From e09866e7f2c4e900c5aac812b8b87c0d1f1af093 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 1 May 2023 16:42:44 +0000
Subject: [PATCH 001/182] add prior

---
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipelines/__init__.py           |   1 +
 src/diffusers/pipelines/kandinsky/__init__.py |  16 ++
 .../kandinsky/pipeline_kandinsky_prior.py     | 269 ++++++++++++++++++
 4 files changed, 287 insertions(+)
 create mode 100644 src/diffusers/pipelines/kandinsky/__init__.py
 create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index f21a550517eb..d5ffbc224318 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -145,6 +145,7 @@
         TextToVideoZeroPipeline,
         UnCLIPImageVariationPipeline,
         UnCLIPPipeline,
+        KandinskyPipeline,
         VersatileDiffusionDualGuidedPipeline,
         VersatileDiffusionImageVariationPipeline,
         VersatileDiffusionPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 10da653a1377..6ef385ae5f19 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -78,6 +78,7 @@
     from .stable_diffusion_safe import StableDiffusionPipelineSafe
     from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline
     from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
+    from .kandinsky import KandinskyPipeline
     from .versatile_diffusion import (
         VersatileDiffusionDualGuidedPipeline,
         VersatileDiffusionImageVariationPipeline,
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
new file mode 100644
index 000000000000..7996ed2d581f
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -0,0 +1,16 @@
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    print("to-do")
+#    from ...utils.dummy_torch_and_transformers_objects import UnCLIPImageVariationPipeline, UnCLIPPipeline
+else:
+    from .pipeline_kandinsky_prior import KandinskyPipeline
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
new file mode 100644
index 000000000000..83de5526d04e
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -0,0 +1,269 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...models import PriorTransformer, UNet2DConditionModel
+from ...pipelines import DiffusionPipeline
+from ...schedulers import UnCLIPScheduler
+
+from ...utils import (
+    logging, 
+    randn_tensor,
+)
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+class KandinskyPipeline(DiffusionPipeline):
+    """
+    Pipeline for generate image prior for Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+        )
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+            # get prompt text embeddings
+        text_inputs = self.prior_tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.prior_tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+
+        untruncated_ids = self.prior_tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+            text_input_ids, untruncated_ids
+        ):
+            removed_text = self.prior_tokenizer.batch_decode(
+                untruncated_ids[:, self.prior_tokenizer.model_max_length - 1 : -1]
+            )
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.prior_tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length]
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.prior_tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.prior_tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+    
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        # TO_DO
+        return self.device
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        num_images_per_prompt: int = 1,
+        prior_num_inference_steps: int =5,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prior_latents: Optional[torch.FloatTensor] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prior_guidance_scale: float = 4.0, 
+        output_type: Optional[str] = "pt",
+        return_dict: bool = True,
+    ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = prior_guidance_scale > 1.0
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # prior
+        self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
+        prior_timesteps_tensor = self.prior_scheduler.timesteps
+
+        embedding_dim = self.prior.config.embedding_dim
+
+        prior_latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            prior_latents,
+            self.prior_scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == prior_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = prior_timesteps_tensor[i + 1]
+
+            prior_latents = self.prior_scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=prior_latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        prior_latents = self.prior.post_process_latents(prior_latents)
+
+        image_embeddings = prior_latents
+
+        return image_embeddings
\ No newline at end of file

From f28ad03bb726ada29616a08ab8ff57988ed73e90 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 1 May 2023 17:07:13 +0000
Subject: [PATCH 002/182] add conversion script

---
 scripts/convert_kandinsky_to_diffusers.py     | 318 ++++++++++++++++++
 ...ndinsky_prior.py => pipeline_kandinsky.py} |   0
 2 files changed, 318 insertions(+)
 create mode 100644 scripts/convert_kandinsky_to_diffusers.py
 rename src/diffusers/pipelines/kandinsky/{pipeline_kandinsky_prior.py => pipeline_kandinsky.py} (100%)

diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py
new file mode 100644
index 000000000000..d24d8a5cd122
--- /dev/null
+++ b/scripts/convert_kandinsky_to_diffusers.py
@@ -0,0 +1,318 @@
+import argparse
+import tempfile
+
+import torch
+from accelerate import load_checkpoint_and_dispatch
+from diffusers.models.prior_transformer import PriorTransformer
+
+
+"""
+Example - From the diffusers root directory:
+
+Download weights:
+```sh
+$ wget https://huggingface.co/ai-forever/Kandinsky_2.1/blob/main/prior_fp16.ckpt
+```
+
+Convert the model:
+```sh
+python scripts/convert_kandinsky_to_diffusers.py \
+      --prior_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/prior_fp16.ckpt \
+      --clip_stat_path  /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/ViT-L-14_stats.th \
+      --dump_path ./kandinsky_model \
+      --debug prior
+```
+"""
+
+def split_attentions(*, weight, bias, split, chunk_size):
+    weights = [None] * split
+    biases = [None] * split
+
+    weights_biases_idx = 0
+
+    for starting_row_index in range(0, weight.shape[0], chunk_size):
+        row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size)
+
+        weight_rows = weight[row_indices, :]
+        bias_rows = bias[row_indices]
+
+        if weights[weights_biases_idx] is None:
+            assert weights[weights_biases_idx] is None
+            weights[weights_biases_idx] = weight_rows
+            biases[weights_biases_idx] = bias_rows
+        else:
+            assert weights[weights_biases_idx] is not None
+            weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows])
+            biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows])
+
+        weights_biases_idx = (weights_biases_idx + 1) % split
+
+    return weights, biases
+
+
+
+# prior
+
+PRIOR_ORIGINAL_PREFIX = "model"
+
+# Uses default arguments
+PRIOR_CONFIG = {}
+
+
+def prior_model_from_original_config():
+    model = PriorTransformer(**PRIOR_CONFIG)
+
+    return model
+
+
+def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint, clip_stats_checkpoint):
+    diffusers_checkpoint = {}
+
+    # <original>.time_embed.0 -> <diffusers>.time_embedding.linear_1
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_1.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.0.weight"],
+            "time_embedding.linear_1.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.0.bias"],
+        }
+    )
+
+    # <original>.clip_img_proj -> <diffusers>.proj_in
+    diffusers_checkpoint.update(
+        {
+            "proj_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_img_proj.weight"],
+            "proj_in.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_img_proj.bias"],
+        }
+    )
+
+    # <original>.text_emb_proj -> <diffusers>.embedding_proj
+    diffusers_checkpoint.update(
+        {
+            "embedding_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_emb_proj.weight"],
+            "embedding_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_emb_proj.bias"],
+        }
+    )
+
+    # <original>.text_enc_proj -> <diffusers>.encoder_hidden_states_proj
+    diffusers_checkpoint.update(
+        {
+            "encoder_hidden_states_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_enc_proj.weight"],
+            "encoder_hidden_states_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_enc_proj.bias"],
+        }
+    )
+
+    # <original>.positional_embedding -> <diffusers>.positional_embedding
+    diffusers_checkpoint.update({"positional_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.positional_embedding"]})
+
+    # <original>.prd_emb -> <diffusers>.prd_embedding
+    diffusers_checkpoint.update({"prd_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.prd_emb"]})
+
+    # <original>.time_embed.2 -> <diffusers>.time_embedding.linear_2
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_2.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.2.weight"],
+            "time_embedding.linear_2.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.2.bias"],
+        }
+    )
+
+    # <original>.resblocks.<x> -> <diffusers>.transformer_blocks.<x>
+    for idx in range(len(model.transformer_blocks)):
+        diffusers_transformer_prefix = f"transformer_blocks.{idx}"
+        original_transformer_prefix = f"{PRIOR_ORIGINAL_PREFIX}.transformer.resblocks.{idx}"
+
+        # <original>.attn -> <diffusers>.attn1
+        diffusers_attention_prefix = f"{diffusers_transformer_prefix}.attn1"
+        original_attention_prefix = f"{original_transformer_prefix}.attn"
+        diffusers_checkpoint.update(
+            prior_attention_to_diffusers(
+                checkpoint,
+                diffusers_attention_prefix=diffusers_attention_prefix,
+                original_attention_prefix=original_attention_prefix,
+                attention_head_dim=model.attention_head_dim,
+            )
+        )
+
+        # <original>.mlp -> <diffusers>.ff
+        diffusers_ff_prefix = f"{diffusers_transformer_prefix}.ff"
+        original_ff_prefix = f"{original_transformer_prefix}.mlp"
+        diffusers_checkpoint.update(
+            prior_ff_to_diffusers(
+                checkpoint, diffusers_ff_prefix=diffusers_ff_prefix, original_ff_prefix=original_ff_prefix
+            )
+        )
+
+        # <original>.ln_1 -> <diffusers>.norm1
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm1.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_1.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm1.bias": checkpoint[f"{original_transformer_prefix}.ln_1.bias"],
+            }
+        )
+
+        # <original>.ln_2 -> <diffusers>.norm3
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm3.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_2.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm3.bias": checkpoint[f"{original_transformer_prefix}.ln_2.bias"],
+            }
+        )
+
+    # <original>.final_ln -> <diffusers>.norm_out
+    diffusers_checkpoint.update(
+        {
+            "norm_out.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.final_ln.weight"],
+            "norm_out.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.final_ln.bias"],
+        }
+    )
+
+    # <original>.out_proj -> <diffusers>.proj_to_clip_embeddings
+    diffusers_checkpoint.update(
+        {
+            "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.out_proj.weight"],
+            "proj_to_clip_embeddings.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.out_proj.bias"],
+        }
+    )
+
+    # clip stats
+    clip_mean, clip_std = clip_stats_checkpoint
+    clip_mean = clip_mean[None, :]
+    clip_std = clip_std[None, :]
+
+    diffusers_checkpoint.update({"clip_mean": clip_mean, "clip_std": clip_std})
+
+    return diffusers_checkpoint
+
+
+def prior_attention_to_diffusers(
+    checkpoint, *, diffusers_attention_prefix, original_attention_prefix, attention_head_dim
+):
+    diffusers_checkpoint = {}
+
+    # <original>.c_qkv -> <diffusers>.{to_q, to_k, to_v}
+    [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions(
+        weight=checkpoint[f"{original_attention_prefix}.c_qkv.weight"],
+        bias=checkpoint[f"{original_attention_prefix}.c_qkv.bias"],
+        split=3,
+        chunk_size=attention_head_dim,
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_q.weight": q_weight,
+            f"{diffusers_attention_prefix}.to_q.bias": q_bias,
+            f"{diffusers_attention_prefix}.to_k.weight": k_weight,
+            f"{diffusers_attention_prefix}.to_k.bias": k_bias,
+            f"{diffusers_attention_prefix}.to_v.weight": v_weight,
+            f"{diffusers_attention_prefix}.to_v.bias": v_bias,
+        }
+    )
+
+    # <original>.c_proj -> <diffusers>.to_out.0
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{original_attention_prefix}.c_proj.weight"],
+            f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{original_attention_prefix}.c_proj.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix):
+    diffusers_checkpoint = {
+        # <original>.c_fc -> <diffusers>.net.0.proj
+        f"{diffusers_ff_prefix}.net.{0}.proj.weight": checkpoint[f"{original_ff_prefix}.c_fc.weight"],
+        f"{diffusers_ff_prefix}.net.{0}.proj.bias": checkpoint[f"{original_ff_prefix}.c_fc.bias"],
+        # <original>.c_proj -> <diffusers>.net.2
+        f"{diffusers_ff_prefix}.net.{2}.weight": checkpoint[f"{original_ff_prefix}.c_proj.weight"],
+        f"{diffusers_ff_prefix}.net.{2}.bias": checkpoint[f"{original_ff_prefix}.c_proj.bias"],
+    }
+
+    return diffusers_checkpoint
+
+
+# done prior
+
+def prior(*, args, checkpoint_map_location):
+    print("loading prior")
+
+    prior_checkpoint = torch.load(args.prior_checkpoint_path, map_location=checkpoint_map_location)
+
+    clip_stats_checkpoint = torch.load(args.clip_stat_path, map_location=checkpoint_map_location)
+
+    prior_model = prior_model_from_original_config()
+
+    prior_diffusers_checkpoint = prior_original_checkpoint_to_diffusers_checkpoint(
+        prior_model, prior_checkpoint, clip_stats_checkpoint
+    )
+
+    del prior_checkpoint
+    del clip_stats_checkpoint
+
+    load_checkpoint_to_model(prior_diffusers_checkpoint, prior_model, strict=True)
+
+    print("done loading prior")
+
+    return prior_model
+
+def load_checkpoint_to_model(checkpoint, model, strict=False):
+    with tempfile.NamedTemporaryFile() as file:
+        torch.save(checkpoint, file.name)
+        del checkpoint
+        if strict:
+            model.load_state_dict(torch.load(file.name), strict=True)
+        else:
+            load_checkpoint_and_dispatch(model, file.name, device_map="auto")
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument(
+        "--prior_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the prior checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--clip_stat_path", default=None, type=str, required=True, help="Path to the clip stats checkpoint to convert."
+    )
+    parser.add_argument(
+        "--checkpoint_load_device",
+        default="cpu",
+        type=str,
+        required=False,
+        help="The device passed to `map_location` when loading checkpoints.",
+    )
+
+    parser.add_argument(
+        "--debug",
+        default=None,
+        type=str,
+        required=False,
+        help="Only run a specific stage of the convert script. Used for debugging",
+    )
+
+    args = parser.parse_args()
+
+    print(f"loading checkpoints to {args.checkpoint_load_device}")
+
+    checkpoint_map_location = torch.device(args.checkpoint_load_device)
+
+    if args.debug is not None:
+        print(f"debug: only executing {args.debug}")
+
+    if args.debug is None:
+        print("to-do")
+    elif args.debug == "prior":
+        prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location)
+        prior_model.save_pretrained(args.dump_path)
+    else:
+        raise ValueError(f"unknown debug value : {args.debug}")
\ No newline at end of file
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
similarity index 100%
rename from src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
rename to src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py

From 09da58a2249de5cc4e8f1ba2c549711b8bd76fc4 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 3 May 2023 00:31:31 +0000
Subject: [PATCH 003/182] fix file name

---
 src/diffusers/pipelines/kandinsky/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 7996ed2d581f..fb4746bd6087 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -13,4 +13,4 @@
     print("to-do")
 #    from ...utils.dummy_torch_and_transformers_objects import UnCLIPImageVariationPipeline, UnCLIPPipeline
 else:
-    from .pipeline_kandinsky_prior import KandinskyPipeline
+    from .pipeline_kandinsky import KandinskyPipeline

From 6c95524d7bb0d9b4a427f3a0f25d039ac938bfdb Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 4 May 2023 02:31:24 +0000
Subject: [PATCH 004/182] add unet + text_proj

---
 scripts/convert_kandinsky_to_diffusers.py     | 561 +++++++++++++++++-
 .../pipelines/kandinsky/text_proj.py          |  77 +++
 2 files changed, 611 insertions(+), 27 deletions(-)
 create mode 100644 src/diffusers/pipelines/kandinsky/text_proj.py

diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py
index d24d8a5cd122..00941d3d2a3b 100644
--- a/scripts/convert_kandinsky_to_diffusers.py
+++ b/scripts/convert_kandinsky_to_diffusers.py
@@ -4,6 +4,8 @@
 import torch
 from accelerate import load_checkpoint_and_dispatch
 from diffusers.models.prior_transformer import PriorTransformer
+from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
+from diffusers import UNet2DConditionModel
 
 
 """
@@ -19,37 +21,12 @@
 python scripts/convert_kandinsky_to_diffusers.py \
       --prior_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/prior_fp16.ckpt \
       --clip_stat_path  /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/ViT-L-14_stats.th \
+      --text2img_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/decoder_fp16.ckpt \
       --dump_path ./kandinsky_model \
-      --debug prior
+      --debug text2img
 ```
 """
 
-def split_attentions(*, weight, bias, split, chunk_size):
-    weights = [None] * split
-    biases = [None] * split
-
-    weights_biases_idx = 0
-
-    for starting_row_index in range(0, weight.shape[0], chunk_size):
-        row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size)
-
-        weight_rows = weight[row_indices, :]
-        bias_rows = bias[row_indices]
-
-        if weights[weights_biases_idx] is None:
-            assert weights[weights_biases_idx] is None
-            weights[weights_biases_idx] = weight_rows
-            biases[weights_biases_idx] = bias_rows
-        else:
-            assert weights[weights_biases_idx] is not None
-            weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows])
-            biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows])
-
-        weights_biases_idx = (weights_biases_idx + 1) % split
-
-    return weights, biases
-
-
 
 # prior
 
@@ -236,6 +213,490 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix
 
 # done prior
 
+# unet 
+
+# We are hardcoding the model configuration for now. If we need to generalize to more model configurations, we can
+# update then.
+
+UNET_CONFIG = {
+  "act_fn":"silu",
+  "attention_head_dim": 64,
+  "block_out_channels": (384, 768, 1152, 1536),
+  "center_input_sample": False,
+  "class_embed_type": "identity",
+  "cross_attention_dim": 768,
+  "down_block_types": (
+    "ResnetDownsampleBlock2D",
+    "SimpleCrossAttnDownBlock2D",
+    "SimpleCrossAttnDownBlock2D",
+    "SimpleCrossAttnDownBlock2D",
+  ),
+  "downsample_padding": 1,
+  "dual_cross_attention": False,
+  "flip_sin_to_cos": True,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 3,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "only_cross_attention": False,
+  "out_channels": 8,
+  "resnet_time_scale_shift": "scale_shift",
+  "sample_size": 64,
+  "up_block_types": (
+    "SimpleCrossAttnUpBlock2D",
+    "SimpleCrossAttnUpBlock2D",
+    "SimpleCrossAttnUpBlock2D",
+    "ResnetUpsampleBlock2D",
+  ),
+  "upcast_attention": False,
+  "use_linear_projection": False
+}
+
+def unet_model_from_original_config():
+    model = UNet2DConditionModel(**UNET_CONFIG)
+
+    return model
+
+def unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    num_head_channels = UNET_CONFIG["attention_head_dim"]
+
+    diffusers_checkpoint.update(unet_time_embeddings(checkpoint))
+    diffusers_checkpoint.update(unet_conv_in(checkpoint))
+
+    # <original>.input_blocks -> <diffusers>.down_blocks
+
+    original_down_block_idx = 1
+
+    for diffusers_down_block_idx in range(len(model.down_blocks)):
+        checkpoint_update, num_original_down_blocks = unet_downblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_down_block_idx=diffusers_down_block_idx,
+            original_down_block_idx=original_down_block_idx,
+            num_head_channels=num_head_channels,
+        )
+
+        original_down_block_idx += num_original_down_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.input_blocks -> <diffusers>.down_blocks
+
+    diffusers_checkpoint.update(
+        unet_midblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            num_head_channels=num_head_channels,
+        )
+    )
+
+    # <original>.output_blocks -> <diffusers>.up_blocks
+
+    original_up_block_idx = 0
+
+    for diffusers_up_block_idx in range(len(model.up_blocks)):
+        checkpoint_update, num_original_up_blocks = unet_upblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_up_block_idx=diffusers_up_block_idx,
+            original_up_block_idx=original_up_block_idx,
+            num_head_channels=num_head_channels,
+        )
+
+        original_up_block_idx += num_original_up_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.output_blocks -> <diffusers>.up_blocks
+
+    diffusers_checkpoint.update(unet_conv_norm_out(checkpoint))
+    diffusers_checkpoint.update(unet_conv_out(checkpoint))
+
+    return diffusers_checkpoint
+
+# done unet
+
+# text proj 
+
+TEXT_PROJ_CONFIG = {}
+
+def text_proj_from_original_config():
+    model = KandinskyTextProjModel(**TEXT_PROJ_CONFIG)
+    return model
+
+# Note that the input checkpoint is the original text2img model checkpoint
+def text_proj_original_checkpoint_to_diffusers_checkpoint(checkpoint):
+    diffusers_checkpoint = {
+        # <original>.text_seq_proj.0 -> <diffusers>.encoder_hidden_states_proj
+        "encoder_hidden_states_proj.weight": checkpoint["to_model_dim_n.weight"],
+        "encoder_hidden_states_proj.bias": checkpoint["to_model_dim_n.bias"],
+        # <original>.clip_tok_proj -> <diffusers>.clip_extra_context_tokens_proj
+        "clip_extra_context_tokens_proj.weight": checkpoint["clip_to_seq.weight"],
+        "clip_extra_context_tokens_proj.bias": checkpoint["clip_to_seq.bias"],
+        # <original>.proj_n -> <diffusers>.embedding_proj
+        "embedding_proj.weight": checkpoint["proj_n.weight"],
+        "embedding_proj.bias": checkpoint["proj_n.bias"],
+        # <original>.ln_model_n -> <diffusers>.embedding_norm
+        "embedding_norm.weight": checkpoint["ln_model_n.weight"],
+        "embedding_norm.bias": checkpoint["ln_model_n.bias"],
+        # <original>.clip_emb -> <diffusers>.clip_image_embeddings_project_to_time_embeddings
+        "clip_image_embeddings_project_to_time_embeddings.weight": checkpoint["img_layer.weight"],
+        "clip_image_embeddings_project_to_time_embeddings.bias": checkpoint["img_layer.bias"],
+    }
+
+    return diffusers_checkpoint
+
+# unet utils
+
+# <original>.time_embed -> <diffusers>.time_embedding
+def unet_time_embeddings(checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_1.weight": checkpoint["time_embed.0.weight"],
+            "time_embedding.linear_1.bias": checkpoint["time_embed.0.bias"],
+            "time_embedding.linear_2.weight": checkpoint["time_embed.2.weight"],
+            "time_embedding.linear_2.bias": checkpoint["time_embed.2.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+# <original>.input_blocks.0 -> <diffusers>.conv_in
+def unet_conv_in(checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "conv_in.weight": checkpoint["input_blocks.0.0.weight"],
+            "conv_in.bias": checkpoint["input_blocks.0.0.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+# <original>.out.0 -> <diffusers>.conv_norm_out
+def unet_conv_norm_out(checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "conv_norm_out.weight": checkpoint["out.0.weight"],
+            "conv_norm_out.bias": checkpoint["out.0.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+# <original>.out.2 -> <diffusers>.conv_out
+def unet_conv_out(checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(
+        {
+            "conv_out.weight": checkpoint["out.2.weight"],
+            "conv_out.bias": checkpoint["out.2.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+# <original>.input_blocks -> <diffusers>.down_blocks
+def unet_downblock_to_diffusers_checkpoint(
+    model, checkpoint, *, diffusers_down_block_idx, original_down_block_idx, num_head_channels
+):
+    diffusers_checkpoint = {}
+
+    diffusers_resnet_prefix = f"down_blocks.{diffusers_down_block_idx}.resnets"
+    original_down_block_prefix = "input_blocks"
+
+    down_block = model.down_blocks[diffusers_down_block_idx]
+
+    num_resnets = len(down_block.resnets)
+
+    if down_block.downsamplers is None:
+        downsampler = False
+    else:
+        assert len(down_block.downsamplers) == 1
+        downsampler = True
+        # The downsample block is also a resnet
+        num_resnets += 1
+
+    for resnet_idx_inc in range(num_resnets):
+        full_resnet_prefix = f"{original_down_block_prefix}.{original_down_block_idx + resnet_idx_inc}.0"
+
+        if downsampler and resnet_idx_inc == num_resnets - 1:
+            # this is a downsample block
+            full_diffusers_resnet_prefix = f"down_blocks.{diffusers_down_block_idx}.downsamplers.0"
+        else:
+            # this is a regular resnet block
+            full_diffusers_resnet_prefix = f"{diffusers_resnet_prefix}.{resnet_idx_inc}"
+
+        diffusers_checkpoint.update(
+            resnet_to_diffusers_checkpoint(
+                checkpoint, resnet_prefix=full_resnet_prefix, diffusers_resnet_prefix=full_diffusers_resnet_prefix
+            )
+        )
+
+    if hasattr(down_block, "attentions"):
+        num_attentions = len(down_block.attentions)
+        diffusers_attention_prefix = f"down_blocks.{diffusers_down_block_idx}.attentions"
+
+        for attention_idx_inc in range(num_attentions):
+            full_attention_prefix = f"{original_down_block_prefix}.{original_down_block_idx + attention_idx_inc}.1"
+            full_diffusers_attention_prefix = f"{diffusers_attention_prefix}.{attention_idx_inc}"
+
+            diffusers_checkpoint.update(
+                attention_to_diffusers_checkpoint(
+                    checkpoint,
+                    attention_prefix=full_attention_prefix,
+                    diffusers_attention_prefix=full_diffusers_attention_prefix,
+                    num_head_channels=num_head_channels,
+                )
+            )
+
+    num_original_down_blocks = num_resnets
+
+    return diffusers_checkpoint, num_original_down_blocks
+
+# <original>.middle_block -> <diffusers>.mid_block
+def unet_midblock_to_diffusers_checkpoint(model, checkpoint, *, num_head_channels):
+    diffusers_checkpoint = {}
+
+    # block 0
+
+    original_block_idx = 0
+
+    diffusers_checkpoint.update(
+        resnet_to_diffusers_checkpoint(
+            checkpoint,
+            diffusers_resnet_prefix="mid_block.resnets.0",
+            resnet_prefix=f"middle_block.{original_block_idx}",
+        )
+    )
+
+    original_block_idx += 1
+
+    # optional block 1
+
+    if hasattr(model.mid_block, "attentions") and model.mid_block.attentions[0] is not None:
+        diffusers_checkpoint.update(
+            attention_to_diffusers_checkpoint(
+                checkpoint,
+                diffusers_attention_prefix="mid_block.attentions.0",
+                attention_prefix=f"middle_block.{original_block_idx}",
+                num_head_channels=num_head_channels,
+            )
+        )
+        original_block_idx += 1
+
+    # block 1 or block 2
+
+    diffusers_checkpoint.update(
+        resnet_to_diffusers_checkpoint(
+            checkpoint,
+            diffusers_resnet_prefix="mid_block.resnets.1",
+            resnet_prefix=f"middle_block.{original_block_idx}",
+        )
+    )
+
+    return diffusers_checkpoint
+
+
+# <original>.output_blocks -> <diffusers>.up_blocks
+def unet_upblock_to_diffusers_checkpoint(
+    model, checkpoint, *, diffusers_up_block_idx, original_up_block_idx, num_head_channels
+):
+    diffusers_checkpoint = {}
+
+    diffusers_resnet_prefix = f"up_blocks.{diffusers_up_block_idx}.resnets"
+    original_up_block_prefix = "output_blocks"
+
+    up_block = model.up_blocks[diffusers_up_block_idx]
+
+    num_resnets = len(up_block.resnets)
+
+    if up_block.upsamplers is None:
+        upsampler = False
+    else:
+        assert len(up_block.upsamplers) == 1
+        upsampler = True
+        # The upsample block is also a resnet
+        num_resnets += 1
+
+    has_attentions = hasattr(up_block, "attentions")
+
+    for resnet_idx_inc in range(num_resnets):
+        if upsampler and resnet_idx_inc == num_resnets - 1:
+            # this is an upsample block
+            if has_attentions:
+                # There is a middle attention block that we skip
+                original_resnet_block_idx = 2
+            else:
+                original_resnet_block_idx = 1
+
+            # we add the `minus 1` because the last two resnets are stuck together in the same output block
+            full_resnet_prefix = (
+                f"{original_up_block_prefix}.{original_up_block_idx + resnet_idx_inc - 1}.{original_resnet_block_idx}"
+            )
+
+            full_diffusers_resnet_prefix = f"up_blocks.{diffusers_up_block_idx}.upsamplers.0"
+        else:
+            # this is a regular resnet block
+            full_resnet_prefix = f"{original_up_block_prefix}.{original_up_block_idx + resnet_idx_inc}.0"
+            full_diffusers_resnet_prefix = f"{diffusers_resnet_prefix}.{resnet_idx_inc}"
+
+        diffusers_checkpoint.update(
+            resnet_to_diffusers_checkpoint(
+                checkpoint, resnet_prefix=full_resnet_prefix, diffusers_resnet_prefix=full_diffusers_resnet_prefix
+            )
+        )
+
+    if has_attentions:
+        num_attentions = len(up_block.attentions)
+        diffusers_attention_prefix = f"up_blocks.{diffusers_up_block_idx}.attentions"
+
+        for attention_idx_inc in range(num_attentions):
+            full_attention_prefix = f"{original_up_block_prefix}.{original_up_block_idx + attention_idx_inc}.1"
+            full_diffusers_attention_prefix = f"{diffusers_attention_prefix}.{attention_idx_inc}"
+
+            diffusers_checkpoint.update(
+                attention_to_diffusers_checkpoint(
+                    checkpoint,
+                    attention_prefix=full_attention_prefix,
+                    diffusers_attention_prefix=full_diffusers_attention_prefix,
+                    num_head_channels=num_head_channels,
+                )
+            )
+
+    num_original_down_blocks = num_resnets - 1 if upsampler else num_resnets
+
+    return diffusers_checkpoint, num_original_down_blocks
+
+
+def resnet_to_diffusers_checkpoint(checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    diffusers_checkpoint = {
+        f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.in_layers.0.weight"],
+        f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.in_layers.0.bias"],
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.in_layers.2.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.in_layers.2.bias"],
+        f"{diffusers_resnet_prefix}.time_emb_proj.weight": checkpoint[f"{resnet_prefix}.emb_layers.1.weight"],
+        f"{diffusers_resnet_prefix}.time_emb_proj.bias": checkpoint[f"{resnet_prefix}.emb_layers.1.bias"],
+        f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.out_layers.0.weight"],
+        f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.out_layers.0.bias"],
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.out_layers.3.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.out_layers.3.bias"],
+    }
+
+    skip_connection_prefix = f"{resnet_prefix}.skip_connection"
+
+    if f"{skip_connection_prefix}.weight" in checkpoint:
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{skip_connection_prefix}.weight"],
+                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{skip_connection_prefix}.bias"],
+            }
+        )
+
+    return diffusers_checkpoint
+
+
+def attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix, num_head_channels):
+    diffusers_checkpoint = {}
+
+    # <original>.norm -> <diffusers>.group_norm
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
+            f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
+        }
+    )
+
+    # <original>.qkv -> <diffusers>.{query, key, value}
+    [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions(
+        weight=checkpoint[f"{attention_prefix}.qkv.weight"][:, :, 0],
+        bias=checkpoint[f"{attention_prefix}.qkv.bias"],
+        split=3,
+        chunk_size=num_head_channels,
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_q.weight": q_weight,
+            f"{diffusers_attention_prefix}.to_q.bias": q_bias,
+            f"{diffusers_attention_prefix}.to_k.weight": k_weight,
+            f"{diffusers_attention_prefix}.to_k.bias": k_bias,
+            f"{diffusers_attention_prefix}.to_v.weight": v_weight,
+            f"{diffusers_attention_prefix}.to_v.bias": v_bias,
+        }
+    )
+
+    # <original>.encoder_kv -> <diffusers>.{context_key, context_value}
+    [encoder_k_weight, encoder_v_weight], [encoder_k_bias, encoder_v_bias] = split_attentions(
+        weight=checkpoint[f"{attention_prefix}.encoder_kv.weight"][:, :, 0],
+        bias=checkpoint[f"{attention_prefix}.encoder_kv.bias"],
+        split=2,
+        chunk_size=num_head_channels,
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.add_k_proj.weight": encoder_k_weight,
+            f"{diffusers_attention_prefix}.add_k_proj.bias": encoder_k_bias,
+            f"{diffusers_attention_prefix}.add_v_proj.weight": encoder_v_weight,
+            f"{diffusers_attention_prefix}.add_v_proj.bias": encoder_v_bias,
+        }
+    )
+
+    # <original>.proj_out (1d conv) -> <diffusers>.proj_attn (linear)
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
+                :, :, 0
+            ],
+            f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?)
+def split_attentions(*, weight, bias, split, chunk_size):
+    weights = [None] * split
+    biases = [None] * split
+
+    weights_biases_idx = 0
+
+    for starting_row_index in range(0, weight.shape[0], chunk_size):
+        row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size)
+
+        weight_rows = weight[row_indices, :]
+        bias_rows = bias[row_indices]
+
+        if weights[weights_biases_idx] is None:
+            assert weights[weights_biases_idx] is None
+            weights[weights_biases_idx] = weight_rows
+            biases[weights_biases_idx] = bias_rows
+        else:
+            assert weights[weights_biases_idx] is not None
+            weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows])
+            biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows])
+
+        weights_biases_idx = (weights_biases_idx + 1) % split
+
+    return weights, biases
+
+
+# done unet utils
+
+
 def prior(*, args, checkpoint_map_location):
     print("loading prior")
 
@@ -258,6 +719,41 @@ def prior(*, args, checkpoint_map_location):
 
     return prior_model
 
+
+def text2img(*, args, checkpoint_map_location):
+    
+    print("loading text2img")
+
+    text2img_checkpoint = torch.load(args.text2img_checkpoint_path, map_location=checkpoint_map_location)
+
+    unet_model = unet_model_from_original_config()
+
+    unet_diffusers_checkpoint = unet_original_checkpoint_to_diffusers_checkpoint(
+        unet_model, text2img_checkpoint
+    )
+
+    # text proj interlude
+    
+    # The original decoder implementation includes a set of parameters that are used
+    # for creating the `encoder_hidden_states` which are what the U-net is conditioned
+    # on. The diffusers conditional unet directly takes the encoder_hidden_states. We pull
+    # the parameters into the KandinskyTextProjModel class
+    text_proj_model = text_proj_from_original_config()
+    
+    text_proj_checkpoint = text_proj_original_checkpoint_to_diffusers_checkpoint(text2img_checkpoint)
+
+    load_checkpoint_to_model(text_proj_checkpoint, text_proj_model, strict=True)
+    
+
+    del text2img_checkpoint
+
+    load_checkpoint_to_model(unet_diffusers_checkpoint, unet_model, strict=True)
+
+    print("done loading text2img")
+
+    return unet_model, text_proj_model
+
+
 def load_checkpoint_to_model(checkpoint, model, strict=False):
     with tempfile.NamedTemporaryFile() as file:
         torch.save(checkpoint, file.name)
@@ -284,6 +780,13 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
     parser.add_argument(
         "--clip_stat_path", default=None, type=str, required=True, help="Path to the clip stats checkpoint to convert."
     )
+    parser.add_argument(
+        "--text2img_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the text2img checkpoint to convert.",
+    )
     parser.add_argument(
         "--checkpoint_load_device",
         default="cpu",
@@ -314,5 +817,9 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
     elif args.debug == "prior":
         prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location)
         prior_model.save_pretrained(args.dump_path)
+    elif args.debug == 'text2img':
+        unet_model, text_proj_model = text2img(args=args, checkpoint_map_location=checkpoint_map_location)
+        unet_model.save_pretrained(f"{args.dump_path}/unet")
+        text_proj_model.save_pretrained(f"{args.dump_path}/text_proj")
     else:
         raise ValueError(f"unknown debug value : {args.debug}")
\ No newline at end of file
diff --git a/src/diffusers/pipelines/kandinsky/text_proj.py b/src/diffusers/pipelines/kandinsky/text_proj.py
new file mode 100644
index 000000000000..2aa2bdec4ea7
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky/text_proj.py
@@ -0,0 +1,77 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+
+
+class KandinskyTextProjModel(ModelMixin, ConfigMixin):
+    """
+    Utility class for Kandingsky text embeddings. Used to combine the image and text embeddings into a format usable by the
+    unet diffusion model.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        *,
+        clip_extra_context_tokens: int = 10,
+        clip_text_encoder_hidden_states_dim: int = 1024, 
+        clip_embeddings_dim: int = 768,
+        time_embed_dim: int = 1536,
+        cross_attention_dim: int = 768,
+    ):
+        super().__init__()
+
+        # parameters for additional clip time embeddings
+        self.embedding_proj = nn.Linear(clip_embeddings_dim, time_embed_dim)
+        self.embedding_norm = nn.LayerNorm(time_embed_dim)
+        self.clip_image_embeddings_project_to_time_embeddings = nn.Linear(clip_embeddings_dim, time_embed_dim)
+
+        # parameters for encoder hidden states
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.clip_extra_context_tokens_proj = nn.Linear(
+            clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim
+        )
+        self.encoder_hidden_states_proj = nn.Linear(clip_text_encoder_hidden_states_dim, cross_attention_dim)
+
+    def forward(self, *, image_embeddings, prompt_embeds, text_encoder_hidden_states):
+
+        # The image embeddings batch size and the text embeddings batch size are equal
+        assert image_embeddings.shape[0] == prompt_embeds.shape[0] == text_encoder_hidden_states.shape[0]
+
+        batch_size = prompt_embeds.shape[0]
+
+        # project text and image embeddings to add to the existing timestep embedding
+        time_projected_prompt_embeds = self.embedding_proj(prompt_embeds)
+        time_projected_prompt_embeds = self.embedding_norm(time_projected_prompt_embeds)
+        time_projected_image_embeddings = self.clip_image_embeddings_project_to_time_embeddings(image_embeddings)
+        additive_clip_time_embeddings = time_projected_image_embeddings + time_projected_prompt_embeds
+
+        # image_embeddings -> linear (2 x 7680) -> (2, 10, 768)
+        # text_encoder_hidden_states -> linear -> (2, 77, 768)
+         # (2, 87, 768)
+          # ... and by projecting CLIP embeddings into 10
+        # extra tokens of context that are concatenated to the sequence of outputs from the GLIDE text encoder"
+        clip_extra_context_tokens = self.clip_extra_context_tokens_proj(image_embeddings)
+        clip_extra_context_tokens = clip_extra_context_tokens.reshape(batch_size, -1, self.clip_extra_context_tokens)
+        clip_extra_context_tokens = clip_extra_context_tokens.permute(0, 2, 1)
+
+        text_encoder_hidden_states = self.encoder_hidden_states_proj(text_encoder_hidden_states)
+        text_encoder_hidden_states = torch.cat([clip_extra_context_tokens, text_encoder_hidden_states], dim=1)
+
+        return text_encoder_hidden_states, additive_clip_time_embeddings

From fee1bba57e838a8f91adfe8d628cc9657ce2f697 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 6 May 2023 07:50:21 +0000
Subject: [PATCH 005/182] soft dependency on m-clip

---
 src/diffusers/__init__.py                     |  9 +++++++++
 src/diffusers/pipelines/__init__.py           |  9 +++++++++
 src/diffusers/pipelines/kandinsky/__init__.py |  7 ++++---
 src/diffusers/utils/__init__.py               |  1 +
 .../dummy_torch_and_transformers_objects.py   | 15 +++++++++++++++
 src/diffusers/utils/import_utils.py           | 19 +++++++++++++++++++
 6 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 92ee4c0bf040..fb10b02405b9 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -13,6 +13,7 @@
     is_scipy_available,
     is_torch_available,
     is_torchsde_available,
+    is_multilingual_clip_available,
     is_transformers_available,
     is_transformers_version,
     is_unidecode_available,
@@ -170,6 +171,14 @@
 else:
     from .pipelines import StableDiffusionKDiffusionPipeline
 
+try:
+    if not(is_torch_available() and is_transformers_available() and is_multilingual_clip_available()):
+         raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils.dummy_torch_and_transformers_and_multilingual_clip_objects import *  # noqa F403
+else:
+    from .pipelines import KandinskyPipeline
+
 try:
     if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
         raise OptionalDependencyNotAvailable()
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index b38cddeffac4..b8b640fcc38a 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -7,6 +7,7 @@
     is_onnx_available,
     is_torch_available,
     is_transformers_available,
+    is_multilingual_clip_available,
 )
 
 
@@ -119,6 +120,14 @@
 else:
     from .stable_diffusion import StableDiffusionKDiffusionPipeline
 
+try:
+    if not (is_torch_available() and is_transformers_available() and is_multilingual_clip_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils.dummy_torch_and_transformers_and_multilingual_clip_diffusion_objects import *  # noqa F403
+else:
+    from .kandinsky import KandinskyPipeline
+
 try:
     if not is_flax_available():
         raise OptionalDependencyNotAvailable()
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index fb4746bd6087..54a77f3037f6 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -3,14 +3,15 @@
     is_torch_available,
     is_transformers_available,
     is_transformers_version,
+    is_multilingual_clip_available,
 )
 
 
 try:
-    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")) and is_multilingual_clip_available:
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    print("to-do")
-#    from ...utils.dummy_torch_and_transformers_objects import UnCLIPImageVariationPipeline, UnCLIPPipeline
+    from ...utils.dummy_torch_and_transformers_and_multilingual_clip_diffusion_objects import KandinskyPipeline
 else:
     from .pipeline_kandinsky import KandinskyPipeline
+    from .text_proj import KandinskyTextProjModel
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index cd3a1b8f3dd4..31b660086b22 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -75,6 +75,7 @@
     is_transformers_version,
     is_unidecode_available,
     is_wandb_available,
+    is_multilingual_clip_available,
     is_xformers_available,
     requires_backends,
 )
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index f3708107e82a..2c0996e58e8a 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -542,6 +542,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class KandinskyPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class VersatileDiffusionDualGuidedPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index 4ded0f272462..27ce5c2c2f2b 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -295,6 +295,14 @@
     _torchsde_available = False
 
 
+_multilingual_clip_available = importlib.util.find_spec("multilingual-clip") is not None
+try:
+    _multilingual_clip_version = importlib_metadata.version("multilingual-clip")
+    logger.debug(f"Successfully imported multilingual-clip version {_multilingual_clip_version}")
+except importlib_metadata.PackageNotFoundError:
+    _multilingual_clip_available = False
+
+    
 def is_torch_available():
     return _torch_available
 
@@ -383,6 +391,10 @@ def is_torchsde_available():
     return _torchsde_available
 
 
+def is_multilingual_clip_available():
+    return is_multilingual_clip_available
+
+
 # docstyle-ignore
 FLAX_IMPORT_ERROR = """
 {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
@@ -492,6 +504,12 @@ def is_torchsde_available():
 """
 
 
+# docstyle-ignore
+MULTILINGUAL_CLIP_IMPORT_ERROR = """
+{0} requires the multilingual-clip library but it was not found in your environment. You can install it with pip: `pip install multilingual-clip`
+"""
+
+
 BACKENDS_MAPPING = OrderedDict(
     [
         ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
@@ -512,6 +530,7 @@ def is_torchsde_available():
         ("compel", (_compel_available, COMPEL_IMPORT_ERROR)),
         ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
         ("torchsde", (_torchsde_available, TORCHSDE_IMPORT_ERROR)),
+        ("multilingual-clip", (_multilingual_clip_available, MULTILINGUAL_CLIP_IMPORT_ERROR)),
     ]
 )
 

From 06499f70a2c0a710e16c96b6eeaec76245c88055 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 6 May 2023 08:29:19 +0000
Subject: [PATCH 006/182] add dynamic thresholding to unclip

---
 src/diffusers/schedulers/scheduling_unclip.py | 51 ++++++++++++++++++-
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 6403ee3f1518..218d4588a88d 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -97,6 +97,15 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, default `epsilon`, optional):
             prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process)
             or `sample` (directly predicting the noisy sample`)
+        thresholding (`bool`, default `False`):
+            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+            Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+            stable-diffusion).
+        dynamic_thresholding_ratio (`float`, default `0.995`):
+            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+            (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`.
+        sample_max_value (`float`, default `1.0`):
+            the threshold value for dynamic thresholding. Valid only when `thresholding=True`.
     """
 
     @register_to_config
@@ -106,6 +115,9 @@ def __init__(
         variance_type: str = "fixed_small_log",
         clip_sample: bool = True,
         clip_sample_range: Optional[float] = 1.0,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
         prediction_type: str = "epsilon",
         beta_schedule: str = "squaredcos_cap_v2",
     ):
@@ -194,6 +206,40 @@ def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance
 
         return variance
 
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, height, width = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * height * width)
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, height, width)
+        sample = sample.to(dtype)
+
+        return sample
+
     def step(
         self,
         model_output: torch.FloatTensor,
@@ -258,12 +304,13 @@ def step(
                 " for the UnCLIPScheduler."
             )
 
-        # 3. Clip "predicted x_0"
+        # 3. Clip/threhold "predicted x_0" 
         if self.config.clip_sample:
             pred_original_sample = torch.clamp(
                 pred_original_sample, -self.config.clip_sample_range, self.config.clip_sample_range
             )
-
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
         # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
         # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
         pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * beta) / beta_prod_t

From 2e93e81f669a2c80bfb49b91a821c04bab500997 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 7 May 2023 00:27:52 +0000
Subject: [PATCH 007/182] Revert "soft dependency on m-clip"

This reverts commit fee1bba57e838a8f91adfe8d628cc9657ce2f697.
---
 src/diffusers/__init__.py                     |  9 ---------
 src/diffusers/pipelines/__init__.py           |  9 ---------
 src/diffusers/pipelines/kandinsky/__init__.py |  7 +++----
 src/diffusers/utils/__init__.py               |  1 -
 .../dummy_torch_and_transformers_objects.py   | 15 ---------------
 src/diffusers/utils/import_utils.py           | 19 -------------------
 6 files changed, 3 insertions(+), 57 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index fb10b02405b9..92ee4c0bf040 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -13,7 +13,6 @@
     is_scipy_available,
     is_torch_available,
     is_torchsde_available,
-    is_multilingual_clip_available,
     is_transformers_available,
     is_transformers_version,
     is_unidecode_available,
@@ -171,14 +170,6 @@
 else:
     from .pipelines import StableDiffusionKDiffusionPipeline
 
-try:
-    if not(is_torch_available() and is_transformers_available() and is_multilingual_clip_available()):
-         raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_transformers_and_multilingual_clip_objects import *  # noqa F403
-else:
-    from .pipelines import KandinskyPipeline
-
 try:
     if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
         raise OptionalDependencyNotAvailable()
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index b8b640fcc38a..b38cddeffac4 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -7,7 +7,6 @@
     is_onnx_available,
     is_torch_available,
     is_transformers_available,
-    is_multilingual_clip_available,
 )
 
 
@@ -120,14 +119,6 @@
 else:
     from .stable_diffusion import StableDiffusionKDiffusionPipeline
 
-try:
-    if not (is_torch_available() and is_transformers_available() and is_multilingual_clip_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_torch_and_transformers_and_multilingual_clip_diffusion_objects import *  # noqa F403
-else:
-    from .kandinsky import KandinskyPipeline
-
 try:
     if not is_flax_available():
         raise OptionalDependencyNotAvailable()
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 54a77f3037f6..fb4746bd6087 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -3,15 +3,14 @@
     is_torch_available,
     is_transformers_available,
     is_transformers_version,
-    is_multilingual_clip_available,
 )
 
 
 try:
-    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")) and is_multilingual_clip_available:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_and_multilingual_clip_diffusion_objects import KandinskyPipeline
+    print("to-do")
+#    from ...utils.dummy_torch_and_transformers_objects import UnCLIPImageVariationPipeline, UnCLIPPipeline
 else:
     from .pipeline_kandinsky import KandinskyPipeline
-    from .text_proj import KandinskyTextProjModel
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 31b660086b22..cd3a1b8f3dd4 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -75,7 +75,6 @@
     is_transformers_version,
     is_unidecode_available,
     is_wandb_available,
-    is_multilingual_clip_available,
     is_xformers_available,
     requires_backends,
 )
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 2c0996e58e8a..f3708107e82a 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -542,21 +542,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class KandinskyPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class VersatileDiffusionDualGuidedPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index 27ce5c2c2f2b..4ded0f272462 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -295,14 +295,6 @@
     _torchsde_available = False
 
 
-_multilingual_clip_available = importlib.util.find_spec("multilingual-clip") is not None
-try:
-    _multilingual_clip_version = importlib_metadata.version("multilingual-clip")
-    logger.debug(f"Successfully imported multilingual-clip version {_multilingual_clip_version}")
-except importlib_metadata.PackageNotFoundError:
-    _multilingual_clip_available = False
-
-    
 def is_torch_available():
     return _torch_available
 
@@ -391,10 +383,6 @@ def is_torchsde_available():
     return _torchsde_available
 
 
-def is_multilingual_clip_available():
-    return is_multilingual_clip_available
-
-
 # docstyle-ignore
 FLAX_IMPORT_ERROR = """
 {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
@@ -504,12 +492,6 @@ def is_multilingual_clip_available():
 """
 
 
-# docstyle-ignore
-MULTILINGUAL_CLIP_IMPORT_ERROR = """
-{0} requires the multilingual-clip library but it was not found in your environment. You can install it with pip: `pip install multilingual-clip`
-"""
-
-
 BACKENDS_MAPPING = OrderedDict(
     [
         ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
@@ -530,7 +512,6 @@ def is_multilingual_clip_available():
         ("compel", (_compel_available, COMPEL_IMPORT_ERROR)),
         ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
         ("torchsde", (_torchsde_available, TORCHSDE_IMPORT_ERROR)),
-        ("multilingual-clip", (_multilingual_clip_available, MULTILINGUAL_CLIP_IMPORT_ERROR)),
     ]
 )
 

From a6bacf29ce4c2187cf702a58629713539e34953f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 7 May 2023 05:57:41 +0000
Subject: [PATCH 008/182] fix a bug in text_proj

---
 src/diffusers/pipelines/kandinsky/text_proj.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/text_proj.py b/src/diffusers/pipelines/kandinsky/text_proj.py
index 2aa2bdec4ea7..fc485f47a632 100644
--- a/src/diffusers/pipelines/kandinsky/text_proj.py
+++ b/src/diffusers/pipelines/kandinsky/text_proj.py
@@ -68,8 +68,7 @@ def forward(self, *, image_embeddings, prompt_embeds, text_encoder_hidden_states
           # ... and by projecting CLIP embeddings into 10
         # extra tokens of context that are concatenated to the sequence of outputs from the GLIDE text encoder"
         clip_extra_context_tokens = self.clip_extra_context_tokens_proj(image_embeddings)
-        clip_extra_context_tokens = clip_extra_context_tokens.reshape(batch_size, -1, self.clip_extra_context_tokens)
-        clip_extra_context_tokens = clip_extra_context_tokens.permute(0, 2, 1)
+        clip_extra_context_tokens = clip_extra_context_tokens.reshape(batch_size, self.clip_extra_context_tokens, -1)
 
         text_encoder_hidden_states = self.encoder_hidden_states_proj(text_encoder_hidden_states)
         text_encoder_hidden_states = torch.cat([clip_extra_context_tokens, text_encoder_hidden_states], dim=1)

From 3888b2d74134ffd26e3cb5443fd296f8c40da0bb Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 8 May 2023 17:17:13 +0000
Subject: [PATCH 009/182] add PipelineKandinsky

update init

testing

adding testing lines

finalize pipeline

remove print lines

clean up scheduling_unclip.py
---
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipelines/__init__.py           |   2 +-
 src/diffusers/pipelines/kandinsky/__init__.py |   3 +-
 .../pipelines/kandinsky/pipeline_kandinsky.py | 197 +++++++++++++++++-
 src/diffusers/schedulers/scheduling_unclip.py |  46 +++-
 5 files changed, 230 insertions(+), 19 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 92ee4c0bf040..ae38cb005bfb 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -154,6 +154,7 @@
         TextToVideoZeroPipeline,
         UnCLIPImageVariationPipeline,
         UnCLIPPipeline,
+        KandinskyPriorPipeline,
         KandinskyPipeline,
         VersatileDiffusionDualGuidedPipeline,
         VersatileDiffusionImageVariationPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index b38cddeffac4..041b34089af9 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -79,7 +79,7 @@
     from .stable_diffusion_safe import StableDiffusionPipelineSafe
     from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline
     from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
-    from .kandinsky import KandinskyPipeline
+    from .kandinsky import KandinskyPipeline, KandinskyPriorPipeline
     from .versatile_diffusion import (
         VersatileDiffusionDualGuidedPipeline,
         VersatileDiffusionImageVariationPipeline,
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index fb4746bd6087..d42fc574e5e0 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -13,4 +13,5 @@
     print("to-do")
 #    from ...utils.dummy_torch_and_transformers_objects import UnCLIPImageVariationPipeline, UnCLIPPipeline
 else:
-    from .pipeline_kandinsky import KandinskyPipeline
+    from .pipeline_kandinsky import KandinskyPipeline, KandinskyPriorPipeline
+    from .text_proj import KandinskyTextProjModel
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 83de5526d04e..e8b2af450515 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -21,6 +21,7 @@
 from ...models import PriorTransformer, UNet2DConditionModel
 from ...pipelines import DiffusionPipeline
 from ...schedulers import UnCLIPScheduler
+from .text_proj import KandinskyTextProjModel
 
 from ...utils import (
     logging, 
@@ -29,7 +30,17 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-class KandinskyPipeline(DiffusionPipeline):
+
+def get_new_h_w(h, w):
+    new_h = h // 64
+    if h % 64 != 0:
+        new_h += 1
+    new_w = w // 64
+    if w % 64 != 0:
+        new_w += 1
+    return new_h * 8, new_w * 8
+
+class KandinskyPriorPipeline(DiffusionPipeline):
     """
     Pipeline for generate image prior for Kandinsky
 
@@ -37,21 +48,21 @@ class KandinskyPipeline(DiffusionPipeline):
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     Args:
-        text_encoder ([`CLIPTextModelWithProjection`]):
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
             Frozen text-encoder.
         prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        prior ([`PriorTransformer`]):
-            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
-        scheduler ([`UnCLIPScheduler`]):
+        prior_scheduler ([`UnCLIPScheduler`]):
             A scheduler to be used in combination with `prior` to generate image embedding.
     """
 
     def __init__(
         self,
         prior: PriorTransformer,
-        text_encoder: CLIPTextModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
         prior_tokenizer: CLIPTokenizer,
         prior_scheduler: UnCLIPScheduler,
     ):
@@ -59,7 +70,7 @@ def __init__(
 
         self.register_modules(
             prior=prior,
-            text_encoder=text_encoder,
+            prior_text_encoder=prior_text_encoder,
             prior_tokenizer=prior_tokenizer,
             prior_scheduler=prior_scheduler,
         )
@@ -110,7 +121,7 @@ def _encode_prompt(
             )
             text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length]
 
-        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+        text_encoder_output = self.prior_text_encoder(text_input_ids.to(device))
 
         prompt_embeds = text_encoder_output.text_embeds
         text_encoder_hidden_states = text_encoder_output.last_hidden_state
@@ -147,7 +158,7 @@ def _encode_prompt(
                 return_tensors="pt",
             )
             uncond_text_mask = uncond_input.attention_mask.bool().to(device)
-            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+            negative_prompt_embeds_text_encoder_output = self.prior_text_encoder(uncond_input.input_ids.to(device))
 
             negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
             uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
@@ -266,4 +277,170 @@ def __call__(
 
         image_embeddings = prior_latents
 
-        return image_embeddings
\ No newline at end of file
+        return image_embeddings
+
+
+class KandinskyPipeline(DiffusionPipeline):
+    """
+    Pipeline for image based on text prompt and image prior for Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder: 
+            to-add
+        tokenizer: 
+            to-add
+        image_encoder: 
+            to-add
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        text_proj ([`KandinskyTextProjModel`]):
+            Utility class to prepare and combine the embeddings before they are passed to the decoder.
+    """
+    
+    def __init__(
+        self,
+        text_proj: KandinskyTextProjModel,
+        unet: UNet2DConditionModel,
+        scheduler: UnCLIPScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_proj=text_proj,
+            unet=unet,
+            scheduler=scheduler,
+        )
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        # TO_DO
+        return self.device
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        text_encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        negative_image_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict:  bool = True,
+    ):
+
+        if prompt is not None:
+            if isinstance(prompt, str):
+                batch_size = 1
+            elif isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        else:
+            batch_size = prompt_embeds.shape[0] //2
+        
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        #  TO_DO[1] add encode_text step https://github.com/ai-forever/Kandinsky-2/blob/main/kandinsky2/kandinsky2_1_model.py#L208
+        #  Here we pass prompt_embeds and text_encoder_hidden_states directly 
+        
+        # prompt_embeds, text_encoder_hidden_states = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
+
+        # TO_DO [2] add a step to create negative_image_embeds https://github.com/ai-forever/Kandinsky-2/blob/main/kandinsky2/kandinsky2_1_model.py#L322
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)
+        
+        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
+            image_embeddings=image_embeds,
+            prompt_embeds=prompt_embeds,
+            text_encoder_hidden_states=text_encoder_hidden_states,
+            )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        num_channels_latents = self.unet.config.in_channels
+
+        height = height or self.unet.config.sample_size
+        width = width or self.unet.config.sample_size
+        height, width = get_new_h_w(height, width)
+        
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+        
+        # expand the latents if we are doing classifier free guidance
+        latents = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+        
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            noise_pred = self.unet(
+                sample=latents, #[2, 4, 96, 96]
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                class_labels=additive_clip_time_embeddings,
+            ).sample
+
+            # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline, 
+              # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond) 
+              # this means the our latent shape is batch_size *2 instad batch_size
+
+            # YiYi's TO-DO: test it to see if we can do it differently: apply cfg to predicted noise and only take cond portion in predicted variance
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                variance_pred_uncond, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred] * 2) 
+                variance_pred = torch.cat([variance_pred_uncond, variance_pred_text])
+                noise_pred = torch.cat([noise_pred, variance_pred], dim=1)
+
+            if i + 1 == timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred, t, latents, prev_timestep=prev_timestep, generator=generator, batch_size=batch_size,
+            ).prev_sample
+
+        _, latents = latents.chunk(2)
+
+
+        return latents 
\ No newline at end of file
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 218d4588a88d..ef5ef002e740 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -23,6 +23,14 @@
 from ..utils import BaseOutput, randn_tensor
 from .scheduling_utils import SchedulerMixin
 
+def dynamic_threholding_test(x):
+    x2 = torch.clone(x).cpu().detach().numpy()
+    p = 99.5
+    s = np.percentile(np.abs(x2), p, axis=tuple(range(1, x2.ndim)))[0]
+    s = max(s, 1.0)
+    x = torch.clip(x, -s, s) / s
+    return x  # x.clamp(-1, 1)
+
 
 @dataclass
 # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->UnCLIP
@@ -119,12 +127,21 @@ def __init__(
         dynamic_thresholding_ratio: float = 0.995,
         sample_max_value: float = 1.0,
         prediction_type: str = "epsilon",
-        beta_schedule: str = "squaredcos_cap_v2",
+        beta_schedule: str = "squaredcos_cap_v2", # "linear"
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+
     ):
-        if beta_schedule != "squaredcos_cap_v2":
-            raise ValueError("UnCLIPScheduler only supports `beta_schedule`: 'squaredcos_cap_v2'")
 
-        self.betas = betas_for_alpha_bar(num_train_timesteps)
+        if beta_schedule == "squaredcos_cap_v2":
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "linear":
+            # Linear schedule from Ho et al, extended to work for any number of diffusion steps.
+            scale = 1000 / num_train_timesteps
+            self.betas = torch.linspace(beta_start * scale, beta_end * scale, num_train_timesteps, dtype=torch.float64)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
@@ -202,8 +219,10 @@ def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance
             max_log = beta.log()
 
             frac = (predicted_variance + 1) / 2
+            # this is log variance
             variance = frac * max_log + (1 - frac) * min_log
 
+
         return variance
 
     def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
@@ -248,6 +267,8 @@ def step(
         prev_timestep: Optional[int] = None,
         generator=None,
         return_dict: bool = True,
+        # YiYi notes: added for testing kandinsky (will try to remove)
+        batch_size: Optional[int] = None,
     ) -> Union[UnCLIPSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -309,8 +330,12 @@ def step(
             pred_original_sample = torch.clamp(
                 pred_original_sample, -self.config.clip_sample_range, self.config.clip_sample_range
             )
+
         if self.config.thresholding:
-            pred_original_sample = self._threshold_sample(pred_original_sample)
+            # yiyi Notes, testing with dynamic_threholding_test, need to make it work with _threshold_sample
+            #pred_original_sample = self._threshold_sample(pred_original_sample)
+            pred_original_sample = dynamic_threholding_test(pred_original_sample)
+            
         # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
         # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
         pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * beta) / beta_prod_t
@@ -323,9 +348,16 @@ def step(
         # 6. Add noise
         variance = 0
         if t > 0:
+            # YiYi Notes: test to see if we can sampling with latent shape [batch_size, ...] and change this back
             variance_noise = randn_tensor(
-                model_output.shape, dtype=model_output.dtype, generator=generator, device=model_output.device
-            )
+                (batch_size if batch_size is not None else model_output.shape[0], *model_output.shape[1:]) , dtype=model_output.dtype, generator=generator, device=model_output.device
+            )     
+            # variance_noise = randn_tensor(
+            #     model_output.shape, dtype=model_output.dtype, generator=generator, device=model_output.device
+            # )
+
+            if batch_size is not None:
+                variance_noise = torch.cat([variance_noise, variance_noise], dim=0)
 
             variance = self._get_variance(
                 t,

From ac123e57664285ca732df463e883ff75695c0da8 Mon Sep 17 00:00:00 2001
From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com>
Date: Wed, 10 May 2023 18:59:06 +0530
Subject: [PATCH 010/182] [WIP] add Kandinsky image encoder and Multi-clip
 model  (#3373)

Add multiclip and image encoder

Co-authored-by: ayushmangal <ayushmangal@microsoft.com>
Co-authored-by: YiYi Xu <yixu310@gmail.com>
---
 src/diffusers/pipelines/kandinsky/multiclip.py | 18 ++++++++++++++++++
 .../pipelines/kandinsky/pipeline_kandinsky.py  | 17 ++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 src/diffusers/pipelines/kandinsky/multiclip.py

diff --git a/src/diffusers/pipelines/kandinsky/multiclip.py b/src/diffusers/pipelines/kandinsky/multiclip.py
new file mode 100644
index 000000000000..cf0b4d8f82ce
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky/multiclip.py
@@ -0,0 +1,18 @@
+import torch
+from torch import nn
+from transformers import XLMRobertaPreTrainedModel, XLMRobertaModel
+
+class MultilingualCLIP(XLMRobertaPreTrainedModel):
+    def __init__(self, config, in_features=1024, out_features=768): # 1024, 768
+        super().__init__(config)
+        self.transformer = XLMRobertaModel(config)
+        self.LinearTransformation = torch.nn.Linear(
+            in_features=in_features, out_features=out_features
+        )
+
+    def forward(self, input_ids, attention_mask):
+        embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
+        embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(
+            dim=1
+        )[:, None]
+        return self.LinearTransformation(embs2), embs
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index e8b2af450515..d5b619535aa6 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -16,11 +16,13 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
-from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+from transformers import CLIPTextModelWithProjection, CLIPVisionModelWithProjection, CLIPTokenizer, XLMRobertaTokenizerFast
 
 from ...models import PriorTransformer, UNet2DConditionModel
 from ...pipelines import DiffusionPipeline
 from ...schedulers import UnCLIPScheduler
+
+from .multiclip import MultilingualCLIP
 from .text_proj import KandinskyTextProjModel
 
 from ...utils import (
@@ -52,19 +54,29 @@ class KandinskyPriorPipeline(DiffusionPipeline):
             The canonincal unCLIP prior to approximate the image embedding from the text embedding.
         prior_text_encoder ([`CLIPTextModelWithProjection`]):
             Frozen text-encoder.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
         prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         prior_scheduler ([`UnCLIPScheduler`]):
             A scheduler to be used in combination with `prior` to generate image embedding.
+        multiclip ([`MultilingualCLIP`]):
+            A multilingual text encoder.
+        multiclip_tokenizer ([`XLMRobertaTokenizerFast`]):
+            Tokenizer for multiclip
     """
 
     def __init__(
         self,
         prior: PriorTransformer,
+        text_encoder: CLIPTextModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection,
         prior_text_encoder: CLIPTextModelWithProjection,
         prior_tokenizer: CLIPTokenizer,
         prior_scheduler: UnCLIPScheduler,
+        multiclip: MultilingualCLIP,
+        multiclip_tokenizer: XLMRobertaTokenizerFast,
     ):
         super().__init__()
 
@@ -73,6 +85,9 @@ def __init__(
             prior_text_encoder=prior_text_encoder,
             prior_tokenizer=prior_tokenizer,
             prior_scheduler=prior_scheduler,
+            image_encoder=image_encoder,
+            multiclip=multiclip,
+            multiclip_tokenizer=multiclip_tokenizer,
         )
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):

From e0937228c80f3f55cd2bf1a53cb7380b77b1ef23 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 15:35:33 +0000
Subject: [PATCH 011/182] add image_encoder to prior

---
 src/diffusers/pipelines/kandinsky/__init__.py |   1 +
 .../pipelines/kandinsky/pipeline_kandinsky.py | 124 ++++++++++--------
 2 files changed, 69 insertions(+), 56 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index d42fc574e5e0..613bd3fdf820 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -15,3 +15,4 @@
 else:
     from .pipeline_kandinsky import KandinskyPipeline, KandinskyPriorPipeline
     from .text_proj import KandinskyTextProjModel
+    from .text_encoder import MultilingualCLIP
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index d5b619535aa6..7e1bdc72f093 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -70,13 +70,13 @@ class KandinskyPriorPipeline(DiffusionPipeline):
     def __init__(
         self,
         prior: PriorTransformer,
-        text_encoder: CLIPTextModelWithProjection,
-        image_encoder: CLIPVisionModelWithProjection,
+        #text_encoder: CLIPTextModelWithProjection,
+        prior_image_encoder: CLIPVisionModelWithProjection,
         prior_text_encoder: CLIPTextModelWithProjection,
         prior_tokenizer: CLIPTokenizer,
         prior_scheduler: UnCLIPScheduler,
-        multiclip: MultilingualCLIP,
-        multiclip_tokenizer: XLMRobertaTokenizerFast,
+        #multiclip: MultilingualCLIP,
+        #multiclip_tokenizer: XLMRobertaTokenizerFast,
     ):
         super().__init__()
 
@@ -85,9 +85,9 @@ def __init__(
             prior_text_encoder=prior_text_encoder,
             prior_tokenizer=prior_tokenizer,
             prior_scheduler=prior_scheduler,
-            image_encoder=image_encoder,
-            multiclip=multiclip,
-            multiclip_tokenizer=multiclip_tokenizer,
+            prior_image_encoder=prior_image_encoder,
+            #multiclip=multiclip,
+            #multiclip_tokenizer=multiclip_tokenizer,
         )
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
@@ -101,6 +101,12 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
+    def create_zero_img_emb(self, batch_size, device):
+        zero_img = torch.zeros(1, 3, 224, 224).to(device=device)
+        zero_image_emb = self.prior_image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size,1)
+        return zero_image_emb
+
     def _encode_prompt(
         self,
         prompt,
@@ -237,60 +243,66 @@ def __call__(
 
         batch_size = batch_size * num_images_per_prompt
 
-        do_classifier_free_guidance = prior_guidance_scale > 1.0
-        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
-            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-        )
-
-        # prior
-        self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
-        prior_timesteps_tensor = self.prior_scheduler.timesteps
-
-        embedding_dim = self.prior.config.embedding_dim
-
-        prior_latents = self.prepare_latents(
-            (batch_size, embedding_dim),
-            prompt_embeds.dtype,
-            device,
-            generator,
-            prior_latents,
-            self.prior_scheduler,
-        )
-
-        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
-
-            predicted_image_embedding = self.prior(
-                latent_model_input,
-                timestep=t,
-                proj_embedding=prompt_embeds,
-                encoder_hidden_states=text_encoder_hidden_states,
-                attention_mask=text_mask,
-            ).predicted_image_embedding
+        if prompt == '' or prompt[0] == '':
+            
+            image_embeddings = self.create_zero_img_emb(batch_size=batch_size, device=device)
+        
+        else:
 
-            if do_classifier_free_guidance:
-                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
-                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
-                    predicted_image_embedding_text - predicted_image_embedding_uncond
-                )
+            do_classifier_free_guidance = prior_guidance_scale > 1.0
+            prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+                prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
 
-            if i + 1 == prior_timesteps_tensor.shape[0]:
-                prev_timestep = None
-            else:
-                prev_timestep = prior_timesteps_tensor[i + 1]
+            # prior
+            self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
+            prior_timesteps_tensor = self.prior_scheduler.timesteps
 
-            prior_latents = self.prior_scheduler.step(
-                predicted_image_embedding,
-                timestep=t,
-                sample=prior_latents,
-                generator=generator,
-                prev_timestep=prev_timestep,
-            ).prev_sample
+            embedding_dim = self.prior.config.embedding_dim
 
-        prior_latents = self.prior.post_process_latents(prior_latents)
+            prior_latents = self.prepare_latents(
+                (batch_size, embedding_dim),
+                prompt_embeds.dtype,
+                device,
+                generator,
+                prior_latents,
+                self.prior_scheduler,
+            )
 
-        image_embeddings = prior_latents
+            for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
+
+                predicted_image_embedding = self.prior(
+                    latent_model_input,
+                    timestep=t,
+                    proj_embedding=prompt_embeds,
+                    encoder_hidden_states=text_encoder_hidden_states,
+                    attention_mask=text_mask,
+                ).predicted_image_embedding
+
+                if do_classifier_free_guidance:
+                    predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                    predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                        predicted_image_embedding_text - predicted_image_embedding_uncond
+                    )
+
+                if i + 1 == prior_timesteps_tensor.shape[0]:
+                    prev_timestep = None
+                else:
+                    prev_timestep = prior_timesteps_tensor[i + 1]
+
+                prior_latents = self.prior_scheduler.step(
+                    predicted_image_embedding,
+                    timestep=t,
+                    sample=prior_latents,
+                    generator=generator,
+                    prev_timestep=prev_timestep,
+                ).prev_sample
+
+            prior_latents = self.prior.post_process_latents(prior_latents)
+
+            image_embeddings = prior_latents
 
         return image_embeddings
 

From ca2b15d7841add486727debcdbeaf4904f99e1ef Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 16:40:19 +0000
Subject: [PATCH 012/182] add mclip text encoder/tokenizer

---
 .../pipelines/kandinsky/pipeline_kandinsky.py | 127 ++++++++++++++++--
 1 file changed, 114 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 7e1bdc72f093..b337cbd84c21 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -22,7 +22,7 @@
 from ...pipelines import DiffusionPipeline
 from ...schedulers import UnCLIPScheduler
 
-from .multiclip import MultilingualCLIP
+from .text_encoder import MultilingualCLIP
 from .text_proj import KandinskyTextProjModel
 
 from ...utils import (
@@ -70,13 +70,10 @@ class KandinskyPriorPipeline(DiffusionPipeline):
     def __init__(
         self,
         prior: PriorTransformer,
-        #text_encoder: CLIPTextModelWithProjection,
         prior_image_encoder: CLIPVisionModelWithProjection,
         prior_text_encoder: CLIPTextModelWithProjection,
         prior_tokenizer: CLIPTokenizer,
         prior_scheduler: UnCLIPScheduler,
-        #multiclip: MultilingualCLIP,
-        #multiclip_tokenizer: XLMRobertaTokenizerFast,
     ):
         super().__init__()
 
@@ -244,7 +241,7 @@ def __call__(
         batch_size = batch_size * num_images_per_prompt
 
         if prompt == '' or prompt[0] == '':
-            
+
             image_embeddings = self.create_zero_img_emb(batch_size=batch_size, device=device)
         
         else:
@@ -319,8 +316,6 @@ class KandinskyPipeline(DiffusionPipeline):
             to-add
         tokenizer: 
             to-add
-        image_encoder: 
-            to-add
         scheduler ([`UnCLIPScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
@@ -331,6 +326,8 @@ class KandinskyPipeline(DiffusionPipeline):
     
     def __init__(
         self,
+        text_encoder: MultilingualCLIP,
+        tokenizer: XLMRobertaTokenizerFast,
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
         scheduler: UnCLIPScheduler,
@@ -338,6 +335,8 @@ def __init__(
         super().__init__()
 
         self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
             text_proj=text_proj,
             unet=unet,
             scheduler=scheduler,
@@ -354,6 +353,110 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+            text_input_ids, untruncated_ids
+        ):
+            removed_text = self.tokenizer.batch_decode(
+                untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+            )
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        prompt_embeds, text_encoder_hidden_states = self.text_encoder(input_ids=text_input_ids, attention_mask=text_mask)
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            uncond_text_input_ids = uncond_input.input_ids.to(device)
+            uncond_text_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+    
+
     @property
     def _execution_device(self):
         r"""
@@ -373,10 +476,11 @@ def __call__(
         num_inference_steps: int = 100,
         guidance_scale: float = 4.0,
         num_images_per_prompt: int = 1,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        text_encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        #prompt_embeds: Optional[torch.FloatTensor] = None,
+        #text_encoder_hidden_states: Optional[torch.FloatTensor] = None,
         image_embeds: Optional[torch.FloatTensor] = None,
         negative_image_embeds: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pil",
@@ -399,10 +503,7 @@ def __call__(
 
         do_classifier_free_guidance = guidance_scale > 1.0
 
-        #  TO_DO[1] add encode_text step https://github.com/ai-forever/Kandinsky-2/blob/main/kandinsky2/kandinsky2_1_model.py#L208
-        #  Here we pass prompt_embeds and text_encoder_hidden_states directly 
-        
-        # prompt_embeds, text_encoder_hidden_states = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
+        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt)
 
         # TO_DO [2] add a step to create negative_image_embeds https://github.com/ai-forever/Kandinsky-2/blob/main/kandinsky2/kandinsky2_1_model.py#L322
         image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)

From d3465f0f3cea060b2c4c39a6ec7953bf6fcbe6a5 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 16:42:28 +0000
Subject: [PATCH 013/182] add

---
 .../pipelines/kandinsky/multiclip.py          | 18 ----------
 .../pipelines/kandinsky/text_encoder.py       | 35 +++++++++++++++++++
 2 files changed, 35 insertions(+), 18 deletions(-)
 delete mode 100644 src/diffusers/pipelines/kandinsky/multiclip.py
 create mode 100644 src/diffusers/pipelines/kandinsky/text_encoder.py

diff --git a/src/diffusers/pipelines/kandinsky/multiclip.py b/src/diffusers/pipelines/kandinsky/multiclip.py
deleted file mode 100644
index cf0b4d8f82ce..000000000000
--- a/src/diffusers/pipelines/kandinsky/multiclip.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import torch
-from torch import nn
-from transformers import XLMRobertaPreTrainedModel, XLMRobertaModel
-
-class MultilingualCLIP(XLMRobertaPreTrainedModel):
-    def __init__(self, config, in_features=1024, out_features=768): # 1024, 768
-        super().__init__(config)
-        self.transformer = XLMRobertaModel(config)
-        self.LinearTransformation = torch.nn.Linear(
-            in_features=in_features, out_features=out_features
-        )
-
-    def forward(self, input_ids, attention_mask):
-        embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
-        embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(
-            dim=1
-        )[:, None]
-        return self.LinearTransformation(embs2), embs
diff --git a/src/diffusers/pipelines/kandinsky/text_encoder.py b/src/diffusers/pipelines/kandinsky/text_encoder.py
new file mode 100644
index 000000000000..cc22e8f76871
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky/text_encoder.py
@@ -0,0 +1,35 @@
+from transformers import PreTrainedModel, PretrainedConfig, AutoModel
+import torch
+
+
+
+class MCLIPConfig(PretrainedConfig):
+    model_type = "M-CLIP"
+
+    def __init__(self, modelBase='xlm-roberta-large', transformerDimSize=1024, imageDimSize=768, **kwargs):
+        self.transformerDimensions = transformerDimSize
+        self.numDims = imageDimSize
+        self.modelBase = modelBase
+        super().__init__(**kwargs)
+
+
+class MultilingualCLIP(PreTrainedModel):
+    config_class = MCLIPConfig
+
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        self.transformer = AutoModel.from_pretrained(config.modelBase, cache_dir=kwargs.get("cache_dir"))
+        self.LinearTransformation = torch.nn.Linear(in_features=config.transformerDimensions,
+                                                    out_features=config.numDims)
+
+    def forward(self, input_ids, attention_mask):
+        embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
+        embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(
+            dim=1
+        )[:, None]
+        return self.LinearTransformation(embs2), embs
+
+    @classmethod
+    def _load_state_dict_into_model(cls, model, state_dict, pretrained_model_name_or_path, _fast_init=True):
+        model.load_state_dict(state_dict)
+        return model, [], [], []
\ No newline at end of file

From f6c55a5d6815ca16a2cf232e7fe2f217a43ad89d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 20:41:16 +0000
Subject: [PATCH 014/182] remove comments about batch_size

---
 src/diffusers/schedulers/scheduling_unclip.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index ef5ef002e740..7b70c9e96ff9 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -267,7 +267,6 @@ def step(
         prev_timestep: Optional[int] = None,
         generator=None,
         return_dict: bool = True,
-        # YiYi notes: added for testing kandinsky (will try to remove)
         batch_size: Optional[int] = None,
     ) -> Union[UnCLIPSchedulerOutput, Tuple]:
         """
@@ -348,16 +347,15 @@ def step(
         # 6. Add noise
         variance = 0
         if t > 0:
-            # YiYi Notes: test to see if we can sampling with latent shape [batch_size, ...] and change this back
-            variance_noise = randn_tensor(
-                (batch_size if batch_size is not None else model_output.shape[0], *model_output.shape[1:]) , dtype=model_output.dtype, generator=generator, device=model_output.device
-            )     
-            # variance_noise = randn_tensor(
-            #     model_output.shape, dtype=model_output.dtype, generator=generator, device=model_output.device
-            # )
-
             if batch_size is not None:
+                assert batch_size * 2 == model_output.shape[0]
+                variance_noise = randn_tensor(
+                    (batch_size, *model_output.shape[1:]) , dtype=model_output.dtype, generator=generator, device=model_output.device
+                )     
+
                 variance_noise = torch.cat([variance_noise, variance_noise], dim=0)
+            else:
+                variance_noise = randn_tensor(model_output.shape , dtype=model_output.dtype, generator=generator, device=model_output.device)  
 
             variance = self._get_variance(
                 t,

From 755f026512e406c955d0954f6089c6bba15aaf28 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 21:03:48 +0000
Subject: [PATCH 015/182] refactor dynamic threholding

---
 .../pipelines/kandinsky/pipeline_kandinsky.py   |  1 -
 src/diffusers/schedulers/scheduling_unclip.py   | 17 ++++-------------
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index b337cbd84c21..fdd44f235012 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -548,7 +548,6 @@ def __call__(
               # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond) 
               # this means the our latent shape is batch_size *2 instad batch_size
 
-            # YiYi's TO-DO: test it to see if we can do it differently: apply cfg to predicted noise and only take cond portion in predicted variance
             if do_classifier_free_guidance:
                 noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 7b70c9e96ff9..7a501d7c7743 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -23,14 +23,6 @@
 from ..utils import BaseOutput, randn_tensor
 from .scheduling_utils import SchedulerMixin
 
-def dynamic_threholding_test(x):
-    x2 = torch.clone(x).cpu().detach().numpy()
-    p = 99.5
-    s = np.percentile(np.abs(x2), p, axis=tuple(range(1, x2.ndim)))[0]
-    s = max(s, 1.0)
-    x = torch.clip(x, -s, s) / s
-    return x  # x.clamp(-1, 1)
-
 
 @dataclass
 # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->UnCLIP
@@ -125,7 +117,8 @@ def __init__(
         clip_sample_range: Optional[float] = 1.0,
         thresholding: bool = False,
         dynamic_thresholding_ratio: float = 0.995,
-        sample_max_value: float = 1.0,
+        sample_min_value: Optional[float] = None,
+        sample_max_value: Optional[float] = 1.0,
         prediction_type: str = "epsilon",
         beta_schedule: str = "squaredcos_cap_v2", # "linear"
         beta_start: float = 0.0001,
@@ -248,7 +241,7 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
 
         s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
         s = torch.clamp(
-            s, min=1, max=self.config.sample_max_value
+            s, min=self.config.sample_min_value, max=self.config.sample_max_value,
         )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
 
         s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
@@ -331,9 +324,7 @@ def step(
             )
 
         if self.config.thresholding:
-            # yiyi Notes, testing with dynamic_threholding_test, need to make it work with _threshold_sample
-            #pred_original_sample = self._threshold_sample(pred_original_sample)
-            pred_original_sample = dynamic_threholding_test(pred_original_sample)
+            pred_original_sample = self._threshold_sample(pred_original_sample)
             
         # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
         # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf

From 843a4d60fd5840f0f9e34f9590893dceb12920b8 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 21:13:37 +0000
Subject: [PATCH 016/182] put prior pipeline into a seperate file

---
 src/diffusers/pipelines/kandinsky/__init__.py |   3 +-
 .../pipelines/kandinsky/pipeline_kandinsky.py | 263 +-----------------
 2 files changed, 3 insertions(+), 263 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 613bd3fdf820..67062f427390 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -13,6 +13,7 @@
     print("to-do")
 #    from ...utils.dummy_torch_and_transformers_objects import UnCLIPImageVariationPipeline, UnCLIPPipeline
 else:
-    from .pipeline_kandinsky import KandinskyPipeline, KandinskyPriorPipeline
+    from .pipeline_kandinsky import KandinskyPipeline
+    from .pipeline_kandinsky_prior import KandinskyPriorPipeline
     from .text_proj import KandinskyTextProjModel
     from .text_encoder import MultilingualCLIP
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index fdd44f235012..71690924749b 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -22,6 +22,7 @@
 from ...pipelines import DiffusionPipeline
 from ...schedulers import UnCLIPScheduler
 
+from .pipeline_kandinsky_prior import KandinskyPriorPipeline
 from .text_encoder import MultilingualCLIP
 from .text_proj import KandinskyTextProjModel
 
@@ -42,268 +43,6 @@ def get_new_h_w(h, w):
         new_w += 1
     return new_h * 8, new_w * 8
 
-class KandinskyPriorPipeline(DiffusionPipeline):
-    """
-    Pipeline for generate image prior for Kandinsky
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        prior ([`PriorTransformer`]):
-            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
-        prior_text_encoder ([`CLIPTextModelWithProjection`]):
-            Frozen text-encoder.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen image-encoder.
-        prior_tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        prior_scheduler ([`UnCLIPScheduler`]):
-            A scheduler to be used in combination with `prior` to generate image embedding.
-        multiclip ([`MultilingualCLIP`]):
-            A multilingual text encoder.
-        multiclip_tokenizer ([`XLMRobertaTokenizerFast`]):
-            Tokenizer for multiclip
-    """
-
-    def __init__(
-        self,
-        prior: PriorTransformer,
-        prior_image_encoder: CLIPVisionModelWithProjection,
-        prior_text_encoder: CLIPTextModelWithProjection,
-        prior_tokenizer: CLIPTokenizer,
-        prior_scheduler: UnCLIPScheduler,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            prior=prior,
-            prior_text_encoder=prior_text_encoder,
-            prior_tokenizer=prior_tokenizer,
-            prior_scheduler=prior_scheduler,
-            prior_image_encoder=prior_image_encoder,
-            #multiclip=multiclip,
-            #multiclip_tokenizer=multiclip_tokenizer,
-        )
-
-    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-
-        latents = latents * scheduler.init_noise_sigma
-        return latents
-
-    def create_zero_img_emb(self, batch_size, device):
-        zero_img = torch.zeros(1, 3, 224, 224).to(device=device)
-        zero_image_emb = self.prior_image_encoder(zero_img)["image_embeds"]
-        zero_image_emb = zero_image_emb.repeat(batch_size,1)
-        return zero_image_emb
-
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-    ):
-        
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-            # get prompt text embeddings
-        text_inputs = self.prior_tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.prior_tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-        text_mask = text_inputs.attention_mask.bool().to(device)
-
-        untruncated_ids = self.prior_tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-            text_input_ids, untruncated_ids
-        ):
-            removed_text = self.prior_tokenizer.batch_decode(
-                untruncated_ids[:, self.prior_tokenizer.model_max_length - 1 : -1]
-            )
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.prior_tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length]
-
-        text_encoder_output = self.prior_text_encoder(text_input_ids.to(device))
-
-        prompt_embeds = text_encoder_output.text_embeds
-        text_encoder_hidden_states = text_encoder_output.last_hidden_state
-
-        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
-
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            uncond_input = self.prior_tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=self.prior_tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
-            negative_prompt_embeds_text_encoder_output = self.prior_text_encoder(uncond_input.input_ids.to(device))
-
-            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
-            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
-
-            seq_len = uncond_text_encoder_hidden_states.shape[1]
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
-                batch_size * num_images_per_prompt, seq_len, -1
-            )
-            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
-
-            # done duplicates
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
-
-            text_mask = torch.cat([uncond_text_mask, text_mask])
-
-        return prompt_embeds, text_encoder_hidden_states, text_mask
-    
-    @property
-    def _execution_device(self):
-        r"""
-        Returns the device on which the pipeline's models will be executed. After calling
-        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
-        hooks.
-        """
-        # TO_DO
-        return self.device
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt,
-        num_images_per_prompt: int = 1,
-        prior_num_inference_steps: int =5,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        prior_latents: Optional[torch.FloatTensor] = None,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        prior_guidance_scale: float = 4.0, 
-        output_type: Optional[str] = "pt",
-        return_dict: bool = True,
-    ):
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        device = self._execution_device
-
-        batch_size = batch_size * num_images_per_prompt
-
-        if prompt == '' or prompt[0] == '':
-
-            image_embeddings = self.create_zero_img_emb(batch_size=batch_size, device=device)
-        
-        else:
-
-            do_classifier_free_guidance = prior_guidance_scale > 1.0
-            prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
-                prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-            # prior
-            self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
-            prior_timesteps_tensor = self.prior_scheduler.timesteps
-
-            embedding_dim = self.prior.config.embedding_dim
-
-            prior_latents = self.prepare_latents(
-                (batch_size, embedding_dim),
-                prompt_embeds.dtype,
-                device,
-                generator,
-                prior_latents,
-                self.prior_scheduler,
-            )
-
-            for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
-
-                predicted_image_embedding = self.prior(
-                    latent_model_input,
-                    timestep=t,
-                    proj_embedding=prompt_embeds,
-                    encoder_hidden_states=text_encoder_hidden_states,
-                    attention_mask=text_mask,
-                ).predicted_image_embedding
-
-                if do_classifier_free_guidance:
-                    predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
-                    predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
-                        predicted_image_embedding_text - predicted_image_embedding_uncond
-                    )
-
-                if i + 1 == prior_timesteps_tensor.shape[0]:
-                    prev_timestep = None
-                else:
-                    prev_timestep = prior_timesteps_tensor[i + 1]
-
-                prior_latents = self.prior_scheduler.step(
-                    predicted_image_embedding,
-                    timestep=t,
-                    sample=prior_latents,
-                    generator=generator,
-                    prev_timestep=prev_timestep,
-                ).prev_sample
-
-            prior_latents = self.prior.post_process_latents(prior_latents)
-
-            image_embeddings = prior_latents
-
-        return image_embeddings
-
-
 class KandinskyPipeline(DiffusionPipeline):
     """
     Pipeline for image based on text prompt and image prior for Kandinsky

From 9be88872993d5d28e515946c285220c0903eb6be Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 21:30:31 +0000
Subject: [PATCH 017/182] add more copy from methods

---
 .../pipelines/kandinsky/pipeline_kandinsky.py | 65 ++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 71690924749b..d5b0ff8041d1 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -195,15 +195,78 @@ def _encode_prompt(
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
     
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
 
     @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
     def _execution_device(self):
         r"""
         Returns the device on which the pipeline's models will be executed. After calling
         `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
         hooks.
         """
-        # TO_DO
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
         return self.device
 
     @torch.no_grad()

From e3ee4a33cc9a3cf1a122bcba2c47bd4822b8bf0b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 21:46:12 +0000
Subject: [PATCH 018/182] make style

---
 scripts/convert_kandinsky_to_diffusers.py     | 102 +++---
 src/diffusers/__init__.py                     |   4 +-
 src/diffusers/pipelines/__init__.py           |   2 +-
 src/diffusers/pipelines/kandinsky/__init__.py |   2 +-
 .../pipelines/kandinsky/pipeline_kandinsky.py | 103 +++---
 .../kandinsky/pipeline_kandinsky_prior.py     | 342 ++++++++++++++++++
 .../pipelines/kandinsky/text_encoder.py       |  16 +-
 .../pipelines/kandinsky/text_proj.py          |   7 +-
 src/diffusers/schedulers/scheduling_unclip.py |  25 +-
 9 files changed, 475 insertions(+), 128 deletions(-)
 create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py

diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py
index 00941d3d2a3b..107f4a7eda8d 100644
--- a/scripts/convert_kandinsky_to_diffusers.py
+++ b/scripts/convert_kandinsky_to_diffusers.py
@@ -3,9 +3,10 @@
 
 import torch
 from accelerate import load_checkpoint_and_dispatch
+
+from diffusers import UNet2DConditionModel
 from diffusers.models.prior_transformer import PriorTransformer
 from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
-from diffusers import UNet2DConditionModel
 
 
 """
@@ -213,53 +214,55 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix
 
 # done prior
 
-# unet 
+# unet
 
 # We are hardcoding the model configuration for now. If we need to generalize to more model configurations, we can
 # update then.
 
 UNET_CONFIG = {
-  "act_fn":"silu",
-  "attention_head_dim": 64,
-  "block_out_channels": (384, 768, 1152, 1536),
-  "center_input_sample": False,
-  "class_embed_type": "identity",
-  "cross_attention_dim": 768,
-  "down_block_types": (
-    "ResnetDownsampleBlock2D",
-    "SimpleCrossAttnDownBlock2D",
-    "SimpleCrossAttnDownBlock2D",
-    "SimpleCrossAttnDownBlock2D",
-  ),
-  "downsample_padding": 1,
-  "dual_cross_attention": False,
-  "flip_sin_to_cos": True,
-  "freq_shift": 0,
-  "in_channels": 4,
-  "layers_per_block": 3,
-  "mid_block_scale_factor": 1,
-  "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
-  "norm_eps": 1e-05,
-  "norm_num_groups": 32,
-  "only_cross_attention": False,
-  "out_channels": 8,
-  "resnet_time_scale_shift": "scale_shift",
-  "sample_size": 64,
-  "up_block_types": (
-    "SimpleCrossAttnUpBlock2D",
-    "SimpleCrossAttnUpBlock2D",
-    "SimpleCrossAttnUpBlock2D",
-    "ResnetUpsampleBlock2D",
-  ),
-  "upcast_attention": False,
-  "use_linear_projection": False
+    "act_fn": "silu",
+    "attention_head_dim": 64,
+    "block_out_channels": (384, 768, 1152, 1536),
+    "center_input_sample": False,
+    "class_embed_type": "identity",
+    "cross_attention_dim": 768,
+    "down_block_types": (
+        "ResnetDownsampleBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+    ),
+    "downsample_padding": 1,
+    "dual_cross_attention": False,
+    "flip_sin_to_cos": True,
+    "freq_shift": 0,
+    "in_channels": 4,
+    "layers_per_block": 3,
+    "mid_block_scale_factor": 1,
+    "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+    "norm_eps": 1e-05,
+    "norm_num_groups": 32,
+    "only_cross_attention": False,
+    "out_channels": 8,
+    "resnet_time_scale_shift": "scale_shift",
+    "sample_size": 64,
+    "up_block_types": (
+        "SimpleCrossAttnUpBlock2D",
+        "SimpleCrossAttnUpBlock2D",
+        "SimpleCrossAttnUpBlock2D",
+        "ResnetUpsampleBlock2D",
+    ),
+    "upcast_attention": False,
+    "use_linear_projection": False,
 }
 
+
 def unet_model_from_original_config():
     model = UNet2DConditionModel(**UNET_CONFIG)
 
     return model
 
+
 def unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
     diffusers_checkpoint = {}
 
@@ -319,16 +322,19 @@ def unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
 
     return diffusers_checkpoint
 
+
 # done unet
 
-# text proj 
+# text proj
 
 TEXT_PROJ_CONFIG = {}
 
+
 def text_proj_from_original_config():
     model = KandinskyTextProjModel(**TEXT_PROJ_CONFIG)
     return model
 
+
 # Note that the input checkpoint is the original text2img model checkpoint
 def text_proj_original_checkpoint_to_diffusers_checkpoint(checkpoint):
     diffusers_checkpoint = {
@@ -351,8 +357,10 @@ def text_proj_original_checkpoint_to_diffusers_checkpoint(checkpoint):
 
     return diffusers_checkpoint
 
+
 # unet utils
 
+
 # <original>.time_embed -> <diffusers>.time_embedding
 def unet_time_embeddings(checkpoint):
     diffusers_checkpoint = {}
@@ -368,6 +376,7 @@ def unet_time_embeddings(checkpoint):
 
     return diffusers_checkpoint
 
+
 # <original>.input_blocks.0 -> <diffusers>.conv_in
 def unet_conv_in(checkpoint):
     diffusers_checkpoint = {}
@@ -381,6 +390,7 @@ def unet_conv_in(checkpoint):
 
     return diffusers_checkpoint
 
+
 # <original>.out.0 -> <diffusers>.conv_norm_out
 def unet_conv_norm_out(checkpoint):
     diffusers_checkpoint = {}
@@ -394,6 +404,7 @@ def unet_conv_norm_out(checkpoint):
 
     return diffusers_checkpoint
 
+
 # <original>.out.2 -> <diffusers>.conv_out
 def unet_conv_out(checkpoint):
     diffusers_checkpoint = {}
@@ -407,6 +418,7 @@ def unet_conv_out(checkpoint):
 
     return diffusers_checkpoint
 
+
 # <original>.input_blocks -> <diffusers>.down_blocks
 def unet_downblock_to_diffusers_checkpoint(
     model, checkpoint, *, diffusers_down_block_idx, original_down_block_idx, num_head_channels
@@ -465,6 +477,7 @@ def unet_downblock_to_diffusers_checkpoint(
 
     return diffusers_checkpoint, num_original_down_blocks
 
+
 # <original>.middle_block -> <diffusers>.mid_block
 def unet_midblock_to_diffusers_checkpoint(model, checkpoint, *, num_head_channels):
     diffusers_checkpoint = {}
@@ -721,29 +734,25 @@ def prior(*, args, checkpoint_map_location):
 
 
 def text2img(*, args, checkpoint_map_location):
-    
     print("loading text2img")
 
     text2img_checkpoint = torch.load(args.text2img_checkpoint_path, map_location=checkpoint_map_location)
 
     unet_model = unet_model_from_original_config()
 
-    unet_diffusers_checkpoint = unet_original_checkpoint_to_diffusers_checkpoint(
-        unet_model, text2img_checkpoint
-    )
+    unet_diffusers_checkpoint = unet_original_checkpoint_to_diffusers_checkpoint(unet_model, text2img_checkpoint)
 
     # text proj interlude
-    
+
     # The original decoder implementation includes a set of parameters that are used
     # for creating the `encoder_hidden_states` which are what the U-net is conditioned
     # on. The diffusers conditional unet directly takes the encoder_hidden_states. We pull
     # the parameters into the KandinskyTextProjModel class
     text_proj_model = text_proj_from_original_config()
-    
+
     text_proj_checkpoint = text_proj_original_checkpoint_to_diffusers_checkpoint(text2img_checkpoint)
 
     load_checkpoint_to_model(text_proj_checkpoint, text_proj_model, strict=True)
-    
 
     del text2img_checkpoint
 
@@ -764,7 +773,6 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
             load_checkpoint_and_dispatch(model, file.name, device_map="auto")
 
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
@@ -817,9 +825,9 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
     elif args.debug == "prior":
         prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location)
         prior_model.save_pretrained(args.dump_path)
-    elif args.debug == 'text2img':
+    elif args.debug == "text2img":
         unet_model, text_proj_model = text2img(args=args, checkpoint_map_location=checkpoint_map_location)
         unet_model.save_pretrained(f"{args.dump_path}/unet")
         text_proj_model.save_pretrained(f"{args.dump_path}/text_proj")
     else:
-        raise ValueError(f"unknown debug value : {args.debug}")
\ No newline at end of file
+        raise ValueError(f"unknown debug value : {args.debug}")
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index ae38cb005bfb..265c263881ca 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -128,6 +128,8 @@
         IFInpaintingSuperResolutionPipeline,
         IFPipeline,
         IFSuperResolutionPipeline,
+        KandinskyPipeline,
+        KandinskyPriorPipeline,
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
         SemanticStableDiffusionPipeline,
@@ -154,8 +156,6 @@
         TextToVideoZeroPipeline,
         UnCLIPImageVariationPipeline,
         UnCLIPPipeline,
-        KandinskyPriorPipeline,
-        KandinskyPipeline,
         VersatileDiffusionDualGuidedPipeline,
         VersatileDiffusionImageVariationPipeline,
         VersatileDiffusionPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 041b34089af9..da4733b07e1c 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -52,6 +52,7 @@
         IFPipeline,
         IFSuperResolutionPipeline,
     )
+    from .kandinsky import KandinskyPipeline, KandinskyPriorPipeline
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
@@ -79,7 +80,6 @@
     from .stable_diffusion_safe import StableDiffusionPipelineSafe
     from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline
     from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
-    from .kandinsky import KandinskyPipeline, KandinskyPriorPipeline
     from .versatile_diffusion import (
         VersatileDiffusionDualGuidedPipeline,
         VersatileDiffusionImageVariationPipeline,
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 67062f427390..79a89cdc46e4 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -15,5 +15,5 @@
 else:
     from .pipeline_kandinsky import KandinskyPipeline
     from .pipeline_kandinsky_prior import KandinskyPriorPipeline
-    from .text_proj import KandinskyTextProjModel
     from .text_encoder import MultilingualCLIP
+    from .text_proj import KandinskyTextProjModel
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index d5b0ff8041d1..28682d231df2 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -12,24 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import inspect
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Union
 
 import torch
-from transformers import CLIPTextModelWithProjection, CLIPVisionModelWithProjection, CLIPTokenizer, XLMRobertaTokenizerFast
+from transformers import (
+    XLMRobertaTokenizerFast,
+)
 
-from ...models import PriorTransformer, UNet2DConditionModel
+from ...models import UNet2DConditionModel
 from ...pipelines import DiffusionPipeline
 from ...schedulers import UnCLIPScheduler
-
-from .pipeline_kandinsky_prior import KandinskyPriorPipeline
-from .text_encoder import MultilingualCLIP
-from .text_proj import KandinskyTextProjModel
-
 from ...utils import (
-    logging, 
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
     randn_tensor,
 )
+from .text_encoder import MultilingualCLIP
+from .text_proj import KandinskyTextProjModel
+
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -43,6 +44,7 @@ def get_new_h_w(h, w):
         new_w += 1
     return new_h * 8, new_w * 8
 
+
 class KandinskyPipeline(DiffusionPipeline):
     """
     Pipeline for image based on text prompt and image prior for Kandinsky
@@ -51,9 +53,9 @@ class KandinskyPipeline(DiffusionPipeline):
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     Args:
-        text_encoder: 
+        text_encoder:
             to-add
-        tokenizer: 
+        tokenizer:
             to-add
         scheduler ([`UnCLIPScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
@@ -62,7 +64,7 @@ class KandinskyPipeline(DiffusionPipeline):
         text_proj ([`KandinskyTextProjModel`]):
             Utility class to prepare and combine the embeddings before they are passed to the decoder.
     """
-    
+
     def __init__(
         self,
         text_encoder: MultilingualCLIP,
@@ -100,7 +102,6 @@ def _encode_prompt(
         do_classifier_free_guidance,
         negative_prompt=None,
     ):
-        
         batch_size = len(prompt) if isinstance(prompt, list) else 1
         # get prompt text embeddings
         text_inputs = self.tokenizer(
@@ -118,19 +119,17 @@ def _encode_prompt(
 
         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-            text_input_ids, untruncated_ids
-        ):
-            removed_text = self.tokenizer.batch_decode(
-                untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-            )
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
             text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
 
-        prompt_embeds, text_encoder_hidden_states = self.text_encoder(input_ids=text_input_ids, attention_mask=text_mask)
+        prompt_embeds, text_encoder_hidden_states = self.text_encoder(
+            input_ids=text_input_ids, attention_mask=text_mask
+        )
 
         prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
         text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
@@ -168,7 +167,9 @@ def _encode_prompt(
             uncond_text_input_ids = uncond_input.input_ids.to(device)
             uncond_text_mask = uncond_input.attention_mask.to(device)
 
-            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask)
+            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(
+                input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask
+            )
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
 
@@ -194,7 +195,7 @@ def _encode_prompt(
             text_mask = torch.cat([uncond_text_mask, text_mask])
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
-    
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
@@ -272,7 +273,7 @@ def _execution_device(self):
     @torch.no_grad()
     def __call__(
         self,
-        prompt: Optional[Union[str, List[str]]] = None,
+        prompt: Union[str, List[str]],
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 100,
@@ -281,40 +282,36 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
-        #prompt_embeds: Optional[torch.FloatTensor] = None,
-        #text_encoder_hidden_states: Optional[torch.FloatTensor] = None,
         image_embeds: Optional[torch.FloatTensor] = None,
         negative_image_embeds: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pil",
-        return_dict:  bool = True,
+        return_dict: bool = True,
     ):
-
-        if prompt is not None:
-            if isinstance(prompt, str):
-                batch_size = 1
-            elif isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
         else:
-            batch_size = prompt_embeds.shape[0] //2
-        
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
         device = self._execution_device
 
         batch_size = batch_size * num_images_per_prompt
 
         do_classifier_free_guidance = guidance_scale > 1.0
 
-        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt)
+        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
 
         # TO_DO [2] add a step to create negative_image_embeds https://github.com/ai-forever/Kandinsky-2/blob/main/kandinsky2/kandinsky2_1_model.py#L322
         image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)
-        
+
         text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
             image_embeddings=image_embeds,
             prompt_embeds=prompt_embeds,
             text_encoder_hidden_states=text_encoder_hidden_states,
-            )
+        )
 
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps_tensor = self.scheduler.timesteps
@@ -324,7 +321,7 @@ def __call__(
         height = height or self.unet.config.sample_size
         width = width or self.unet.config.sample_size
         height, width = get_new_h_w(height, width)
-        
+
         # create initial latent
         latents = self.prepare_latents(
             (batch_size, num_channels_latents, height, width),
@@ -334,28 +331,28 @@ def __call__(
             latents,
             self.scheduler,
         )
-        
+
         # expand the latents if we are doing classifier free guidance
         latents = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-        
+
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             noise_pred = self.unet(
-                sample=latents, #[2, 4, 96, 96]
+                sample=latents,  # [2, 4, 96, 96]
                 timestep=t,
                 encoder_hidden_states=text_encoder_hidden_states,
                 class_labels=additive_clip_time_embeddings,
             ).sample
 
-            # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline, 
-              # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond) 
-              # this means the our latent shape is batch_size *2 instad batch_size
+            # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline,
+            # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond)
+            # this means the our latent shape is batch_size *2 instad batch_size
 
             if do_classifier_free_guidance:
                 noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                 variance_pred_uncond, variance_pred_text = variance_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                noise_pred = torch.cat([noise_pred] * 2) 
+                noise_pred = torch.cat([noise_pred] * 2)
                 variance_pred = torch.cat([variance_pred_uncond, variance_pred_text])
                 noise_pred = torch.cat([noise_pred, variance_pred], dim=1)
 
@@ -366,10 +363,14 @@ def __call__(
 
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(
-                noise_pred, t, latents, prev_timestep=prev_timestep, generator=generator, batch_size=batch_size,
+                noise_pred,
+                t,
+                latents,
+                prev_timestep=prev_timestep,
+                generator=generator,
+                batch_size=batch_size,
             ).prev_sample
 
         _, latents = latents.chunk(2)
 
-
-        return latents 
\ No newline at end of file
+        return latents
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
new file mode 100644
index 000000000000..b494a5bb6cac
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -0,0 +1,342 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import PriorTransformer
+from ...pipelines import DiffusionPipeline
+from ...schedulers import UnCLIPScheduler
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    randn_tensor,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class KandinskyPriorPipeline(DiffusionPipeline):
+    """
+    Pipeline for generate image prior for Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        multiclip ([`MultilingualCLIP`]):
+            A multilingual text encoder.
+        multiclip_tokenizer ([`XLMRobertaTokenizerFast`]):
+            Tokenizer for multiclip
+    """
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_encoder=prior_image_encoder,
+        )
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def create_zero_img_emb(self, batch_size, device):
+        zero_img = torch.zeros(1, 3, 224, 224).to(device=device)
+        zero_image_emb = self.prior_image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size, 1)
+        return zero_image_emb
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        return self.device
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.prior_tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.prior_tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+
+        untruncated_ids = self.prior_tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.prior_tokenizer.batch_decode(
+                untruncated_ids[:, self.prior_tokenizer.model_max_length - 1 : -1]
+            )
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.prior_tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length]
+
+        text_encoder_output = self.prior_text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.prior_tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.prior_tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.prior_text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        num_images_per_prompt: int = 1,
+        prior_num_inference_steps: int = 5,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prior_latents: Optional[torch.FloatTensor] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prior_guidance_scale: float = 4.0,
+        output_type: Optional[str] = "pt",
+        return_dict: bool = True,
+    ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if prompt == "" or prompt[0] == "":
+            image_embeddings = self.create_zero_img_emb(batch_size=batch_size, device=device)
+
+        else:
+            do_classifier_free_guidance = prior_guidance_scale > 1.0
+            prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+                prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+            # prior
+            self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
+            prior_timesteps_tensor = self.prior_scheduler.timesteps
+
+            embedding_dim = self.prior.config.embedding_dim
+
+            prior_latents = self.prepare_latents(
+                (batch_size, embedding_dim),
+                prompt_embeds.dtype,
+                device,
+                generator,
+                prior_latents,
+                self.prior_scheduler,
+            )
+
+            for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
+
+                predicted_image_embedding = self.prior(
+                    latent_model_input,
+                    timestep=t,
+                    proj_embedding=prompt_embeds,
+                    encoder_hidden_states=text_encoder_hidden_states,
+                    attention_mask=text_mask,
+                ).predicted_image_embedding
+
+                if do_classifier_free_guidance:
+                    predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(
+                        2
+                    )
+                    predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                        predicted_image_embedding_text - predicted_image_embedding_uncond
+                    )
+
+                if i + 1 == prior_timesteps_tensor.shape[0]:
+                    prev_timestep = None
+                else:
+                    prev_timestep = prior_timesteps_tensor[i + 1]
+
+                prior_latents = self.prior_scheduler.step(
+                    predicted_image_embedding,
+                    timestep=t,
+                    sample=prior_latents,
+                    generator=generator,
+                    prev_timestep=prev_timestep,
+                ).prev_sample
+
+            prior_latents = self.prior.post_process_latents(prior_latents)
+
+            image_embeddings = prior_latents
+
+        return image_embeddings
diff --git a/src/diffusers/pipelines/kandinsky/text_encoder.py b/src/diffusers/pipelines/kandinsky/text_encoder.py
index cc22e8f76871..4906f95d387b 100644
--- a/src/diffusers/pipelines/kandinsky/text_encoder.py
+++ b/src/diffusers/pipelines/kandinsky/text_encoder.py
@@ -1,12 +1,11 @@
-from transformers import PreTrainedModel, PretrainedConfig, AutoModel
 import torch
-
+from transformers import AutoModel, PretrainedConfig, PreTrainedModel
 
 
 class MCLIPConfig(PretrainedConfig):
     model_type = "M-CLIP"
 
-    def __init__(self, modelBase='xlm-roberta-large', transformerDimSize=1024, imageDimSize=768, **kwargs):
+    def __init__(self, modelBase="xlm-roberta-large", transformerDimSize=1024, imageDimSize=768, **kwargs):
         self.transformerDimensions = transformerDimSize
         self.numDims = imageDimSize
         self.modelBase = modelBase
@@ -19,17 +18,16 @@ class MultilingualCLIP(PreTrainedModel):
     def __init__(self, config, *args, **kwargs):
         super().__init__(config, *args, **kwargs)
         self.transformer = AutoModel.from_pretrained(config.modelBase, cache_dir=kwargs.get("cache_dir"))
-        self.LinearTransformation = torch.nn.Linear(in_features=config.transformerDimensions,
-                                                    out_features=config.numDims)
+        self.LinearTransformation = torch.nn.Linear(
+            in_features=config.transformerDimensions, out_features=config.numDims
+        )
 
     def forward(self, input_ids, attention_mask):
         embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
-        embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(
-            dim=1
-        )[:, None]
+        embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None]
         return self.LinearTransformation(embs2), embs
 
     @classmethod
     def _load_state_dict_into_model(cls, model, state_dict, pretrained_model_name_or_path, _fast_init=True):
         model.load_state_dict(state_dict)
-        return model, [], [], []
\ No newline at end of file
+        return model, [], [], []
diff --git a/src/diffusers/pipelines/kandinsky/text_proj.py b/src/diffusers/pipelines/kandinsky/text_proj.py
index fc485f47a632..ac8f712761c3 100644
--- a/src/diffusers/pipelines/kandinsky/text_proj.py
+++ b/src/diffusers/pipelines/kandinsky/text_proj.py
@@ -30,7 +30,7 @@ def __init__(
         self,
         *,
         clip_extra_context_tokens: int = 10,
-        clip_text_encoder_hidden_states_dim: int = 1024, 
+        clip_text_encoder_hidden_states_dim: int = 1024,
         clip_embeddings_dim: int = 768,
         time_embed_dim: int = 1536,
         cross_attention_dim: int = 768,
@@ -50,7 +50,6 @@ def __init__(
         self.encoder_hidden_states_proj = nn.Linear(clip_text_encoder_hidden_states_dim, cross_attention_dim)
 
     def forward(self, *, image_embeddings, prompt_embeds, text_encoder_hidden_states):
-
         # The image embeddings batch size and the text embeddings batch size are equal
         assert image_embeddings.shape[0] == prompt_embeds.shape[0] == text_encoder_hidden_states.shape[0]
 
@@ -62,10 +61,6 @@ def forward(self, *, image_embeddings, prompt_embeds, text_encoder_hidden_states
         time_projected_image_embeddings = self.clip_image_embeddings_project_to_time_embeddings(image_embeddings)
         additive_clip_time_embeddings = time_projected_image_embeddings + time_projected_prompt_embeds
 
-        # image_embeddings -> linear (2 x 7680) -> (2, 10, 768)
-        # text_encoder_hidden_states -> linear -> (2, 77, 768)
-         # (2, 87, 768)
-          # ... and by projecting CLIP embeddings into 10
         # extra tokens of context that are concatenated to the sequence of outputs from the GLIDE text encoder"
         clip_extra_context_tokens = self.clip_extra_context_tokens_proj(image_embeddings)
         clip_extra_context_tokens = clip_extra_context_tokens.reshape(batch_size, self.clip_extra_context_tokens, -1)
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 7a501d7c7743..4ee549f98830 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -120,12 +120,10 @@ def __init__(
         sample_min_value: Optional[float] = None,
         sample_max_value: Optional[float] = 1.0,
         prediction_type: str = "epsilon",
-        beta_schedule: str = "squaredcos_cap_v2", # "linear"
+        beta_schedule: str = "squaredcos_cap_v2",  # "linear"
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-
     ):
-
         if beta_schedule == "squaredcos_cap_v2":
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         elif beta_schedule == "linear":
@@ -135,7 +133,6 @@ def __init__(
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
-
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
         self.one = torch.tensor(1.0)
@@ -215,7 +212,6 @@ def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance
             # this is log variance
             variance = frac * max_log + (1 - frac) * min_log
 
-
         return variance
 
     def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
@@ -241,7 +237,9 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
 
         s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
         s = torch.clamp(
-            s, min=self.config.sample_min_value, max=self.config.sample_max_value,
+            s,
+            min=self.config.sample_min_value,
+            max=self.config.sample_max_value,
         )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
 
         s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
@@ -317,7 +315,7 @@ def step(
                 " for the UnCLIPScheduler."
             )
 
-        # 3. Clip/threhold "predicted x_0" 
+        # 3. Clip/threhold "predicted x_0"
         if self.config.clip_sample:
             pred_original_sample = torch.clamp(
                 pred_original_sample, -self.config.clip_sample_range, self.config.clip_sample_range
@@ -325,7 +323,7 @@ def step(
 
         if self.config.thresholding:
             pred_original_sample = self._threshold_sample(pred_original_sample)
-            
+
         # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
         # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
         pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * beta) / beta_prod_t
@@ -341,12 +339,17 @@ def step(
             if batch_size is not None:
                 assert batch_size * 2 == model_output.shape[0]
                 variance_noise = randn_tensor(
-                    (batch_size, *model_output.shape[1:]) , dtype=model_output.dtype, generator=generator, device=model_output.device
-                )     
+                    (batch_size, *model_output.shape[1:]),
+                    dtype=model_output.dtype,
+                    generator=generator,
+                    device=model_output.device,
+                )
 
                 variance_noise = torch.cat([variance_noise, variance_noise], dim=0)
             else:
-                variance_noise = randn_tensor(model_output.shape , dtype=model_output.dtype, generator=generator, device=model_output.device)  
+                variance_noise = randn_tensor(
+                    model_output.shape, dtype=model_output.dtype, generator=generator, device=model_output.device
+                )
 
             variance = self._get_variance(
                 t,

From 5a57f2398b4719a21daf36d5b96f1e02c4ce0525 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 21:51:01 +0000
Subject: [PATCH 019/182] fix _execution_device for prior pipeline

---
 .../pipelines/kandinsky/pipeline_kandinsky_prior.py      | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index b494a5bb6cac..d8fd2b5bfc66 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -153,6 +153,15 @@ def _execution_device(self):
         `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
         hooks.
         """
+        if self.device != torch.device("meta") or not hasattr(self.decoder, "_hf_hook"):
+            return self.device
+        for module in self.decoder.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
         return self.device
 
     def _encode_prompt(

From b90b1a9ad78d0513801c3d35a41520d8939e61b6 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 21:51:50 +0000
Subject: [PATCH 020/182] make fix-copies

---
 .../dummy_torch_and_transformers_objects.py   | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index f3708107e82a..55d057ffc6c0 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -152,6 +152,36 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class KandinskyPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyPriorPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class LDMTextToImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From d17736616f7c5c76833e4c1d9c8f8b184f04aaa0 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 22:01:16 +0000
Subject: [PATCH 021/182] make style

---
 src/diffusers/pipelines/kandinsky/text_proj.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/text_proj.py b/src/diffusers/pipelines/kandinsky/text_proj.py
index ac8f712761c3..ab985ebbd056 100644
--- a/src/diffusers/pipelines/kandinsky/text_proj.py
+++ b/src/diffusers/pipelines/kandinsky/text_proj.py
@@ -21,8 +21,8 @@
 
 class KandinskyTextProjModel(ModelMixin, ConfigMixin):
     """
-    Utility class for Kandingsky text embeddings. Used to combine the image and text embeddings into a format usable by the
-    unet diffusion model.
+    Utility class for Kandingsky text embeddings. Used to combine the image and text embeddings into a format usable by
+    the unet diffusion model.
     """
 
     @register_to_config

From a122d319d7436989cc09415d25b5f0dccbef2855 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 22:21:10 +0000
Subject: [PATCH 022/182] fix some doc string

---
 src/diffusers/pipelines/kandinsky/__init__.py          |  3 +--
 .../pipelines/kandinsky/pipeline_kandinsky.py          | 10 +++++-----
 .../pipelines/kandinsky/pipeline_kandinsky_prior.py    |  8 ++------
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 79a89cdc46e4..ced8028613c3 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -10,8 +10,7 @@
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    print("to-do")
-#    from ...utils.dummy_torch_and_transformers_objects import UnCLIPImageVariationPipeline, UnCLIPPipeline
+    from ...utils.dummy_torch_and_transformers_objects import KandinskyPriorPipeline, KandinskyPipeline
 else:
     from .pipeline_kandinsky import KandinskyPipeline
     from .pipeline_kandinsky_prior import KandinskyPriorPipeline
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 28682d231df2..dfd3e6e50002 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -47,16 +47,16 @@ def get_new_h_w(h, w):
 
 class KandinskyPipeline(DiffusionPipeline):
     """
-    Pipeline for image based on text prompt and image prior for Kandinsky
+    Pipeline for text-to-image generation using Kandinsky
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
     Args:
-        text_encoder:
-            to-add
-        tokenizer:
-            to-add
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizerFast`]):
+            Tokenizer of class
         scheduler ([`UnCLIPScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index d8fd2b5bfc66..9110015c41c9 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -41,19 +41,15 @@ class KandinskyPriorPipeline(DiffusionPipeline):
     Args:
         prior ([`PriorTransformer`]):
             The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
         prior_text_encoder ([`CLIPTextModelWithProjection`]):
             Frozen text-encoder.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen image-encoder.
         prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         prior_scheduler ([`UnCLIPScheduler`]):
             A scheduler to be used in combination with `prior` to generate image embedding.
-        multiclip ([`MultilingualCLIP`]):
-            A multilingual text encoder.
-        multiclip_tokenizer ([`XLMRobertaTokenizerFast`]):
-            Tokenizer for multiclip
     """
 
     def __init__(

From ce2ffb32ffcfe391b983e1655c1aa03e05afdcee Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 10 May 2023 22:22:13 +0000
Subject: [PATCH 023/182] style

---
 src/diffusers/pipelines/kandinsky/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index ced8028613c3..bffc40ea1ae5 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -10,7 +10,7 @@
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils.dummy_torch_and_transformers_objects import KandinskyPriorPipeline, KandinskyPipeline
+    from ...utils.dummy_torch_and_transformers_objects import KandinskyPipeline, KandinskyPriorPipeline
 else:
     from .pipeline_kandinsky import KandinskyPipeline
     from .pipeline_kandinsky_prior import KandinskyPriorPipeline

From a7a4a5557078d99117b7f563465ebc07166d4681 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 11 May 2023 11:15:50 -1000
Subject: [PATCH 024/182] Update src/diffusers/pipelines/kandinsky/__init__.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index bffc40ea1ae5..60fc387404ba 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -7,7 +7,7 @@
 
 
 try:
-    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+    if not (is_transformers_available() and is_torch_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_torch_and_transformers_objects import KandinskyPipeline, KandinskyPriorPipeline

From 7303cb520407be2b5ffb3b8e0c6aade1a772a24a Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 11 May 2023 11:16:18 -1000
Subject: [PATCH 025/182] Update
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 9110015c41c9..9348ad535b96 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -41,7 +41,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
     Args:
         prior ([`PriorTransformer`]):
             The canonincal unCLIP prior to approximate the image embedding from the text embedding.
-        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+        image_encoder ([`CLIPVisionModelWithProjection`]):
             Frozen image-encoder.
         prior_text_encoder ([`CLIPTextModelWithProjection`]):
             Frozen text-encoder.

From dc219b894bb43454c2ff9f5db95aba6b28a3ccb0 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 11 May 2023 11:16:28 -1000
Subject: [PATCH 026/182] Update
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 9348ad535b96..80f9a677753c 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -43,7 +43,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
             The canonincal unCLIP prior to approximate the image embedding from the text embedding.
         image_encoder ([`CLIPVisionModelWithProjection`]):
             Frozen image-encoder.
-        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+        text_encoder ([`CLIPTextModelWithProjection`]):
             Frozen text-encoder.
         prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class

From 55adf823157eef0f7dbfd70e3cd056908d9ae533 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 11 May 2023 11:16:37 -1000
Subject: [PATCH 027/182] Update
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 80f9a677753c..ffae9355263d 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -45,7 +45,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
             Frozen image-encoder.
         text_encoder ([`CLIPTextModelWithProjection`]):
             Frozen text-encoder.
-        prior_tokenizer (`CLIPTokenizer`):
+        tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         prior_scheduler ([`UnCLIPScheduler`]):

From 54cf75214118f3c116a19cf1c53852f46c307faa Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 11 May 2023 11:16:44 -1000
Subject: [PATCH 028/182] Update
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index ffae9355263d..8f377232d0ff 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -48,7 +48,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
         tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        prior_scheduler ([`UnCLIPScheduler`]):
+        scheduler ([`UnCLIPScheduler`]):
             A scheduler to be used in combination with `prior` to generate image embedding.
     """
 

From d63b210f7361bc96bb33e3eee2fde1aededb02e9 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 11 May 2023 11:17:11 -1000
Subject: [PATCH 029/182] Update
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../pipelines/kandinsky/pipeline_kandinsky_prior.py       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 8f377232d0ff..b359f50341e0 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -55,10 +55,10 @@ class KandinskyPriorPipeline(DiffusionPipeline):
     def __init__(
         self,
         prior: PriorTransformer,
-        prior_image_encoder: CLIPVisionModelWithProjection,
-        prior_text_encoder: CLIPTextModelWithProjection,
-        prior_tokenizer: CLIPTokenizer,
-        prior_scheduler: UnCLIPScheduler,
+        image_encoder: CLIPVisionModelWithProjection,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: UnCLIPScheduler,
     ):
         super().__init__()
 

From e956043112b50dc1f48776d1d316c27faa6764c3 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 11 May 2023 22:07:57 +0000
Subject: [PATCH 030/182] fix the module names

---
 .../pipelines/kandinsky/pipeline_kandinsky.py |  1 -
 .../kandinsky/pipeline_kandinsky_prior.py     | 40 +++++++++----------
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index dfd3e6e50002..1399d867cc48 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -304,7 +304,6 @@ def __call__(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
 
-        # TO_DO [2] add a step to create negative_image_embeds https://github.com/ai-forever/Kandinsky-2/blob/main/kandinsky2/kandinsky2_1_model.py#L322
         image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)
 
         text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index b359f50341e0..8ef7b369d096 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -64,10 +64,10 @@ def __init__(
 
         self.register_modules(
             prior=prior,
-            prior_text_encoder=prior_text_encoder,
-            prior_tokenizer=prior_tokenizer,
-            prior_scheduler=prior_scheduler,
-            prior_image_encoder=prior_image_encoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
         )
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
@@ -83,7 +83,7 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
 
     def create_zero_img_emb(self, batch_size, device):
         zero_img = torch.zeros(1, 3, 224, 224).to(device=device)
-        zero_image_emb = self.prior_image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
         zero_image_emb = zero_image_emb.repeat(batch_size, 1)
         return zero_image_emb
 
@@ -170,29 +170,29 @@ def _encode_prompt(
     ):
         batch_size = len(prompt) if isinstance(prompt, list) else 1
         # get prompt text embeddings
-        text_inputs = self.prior_tokenizer(
+        text_inputs = self.tokenizer(
             prompt,
             padding="max_length",
-            max_length=self.prior_tokenizer.model_max_length,
+            max_length=self.tokenizer.model_max_length,
             truncation=True,
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
         text_mask = text_inputs.attention_mask.bool().to(device)
 
-        untruncated_ids = self.prior_tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
         if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
-            removed_text = self.prior_tokenizer.batch_decode(
-                untruncated_ids[:, self.prior_tokenizer.model_max_length - 1 : -1]
+            removed_text = self.tokenizer.batch_decode(
+                untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
             )
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.prior_tokenizer.model_max_length} tokens: {removed_text}"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length]
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
 
-        text_encoder_output = self.prior_text_encoder(text_input_ids.to(device))
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
 
         prompt_embeds = text_encoder_output.text_embeds
         text_encoder_hidden_states = text_encoder_output.last_hidden_state
@@ -221,15 +221,15 @@ def _encode_prompt(
             else:
                 uncond_tokens = negative_prompt
 
-            uncond_input = self.prior_tokenizer(
+            uncond_input = self.tokenizer(
                 uncond_tokens,
                 padding="max_length",
-                max_length=self.prior_tokenizer.model_max_length,
+                max_length=self.tokenizer.model_max_length,
                 truncation=True,
                 return_tensors="pt",
             )
             uncond_text_mask = uncond_input.attention_mask.bool().to(device)
-            negative_prompt_embeds_text_encoder_output = self.prior_text_encoder(uncond_input.input_ids.to(device))
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
 
             negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
             uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
@@ -293,8 +293,8 @@ def __call__(
             )
 
             # prior
-            self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
-            prior_timesteps_tensor = self.prior_scheduler.timesteps
+            self.scheduler.set_timesteps(prior_num_inference_steps, device=device)
+            prior_timesteps_tensor = self.scheduler.timesteps
 
             embedding_dim = self.prior.config.embedding_dim
 
@@ -304,7 +304,7 @@ def __call__(
                 device,
                 generator,
                 prior_latents,
-                self.prior_scheduler,
+                self.scheduler,
             )
 
             for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
@@ -332,7 +332,7 @@ def __call__(
                 else:
                     prev_timestep = prior_timesteps_tensor[i + 1]
 
-                prior_latents = self.prior_scheduler.step(
+                prior_latents = self.scheduler.step(
                     predicted_image_embedding,
                     timestep=t,
                     sample=prior_latents,

From bfa172babae9e10a99cf7face56e45e1269b9376 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 12 May 2023 02:09:28 +0000
Subject: [PATCH 031/182] fix height and width arg

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 1399d867cc48..badc96f17060 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -45,7 +45,7 @@ def get_new_h_w(h, w):
     return new_h * 8, new_w * 8
 
 
-class KandinskyPipeline(DiffusionPipeline):
+class KandinskyInpaintPipeline(DiffusionPipeline):
     """
     Pipeline for text-to-image generation using Kandinsky
 
@@ -274,8 +274,8 @@ def _execution_device(self):
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
+        height: int = 512,
+        width: int = 512,
         num_inference_steps: int = 100,
         guidance_scale: float = 4.0,
         num_images_per_prompt: int = 1,
@@ -317,8 +317,6 @@ def __call__(
 
         num_channels_latents = self.unet.config.in_channels
 
-        height = height or self.unet.config.sample_size
-        width = width or self.unet.config.sample_size
         height, width = get_new_h_w(height, width)
 
         # create initial latent

From 05687061096b72315d77be1221bff88ee8988bac Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 12 May 2023 05:23:47 +0000
Subject: [PATCH 032/182] add inpaint

---
 .../kandinsky/pipeline_kandinsky_inpaint.py   | 446 ++++++++++++++++++
 1 file changed, 446 insertions(+)
 create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
new file mode 100644
index 000000000000..7218103c06a5
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -0,0 +1,446 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+
+import torch
+from transformers import (
+    XLMRobertaTokenizerFast,
+)
+
+from ...models import UNet2DConditionModel
+from ...pipelines import DiffusionPipeline
+from ...schedulers import UnCLIPScheduler
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    randn_tensor,
+)
+from .text_encoder import MultilingualCLIP
+from .text_proj import KandinskyTextProjModel
+from copy import deepcopy 
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+def get_new_h_w(h, w):
+    new_h = h // 64
+    if h % 64 != 0:
+        new_h += 1
+    new_w = w // 64
+    if w % 64 != 0:
+        new_w += 1
+    return new_h * 8, new_w * 8
+
+def prepare_mask(mask):
+    mask = mask.float()[0]
+    old_mask = deepcopy(mask)
+    for i in range(mask.shape[1]):
+        for j in range(mask.shape[2]):
+            if old_mask[0][i][j] == 1:
+                continue
+            if i != 0:
+                mask[:, i - 1, j] = 0
+            if j != 0:
+                mask[:, i, j - 1] = 0
+            if i != 0 and j != 0:
+                mask[:, i - 1, j - 1] = 0
+            if i != mask.shape[1] - 1:
+                mask[:, i + 1, j] = 0
+            if j != mask.shape[2] - 1:
+                mask[:, i, j + 1] = 0
+            if i != mask.shape[1] - 1 and j != mask.shape[2] - 1:
+                mask[:, i + 1, j + 1] = 0
+    return mask.unsqueeze(0)
+
+
+def prepare_image(pil_image, w=512, h=512):
+    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+
+class KandinskyPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizerFast`]):
+            Tokenizer of class
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        text_proj ([`KandinskyTextProjModel`]):
+            Utility class to prepare and combine the embeddings before they are passed to the decoder.
+    """
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        #image_encoder: MOVQ # TO_DO add this later 
+        tokenizer: XLMRobertaTokenizerFast,
+        text_proj: KandinskyTextProjModel,
+        unet: UNet2DConditionModel,
+        scheduler: UnCLIPScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_proj=text_proj,
+            unet=unet,
+            scheduler=scheduler,
+        )
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        prompt_embeds, text_encoder_hidden_states = self.text_encoder(
+            input_ids=text_input_ids, attention_mask=text_mask
+        )
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            uncond_text_input_ids = uncond_input.input_ids.to(device)
+            uncond_text_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(
+                input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask
+            )
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        negative_image_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        # Define call parameters
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+       
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)
+        
+        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
+            image_embeddings=image_embeds,
+            prompt_embeds=prompt_embeds,
+            text_encoder_hidden_states=text_encoder_hidden_states,
+        )
+
+        # preprocess image and mask 
+        ## Encode the image
+        image = prepare_image(image, width, height).to(device)
+        image = self.image_encoder.encode(image)
+        
+        ## prepared mask
+        mask_image = torch.from_numpy(mask_image).unsqueeze(0).unsqueeze(0)
+        image_shape = tuple(image.shape[-2:])
+        mask_image = F.interpolate(
+            mask_image,
+            image_shape,
+            mode="nearest",
+        )
+        mask_image = prepare_mask(mask_image).to(device)
+        
+        ## apply mask on image
+        masked_image = image * mask_image
+
+        if do_classifier_free_guidance:
+            mask_image = mask_image.repeat(2, 1, 1, 1)
+            masked_image = masked_image.repeat(2, 1, 1, 1)
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        # YiYi's TO-DO: hard-code to be 4, need to set it to be the z_channels in MoVQ encoder's config once it's added
+        num_channels_latents = 4
+        #num_channels_latents = self.image_encoder.config.z_channels
+        
+        # get h, w for latents
+        height, width = get_new_h_w(height, width)
+    
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        # Check that sizes of mask, masked image and latents match with expected
+        num_channels_mask = mask_image.shape[1]
+        num_channels_masked_image = masked_image.shape[1]
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        # expand the latents if we are doing classifier free guidance
+        latents = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            latent_model_input = 
+            noise_pred = self.unet(
+                sample=latents,  # [2, 4, 96, 96]
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                class_labels=additive_clip_time_embeddings,
+            ).sample
+
+            # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline,
+            # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond)
+            # this means the our latent shape is batch_size *2 instad batch_size
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                variance_pred_uncond, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred] * 2)
+                variance_pred = torch.cat([variance_pred_uncond, variance_pred_text])
+                noise_pred = torch.cat([noise_pred, variance_pred], dim=1)
+
+            if i + 1 == timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                prev_timestep=prev_timestep,
+                generator=generator,
+                batch_size=batch_size,
+            ).prev_sample
+
+        _, latents = latents.chunk(2)
+
+        return latents

From 323aee64dd4184ad75fa594231f345a7190c3e5e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 12 May 2023 05:34:47 +0000
Subject: [PATCH 033/182] refactor text2imge pipeline

---
 .../pipelines/kandinsky/pipeline_kandinsky.py         | 11 ++++++-----
 .../pipelines/kandinsky/pipeline_kandinsky_inpaint.py |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index badc96f17060..d988f38506ea 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -45,7 +45,7 @@ def get_new_h_w(h, w):
     return new_h * 8, new_w * 8
 
 
-class KandinskyInpaintPipeline(DiffusionPipeline):
+class KandinskyPipeline(DiffusionPipeline):
     """
     Pipeline for text-to-image generation using Kandinsky
 
@@ -329,12 +329,13 @@ def __call__(
             self.scheduler,
         )
 
-        # expand the latents if we are doing classifier free guidance
-        latents = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
 
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            
             noise_pred = self.unet(
-                sample=latents,  # [2, 4, 96, 96]
+                sample=latent_model_input,  # [2, 4, 96, 96]
                 timestep=t,
                 encoder_hidden_states=text_encoder_hidden_states,
                 class_labels=additive_clip_time_embeddings,
@@ -368,6 +369,6 @@ def __call__(
                 batch_size=batch_size,
             ).prev_sample
 
-        _, latents = latents.chunk(2)
+            _, latents = latents.chunk(2)
 
         return latents
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 7218103c06a5..1dc00c61bb0d 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -75,7 +75,7 @@ def prepare_image(pil_image, w=512, h=512):
     return image
 
 
-class KandinskyPipeline(DiffusionPipeline):
+class KandinskyInpaintPipeline(DiffusionPipeline):
     """
     Pipeline for text-to-image generation using Kandinsky
 

From faa08f944db8a1d04611e593ca0731c2897bcff4 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 12 May 2023 05:42:57 +0000
Subject: [PATCH 034/182] add a note about batch_size argument

---
 src/diffusers/schedulers/scheduling_unclip.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 4ee549f98830..009bbff398a4 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -258,6 +258,7 @@ def step(
         prev_timestep: Optional[int] = None,
         generator=None,
         return_dict: bool = True,
+        # YiYi's TO-DO: batch_size argument for testing, remove this later
         batch_size: Optional[int] = None,
     ) -> Union[UnCLIPSchedulerOutput, Tuple]:
         """

From 28eb8168cebdd34ae5dc1afa03ce559deb22c87a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 12 May 2023 08:18:09 +0000
Subject: [PATCH 035/182] finish inpaint pipeline first draft

---
 scripts/convert_kandinsky_to_diffusers.py     | 156 +++++++++++++++++-
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipelines/__init__.py           |   2 +-
 src/diffusers/pipelines/kandinsky/__init__.py |   1 +
 .../kandinsky/pipeline_kandinsky_inpaint.py   |  19 ++-
 5 files changed, 168 insertions(+), 11 deletions(-)

diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py
index 107f4a7eda8d..7e22f0559619 100644
--- a/scripts/convert_kandinsky_to_diffusers.py
+++ b/scripts/convert_kandinsky_to_diffusers.py
@@ -23,8 +23,9 @@
       --prior_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/prior_fp16.ckpt \
       --clip_stat_path  /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/ViT-L-14_stats.th \
       --text2img_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/decoder_fp16.ckpt \
-      --dump_path ./kandinsky_model \
-      --debug text2img
+      --inpaint_text2img_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/inpainting_fp16.ckpt \
+      --dump_path /home/yiyi_huggingface_co/model_repo/Kandinsky-inpaint \
+      --debug inpaint_text2img
 ```
 """
 
@@ -256,7 +257,6 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix
     "use_linear_projection": False,
 }
 
-
 def unet_model_from_original_config():
     model = UNet2DConditionModel(**UNET_CONFIG)
 
@@ -325,6 +325,116 @@ def unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
 
 # done unet
 
+# inpaint unet
+
+# We are hardcoding the model configuration for now. If we need to generalize to more model configurations, we can
+# update then.
+
+INPAINT_UNET_CONFIG = {
+    "act_fn": "silu",
+    "attention_head_dim": 64,
+    "block_out_channels": (384, 768, 1152, 1536),
+    "center_input_sample": False,
+    "class_embed_type": "identity",
+    "cross_attention_dim": 768,
+    "down_block_types": (
+        "ResnetDownsampleBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+        "SimpleCrossAttnDownBlock2D",
+    ),
+    "downsample_padding": 1,
+    "dual_cross_attention": False,
+    "flip_sin_to_cos": True,
+    "freq_shift": 0,
+    "in_channels": 9,
+    "layers_per_block": 3,
+    "mid_block_scale_factor": 1,
+    "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+    "norm_eps": 1e-05,
+    "norm_num_groups": 32,
+    "only_cross_attention": False,
+    "out_channels": 8,
+    "resnet_time_scale_shift": "scale_shift",
+    "sample_size": 64,
+    "up_block_types": (
+        "SimpleCrossAttnUpBlock2D",
+        "SimpleCrossAttnUpBlock2D",
+        "SimpleCrossAttnUpBlock2D",
+        "ResnetUpsampleBlock2D",
+    ),
+    "upcast_attention": False,
+    "use_linear_projection": False,
+}
+
+def inpaint_unet_model_from_original_config():
+    model = UNet2DConditionModel(**INPAINT_UNET_CONFIG)
+
+    return model
+
+
+def inpaint_unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    num_head_channels = UNET_CONFIG["attention_head_dim"]
+
+    diffusers_checkpoint.update(unet_time_embeddings(checkpoint))
+    diffusers_checkpoint.update(unet_conv_in(checkpoint))
+
+    # <original>.input_blocks -> <diffusers>.down_blocks
+
+    original_down_block_idx = 1
+
+    for diffusers_down_block_idx in range(len(model.down_blocks)):
+        checkpoint_update, num_original_down_blocks = unet_downblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_down_block_idx=diffusers_down_block_idx,
+            original_down_block_idx=original_down_block_idx,
+            num_head_channels=num_head_channels,
+        )
+
+        original_down_block_idx += num_original_down_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.input_blocks -> <diffusers>.down_blocks
+
+    diffusers_checkpoint.update(
+        unet_midblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            num_head_channels=num_head_channels,
+        )
+    )
+
+    # <original>.output_blocks -> <diffusers>.up_blocks
+
+    original_up_block_idx = 0
+
+    for diffusers_up_block_idx in range(len(model.up_blocks)):
+        checkpoint_update, num_original_up_blocks = unet_upblock_to_diffusers_checkpoint(
+            model,
+            checkpoint,
+            diffusers_up_block_idx=diffusers_up_block_idx,
+            original_up_block_idx=original_up_block_idx,
+            num_head_channels=num_head_channels,
+        )
+
+        original_up_block_idx += num_original_up_blocks
+
+        diffusers_checkpoint.update(checkpoint_update)
+
+    # done <original>.output_blocks -> <diffusers>.up_blocks
+
+    diffusers_checkpoint.update(unet_conv_norm_out(checkpoint))
+    diffusers_checkpoint.update(unet_conv_out(checkpoint))
+
+    return diffusers_checkpoint
+
+
+# done inpaint unet
+
 # text proj
 
 TEXT_PROJ_CONFIG = {}
@@ -762,6 +872,35 @@ def text2img(*, args, checkpoint_map_location):
 
     return unet_model, text_proj_model
 
+def inpaint_text2img(*, args, checkpoint_map_location):
+    print("loading inpaint text2img")
+
+    inpaint_text2img_checkpoint = torch.load(args.inpaint_text2img_checkpoint_path, map_location=checkpoint_map_location)
+
+    inpaint_unet_model = inpaint_unet_model_from_original_config()
+
+    inpaint_unet_diffusers_checkpoint = inpaint_unet_original_checkpoint_to_diffusers_checkpoint(inpaint_unet_model, inpaint_text2img_checkpoint)
+
+    # text proj interlude
+
+    # The original decoder implementation includes a set of parameters that are used
+    # for creating the `encoder_hidden_states` which are what the U-net is conditioned
+    # on. The diffusers conditional unet directly takes the encoder_hidden_states. We pull
+    # the parameters into the KandinskyTextProjModel class
+    text_proj_model = text_proj_from_original_config()
+
+    text_proj_checkpoint = text_proj_original_checkpoint_to_diffusers_checkpoint(inpaint_text2img_checkpoint)
+
+    load_checkpoint_to_model(text_proj_checkpoint, text_proj_model, strict=True)
+
+    del inpaint_text2img_checkpoint
+
+    load_checkpoint_to_model(inpaint_unet_diffusers_checkpoint, inpaint_unet_model, strict=True)
+
+    print("done loading inpaint text2img")
+
+    return inpaint_unet_model, text_proj_model
+
 
 def load_checkpoint_to_model(checkpoint, model, strict=False):
     with tempfile.NamedTemporaryFile() as file:
@@ -795,6 +934,13 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         required=True,
         help="Path to the text2img checkpoint to convert.",
     )
+    parser.add_argument(
+        "--inpaint_text2img_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the inpaint text2img checkpoint to convert.",
+    )
     parser.add_argument(
         "--checkpoint_load_device",
         default="cpu",
@@ -829,5 +975,9 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         unet_model, text_proj_model = text2img(args=args, checkpoint_map_location=checkpoint_map_location)
         unet_model.save_pretrained(f"{args.dump_path}/unet")
         text_proj_model.save_pretrained(f"{args.dump_path}/text_proj")
+    elif args.debug == "inpaint_text2img":
+        inpaint_unet_model, inpaint_text_proj_model = inpaint_text2img(args=args, checkpoint_map_location=checkpoint_map_location)
+        inpaint_unet_model.save_pretrained(f"{args.dump_path}/inpaint_unet")
+        inpaint_text_proj_model.save_pretrained(f"{args.dump_path}/inpaint_text_proj")
     else:
         raise ValueError(f"unknown debug value : {args.debug}")
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 265c263881ca..b4551d0e2b7d 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -130,6 +130,7 @@
         IFSuperResolutionPipeline,
         KandinskyPipeline,
         KandinskyPriorPipeline,
+        KandinskyInpaintPipeline,
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
         SemanticStableDiffusionPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index da4733b07e1c..eaea3a89c6ce 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -52,7 +52,7 @@
         IFPipeline,
         IFSuperResolutionPipeline,
     )
-    from .kandinsky import KandinskyPipeline, KandinskyPriorPipeline
+    from .kandinsky import KandinskyPipeline, KandinskyPriorPipeline, KandinskyInpaintPipeline
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 60fc387404ba..93731dbdbf3d 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -14,5 +14,6 @@
 else:
     from .pipeline_kandinsky import KandinskyPipeline
     from .pipeline_kandinsky_prior import KandinskyPriorPipeline
+    from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
     from .text_encoder import MultilingualCLIP
     from .text_proj import KandinskyTextProjModel
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 1dc00c61bb0d..46bf3775d862 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -14,7 +14,13 @@
 
 from typing import List, Optional, Union
 
+import PIL
+from PIL import Image
+
+import numpy as np
 import torch
+import torch.nn.functional as F
+
 from transformers import (
     XLMRobertaTokenizerFast,
 )
@@ -400,14 +406,13 @@ def __call__(
                 " `pipeline.unet` or your `mask_image` or `image` input."
             )
 
-        # expand the latents if we are doing classifier free guidance
-        latents = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-
-
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
-            latent_model_input = 
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = torch.cat([latent_model_input, masked_image, mask_image], dim=1)
+
             noise_pred = self.unet(
-                sample=latents,  # [2, 4, 96, 96]
+                sample=latent_model_input,  # [2, 9, 96, 96]
                 timestep=t,
                 encoder_hidden_states=text_encoder_hidden_states,
                 class_labels=additive_clip_time_embeddings,
@@ -441,6 +446,6 @@ def __call__(
                 batch_size=batch_size,
             ).prev_sample
 
-        _, latents = latents.chunk(2)
+            _, latents = latents.chunk(2)
 
         return latents

From 0272da90d90bf4461803b12a79b1b7961c5d8e38 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 12 May 2023 08:38:46 +0000
Subject: [PATCH 036/182] add post-processing to inpaint

---
 .../kandinsky/pipeline_kandinsky_inpaint.py   | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 46bf3775d862..b41d087ee184 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -27,6 +27,7 @@
 
 from ...models import UNet2DConditionModel
 from ...pipelines import DiffusionPipeline
+from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
     is_accelerate_available,
@@ -382,11 +383,11 @@ def __call__(
         #num_channels_latents = self.image_encoder.config.z_channels
         
         # get h, w for latents
-        height, width = get_new_h_w(height, width)
+        sample_height, sample_width = get_new_h_w(height, width)
     
         # create initial latent
         latents = self.prepare_latents(
-            (batch_size, num_channels_latents, height, width),
+            (batch_size, num_channels_latents, sample_height, sample_width),
             text_encoder_hidden_states.dtype,
             device,
             generator,
@@ -448,4 +449,17 @@ def __call__(
 
             _, latents = latents.chunk(2)
 
-        return latents
+        # post-processing
+        image = self.image_encoder.decode(latents)
+
+        image = image * 0.5 + 0.5
+        image = image.clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)

From e74c1733c2eedc7032d4ea07b35216d30394cb17 Mon Sep 17 00:00:00 2001
From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com>
Date: Sat, 13 May 2023 02:27:15 +0530
Subject: [PATCH 037/182] [WIP] Add Kandinsky decoder (#3330)

* Add movq

Co-authored-by: ayushmangal <ayushmangal@microsoft.com>
Co-authored-by: YiYi Xu <yixu310@gmail.com>
---
 scripts/convert_kandinsky_to_diffusers.py     | 420 +++++++++++++++++-
 src/diffusers/models/attention.py             |  21 +
 src/diffusers/models/resnet.py                |  14 +-
 src/diffusers/models/unet_2d_blocks.py        |  51 ++-
 src/diffusers/models/vae.py                   |  29 +-
 src/diffusers/models/vq_model.py              |   7 +-
 .../pipelines/kandinsky/pipeline_kandinsky.py |  32 +-
 7 files changed, 539 insertions(+), 35 deletions(-)

diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py
index 7e22f0559619..0462772e3140 100644
--- a/scripts/convert_kandinsky_to_diffusers.py
+++ b/scripts/convert_kandinsky_to_diffusers.py
@@ -1,11 +1,13 @@
 import argparse
 import tempfile
+import os
 
 import torch
 from accelerate import load_checkpoint_and_dispatch
 
 from diffusers import UNet2DConditionModel
 from diffusers.models.prior_transformer import PriorTransformer
+from diffusers.models.vq_model import VQModel
 from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
 
 
@@ -901,15 +903,413 @@ def inpaint_text2img(*, args, checkpoint_map_location):
 
     return inpaint_unet_model, text_proj_model
 
+# movq
+
+MOVQ_CONFIG ={
+                "in_channels":3, 
+                "out_channels":3, 
+                "latent_channels":4, 
+                "down_block_types":("DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "AttnDownEncoderBlock2D"), 
+                "up_block_types":("AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"),
+                "num_vq_embeddings":16384,
+                "block_out_channels":(128, 256, 256, 512),
+                "vq_embed_dim":4,
+                "layers_per_block":2,
+                "norm_type":"spatial"
+            }
+
+
+def movq_model_from_original_config():
+    movq = VQModel(**MOVQ_CONFIG )
+    return movq
+
+def movq_encoder_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # conv_in
+    diffusers_checkpoint.update(
+        {
+            "encoder.conv_in.weight": checkpoint["encoder.conv_in.weight"],
+            "encoder.conv_in.bias": checkpoint["encoder.conv_in.bias"],
+        }
+    )
+
+    # down_blocks
+    for down_block_idx, down_block in enumerate(model.encoder.down_blocks):
+        diffusers_down_block_prefix = f"encoder.down_blocks.{down_block_idx}"
+        down_block_prefix = f"encoder.down.{down_block_idx}"
+
+        # resnets
+        for resnet_idx, resnet in enumerate(down_block.resnets):
+            diffusers_resnet_prefix = f"{diffusers_down_block_prefix}.resnets.{resnet_idx}"
+            resnet_prefix = f"{down_block_prefix}.block.{resnet_idx}"
+
+            diffusers_checkpoint.update(
+                movq_resnet_to_diffusers_checkpoint(
+                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+                )
+            )
+
+        # downsample
+
+        # do not include the downsample when on the last down block
+        # There is no downsample on the last down block
+        if down_block_idx != len(model.encoder.down_blocks) - 1:
+            # There's a single downsample in the original checkpoint but a list of downsamples
+            # in the diffusers model.
+            diffusers_downsample_prefix = f"{diffusers_down_block_prefix}.downsamplers.0.conv"
+            downsample_prefix = f"{down_block_prefix}.downsample.conv"
+            diffusers_checkpoint.update(
+                {
+                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
+                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
+                }
+            )
+
+        # attentions
+
+        if hasattr(down_block, "attentions"):
+            for attention_idx, _ in enumerate(down_block.attentions):
+                diffusers_attention_prefix = f"{diffusers_down_block_prefix}.attentions.{attention_idx}"
+                attention_prefix = f"{down_block_prefix}.attn.{attention_idx}"
+                diffusers_checkpoint.update(
+                    movq_attention_to_diffusers_checkpoint(
+                        checkpoint,
+                        diffusers_attention_prefix=diffusers_attention_prefix,
+                        attention_prefix=attention_prefix,
+                    )
+                )
+
+    # mid block
+
+    # mid block attentions
+
+    # There is a single hardcoded attention block in the middle of the VQ-diffusion encoder
+    diffusers_attention_prefix = "encoder.mid_block.attentions.0"
+    attention_prefix = "encoder.mid.attn_1"
+    diffusers_checkpoint.update(
+        movq_attention_to_diffusers_checkpoint(
+            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+        )
+    )
+
+    # mid block resnets
+
+    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
+        diffusers_resnet_prefix = f"encoder.mid_block.resnets.{diffusers_resnet_idx}"
+
+        # the hardcoded prefixes to `block_` are 1 and 2
+        orig_resnet_idx = diffusers_resnet_idx + 1
+        # There are two hardcoded resnets in the middle of the VQ-diffusion encoder
+        resnet_prefix = f"encoder.mid.block_{orig_resnet_idx}"
+
+        diffusers_checkpoint.update(
+            movq_resnet_to_diffusers_checkpoint(
+                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+            )
+        )
+
+    diffusers_checkpoint.update(
+        {
+            # conv_norm_out
+            "encoder.conv_norm_out.weight": checkpoint["encoder.norm_out.weight"],
+            "encoder.conv_norm_out.bias": checkpoint["encoder.norm_out.bias"],
+            # conv_out
+            "encoder.conv_out.weight": checkpoint["encoder.conv_out.weight"],
+            "encoder.conv_out.bias": checkpoint["encoder.conv_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def movq_decoder_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # conv in
+    diffusers_checkpoint.update(
+        {
+            "decoder.conv_in.weight": checkpoint["decoder.conv_in.weight"],
+            "decoder.conv_in.bias": checkpoint["decoder.conv_in.bias"],
+        }
+    )
+
+    # up_blocks
+
+    for diffusers_up_block_idx, up_block in enumerate(model.decoder.up_blocks):
+        # up_blocks are stored in reverse order in the VQ-diffusion checkpoint
+        orig_up_block_idx = len(model.decoder.up_blocks) - 1 - diffusers_up_block_idx
+
+        diffusers_up_block_prefix = f"decoder.up_blocks.{diffusers_up_block_idx}"
+        up_block_prefix = f"decoder.up.{orig_up_block_idx}"
+
+        # resnets
+        for resnet_idx, resnet in enumerate(up_block.resnets):
+            diffusers_resnet_prefix = f"{diffusers_up_block_prefix}.resnets.{resnet_idx}"
+            resnet_prefix = f"{up_block_prefix}.block.{resnet_idx}"
+
+            diffusers_checkpoint.update(
+                movq_resnet_to_diffusers_checkpoint_spatial_norm(
+                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+                )
+            )
+
+        # upsample
+
+        # there is no up sample on the last up block
+        if diffusers_up_block_idx != len(model.decoder.up_blocks) - 1:
+            # There's a single upsample in the VQ-diffusion checkpoint but a list of downsamples
+            # in the diffusers model.
+            diffusers_downsample_prefix = f"{diffusers_up_block_prefix}.upsamplers.0.conv"
+            downsample_prefix = f"{up_block_prefix}.upsample.conv"
+            diffusers_checkpoint.update(
+                {
+                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
+                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
+                }
+            )
+
+        # attentions
+
+        if hasattr(up_block, "attentions"):
+            for attention_idx, _ in enumerate(up_block.attentions):
+                diffusers_attention_prefix = f"{diffusers_up_block_prefix}.attentions.{attention_idx}"
+                attention_prefix = f"{up_block_prefix}.attn.{attention_idx}"
+                diffusers_checkpoint.update(
+                    movq_attention_to_diffusers_checkpoint_spatial_norm(
+                        checkpoint,
+                        diffusers_attention_prefix=diffusers_attention_prefix,
+                        attention_prefix=attention_prefix,
+                    )
+                )
+
+    # mid block
+
+    # mid block attentions
+
+    # There is a single hardcoded attention block in the middle of the VQ-diffusion decoder
+    diffusers_attention_prefix = "decoder.mid_block.attentions.0"
+    attention_prefix = "decoder.mid.attn_1"
+    diffusers_checkpoint.update(
+        movq_attention_to_diffusers_checkpoint_spatial_norm(
+            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+        )
+    )
+
+    # mid block resnets
+
+    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
+        diffusers_resnet_prefix = f"decoder.mid_block.resnets.{diffusers_resnet_idx}"
+
+        # the hardcoded prefixes to `block_` are 1 and 2
+        orig_resnet_idx = diffusers_resnet_idx + 1
+        # There are two hardcoded resnets in the middle of the VQ-diffusion decoder
+        resnet_prefix = f"decoder.mid.block_{orig_resnet_idx}"
+
+        diffusers_checkpoint.update(
+            movq_resnet_to_diffusers_checkpoint_spatial_norm(
+                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+            )
+        )
+
+    diffusers_checkpoint.update(
+        {
+            # conv_norm_out
+            "decoder.conv_norm_out.norm_layer.weight": checkpoint["decoder.norm_out.norm_layer.weight"],
+            "decoder.conv_norm_out.norm_layer.bias": checkpoint["decoder.norm_out.norm_layer.bias"],
+            "decoder.conv_norm_out.conv_y.weight": checkpoint["decoder.norm_out.conv_y.weight"],
+            "decoder.conv_norm_out.conv_y.bias": checkpoint["decoder.norm_out.conv_y.bias"],
+            "decoder.conv_norm_out.conv_b.weight": checkpoint["decoder.norm_out.conv_b.weight"],
+            "decoder.conv_norm_out.conv_b.bias": checkpoint["decoder.norm_out.conv_b.bias"],
+            # conv_out
+            "decoder.conv_out.weight": checkpoint["decoder.conv_out.weight"],
+            "decoder.conv_out.bias": checkpoint["decoder.conv_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def movq_resnet_to_diffusers_checkpoint(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    rv = {
+        # norm1
+        f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.norm1.weight"],
+        f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.norm1.bias"],
+        # conv1
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
+        # norm2
+        f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.norm2.weight"],
+        f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.norm2.bias"],
+        # conv2
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
+    }
+
+    if resnet.conv_shortcut is not None:
+        rv.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
+                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
+            }
+        )
+
+    return rv
+
+def movq_resnet_to_diffusers_checkpoint_spatial_norm(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    rv = {
+        # norm1
+        f"{diffusers_resnet_prefix}.norm1.norm_layer.weight": checkpoint[f"{resnet_prefix}.norm1.norm_layer.weight"],
+        f"{diffusers_resnet_prefix}.norm1.norm_layer.bias": checkpoint[f"{resnet_prefix}.norm1.norm_layer.bias"],
+        f"{diffusers_resnet_prefix}.norm1.conv_y.weight": checkpoint[f"{resnet_prefix}.norm1.conv_y.weight"],
+        f"{diffusers_resnet_prefix}.norm1.conv_y.bias": checkpoint[f"{resnet_prefix}.norm1.conv_y.bias"],
+        f"{diffusers_resnet_prefix}.norm1.conv_b.weight": checkpoint[f"{resnet_prefix}.norm1.conv_b.weight"],
+        f"{diffusers_resnet_prefix}.norm1.conv_b.bias": checkpoint[f"{resnet_prefix}.norm1.conv_b.bias"],
+        # conv1
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
+        # norm2
+        f"{diffusers_resnet_prefix}.norm2.norm_layer.weight": checkpoint[f"{resnet_prefix}.norm2.norm_layer.weight"],
+        f"{diffusers_resnet_prefix}.norm2.norm_layer.bias": checkpoint[f"{resnet_prefix}.norm2.norm_layer.bias"],
+        f"{diffusers_resnet_prefix}.norm2.conv_y.weight": checkpoint[f"{resnet_prefix}.norm2.conv_y.weight"],
+        f"{diffusers_resnet_prefix}.norm2.conv_y.bias": checkpoint[f"{resnet_prefix}.norm2.conv_y.bias"],
+        f"{diffusers_resnet_prefix}.norm2.conv_b.weight": checkpoint[f"{resnet_prefix}.norm2.conv_b.weight"],
+        f"{diffusers_resnet_prefix}.norm2.conv_b.bias": checkpoint[f"{resnet_prefix}.norm2.conv_b.bias"],
+        # conv2
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
+    }
+
+    if resnet.conv_shortcut is not None:
+        rv.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
+                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
+            }
+        )
+
+    return rv
+
+
+
+def movq_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
+    return {
+        # norm
+        f"{diffusers_attention_prefix}.norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
+        f"{diffusers_attention_prefix}.norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
+        # query
+        f"{diffusers_attention_prefix}.query.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.query.bias": checkpoint[f"{attention_prefix}.q.bias"],
+        # key
+        f"{diffusers_attention_prefix}.key.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.key.bias": checkpoint[f"{attention_prefix}.k.bias"],
+        # value
+        f"{diffusers_attention_prefix}.value.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.value.bias": checkpoint[f"{attention_prefix}.v.bias"],
+        # proj_attn
+        f"{diffusers_attention_prefix}.proj_attn.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
+            :, :, 0, 0
+        ],
+        f"{diffusers_attention_prefix}.proj_attn.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+    }
+
+def movq_attention_to_diffusers_checkpoint_spatial_norm(checkpoint, *, diffusers_attention_prefix, attention_prefix):
+    return {
+        # norm
+        f"{diffusers_attention_prefix}.norm.norm_layer.weight": checkpoint[f"{attention_prefix}.norm.norm_layer.weight"],
+        f"{diffusers_attention_prefix}.norm.norm_layer.bias": checkpoint[f"{attention_prefix}.norm.norm_layer.bias"],
+        f"{diffusers_attention_prefix}.norm.conv_y.weight": checkpoint[f"{attention_prefix}.norm.conv_y.weight"],
+        f"{diffusers_attention_prefix}.norm.conv_y.bias": checkpoint[f"{attention_prefix}.norm.conv_y.bias"],
+        f"{diffusers_attention_prefix}.norm.conv_b.weight": checkpoint[f"{attention_prefix}.norm.conv_b.weight"],
+        f"{diffusers_attention_prefix}.norm.conv_b.bias": checkpoint[f"{attention_prefix}.norm.conv_b.bias"],
+        # query
+        f"{diffusers_attention_prefix}.attention.to_q.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.attention.to_q.bias": checkpoint[f"{attention_prefix}.q.bias"],
+        # key
+        f"{diffusers_attention_prefix}.attention.to_k.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.attention.to_k.bias": checkpoint[f"{attention_prefix}.k.bias"],
+        # value
+        f"{diffusers_attention_prefix}.attention.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.attention.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"],
+        # proj_attn
+        f"{diffusers_attention_prefix}.attention.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
+            :, :, 0, 0
+        ],
+        f"{diffusers_attention_prefix}.attention.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+    }
+
+
+
+
+
+def movq_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+    diffusers_checkpoint.update(movq_encoder_to_diffusers_checkpoint(model, checkpoint))
+
+
+    # quant_conv
+
+    diffusers_checkpoint.update(
+        {
+            "quant_conv.weight": checkpoint["quant_conv.weight"],
+            "quant_conv.bias": checkpoint["quant_conv.bias"],
+        }
+    )
+
+    # quantize
+    diffusers_checkpoint.update({"quantize.embedding.weight": checkpoint["quantize.embedding.weight"]})
+
+    # post_quant_conv
+    diffusers_checkpoint.update(
+        {
+            "post_quant_conv.weight": checkpoint["post_quant_conv.weight"],
+            "post_quant_conv.bias": checkpoint["post_quant_conv.bias"],
+        }
+    )
+
+    # decoder
+    diffusers_checkpoint.update(movq_decoder_to_diffusers_checkpoint(model, checkpoint))
+
+
+
+    for keys in diffusers_checkpoint.keys():
+        print(keys)
+
+    return diffusers_checkpoint
+
+
+
+
+
+def movq(*, args, checkpoint_map_location):
+    print("loading movq")
+
+    movq_checkpoint = torch.load(args.movq_checkpoint_path, map_location=checkpoint_map_location)
+
+    movq_model = movq_model_from_original_config()
+
+    movq_diffusers_checkpoint = movq_original_checkpoint_to_diffusers_checkpoint(
+        movq_model, movq_checkpoint
+    )
+
+    del movq_checkpoint
+
+    load_checkpoint_to_model(movq_diffusers_checkpoint, movq_model, strict=True)
+
+    print("done loading movq")
+
+    return movq_model
+
 
 def load_checkpoint_to_model(checkpoint, model, strict=False):
-    with tempfile.NamedTemporaryFile() as file:
+    with tempfile.NamedTemporaryFile(delete=False) as file:
         torch.save(checkpoint, file.name)
         del checkpoint
         if strict:
             model.load_state_dict(torch.load(file.name), strict=True)
         else:
             load_checkpoint_and_dispatch(model, file.name, device_map="auto")
+    os.remove(file.name)
 
 
 if __name__ == "__main__":
@@ -921,24 +1321,31 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         "--prior_checkpoint_path",
         default=None,
         type=str,
-        required=True,
+        required=False,
         help="Path to the prior checkpoint to convert.",
     )
     parser.add_argument(
-        "--clip_stat_path", default=None, type=str, required=True, help="Path to the clip stats checkpoint to convert."
+        "--clip_stat_path", default=None, type=str, required=False, help="Path to the clip stats checkpoint to convert."
     )
     parser.add_argument(
         "--text2img_checkpoint_path",
         default=None,
         type=str,
-        required=True,
+        required=False,
+        help="Path to the text2img checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--movq_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
         help="Path to the text2img checkpoint to convert.",
     )
     parser.add_argument(
         "--inpaint_text2img_checkpoint_path",
         default=None,
         type=str,
-        required=True,
+        required=False,
         help="Path to the inpaint text2img checkpoint to convert.",
     )
     parser.add_argument(
@@ -979,5 +1386,8 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         inpaint_unet_model, inpaint_text_proj_model = inpaint_text2img(args=args, checkpoint_map_location=checkpoint_map_location)
         inpaint_unet_model.save_pretrained(f"{args.dump_path}/inpaint_unet")
         inpaint_text_proj_model.save_pretrained(f"{args.dump_path}/inpaint_text_proj")
+    elif args.debug == 'decoder':
+        decoder = movq(args=args, checkpoint_map_location=checkpoint_map_location)
+        decoder.save_pretrained(f"{args.dump_path}/decoder")
     else:
         raise ValueError(f"unknown debug value : {args.debug}")
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 0b313b83d360..2b31fa9e2f38 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -369,3 +369,24 @@ def forward(self, x, emb):
         x = F.group_norm(x, self.num_groups, eps=self.eps)
         x = x * (1 + scale) + shift
         return x
+
+class SpatialNorm(nn.Module):
+    """
+    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002
+    """
+    def __init__(
+        self,
+        f_channels,
+        zq_channels,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(num_channels=f_channels,num_groups=32,eps=1e-6,affine=True)
+        self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+        self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, f, zq):
+        f_size = f.shape[-2:]
+        zq = F.interpolate(zq, size=f_size, mode="nearest")
+        norm_f = self.norm_layer(f)
+        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return new_f
\ No newline at end of file
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index d9d539959c09..83bec9a52593 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -20,7 +20,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from .attention import AdaGroupNorm
+from .attention import AdaGroupNorm, SpatialNorm
 
 
 class Upsample1D(nn.Module):
@@ -460,7 +460,7 @@ def __init__(
         eps=1e-6,
         non_linearity="swish",
         skip_time_act=False,
-        time_embedding_norm="default",  # default, scale_shift, ada_group
+        time_embedding_norm="default",  # default, scale_shift, ada_group, spatial
         kernel=None,
         output_scale_factor=1.0,
         use_in_shortcut=None,
@@ -487,6 +487,8 @@ def __init__(
 
         if self.time_embedding_norm == "ada_group":
             self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm1 = SpatialNorm(in_channels, temb_channels)
         else:
             self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
 
@@ -497,7 +499,7 @@ def __init__(
                 self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels)
             elif self.time_embedding_norm == "scale_shift":
                 self.time_emb_proj = torch.nn.Linear(temb_channels, 2 * out_channels)
-            elif self.time_embedding_norm == "ada_group":
+            elif self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
                 self.time_emb_proj = None
             else:
                 raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
@@ -506,6 +508,8 @@ def __init__(
 
         if self.time_embedding_norm == "ada_group":
             self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm2 = SpatialNorm(out_channels, temb_channels)
         else:
             self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
 
@@ -551,7 +555,7 @@ def __init__(
     def forward(self, input_tensor, temb):
         hidden_states = input_tensor
 
-        if self.time_embedding_norm == "ada_group":
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm1(hidden_states, temb)
         else:
             hidden_states = self.norm1(hidden_states)
@@ -579,7 +583,7 @@ def forward(self, input_tensor, temb):
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
 
-        if self.time_embedding_norm == "ada_group":
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm2(hidden_states, temb)
         else:
             hidden_states = self.norm2(hidden_states)
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 0004f074c563..f91132a61397 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -18,7 +18,7 @@
 import torch.nn.functional as F
 from torch import nn
 
-from .attention import AdaGroupNorm
+from .attention import AdaGroupNorm, AttentionBlock, SpatialNorm
 from .attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0
 from .dual_transformer_2d import DualTransformer2DModel
 from .resnet import Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D
@@ -348,6 +348,7 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
             resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels
         )
     elif up_block_type == "AttnUpDecoderBlock2D":
         return AttnUpDecoderBlock2D(
@@ -360,6 +361,7 @@ def get_up_block(
             resnet_groups=resnet_groups,
             attn_num_head_channels=attn_num_head_channels,
             resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels
         )
     elif up_block_type == "KUpBlock2D":
         return KUpBlock2D(
@@ -406,7 +408,6 @@ def __init__(
         super().__init__()
         resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         self.add_attention = add_attention
-
         # there is always at least one resnet
         resnets = [
             ResnetBlock2D(
@@ -439,7 +440,6 @@ def __init__(
                         upcast_softmax=True,
                         _from_deprecated_attn_block=True,
                     )
-                )
             else:
                 attentions.append(None)
 
@@ -465,7 +465,8 @@ def forward(self, hidden_states, temb=None):
         hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
             if attn is not None:
-                hidden_states = attn(hidden_states)
+                hidden_states = attn(hidden_states, temb)
+
             hidden_states = resnet(hidden_states, temb)
 
         return hidden_states
@@ -1971,6 +1972,30 @@ def custom_forward(*inputs):
         return hidden_states
 
 
+class MOVQAttention(nn.Module):
+    def __init__(self, query_dim, temb_channels, attn_num_head_channels):
+        super().__init__()
+
+        self.norm = SpatialNorm(query_dim, temb_channels)
+        num_heads = query_dim // attn_num_head_channels if attn_num_head_channels is not None else 1
+        dim_head = attn_num_head_channels if attn_num_head_channels is not None else query_dim
+        self.attention = Attention(
+                query_dim=query_dim,
+                heads=num_heads,
+                dim_head=dim_head, 
+                bias=True
+                )
+        
+    def forward(self, hidden_states, temb):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states, temb).view(hidden_states.shape[0], hidden_states.shape[1], -1)
+        hidden_states = self.attention(hidden_states.transpose(1, 2), None, None).transpose(1, 2)
+        hidden_states = hidden_states.view(residual.shape)
+        hidden_states = hidden_states + residual
+        return hidden_states
+            
+        
+
 class UpDecoderBlock2D(nn.Module):
     def __init__(
         self,
@@ -1985,6 +2010,7 @@ def __init__(
         resnet_pre_norm: bool = True,
         output_scale_factor=1.0,
         add_upsample=True,
+        temb_channels=None
     ):
         super().__init__()
         resnets = []
@@ -1996,7 +2022,7 @@ def __init__(
                 ResnetBlock2D(
                     in_channels=input_channels,
                     out_channels=out_channels,
-                    temb_channels=None,
+                    temb_channels=temb_channels,
                     eps=resnet_eps,
                     groups=resnet_groups,
                     dropout=dropout,
@@ -2014,9 +2040,9 @@ def __init__(
         else:
             self.upsamplers = None
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states, temb=None):
         for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
+            hidden_states = resnet(hidden_states, temb=temb)
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -2040,6 +2066,7 @@ def __init__(
         attn_num_head_channels=1,
         output_scale_factor=1.0,
         add_upsample=True,
+        temb_channels=None
     ):
         super().__init__()
         resnets = []
@@ -2052,7 +2079,7 @@ def __init__(
                 ResnetBlock2D(
                     in_channels=input_channels,
                     out_channels=out_channels,
-                    temb_channels=None,
+                    temb_channels=temb_channels,
                     eps=resnet_eps,
                     groups=resnet_groups,
                     dropout=dropout,
@@ -2075,7 +2102,6 @@ def __init__(
                     upcast_softmax=True,
                     _from_deprecated_attn_block=True,
                 )
-            )
 
         self.attentions = nn.ModuleList(attentions)
         self.resnets = nn.ModuleList(resnets)
@@ -2085,10 +2111,10 @@ def __init__(
         else:
             self.upsamplers = None
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states, temb=None):
         for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb=None)
-            hidden_states = attn(hidden_states)
+                hidden_states = resnet(hidden_states, temb=temb)
+                hidden_states = attn(hidden_states, temb)
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -2847,3 +2873,4 @@ def forward(
         hidden_states = attn_output + hidden_states
 
         return hidden_states
+
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 400c3030af90..776203042e9b 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -20,7 +20,7 @@
 
 from ..utils import BaseOutput, randn_tensor
 from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
-
+from .attention import SpatialNorm
 
 @dataclass
 class DecoderOutput(BaseOutput):
@@ -149,6 +149,7 @@ def __init__(
         layers_per_block=2,
         norm_num_groups=32,
         act_fn="silu",
+        norm_type="default", # default, spatial
     ):
         super().__init__()
         self.layers_per_block = layers_per_block
@@ -164,16 +165,19 @@ def __init__(
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
 
+        
+        temb_channels = in_channels if norm_type == "spatial" else None
+
         # mid
         self.mid_block = UNetMidBlock2D(
             in_channels=block_out_channels[-1],
             resnet_eps=1e-6,
             resnet_act_fn=act_fn,
             output_scale_factor=1,
-            resnet_time_scale_shift="default",
+            resnet_time_scale_shift=norm_type,
             attn_num_head_channels=None,
             resnet_groups=norm_num_groups,
-            temb_channels=None,
+            temb_channels=temb_channels,
         )
 
         # up
@@ -196,19 +200,23 @@ def __init__(
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
                 attn_num_head_channels=None,
-                temb_channels=None,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
 
         # out
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
         self.conv_act = nn.SiLU()
         self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
 
         self.gradient_checkpointing = False
 
-    def forward(self, z):
+    def forward(self, z, zq=None):
         sample = z
         sample = self.conv_in(sample)
 
@@ -230,15 +238,18 @@ def custom_forward(*inputs):
                 sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample)
         else:
             # middle
-            sample = self.mid_block(sample)
+            sample = self.mid_block(sample, zq)
             sample = sample.to(upscale_dtype)
 
             # up
             for up_block in self.up_blocks:
-                sample = up_block(sample)
+                sample = up_block(sample, zq)
 
         # post-process
-        sample = self.conv_norm_out(sample)
+        if zq is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, zq)
         sample = self.conv_act(sample)
         sample = self.conv_out(sample)
 
diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py
index 65f734dccb2d..040447ba82c8 100644
--- a/src/diffusers/models/vq_model.py
+++ b/src/diffusers/models/vq_model.py
@@ -82,9 +82,11 @@ def __init__(
         norm_num_groups: int = 32,
         vq_embed_dim: Optional[int] = None,
         scaling_factor: float = 0.18215,
+        norm_type: str = "default"
     ):
         super().__init__()
 
+
         # pass init params to Encoder
         self.encoder = Encoder(
             in_channels=in_channels,
@@ -112,6 +114,7 @@ def __init__(
             layers_per_block=layers_per_block,
             act_fn=act_fn,
             norm_num_groups=norm_num_groups,
+            norm_type=norm_type,
         )
 
     def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput:
@@ -131,8 +134,8 @@ def decode(
             quant, emb_loss, info = self.quantize(h)
         else:
             quant = h
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
+        quant2 = self.post_quant_conv(quant) 
+        dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
 
         if not return_dict:
             return (dec,)
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index d988f38506ea..b0d8b4b429a1 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -19,7 +19,7 @@
     XLMRobertaTokenizerFast,
 )
 
-from ...models import UNet2DConditionModel
+from ...models import UNet2DConditionModel, VQModel
 from ...pipelines import DiffusionPipeline
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
@@ -30,6 +30,7 @@
 )
 from .text_encoder import MultilingualCLIP
 from .text_proj import KandinskyTextProjModel
+from PIL import Image
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -44,6 +45,21 @@ def get_new_h_w(h, w):
         new_w += 1
     return new_h * 8, new_w * 8
 
+def process_images(batch):
+    scaled = (
+        ((batch + 1) * 127.5)
+        .round()
+        .clamp(0, 255)
+        .to(torch.uint8)
+        .to("cpu")
+        .permute(0, 2, 3, 1)
+        .numpy()
+    )
+    images = []
+    for i in range(scaled.shape[0]):
+        images.append(Image.fromarray(scaled[i]))
+    return images
+
 
 class KandinskyPipeline(DiffusionPipeline):
     """
@@ -63,6 +79,8 @@ class KandinskyPipeline(DiffusionPipeline):
             Conditional U-Net architecture to denoise the image embedding.
         text_proj ([`KandinskyTextProjModel`]):
             Utility class to prepare and combine the embeddings before they are passed to the decoder.
+        decoder ([`VQModel`]):
+            Decoder to generate the image from the latents.
     """
 
     def __init__(
@@ -72,6 +90,7 @@ def __init__(
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
         scheduler: UnCLIPScheduler,
+        decoder: VQModel
     ):
         super().__init__()
 
@@ -81,6 +100,7 @@ def __init__(
             text_proj=text_proj,
             unet=unet,
             scheduler=scheduler,
+            decoder=decoder,
         )
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
@@ -94,6 +114,13 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
+    def get_image(self, latents):
+        images = self.decoder.decode(latents, force_not_quantize=True)["sample"]  
+        images = process_images(images)
+        return images
+
+
+
     def _encode_prompt(
         self,
         prompt,
@@ -371,4 +398,5 @@ def __call__(
 
             _, latents = latents.chunk(2)
 
-        return latents
+        images = self.get_image(latents)
+        return images

From f0a74977fb909357221a5d25ac623adfe69c8567 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 12 May 2023 20:59:38 +0000
Subject: [PATCH 038/182] Revert "[WIP] Add Kandinsky decoder (#3330)"

This reverts commit e74c1733c2eedc7032d4ea07b35216d30394cb17.
---
 scripts/convert_kandinsky_to_diffusers.py     | 420 +-----------------
 src/diffusers/models/attention.py             |  21 -
 src/diffusers/models/resnet.py                |  14 +-
 src/diffusers/models/unet_2d_blocks.py        |  51 +--
 src/diffusers/models/vae.py                   |  29 +-
 src/diffusers/models/vq_model.py              |   7 +-
 .../pipelines/kandinsky/pipeline_kandinsky.py |  32 +-
 7 files changed, 35 insertions(+), 539 deletions(-)

diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py
index 0462772e3140..7e22f0559619 100644
--- a/scripts/convert_kandinsky_to_diffusers.py
+++ b/scripts/convert_kandinsky_to_diffusers.py
@@ -1,13 +1,11 @@
 import argparse
 import tempfile
-import os
 
 import torch
 from accelerate import load_checkpoint_and_dispatch
 
 from diffusers import UNet2DConditionModel
 from diffusers.models.prior_transformer import PriorTransformer
-from diffusers.models.vq_model import VQModel
 from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
 
 
@@ -903,413 +901,15 @@ def inpaint_text2img(*, args, checkpoint_map_location):
 
     return inpaint_unet_model, text_proj_model
 
-# movq
-
-MOVQ_CONFIG ={
-                "in_channels":3, 
-                "out_channels":3, 
-                "latent_channels":4, 
-                "down_block_types":("DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "AttnDownEncoderBlock2D"), 
-                "up_block_types":("AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"),
-                "num_vq_embeddings":16384,
-                "block_out_channels":(128, 256, 256, 512),
-                "vq_embed_dim":4,
-                "layers_per_block":2,
-                "norm_type":"spatial"
-            }
-
-
-def movq_model_from_original_config():
-    movq = VQModel(**MOVQ_CONFIG )
-    return movq
-
-def movq_encoder_to_diffusers_checkpoint(model, checkpoint):
-    diffusers_checkpoint = {}
-
-    # conv_in
-    diffusers_checkpoint.update(
-        {
-            "encoder.conv_in.weight": checkpoint["encoder.conv_in.weight"],
-            "encoder.conv_in.bias": checkpoint["encoder.conv_in.bias"],
-        }
-    )
-
-    # down_blocks
-    for down_block_idx, down_block in enumerate(model.encoder.down_blocks):
-        diffusers_down_block_prefix = f"encoder.down_blocks.{down_block_idx}"
-        down_block_prefix = f"encoder.down.{down_block_idx}"
-
-        # resnets
-        for resnet_idx, resnet in enumerate(down_block.resnets):
-            diffusers_resnet_prefix = f"{diffusers_down_block_prefix}.resnets.{resnet_idx}"
-            resnet_prefix = f"{down_block_prefix}.block.{resnet_idx}"
-
-            diffusers_checkpoint.update(
-                movq_resnet_to_diffusers_checkpoint(
-                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
-                )
-            )
-
-        # downsample
-
-        # do not include the downsample when on the last down block
-        # There is no downsample on the last down block
-        if down_block_idx != len(model.encoder.down_blocks) - 1:
-            # There's a single downsample in the original checkpoint but a list of downsamples
-            # in the diffusers model.
-            diffusers_downsample_prefix = f"{diffusers_down_block_prefix}.downsamplers.0.conv"
-            downsample_prefix = f"{down_block_prefix}.downsample.conv"
-            diffusers_checkpoint.update(
-                {
-                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
-                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
-                }
-            )
-
-        # attentions
-
-        if hasattr(down_block, "attentions"):
-            for attention_idx, _ in enumerate(down_block.attentions):
-                diffusers_attention_prefix = f"{diffusers_down_block_prefix}.attentions.{attention_idx}"
-                attention_prefix = f"{down_block_prefix}.attn.{attention_idx}"
-                diffusers_checkpoint.update(
-                    movq_attention_to_diffusers_checkpoint(
-                        checkpoint,
-                        diffusers_attention_prefix=diffusers_attention_prefix,
-                        attention_prefix=attention_prefix,
-                    )
-                )
-
-    # mid block
-
-    # mid block attentions
-
-    # There is a single hardcoded attention block in the middle of the VQ-diffusion encoder
-    diffusers_attention_prefix = "encoder.mid_block.attentions.0"
-    attention_prefix = "encoder.mid.attn_1"
-    diffusers_checkpoint.update(
-        movq_attention_to_diffusers_checkpoint(
-            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
-        )
-    )
-
-    # mid block resnets
-
-    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
-        diffusers_resnet_prefix = f"encoder.mid_block.resnets.{diffusers_resnet_idx}"
-
-        # the hardcoded prefixes to `block_` are 1 and 2
-        orig_resnet_idx = diffusers_resnet_idx + 1
-        # There are two hardcoded resnets in the middle of the VQ-diffusion encoder
-        resnet_prefix = f"encoder.mid.block_{orig_resnet_idx}"
-
-        diffusers_checkpoint.update(
-            movq_resnet_to_diffusers_checkpoint(
-                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
-            )
-        )
-
-    diffusers_checkpoint.update(
-        {
-            # conv_norm_out
-            "encoder.conv_norm_out.weight": checkpoint["encoder.norm_out.weight"],
-            "encoder.conv_norm_out.bias": checkpoint["encoder.norm_out.bias"],
-            # conv_out
-            "encoder.conv_out.weight": checkpoint["encoder.conv_out.weight"],
-            "encoder.conv_out.bias": checkpoint["encoder.conv_out.bias"],
-        }
-    )
-
-    return diffusers_checkpoint
-
-
-def movq_decoder_to_diffusers_checkpoint(model, checkpoint):
-    diffusers_checkpoint = {}
-
-    # conv in
-    diffusers_checkpoint.update(
-        {
-            "decoder.conv_in.weight": checkpoint["decoder.conv_in.weight"],
-            "decoder.conv_in.bias": checkpoint["decoder.conv_in.bias"],
-        }
-    )
-
-    # up_blocks
-
-    for diffusers_up_block_idx, up_block in enumerate(model.decoder.up_blocks):
-        # up_blocks are stored in reverse order in the VQ-diffusion checkpoint
-        orig_up_block_idx = len(model.decoder.up_blocks) - 1 - diffusers_up_block_idx
-
-        diffusers_up_block_prefix = f"decoder.up_blocks.{diffusers_up_block_idx}"
-        up_block_prefix = f"decoder.up.{orig_up_block_idx}"
-
-        # resnets
-        for resnet_idx, resnet in enumerate(up_block.resnets):
-            diffusers_resnet_prefix = f"{diffusers_up_block_prefix}.resnets.{resnet_idx}"
-            resnet_prefix = f"{up_block_prefix}.block.{resnet_idx}"
-
-            diffusers_checkpoint.update(
-                movq_resnet_to_diffusers_checkpoint_spatial_norm(
-                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
-                )
-            )
-
-        # upsample
-
-        # there is no up sample on the last up block
-        if diffusers_up_block_idx != len(model.decoder.up_blocks) - 1:
-            # There's a single upsample in the VQ-diffusion checkpoint but a list of downsamples
-            # in the diffusers model.
-            diffusers_downsample_prefix = f"{diffusers_up_block_prefix}.upsamplers.0.conv"
-            downsample_prefix = f"{up_block_prefix}.upsample.conv"
-            diffusers_checkpoint.update(
-                {
-                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
-                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
-                }
-            )
-
-        # attentions
-
-        if hasattr(up_block, "attentions"):
-            for attention_idx, _ in enumerate(up_block.attentions):
-                diffusers_attention_prefix = f"{diffusers_up_block_prefix}.attentions.{attention_idx}"
-                attention_prefix = f"{up_block_prefix}.attn.{attention_idx}"
-                diffusers_checkpoint.update(
-                    movq_attention_to_diffusers_checkpoint_spatial_norm(
-                        checkpoint,
-                        diffusers_attention_prefix=diffusers_attention_prefix,
-                        attention_prefix=attention_prefix,
-                    )
-                )
-
-    # mid block
-
-    # mid block attentions
-
-    # There is a single hardcoded attention block in the middle of the VQ-diffusion decoder
-    diffusers_attention_prefix = "decoder.mid_block.attentions.0"
-    attention_prefix = "decoder.mid.attn_1"
-    diffusers_checkpoint.update(
-        movq_attention_to_diffusers_checkpoint_spatial_norm(
-            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
-        )
-    )
-
-    # mid block resnets
-
-    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
-        diffusers_resnet_prefix = f"decoder.mid_block.resnets.{diffusers_resnet_idx}"
-
-        # the hardcoded prefixes to `block_` are 1 and 2
-        orig_resnet_idx = diffusers_resnet_idx + 1
-        # There are two hardcoded resnets in the middle of the VQ-diffusion decoder
-        resnet_prefix = f"decoder.mid.block_{orig_resnet_idx}"
-
-        diffusers_checkpoint.update(
-            movq_resnet_to_diffusers_checkpoint_spatial_norm(
-                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
-            )
-        )
-
-    diffusers_checkpoint.update(
-        {
-            # conv_norm_out
-            "decoder.conv_norm_out.norm_layer.weight": checkpoint["decoder.norm_out.norm_layer.weight"],
-            "decoder.conv_norm_out.norm_layer.bias": checkpoint["decoder.norm_out.norm_layer.bias"],
-            "decoder.conv_norm_out.conv_y.weight": checkpoint["decoder.norm_out.conv_y.weight"],
-            "decoder.conv_norm_out.conv_y.bias": checkpoint["decoder.norm_out.conv_y.bias"],
-            "decoder.conv_norm_out.conv_b.weight": checkpoint["decoder.norm_out.conv_b.weight"],
-            "decoder.conv_norm_out.conv_b.bias": checkpoint["decoder.norm_out.conv_b.bias"],
-            # conv_out
-            "decoder.conv_out.weight": checkpoint["decoder.conv_out.weight"],
-            "decoder.conv_out.bias": checkpoint["decoder.conv_out.bias"],
-        }
-    )
-
-    return diffusers_checkpoint
-
-
-def movq_resnet_to_diffusers_checkpoint(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
-    rv = {
-        # norm1
-        f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.norm1.weight"],
-        f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.norm1.bias"],
-        # conv1
-        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
-        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
-        # norm2
-        f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.norm2.weight"],
-        f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.norm2.bias"],
-        # conv2
-        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
-        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
-    }
-
-    if resnet.conv_shortcut is not None:
-        rv.update(
-            {
-                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
-                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
-            }
-        )
-
-    return rv
-
-def movq_resnet_to_diffusers_checkpoint_spatial_norm(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
-    rv = {
-        # norm1
-        f"{diffusers_resnet_prefix}.norm1.norm_layer.weight": checkpoint[f"{resnet_prefix}.norm1.norm_layer.weight"],
-        f"{diffusers_resnet_prefix}.norm1.norm_layer.bias": checkpoint[f"{resnet_prefix}.norm1.norm_layer.bias"],
-        f"{diffusers_resnet_prefix}.norm1.conv_y.weight": checkpoint[f"{resnet_prefix}.norm1.conv_y.weight"],
-        f"{diffusers_resnet_prefix}.norm1.conv_y.bias": checkpoint[f"{resnet_prefix}.norm1.conv_y.bias"],
-        f"{diffusers_resnet_prefix}.norm1.conv_b.weight": checkpoint[f"{resnet_prefix}.norm1.conv_b.weight"],
-        f"{diffusers_resnet_prefix}.norm1.conv_b.bias": checkpoint[f"{resnet_prefix}.norm1.conv_b.bias"],
-        # conv1
-        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
-        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
-        # norm2
-        f"{diffusers_resnet_prefix}.norm2.norm_layer.weight": checkpoint[f"{resnet_prefix}.norm2.norm_layer.weight"],
-        f"{diffusers_resnet_prefix}.norm2.norm_layer.bias": checkpoint[f"{resnet_prefix}.norm2.norm_layer.bias"],
-        f"{diffusers_resnet_prefix}.norm2.conv_y.weight": checkpoint[f"{resnet_prefix}.norm2.conv_y.weight"],
-        f"{diffusers_resnet_prefix}.norm2.conv_y.bias": checkpoint[f"{resnet_prefix}.norm2.conv_y.bias"],
-        f"{diffusers_resnet_prefix}.norm2.conv_b.weight": checkpoint[f"{resnet_prefix}.norm2.conv_b.weight"],
-        f"{diffusers_resnet_prefix}.norm2.conv_b.bias": checkpoint[f"{resnet_prefix}.norm2.conv_b.bias"],
-        # conv2
-        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
-        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
-    }
-
-    if resnet.conv_shortcut is not None:
-        rv.update(
-            {
-                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
-                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
-            }
-        )
-
-    return rv
-
-
-
-def movq_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
-    return {
-        # norm
-        f"{diffusers_attention_prefix}.norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
-        f"{diffusers_attention_prefix}.norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
-        # query
-        f"{diffusers_attention_prefix}.query.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.query.bias": checkpoint[f"{attention_prefix}.q.bias"],
-        # key
-        f"{diffusers_attention_prefix}.key.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.key.bias": checkpoint[f"{attention_prefix}.k.bias"],
-        # value
-        f"{diffusers_attention_prefix}.value.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.value.bias": checkpoint[f"{attention_prefix}.v.bias"],
-        # proj_attn
-        f"{diffusers_attention_prefix}.proj_attn.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
-            :, :, 0, 0
-        ],
-        f"{diffusers_attention_prefix}.proj_attn.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
-    }
-
-def movq_attention_to_diffusers_checkpoint_spatial_norm(checkpoint, *, diffusers_attention_prefix, attention_prefix):
-    return {
-        # norm
-        f"{diffusers_attention_prefix}.norm.norm_layer.weight": checkpoint[f"{attention_prefix}.norm.norm_layer.weight"],
-        f"{diffusers_attention_prefix}.norm.norm_layer.bias": checkpoint[f"{attention_prefix}.norm.norm_layer.bias"],
-        f"{diffusers_attention_prefix}.norm.conv_y.weight": checkpoint[f"{attention_prefix}.norm.conv_y.weight"],
-        f"{diffusers_attention_prefix}.norm.conv_y.bias": checkpoint[f"{attention_prefix}.norm.conv_y.bias"],
-        f"{diffusers_attention_prefix}.norm.conv_b.weight": checkpoint[f"{attention_prefix}.norm.conv_b.weight"],
-        f"{diffusers_attention_prefix}.norm.conv_b.bias": checkpoint[f"{attention_prefix}.norm.conv_b.bias"],
-        # query
-        f"{diffusers_attention_prefix}.attention.to_q.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.attention.to_q.bias": checkpoint[f"{attention_prefix}.q.bias"],
-        # key
-        f"{diffusers_attention_prefix}.attention.to_k.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.attention.to_k.bias": checkpoint[f"{attention_prefix}.k.bias"],
-        # value
-        f"{diffusers_attention_prefix}.attention.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.attention.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"],
-        # proj_attn
-        f"{diffusers_attention_prefix}.attention.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
-            :, :, 0, 0
-        ],
-        f"{diffusers_attention_prefix}.attention.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
-    }
-
-
-
-
-
-def movq_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
-    diffusers_checkpoint = {}
-    diffusers_checkpoint.update(movq_encoder_to_diffusers_checkpoint(model, checkpoint))
-
-
-    # quant_conv
-
-    diffusers_checkpoint.update(
-        {
-            "quant_conv.weight": checkpoint["quant_conv.weight"],
-            "quant_conv.bias": checkpoint["quant_conv.bias"],
-        }
-    )
-
-    # quantize
-    diffusers_checkpoint.update({"quantize.embedding.weight": checkpoint["quantize.embedding.weight"]})
-
-    # post_quant_conv
-    diffusers_checkpoint.update(
-        {
-            "post_quant_conv.weight": checkpoint["post_quant_conv.weight"],
-            "post_quant_conv.bias": checkpoint["post_quant_conv.bias"],
-        }
-    )
-
-    # decoder
-    diffusers_checkpoint.update(movq_decoder_to_diffusers_checkpoint(model, checkpoint))
-
-
-
-    for keys in diffusers_checkpoint.keys():
-        print(keys)
-
-    return diffusers_checkpoint
-
-
-
-
-
-def movq(*, args, checkpoint_map_location):
-    print("loading movq")
-
-    movq_checkpoint = torch.load(args.movq_checkpoint_path, map_location=checkpoint_map_location)
-
-    movq_model = movq_model_from_original_config()
-
-    movq_diffusers_checkpoint = movq_original_checkpoint_to_diffusers_checkpoint(
-        movq_model, movq_checkpoint
-    )
-
-    del movq_checkpoint
-
-    load_checkpoint_to_model(movq_diffusers_checkpoint, movq_model, strict=True)
-
-    print("done loading movq")
-
-    return movq_model
-
 
 def load_checkpoint_to_model(checkpoint, model, strict=False):
-    with tempfile.NamedTemporaryFile(delete=False) as file:
+    with tempfile.NamedTemporaryFile() as file:
         torch.save(checkpoint, file.name)
         del checkpoint
         if strict:
             model.load_state_dict(torch.load(file.name), strict=True)
         else:
             load_checkpoint_and_dispatch(model, file.name, device_map="auto")
-    os.remove(file.name)
 
 
 if __name__ == "__main__":
@@ -1321,31 +921,24 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         "--prior_checkpoint_path",
         default=None,
         type=str,
-        required=False,
+        required=True,
         help="Path to the prior checkpoint to convert.",
     )
     parser.add_argument(
-        "--clip_stat_path", default=None, type=str, required=False, help="Path to the clip stats checkpoint to convert."
+        "--clip_stat_path", default=None, type=str, required=True, help="Path to the clip stats checkpoint to convert."
     )
     parser.add_argument(
         "--text2img_checkpoint_path",
         default=None,
         type=str,
-        required=False,
-        help="Path to the text2img checkpoint to convert.",
-    )
-    parser.add_argument(
-        "--movq_checkpoint_path",
-        default=None,
-        type=str,
-        required=False,
+        required=True,
         help="Path to the text2img checkpoint to convert.",
     )
     parser.add_argument(
         "--inpaint_text2img_checkpoint_path",
         default=None,
         type=str,
-        required=False,
+        required=True,
         help="Path to the inpaint text2img checkpoint to convert.",
     )
     parser.add_argument(
@@ -1386,8 +979,5 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         inpaint_unet_model, inpaint_text_proj_model = inpaint_text2img(args=args, checkpoint_map_location=checkpoint_map_location)
         inpaint_unet_model.save_pretrained(f"{args.dump_path}/inpaint_unet")
         inpaint_text_proj_model.save_pretrained(f"{args.dump_path}/inpaint_text_proj")
-    elif args.debug == 'decoder':
-        decoder = movq(args=args, checkpoint_map_location=checkpoint_map_location)
-        decoder.save_pretrained(f"{args.dump_path}/decoder")
     else:
         raise ValueError(f"unknown debug value : {args.debug}")
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 2b31fa9e2f38..0b313b83d360 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -369,24 +369,3 @@ def forward(self, x, emb):
         x = F.group_norm(x, self.num_groups, eps=self.eps)
         x = x * (1 + scale) + shift
         return x
-
-class SpatialNorm(nn.Module):
-    """
-    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002
-    """
-    def __init__(
-        self,
-        f_channels,
-        zq_channels,
-    ):
-        super().__init__()
-        self.norm_layer = nn.GroupNorm(num_channels=f_channels,num_groups=32,eps=1e-6,affine=True)
-        self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
-        self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
-
-    def forward(self, f, zq):
-        f_size = f.shape[-2:]
-        zq = F.interpolate(zq, size=f_size, mode="nearest")
-        norm_f = self.norm_layer(f)
-        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
-        return new_f
\ No newline at end of file
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 83bec9a52593..d9d539959c09 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -20,7 +20,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from .attention import AdaGroupNorm, SpatialNorm
+from .attention import AdaGroupNorm
 
 
 class Upsample1D(nn.Module):
@@ -460,7 +460,7 @@ def __init__(
         eps=1e-6,
         non_linearity="swish",
         skip_time_act=False,
-        time_embedding_norm="default",  # default, scale_shift, ada_group, spatial
+        time_embedding_norm="default",  # default, scale_shift, ada_group
         kernel=None,
         output_scale_factor=1.0,
         use_in_shortcut=None,
@@ -487,8 +487,6 @@ def __init__(
 
         if self.time_embedding_norm == "ada_group":
             self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
-        elif self.time_embedding_norm == "spatial":
-            self.norm1 = SpatialNorm(in_channels, temb_channels)
         else:
             self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
 
@@ -499,7 +497,7 @@ def __init__(
                 self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels)
             elif self.time_embedding_norm == "scale_shift":
                 self.time_emb_proj = torch.nn.Linear(temb_channels, 2 * out_channels)
-            elif self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            elif self.time_embedding_norm == "ada_group":
                 self.time_emb_proj = None
             else:
                 raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
@@ -508,8 +506,6 @@ def __init__(
 
         if self.time_embedding_norm == "ada_group":
             self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
-        elif self.time_embedding_norm == "spatial":
-            self.norm2 = SpatialNorm(out_channels, temb_channels)
         else:
             self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
 
@@ -555,7 +551,7 @@ def __init__(
     def forward(self, input_tensor, temb):
         hidden_states = input_tensor
 
-        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+        if self.time_embedding_norm == "ada_group":
             hidden_states = self.norm1(hidden_states, temb)
         else:
             hidden_states = self.norm1(hidden_states)
@@ -583,7 +579,7 @@ def forward(self, input_tensor, temb):
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
 
-        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+        if self.time_embedding_norm == "ada_group":
             hidden_states = self.norm2(hidden_states, temb)
         else:
             hidden_states = self.norm2(hidden_states)
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index f91132a61397..0004f074c563 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -18,7 +18,7 @@
 import torch.nn.functional as F
 from torch import nn
 
-from .attention import AdaGroupNorm, AttentionBlock, SpatialNorm
+from .attention import AdaGroupNorm
 from .attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0
 from .dual_transformer_2d import DualTransformer2DModel
 from .resnet import Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D
@@ -348,7 +348,6 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            temb_channels=temb_channels
         )
     elif up_block_type == "AttnUpDecoderBlock2D":
         return AttnUpDecoderBlock2D(
@@ -361,7 +360,6 @@ def get_up_block(
             resnet_groups=resnet_groups,
             attn_num_head_channels=attn_num_head_channels,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            temb_channels=temb_channels
         )
     elif up_block_type == "KUpBlock2D":
         return KUpBlock2D(
@@ -408,6 +406,7 @@ def __init__(
         super().__init__()
         resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         self.add_attention = add_attention
+
         # there is always at least one resnet
         resnets = [
             ResnetBlock2D(
@@ -440,6 +439,7 @@ def __init__(
                         upcast_softmax=True,
                         _from_deprecated_attn_block=True,
                     )
+                )
             else:
                 attentions.append(None)
 
@@ -465,8 +465,7 @@ def forward(self, hidden_states, temb=None):
         hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
             if attn is not None:
-                hidden_states = attn(hidden_states, temb)
-
+                hidden_states = attn(hidden_states)
             hidden_states = resnet(hidden_states, temb)
 
         return hidden_states
@@ -1972,30 +1971,6 @@ def custom_forward(*inputs):
         return hidden_states
 
 
-class MOVQAttention(nn.Module):
-    def __init__(self, query_dim, temb_channels, attn_num_head_channels):
-        super().__init__()
-
-        self.norm = SpatialNorm(query_dim, temb_channels)
-        num_heads = query_dim // attn_num_head_channels if attn_num_head_channels is not None else 1
-        dim_head = attn_num_head_channels if attn_num_head_channels is not None else query_dim
-        self.attention = Attention(
-                query_dim=query_dim,
-                heads=num_heads,
-                dim_head=dim_head, 
-                bias=True
-                )
-        
-    def forward(self, hidden_states, temb):
-        residual = hidden_states
-        hidden_states = self.norm(hidden_states, temb).view(hidden_states.shape[0], hidden_states.shape[1], -1)
-        hidden_states = self.attention(hidden_states.transpose(1, 2), None, None).transpose(1, 2)
-        hidden_states = hidden_states.view(residual.shape)
-        hidden_states = hidden_states + residual
-        return hidden_states
-            
-        
-
 class UpDecoderBlock2D(nn.Module):
     def __init__(
         self,
@@ -2010,7 +1985,6 @@ def __init__(
         resnet_pre_norm: bool = True,
         output_scale_factor=1.0,
         add_upsample=True,
-        temb_channels=None
     ):
         super().__init__()
         resnets = []
@@ -2022,7 +1996,7 @@ def __init__(
                 ResnetBlock2D(
                     in_channels=input_channels,
                     out_channels=out_channels,
-                    temb_channels=temb_channels,
+                    temb_channels=None,
                     eps=resnet_eps,
                     groups=resnet_groups,
                     dropout=dropout,
@@ -2040,9 +2014,9 @@ def __init__(
         else:
             self.upsamplers = None
 
-    def forward(self, hidden_states, temb=None):
+    def forward(self, hidden_states):
         for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=temb)
+            hidden_states = resnet(hidden_states, temb=None)
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -2066,7 +2040,6 @@ def __init__(
         attn_num_head_channels=1,
         output_scale_factor=1.0,
         add_upsample=True,
-        temb_channels=None
     ):
         super().__init__()
         resnets = []
@@ -2079,7 +2052,7 @@ def __init__(
                 ResnetBlock2D(
                     in_channels=input_channels,
                     out_channels=out_channels,
-                    temb_channels=temb_channels,
+                    temb_channels=None,
                     eps=resnet_eps,
                     groups=resnet_groups,
                     dropout=dropout,
@@ -2102,6 +2075,7 @@ def __init__(
                     upcast_softmax=True,
                     _from_deprecated_attn_block=True,
                 )
+            )
 
         self.attentions = nn.ModuleList(attentions)
         self.resnets = nn.ModuleList(resnets)
@@ -2111,10 +2085,10 @@ def __init__(
         else:
             self.upsamplers = None
 
-    def forward(self, hidden_states, temb=None):
+    def forward(self, hidden_states):
         for resnet, attn in zip(self.resnets, self.attentions):
-                hidden_states = resnet(hidden_states, temb=temb)
-                hidden_states = attn(hidden_states, temb)
+            hidden_states = resnet(hidden_states, temb=None)
+            hidden_states = attn(hidden_states)
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -2873,4 +2847,3 @@ def forward(
         hidden_states = attn_output + hidden_states
 
         return hidden_states
-
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 776203042e9b..400c3030af90 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -20,7 +20,7 @@
 
 from ..utils import BaseOutput, randn_tensor
 from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
-from .attention import SpatialNorm
+
 
 @dataclass
 class DecoderOutput(BaseOutput):
@@ -149,7 +149,6 @@ def __init__(
         layers_per_block=2,
         norm_num_groups=32,
         act_fn="silu",
-        norm_type="default", # default, spatial
     ):
         super().__init__()
         self.layers_per_block = layers_per_block
@@ -165,19 +164,16 @@ def __init__(
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
 
-        
-        temb_channels = in_channels if norm_type == "spatial" else None
-
         # mid
         self.mid_block = UNetMidBlock2D(
             in_channels=block_out_channels[-1],
             resnet_eps=1e-6,
             resnet_act_fn=act_fn,
             output_scale_factor=1,
-            resnet_time_scale_shift=norm_type,
+            resnet_time_scale_shift="default",
             attn_num_head_channels=None,
             resnet_groups=norm_num_groups,
-            temb_channels=temb_channels,
+            temb_channels=None,
         )
 
         # up
@@ -200,23 +196,19 @@ def __init__(
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
                 attn_num_head_channels=None,
-                temb_channels=temb_channels,
-                resnet_time_scale_shift=norm_type,
+                temb_channels=None,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
 
         # out
-        if norm_type == "spatial":
-            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
-        else:
-            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
         self.conv_act = nn.SiLU()
         self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
 
         self.gradient_checkpointing = False
 
-    def forward(self, z, zq=None):
+    def forward(self, z):
         sample = z
         sample = self.conv_in(sample)
 
@@ -238,18 +230,15 @@ def custom_forward(*inputs):
                 sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample)
         else:
             # middle
-            sample = self.mid_block(sample, zq)
+            sample = self.mid_block(sample)
             sample = sample.to(upscale_dtype)
 
             # up
             for up_block in self.up_blocks:
-                sample = up_block(sample, zq)
+                sample = up_block(sample)
 
         # post-process
-        if zq is None:
-            sample = self.conv_norm_out(sample)
-        else:
-            sample = self.conv_norm_out(sample, zq)
+        sample = self.conv_norm_out(sample)
         sample = self.conv_act(sample)
         sample = self.conv_out(sample)
 
diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py
index 040447ba82c8..65f734dccb2d 100644
--- a/src/diffusers/models/vq_model.py
+++ b/src/diffusers/models/vq_model.py
@@ -82,11 +82,9 @@ def __init__(
         norm_num_groups: int = 32,
         vq_embed_dim: Optional[int] = None,
         scaling_factor: float = 0.18215,
-        norm_type: str = "default"
     ):
         super().__init__()
 
-
         # pass init params to Encoder
         self.encoder = Encoder(
             in_channels=in_channels,
@@ -114,7 +112,6 @@ def __init__(
             layers_per_block=layers_per_block,
             act_fn=act_fn,
             norm_num_groups=norm_num_groups,
-            norm_type=norm_type,
         )
 
     def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput:
@@ -134,8 +131,8 @@ def decode(
             quant, emb_loss, info = self.quantize(h)
         else:
             quant = h
-        quant2 = self.post_quant_conv(quant) 
-        dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
 
         if not return_dict:
             return (dec,)
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index b0d8b4b429a1..d988f38506ea 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -19,7 +19,7 @@
     XLMRobertaTokenizerFast,
 )
 
-from ...models import UNet2DConditionModel, VQModel
+from ...models import UNet2DConditionModel
 from ...pipelines import DiffusionPipeline
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
@@ -30,7 +30,6 @@
 )
 from .text_encoder import MultilingualCLIP
 from .text_proj import KandinskyTextProjModel
-from PIL import Image
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -45,21 +44,6 @@ def get_new_h_w(h, w):
         new_w += 1
     return new_h * 8, new_w * 8
 
-def process_images(batch):
-    scaled = (
-        ((batch + 1) * 127.5)
-        .round()
-        .clamp(0, 255)
-        .to(torch.uint8)
-        .to("cpu")
-        .permute(0, 2, 3, 1)
-        .numpy()
-    )
-    images = []
-    for i in range(scaled.shape[0]):
-        images.append(Image.fromarray(scaled[i]))
-    return images
-
 
 class KandinskyPipeline(DiffusionPipeline):
     """
@@ -79,8 +63,6 @@ class KandinskyPipeline(DiffusionPipeline):
             Conditional U-Net architecture to denoise the image embedding.
         text_proj ([`KandinskyTextProjModel`]):
             Utility class to prepare and combine the embeddings before they are passed to the decoder.
-        decoder ([`VQModel`]):
-            Decoder to generate the image from the latents.
     """
 
     def __init__(
@@ -90,7 +72,6 @@ def __init__(
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
         scheduler: UnCLIPScheduler,
-        decoder: VQModel
     ):
         super().__init__()
 
@@ -100,7 +81,6 @@ def __init__(
             text_proj=text_proj,
             unet=unet,
             scheduler=scheduler,
-            decoder=decoder,
         )
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
@@ -114,13 +94,6 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
-    def get_image(self, latents):
-        images = self.decoder.decode(latents, force_not_quantize=True)["sample"]  
-        images = process_images(images)
-        return images
-
-
-
     def _encode_prompt(
         self,
         prompt,
@@ -398,5 +371,4 @@ def __call__(
 
             _, latents = latents.chunk(2)
 
-        images = self.get_image(latents)
-        return images
+        return latents

From ce965b91dcb20ea8a7bcdda5f4434053e749870d Mon Sep 17 00:00:00 2001
From: ayushmangal <ayushmangal@microsoft.com>
Date: Fri, 12 May 2023 21:15:56 +0000
Subject: [PATCH 039/182] ayusha's PR to add MoVQ

---
 scripts/convert_kandinsky_to_diffusers.py | 420 +++++++++++++++++++++-
 src/diffusers/models/attention.py         |  21 ++
 src/diffusers/models/resnet.py            |  14 +-
 src/diffusers/models/vae.py               |  29 +-
 src/diffusers/models/vq_model.py          |   7 +-
 5 files changed, 470 insertions(+), 21 deletions(-)

diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py
index 7e22f0559619..0462772e3140 100644
--- a/scripts/convert_kandinsky_to_diffusers.py
+++ b/scripts/convert_kandinsky_to_diffusers.py
@@ -1,11 +1,13 @@
 import argparse
 import tempfile
+import os
 
 import torch
 from accelerate import load_checkpoint_and_dispatch
 
 from diffusers import UNet2DConditionModel
 from diffusers.models.prior_transformer import PriorTransformer
+from diffusers.models.vq_model import VQModel
 from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
 
 
@@ -901,15 +903,413 @@ def inpaint_text2img(*, args, checkpoint_map_location):
 
     return inpaint_unet_model, text_proj_model
 
+# movq
+
+MOVQ_CONFIG ={
+                "in_channels":3, 
+                "out_channels":3, 
+                "latent_channels":4, 
+                "down_block_types":("DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "AttnDownEncoderBlock2D"), 
+                "up_block_types":("AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"),
+                "num_vq_embeddings":16384,
+                "block_out_channels":(128, 256, 256, 512),
+                "vq_embed_dim":4,
+                "layers_per_block":2,
+                "norm_type":"spatial"
+            }
+
+
+def movq_model_from_original_config():
+    movq = VQModel(**MOVQ_CONFIG )
+    return movq
+
+def movq_encoder_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # conv_in
+    diffusers_checkpoint.update(
+        {
+            "encoder.conv_in.weight": checkpoint["encoder.conv_in.weight"],
+            "encoder.conv_in.bias": checkpoint["encoder.conv_in.bias"],
+        }
+    )
+
+    # down_blocks
+    for down_block_idx, down_block in enumerate(model.encoder.down_blocks):
+        diffusers_down_block_prefix = f"encoder.down_blocks.{down_block_idx}"
+        down_block_prefix = f"encoder.down.{down_block_idx}"
+
+        # resnets
+        for resnet_idx, resnet in enumerate(down_block.resnets):
+            diffusers_resnet_prefix = f"{diffusers_down_block_prefix}.resnets.{resnet_idx}"
+            resnet_prefix = f"{down_block_prefix}.block.{resnet_idx}"
+
+            diffusers_checkpoint.update(
+                movq_resnet_to_diffusers_checkpoint(
+                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+                )
+            )
+
+        # downsample
+
+        # do not include the downsample when on the last down block
+        # There is no downsample on the last down block
+        if down_block_idx != len(model.encoder.down_blocks) - 1:
+            # There's a single downsample in the original checkpoint but a list of downsamples
+            # in the diffusers model.
+            diffusers_downsample_prefix = f"{diffusers_down_block_prefix}.downsamplers.0.conv"
+            downsample_prefix = f"{down_block_prefix}.downsample.conv"
+            diffusers_checkpoint.update(
+                {
+                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
+                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
+                }
+            )
+
+        # attentions
+
+        if hasattr(down_block, "attentions"):
+            for attention_idx, _ in enumerate(down_block.attentions):
+                diffusers_attention_prefix = f"{diffusers_down_block_prefix}.attentions.{attention_idx}"
+                attention_prefix = f"{down_block_prefix}.attn.{attention_idx}"
+                diffusers_checkpoint.update(
+                    movq_attention_to_diffusers_checkpoint(
+                        checkpoint,
+                        diffusers_attention_prefix=diffusers_attention_prefix,
+                        attention_prefix=attention_prefix,
+                    )
+                )
+
+    # mid block
+
+    # mid block attentions
+
+    # There is a single hardcoded attention block in the middle of the VQ-diffusion encoder
+    diffusers_attention_prefix = "encoder.mid_block.attentions.0"
+    attention_prefix = "encoder.mid.attn_1"
+    diffusers_checkpoint.update(
+        movq_attention_to_diffusers_checkpoint(
+            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+        )
+    )
+
+    # mid block resnets
+
+    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
+        diffusers_resnet_prefix = f"encoder.mid_block.resnets.{diffusers_resnet_idx}"
+
+        # the hardcoded prefixes to `block_` are 1 and 2
+        orig_resnet_idx = diffusers_resnet_idx + 1
+        # There are two hardcoded resnets in the middle of the VQ-diffusion encoder
+        resnet_prefix = f"encoder.mid.block_{orig_resnet_idx}"
+
+        diffusers_checkpoint.update(
+            movq_resnet_to_diffusers_checkpoint(
+                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+            )
+        )
+
+    diffusers_checkpoint.update(
+        {
+            # conv_norm_out
+            "encoder.conv_norm_out.weight": checkpoint["encoder.norm_out.weight"],
+            "encoder.conv_norm_out.bias": checkpoint["encoder.norm_out.bias"],
+            # conv_out
+            "encoder.conv_out.weight": checkpoint["encoder.conv_out.weight"],
+            "encoder.conv_out.bias": checkpoint["encoder.conv_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def movq_decoder_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # conv in
+    diffusers_checkpoint.update(
+        {
+            "decoder.conv_in.weight": checkpoint["decoder.conv_in.weight"],
+            "decoder.conv_in.bias": checkpoint["decoder.conv_in.bias"],
+        }
+    )
+
+    # up_blocks
+
+    for diffusers_up_block_idx, up_block in enumerate(model.decoder.up_blocks):
+        # up_blocks are stored in reverse order in the VQ-diffusion checkpoint
+        orig_up_block_idx = len(model.decoder.up_blocks) - 1 - diffusers_up_block_idx
+
+        diffusers_up_block_prefix = f"decoder.up_blocks.{diffusers_up_block_idx}"
+        up_block_prefix = f"decoder.up.{orig_up_block_idx}"
+
+        # resnets
+        for resnet_idx, resnet in enumerate(up_block.resnets):
+            diffusers_resnet_prefix = f"{diffusers_up_block_prefix}.resnets.{resnet_idx}"
+            resnet_prefix = f"{up_block_prefix}.block.{resnet_idx}"
+
+            diffusers_checkpoint.update(
+                movq_resnet_to_diffusers_checkpoint_spatial_norm(
+                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+                )
+            )
+
+        # upsample
+
+        # there is no up sample on the last up block
+        if diffusers_up_block_idx != len(model.decoder.up_blocks) - 1:
+            # There's a single upsample in the VQ-diffusion checkpoint but a list of downsamples
+            # in the diffusers model.
+            diffusers_downsample_prefix = f"{diffusers_up_block_prefix}.upsamplers.0.conv"
+            downsample_prefix = f"{up_block_prefix}.upsample.conv"
+            diffusers_checkpoint.update(
+                {
+                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
+                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
+                }
+            )
+
+        # attentions
+
+        if hasattr(up_block, "attentions"):
+            for attention_idx, _ in enumerate(up_block.attentions):
+                diffusers_attention_prefix = f"{diffusers_up_block_prefix}.attentions.{attention_idx}"
+                attention_prefix = f"{up_block_prefix}.attn.{attention_idx}"
+                diffusers_checkpoint.update(
+                    movq_attention_to_diffusers_checkpoint_spatial_norm(
+                        checkpoint,
+                        diffusers_attention_prefix=diffusers_attention_prefix,
+                        attention_prefix=attention_prefix,
+                    )
+                )
+
+    # mid block
+
+    # mid block attentions
+
+    # There is a single hardcoded attention block in the middle of the VQ-diffusion decoder
+    diffusers_attention_prefix = "decoder.mid_block.attentions.0"
+    attention_prefix = "decoder.mid.attn_1"
+    diffusers_checkpoint.update(
+        movq_attention_to_diffusers_checkpoint_spatial_norm(
+            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+        )
+    )
+
+    # mid block resnets
+
+    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
+        diffusers_resnet_prefix = f"decoder.mid_block.resnets.{diffusers_resnet_idx}"
+
+        # the hardcoded prefixes to `block_` are 1 and 2
+        orig_resnet_idx = diffusers_resnet_idx + 1
+        # There are two hardcoded resnets in the middle of the VQ-diffusion decoder
+        resnet_prefix = f"decoder.mid.block_{orig_resnet_idx}"
+
+        diffusers_checkpoint.update(
+            movq_resnet_to_diffusers_checkpoint_spatial_norm(
+                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+            )
+        )
+
+    diffusers_checkpoint.update(
+        {
+            # conv_norm_out
+            "decoder.conv_norm_out.norm_layer.weight": checkpoint["decoder.norm_out.norm_layer.weight"],
+            "decoder.conv_norm_out.norm_layer.bias": checkpoint["decoder.norm_out.norm_layer.bias"],
+            "decoder.conv_norm_out.conv_y.weight": checkpoint["decoder.norm_out.conv_y.weight"],
+            "decoder.conv_norm_out.conv_y.bias": checkpoint["decoder.norm_out.conv_y.bias"],
+            "decoder.conv_norm_out.conv_b.weight": checkpoint["decoder.norm_out.conv_b.weight"],
+            "decoder.conv_norm_out.conv_b.bias": checkpoint["decoder.norm_out.conv_b.bias"],
+            # conv_out
+            "decoder.conv_out.weight": checkpoint["decoder.conv_out.weight"],
+            "decoder.conv_out.bias": checkpoint["decoder.conv_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def movq_resnet_to_diffusers_checkpoint(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    rv = {
+        # norm1
+        f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.norm1.weight"],
+        f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.norm1.bias"],
+        # conv1
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
+        # norm2
+        f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.norm2.weight"],
+        f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.norm2.bias"],
+        # conv2
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
+    }
+
+    if resnet.conv_shortcut is not None:
+        rv.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
+                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
+            }
+        )
+
+    return rv
+
+def movq_resnet_to_diffusers_checkpoint_spatial_norm(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    rv = {
+        # norm1
+        f"{diffusers_resnet_prefix}.norm1.norm_layer.weight": checkpoint[f"{resnet_prefix}.norm1.norm_layer.weight"],
+        f"{diffusers_resnet_prefix}.norm1.norm_layer.bias": checkpoint[f"{resnet_prefix}.norm1.norm_layer.bias"],
+        f"{diffusers_resnet_prefix}.norm1.conv_y.weight": checkpoint[f"{resnet_prefix}.norm1.conv_y.weight"],
+        f"{diffusers_resnet_prefix}.norm1.conv_y.bias": checkpoint[f"{resnet_prefix}.norm1.conv_y.bias"],
+        f"{diffusers_resnet_prefix}.norm1.conv_b.weight": checkpoint[f"{resnet_prefix}.norm1.conv_b.weight"],
+        f"{diffusers_resnet_prefix}.norm1.conv_b.bias": checkpoint[f"{resnet_prefix}.norm1.conv_b.bias"],
+        # conv1
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
+        # norm2
+        f"{diffusers_resnet_prefix}.norm2.norm_layer.weight": checkpoint[f"{resnet_prefix}.norm2.norm_layer.weight"],
+        f"{diffusers_resnet_prefix}.norm2.norm_layer.bias": checkpoint[f"{resnet_prefix}.norm2.norm_layer.bias"],
+        f"{diffusers_resnet_prefix}.norm2.conv_y.weight": checkpoint[f"{resnet_prefix}.norm2.conv_y.weight"],
+        f"{diffusers_resnet_prefix}.norm2.conv_y.bias": checkpoint[f"{resnet_prefix}.norm2.conv_y.bias"],
+        f"{diffusers_resnet_prefix}.norm2.conv_b.weight": checkpoint[f"{resnet_prefix}.norm2.conv_b.weight"],
+        f"{diffusers_resnet_prefix}.norm2.conv_b.bias": checkpoint[f"{resnet_prefix}.norm2.conv_b.bias"],
+        # conv2
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
+    }
+
+    if resnet.conv_shortcut is not None:
+        rv.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
+                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
+            }
+        )
+
+    return rv
+
+
+
+def movq_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
+    return {
+        # norm
+        f"{diffusers_attention_prefix}.norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
+        f"{diffusers_attention_prefix}.norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
+        # query
+        f"{diffusers_attention_prefix}.query.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.query.bias": checkpoint[f"{attention_prefix}.q.bias"],
+        # key
+        f"{diffusers_attention_prefix}.key.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.key.bias": checkpoint[f"{attention_prefix}.k.bias"],
+        # value
+        f"{diffusers_attention_prefix}.value.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.value.bias": checkpoint[f"{attention_prefix}.v.bias"],
+        # proj_attn
+        f"{diffusers_attention_prefix}.proj_attn.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
+            :, :, 0, 0
+        ],
+        f"{diffusers_attention_prefix}.proj_attn.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+    }
+
+def movq_attention_to_diffusers_checkpoint_spatial_norm(checkpoint, *, diffusers_attention_prefix, attention_prefix):
+    return {
+        # norm
+        f"{diffusers_attention_prefix}.norm.norm_layer.weight": checkpoint[f"{attention_prefix}.norm.norm_layer.weight"],
+        f"{diffusers_attention_prefix}.norm.norm_layer.bias": checkpoint[f"{attention_prefix}.norm.norm_layer.bias"],
+        f"{diffusers_attention_prefix}.norm.conv_y.weight": checkpoint[f"{attention_prefix}.norm.conv_y.weight"],
+        f"{diffusers_attention_prefix}.norm.conv_y.bias": checkpoint[f"{attention_prefix}.norm.conv_y.bias"],
+        f"{diffusers_attention_prefix}.norm.conv_b.weight": checkpoint[f"{attention_prefix}.norm.conv_b.weight"],
+        f"{diffusers_attention_prefix}.norm.conv_b.bias": checkpoint[f"{attention_prefix}.norm.conv_b.bias"],
+        # query
+        f"{diffusers_attention_prefix}.attention.to_q.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.attention.to_q.bias": checkpoint[f"{attention_prefix}.q.bias"],
+        # key
+        f"{diffusers_attention_prefix}.attention.to_k.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.attention.to_k.bias": checkpoint[f"{attention_prefix}.k.bias"],
+        # value
+        f"{diffusers_attention_prefix}.attention.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.attention.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"],
+        # proj_attn
+        f"{diffusers_attention_prefix}.attention.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
+            :, :, 0, 0
+        ],
+        f"{diffusers_attention_prefix}.attention.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+    }
+
+
+
+
+
+def movq_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+    diffusers_checkpoint.update(movq_encoder_to_diffusers_checkpoint(model, checkpoint))
+
+
+    # quant_conv
+
+    diffusers_checkpoint.update(
+        {
+            "quant_conv.weight": checkpoint["quant_conv.weight"],
+            "quant_conv.bias": checkpoint["quant_conv.bias"],
+        }
+    )
+
+    # quantize
+    diffusers_checkpoint.update({"quantize.embedding.weight": checkpoint["quantize.embedding.weight"]})
+
+    # post_quant_conv
+    diffusers_checkpoint.update(
+        {
+            "post_quant_conv.weight": checkpoint["post_quant_conv.weight"],
+            "post_quant_conv.bias": checkpoint["post_quant_conv.bias"],
+        }
+    )
+
+    # decoder
+    diffusers_checkpoint.update(movq_decoder_to_diffusers_checkpoint(model, checkpoint))
+
+
+
+    for keys in diffusers_checkpoint.keys():
+        print(keys)
+
+    return diffusers_checkpoint
+
+
+
+
+
+def movq(*, args, checkpoint_map_location):
+    print("loading movq")
+
+    movq_checkpoint = torch.load(args.movq_checkpoint_path, map_location=checkpoint_map_location)
+
+    movq_model = movq_model_from_original_config()
+
+    movq_diffusers_checkpoint = movq_original_checkpoint_to_diffusers_checkpoint(
+        movq_model, movq_checkpoint
+    )
+
+    del movq_checkpoint
+
+    load_checkpoint_to_model(movq_diffusers_checkpoint, movq_model, strict=True)
+
+    print("done loading movq")
+
+    return movq_model
+
 
 def load_checkpoint_to_model(checkpoint, model, strict=False):
-    with tempfile.NamedTemporaryFile() as file:
+    with tempfile.NamedTemporaryFile(delete=False) as file:
         torch.save(checkpoint, file.name)
         del checkpoint
         if strict:
             model.load_state_dict(torch.load(file.name), strict=True)
         else:
             load_checkpoint_and_dispatch(model, file.name, device_map="auto")
+    os.remove(file.name)
 
 
 if __name__ == "__main__":
@@ -921,24 +1321,31 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         "--prior_checkpoint_path",
         default=None,
         type=str,
-        required=True,
+        required=False,
         help="Path to the prior checkpoint to convert.",
     )
     parser.add_argument(
-        "--clip_stat_path", default=None, type=str, required=True, help="Path to the clip stats checkpoint to convert."
+        "--clip_stat_path", default=None, type=str, required=False, help="Path to the clip stats checkpoint to convert."
     )
     parser.add_argument(
         "--text2img_checkpoint_path",
         default=None,
         type=str,
-        required=True,
+        required=False,
+        help="Path to the text2img checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--movq_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
         help="Path to the text2img checkpoint to convert.",
     )
     parser.add_argument(
         "--inpaint_text2img_checkpoint_path",
         default=None,
         type=str,
-        required=True,
+        required=False,
         help="Path to the inpaint text2img checkpoint to convert.",
     )
     parser.add_argument(
@@ -979,5 +1386,8 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         inpaint_unet_model, inpaint_text_proj_model = inpaint_text2img(args=args, checkpoint_map_location=checkpoint_map_location)
         inpaint_unet_model.save_pretrained(f"{args.dump_path}/inpaint_unet")
         inpaint_text_proj_model.save_pretrained(f"{args.dump_path}/inpaint_text_proj")
+    elif args.debug == 'decoder':
+        decoder = movq(args=args, checkpoint_map_location=checkpoint_map_location)
+        decoder.save_pretrained(f"{args.dump_path}/decoder")
     else:
         raise ValueError(f"unknown debug value : {args.debug}")
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 0b313b83d360..2b31fa9e2f38 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -369,3 +369,24 @@ def forward(self, x, emb):
         x = F.group_norm(x, self.num_groups, eps=self.eps)
         x = x * (1 + scale) + shift
         return x
+
+class SpatialNorm(nn.Module):
+    """
+    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002
+    """
+    def __init__(
+        self,
+        f_channels,
+        zq_channels,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(num_channels=f_channels,num_groups=32,eps=1e-6,affine=True)
+        self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+        self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, f, zq):
+        f_size = f.shape[-2:]
+        zq = F.interpolate(zq, size=f_size, mode="nearest")
+        norm_f = self.norm_layer(f)
+        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return new_f
\ No newline at end of file
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index d9d539959c09..83bec9a52593 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -20,7 +20,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from .attention import AdaGroupNorm
+from .attention import AdaGroupNorm, SpatialNorm
 
 
 class Upsample1D(nn.Module):
@@ -460,7 +460,7 @@ def __init__(
         eps=1e-6,
         non_linearity="swish",
         skip_time_act=False,
-        time_embedding_norm="default",  # default, scale_shift, ada_group
+        time_embedding_norm="default",  # default, scale_shift, ada_group, spatial
         kernel=None,
         output_scale_factor=1.0,
         use_in_shortcut=None,
@@ -487,6 +487,8 @@ def __init__(
 
         if self.time_embedding_norm == "ada_group":
             self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm1 = SpatialNorm(in_channels, temb_channels)
         else:
             self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
 
@@ -497,7 +499,7 @@ def __init__(
                 self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels)
             elif self.time_embedding_norm == "scale_shift":
                 self.time_emb_proj = torch.nn.Linear(temb_channels, 2 * out_channels)
-            elif self.time_embedding_norm == "ada_group":
+            elif self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
                 self.time_emb_proj = None
             else:
                 raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
@@ -506,6 +508,8 @@ def __init__(
 
         if self.time_embedding_norm == "ada_group":
             self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm2 = SpatialNorm(out_channels, temb_channels)
         else:
             self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
 
@@ -551,7 +555,7 @@ def __init__(
     def forward(self, input_tensor, temb):
         hidden_states = input_tensor
 
-        if self.time_embedding_norm == "ada_group":
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm1(hidden_states, temb)
         else:
             hidden_states = self.norm1(hidden_states)
@@ -579,7 +583,7 @@ def forward(self, input_tensor, temb):
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
 
-        if self.time_embedding_norm == "ada_group":
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm2(hidden_states, temb)
         else:
             hidden_states = self.norm2(hidden_states)
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 400c3030af90..776203042e9b 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -20,7 +20,7 @@
 
 from ..utils import BaseOutput, randn_tensor
 from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
-
+from .attention import SpatialNorm
 
 @dataclass
 class DecoderOutput(BaseOutput):
@@ -149,6 +149,7 @@ def __init__(
         layers_per_block=2,
         norm_num_groups=32,
         act_fn="silu",
+        norm_type="default", # default, spatial
     ):
         super().__init__()
         self.layers_per_block = layers_per_block
@@ -164,16 +165,19 @@ def __init__(
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
 
+        
+        temb_channels = in_channels if norm_type == "spatial" else None
+
         # mid
         self.mid_block = UNetMidBlock2D(
             in_channels=block_out_channels[-1],
             resnet_eps=1e-6,
             resnet_act_fn=act_fn,
             output_scale_factor=1,
-            resnet_time_scale_shift="default",
+            resnet_time_scale_shift=norm_type,
             attn_num_head_channels=None,
             resnet_groups=norm_num_groups,
-            temb_channels=None,
+            temb_channels=temb_channels,
         )
 
         # up
@@ -196,19 +200,23 @@ def __init__(
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
                 attn_num_head_channels=None,
-                temb_channels=None,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
 
         # out
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
         self.conv_act = nn.SiLU()
         self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
 
         self.gradient_checkpointing = False
 
-    def forward(self, z):
+    def forward(self, z, zq=None):
         sample = z
         sample = self.conv_in(sample)
 
@@ -230,15 +238,18 @@ def custom_forward(*inputs):
                 sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample)
         else:
             # middle
-            sample = self.mid_block(sample)
+            sample = self.mid_block(sample, zq)
             sample = sample.to(upscale_dtype)
 
             # up
             for up_block in self.up_blocks:
-                sample = up_block(sample)
+                sample = up_block(sample, zq)
 
         # post-process
-        sample = self.conv_norm_out(sample)
+        if zq is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, zq)
         sample = self.conv_act(sample)
         sample = self.conv_out(sample)
 
diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py
index 65f734dccb2d..040447ba82c8 100644
--- a/src/diffusers/models/vq_model.py
+++ b/src/diffusers/models/vq_model.py
@@ -82,9 +82,11 @@ def __init__(
         norm_num_groups: int = 32,
         vq_embed_dim: Optional[int] = None,
         scaling_factor: float = 0.18215,
+        norm_type: str = "default"
     ):
         super().__init__()
 
+
         # pass init params to Encoder
         self.encoder = Encoder(
             in_channels=in_channels,
@@ -112,6 +114,7 @@ def __init__(
             layers_per_block=layers_per_block,
             act_fn=act_fn,
             norm_num_groups=norm_num_groups,
+            norm_type=norm_type,
         )
 
     def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput:
@@ -131,8 +134,8 @@ def decode(
             quant, emb_loss, info = self.quantize(h)
         else:
             quant = h
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
+        quant2 = self.post_quant_conv(quant) 
+        dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
 
         if not return_dict:
             return (dec,)

From 4f5f38084d99dfabf3986ac9928475d276025722 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 13 May 2023 02:57:11 +0000
Subject: [PATCH 040/182] adapt attention_processor for movq

---
 scripts/convert_kandinsky_to_diffusers.py     | 125 +++++++++---------
 src/diffusers/__init__.py                     |   2 +-
 src/diffusers/models/attention.py             |  23 +---
 src/diffusers/models/attention_processor.py   |  38 +++++-
 src/diffusers/models/resnet.py                |   3 +-
 src/diffusers/models/unet_2d_blocks.py        |  30 +++--
 src/diffusers/models/vae.py                   |  10 +-
 src/diffusers/models/vq_model.py              |   5 +-
 src/diffusers/pipelines/__init__.py           |   2 +-
 src/diffusers/pipelines/kandinsky/__init__.py |   2 +-
 .../pipelines/kandinsky/pipeline_kandinsky.py |   3 +-
 .../kandinsky/pipeline_kandinsky_inpaint.py   |  28 ++--
 .../kandinsky/pipeline_kandinsky_prior.py     |   4 +-
 13 files changed, 148 insertions(+), 127 deletions(-)

diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py
index 0462772e3140..37613369ca92 100644
--- a/scripts/convert_kandinsky_to_diffusers.py
+++ b/scripts/convert_kandinsky_to_diffusers.py
@@ -1,6 +1,6 @@
 import argparse
-import tempfile
 import os
+import tempfile
 
 import torch
 from accelerate import load_checkpoint_and_dispatch
@@ -26,8 +26,9 @@
       --clip_stat_path  /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/ViT-L-14_stats.th \
       --text2img_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/decoder_fp16.ckpt \
       --inpaint_text2img_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/inpainting_fp16.ckpt \
-      --dump_path /home/yiyi_huggingface_co/model_repo/Kandinsky-inpaint \
-      --debug inpaint_text2img
+      --movq_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/movq_final.ckpt \
+      --dump_path /home/yiyi_huggingface_co/dump \
+      --debug decoder
 ```
 """
 
@@ -259,6 +260,7 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix
     "use_linear_projection": False,
 }
 
+
 def unet_model_from_original_config():
     model = UNet2DConditionModel(**UNET_CONFIG)
 
@@ -369,6 +371,7 @@ def unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
     "use_linear_projection": False,
 }
 
+
 def inpaint_unet_model_from_original_config():
     model = UNet2DConditionModel(**INPAINT_UNET_CONFIG)
 
@@ -874,14 +877,19 @@ def text2img(*, args, checkpoint_map_location):
 
     return unet_model, text_proj_model
 
+
 def inpaint_text2img(*, args, checkpoint_map_location):
     print("loading inpaint text2img")
 
-    inpaint_text2img_checkpoint = torch.load(args.inpaint_text2img_checkpoint_path, map_location=checkpoint_map_location)
+    inpaint_text2img_checkpoint = torch.load(
+        args.inpaint_text2img_checkpoint_path, map_location=checkpoint_map_location
+    )
 
     inpaint_unet_model = inpaint_unet_model_from_original_config()
 
-    inpaint_unet_diffusers_checkpoint = inpaint_unet_original_checkpoint_to_diffusers_checkpoint(inpaint_unet_model, inpaint_text2img_checkpoint)
+    inpaint_unet_diffusers_checkpoint = inpaint_unet_original_checkpoint_to_diffusers_checkpoint(
+        inpaint_unet_model, inpaint_text2img_checkpoint
+    )
 
     # text proj interlude
 
@@ -903,26 +911,28 @@ def inpaint_text2img(*, args, checkpoint_map_location):
 
     return inpaint_unet_model, text_proj_model
 
+
 # movq
 
-MOVQ_CONFIG ={
-                "in_channels":3, 
-                "out_channels":3, 
-                "latent_channels":4, 
-                "down_block_types":("DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "AttnDownEncoderBlock2D"), 
-                "up_block_types":("AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"),
-                "num_vq_embeddings":16384,
-                "block_out_channels":(128, 256, 256, 512),
-                "vq_embed_dim":4,
-                "layers_per_block":2,
-                "norm_type":"spatial"
-            }
+MOVQ_CONFIG = {
+    "in_channels": 3,
+    "out_channels": 3,
+    "latent_channels": 4,
+    "down_block_types": ("DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "AttnDownEncoderBlock2D"),
+    "up_block_types": ("AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"),
+    "num_vq_embeddings": 16384,
+    "block_out_channels": (128, 256, 256, 512),
+    "vq_embed_dim": 4,
+    "layers_per_block": 2,
+    "norm_type": "spatial",
+}
 
 
 def movq_model_from_original_config():
-    movq = VQModel(**MOVQ_CONFIG )
+    movq = VQModel(**MOVQ_CONFIG)
     return movq
 
+
 def movq_encoder_to_diffusers_checkpoint(model, checkpoint):
     diffusers_checkpoint = {}
 
@@ -1156,6 +1166,7 @@ def movq_resnet_to_diffusers_checkpoint(resnet, checkpoint, *, diffusers_resnet_
 
     return rv
 
+
 def movq_resnet_to_diffusers_checkpoint_spatial_norm(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
     rv = {
         # norm1
@@ -1191,62 +1202,58 @@ def movq_resnet_to_diffusers_checkpoint_spatial_norm(resnet, checkpoint, *, diff
     return rv
 
 
-
 def movq_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
     return {
         # norm
-        f"{diffusers_attention_prefix}.norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
-        f"{diffusers_attention_prefix}.norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
+        f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
+        f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
         # query
-        f"{diffusers_attention_prefix}.query.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.query.bias": checkpoint[f"{attention_prefix}.q.bias"],
+        f"{diffusers_attention_prefix}.to_q.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_q.bias": checkpoint[f"{attention_prefix}.q.bias"],
         # key
-        f"{diffusers_attention_prefix}.key.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.key.bias": checkpoint[f"{attention_prefix}.k.bias"],
+        f"{diffusers_attention_prefix}.to_k.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_k.bias": checkpoint[f"{attention_prefix}.k.bias"],
         # value
-        f"{diffusers_attention_prefix}.value.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.value.bias": checkpoint[f"{attention_prefix}.v.bias"],
+        f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"],
         # proj_attn
-        f"{diffusers_attention_prefix}.proj_attn.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
+        f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
             :, :, 0, 0
         ],
-        f"{diffusers_attention_prefix}.proj_attn.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+        f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
     }
 
+
 def movq_attention_to_diffusers_checkpoint_spatial_norm(checkpoint, *, diffusers_attention_prefix, attention_prefix):
     return {
         # norm
-        f"{diffusers_attention_prefix}.norm.norm_layer.weight": checkpoint[f"{attention_prefix}.norm.norm_layer.weight"],
-        f"{diffusers_attention_prefix}.norm.norm_layer.bias": checkpoint[f"{attention_prefix}.norm.norm_layer.bias"],
-        f"{diffusers_attention_prefix}.norm.conv_y.weight": checkpoint[f"{attention_prefix}.norm.conv_y.weight"],
-        f"{diffusers_attention_prefix}.norm.conv_y.bias": checkpoint[f"{attention_prefix}.norm.conv_y.bias"],
-        f"{diffusers_attention_prefix}.norm.conv_b.weight": checkpoint[f"{attention_prefix}.norm.conv_b.weight"],
-        f"{diffusers_attention_prefix}.norm.conv_b.bias": checkpoint[f"{attention_prefix}.norm.conv_b.bias"],
+        f"{diffusers_attention_prefix}.spatial_norm.norm_layer.weight": checkpoint[f"{attention_prefix}.norm.norm_layer.weight"],
+        f"{diffusers_attention_prefix}.spatial_norm.norm_layer.bias": checkpoint[f"{attention_prefix}.norm.norm_layer.bias"],
+        f"{diffusers_attention_prefix}.spatial_norm.conv_y.weight": checkpoint[f"{attention_prefix}.norm.conv_y.weight"],
+        f"{diffusers_attention_prefix}.spatial_norm.conv_y.bias": checkpoint[f"{attention_prefix}.norm.conv_y.bias"],
+        f"{diffusers_attention_prefix}.spatial_norm.conv_b.weight": checkpoint[f"{attention_prefix}.norm.conv_b.weight"],
+        f"{diffusers_attention_prefix}.spatial_norm.conv_b.bias": checkpoint[f"{attention_prefix}.norm.conv_b.bias"],
         # query
-        f"{diffusers_attention_prefix}.attention.to_q.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.attention.to_q.bias": checkpoint[f"{attention_prefix}.q.bias"],
+        f"{diffusers_attention_prefix}.to_q.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_q.bias": checkpoint[f"{attention_prefix}.q.bias"],
         # key
-        f"{diffusers_attention_prefix}.attention.to_k.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.attention.to_k.bias": checkpoint[f"{attention_prefix}.k.bias"],
+        f"{diffusers_attention_prefix}.to_k.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_k.bias": checkpoint[f"{attention_prefix}.k.bias"],
         # value
-        f"{diffusers_attention_prefix}.attention.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
-        f"{diffusers_attention_prefix}.attention.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"],
+        f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"],
         # proj_attn
-        f"{diffusers_attention_prefix}.attention.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
+        f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
             :, :, 0, 0
         ],
-        f"{diffusers_attention_prefix}.attention.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+        f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
     }
 
 
-
-
-
 def movq_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
     diffusers_checkpoint = {}
     diffusers_checkpoint.update(movq_encoder_to_diffusers_checkpoint(model, checkpoint))
 
-
     # quant_conv
 
     diffusers_checkpoint.update(
@@ -1270,17 +1277,9 @@ def movq_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
     # decoder
     diffusers_checkpoint.update(movq_decoder_to_diffusers_checkpoint(model, checkpoint))
 
-
-
-    for keys in diffusers_checkpoint.keys():
-        print(keys)
-
     return diffusers_checkpoint
 
 
-
-
-
 def movq(*, args, checkpoint_map_location):
     print("loading movq")
 
@@ -1288,9 +1287,7 @@ def movq(*, args, checkpoint_map_location):
 
     movq_model = movq_model_from_original_config()
 
-    movq_diffusers_checkpoint = movq_original_checkpoint_to_diffusers_checkpoint(
-        movq_model, movq_checkpoint
-    )
+    movq_diffusers_checkpoint = movq_original_checkpoint_to_diffusers_checkpoint(movq_model, movq_checkpoint)
 
     del movq_checkpoint
 
@@ -1325,7 +1322,11 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         help="Path to the prior checkpoint to convert.",
     )
     parser.add_argument(
-        "--clip_stat_path", default=None, type=str, required=False, help="Path to the clip stats checkpoint to convert."
+        "--clip_stat_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the clip stats checkpoint to convert.",
     )
     parser.add_argument(
         "--text2img_checkpoint_path",
@@ -1383,10 +1384,12 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         unet_model.save_pretrained(f"{args.dump_path}/unet")
         text_proj_model.save_pretrained(f"{args.dump_path}/text_proj")
     elif args.debug == "inpaint_text2img":
-        inpaint_unet_model, inpaint_text_proj_model = inpaint_text2img(args=args, checkpoint_map_location=checkpoint_map_location)
+        inpaint_unet_model, inpaint_text_proj_model = inpaint_text2img(
+            args=args, checkpoint_map_location=checkpoint_map_location
+        )
         inpaint_unet_model.save_pretrained(f"{args.dump_path}/inpaint_unet")
         inpaint_text_proj_model.save_pretrained(f"{args.dump_path}/inpaint_text_proj")
-    elif args.debug == 'decoder':
+    elif args.debug == "decoder":
         decoder = movq(args=args, checkpoint_map_location=checkpoint_map_location)
         decoder.save_pretrained(f"{args.dump_path}/decoder")
     else:
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index b4551d0e2b7d..53e70a96928e 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -128,9 +128,9 @@
         IFInpaintingSuperResolutionPipeline,
         IFPipeline,
         IFSuperResolutionPipeline,
+        KandinskyInpaintPipeline,
         KandinskyPipeline,
         KandinskyPriorPipeline,
-        KandinskyInpaintPipeline,
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
         SemanticStableDiffusionPipeline,
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 2b31fa9e2f38..d83d1c7ecbd1 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -368,25 +368,4 @@ def forward(self, x, emb):
 
         x = F.group_norm(x, self.num_groups, eps=self.eps)
         x = x * (1 + scale) + shift
-        return x
-
-class SpatialNorm(nn.Module):
-    """
-    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002
-    """
-    def __init__(
-        self,
-        f_channels,
-        zq_channels,
-    ):
-        super().__init__()
-        self.norm_layer = nn.GroupNorm(num_channels=f_channels,num_groups=32,eps=1e-6,affine=True)
-        self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
-        self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
-
-    def forward(self, f, zq):
-        f_size = f.shape[-2:]
-        zq = F.interpolate(zq, size=f_size, mode="nearest")
-        norm_f = self.norm_layer(f)
-        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
-        return new_f
\ No newline at end of file
+        return x
\ No newline at end of file
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index f88400da0333..050ecfac19a2 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -62,6 +62,7 @@ def __init__(
         cross_attention_norm_num_groups: int = 32,
         added_kv_proj_dim: Optional[int] = None,
         norm_num_groups: Optional[int] = None,
+        spatial_norm_dim: Optional[int] = None,
         out_bias: bool = True,
         scale_qk: bool = True,
         only_cross_attention: bool = False,
@@ -105,6 +106,11 @@ def __init__(
         else:
             self.group_norm = None
 
+        if spatial_norm_dim is not None:
+            self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
+        else:
+            self.spatial_norm = None
+
         if cross_attention_norm is None:
             self.norm_cross = None
         elif cross_attention_norm == "layer_norm":
@@ -292,7 +298,9 @@ def set_processor(self, processor: "AttnProcessor"):
 
         self.processor = processor
 
-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
+    def forward(
+        self, hidden_states, encoder_hidden_states=None, attention_mask=None, vq_emb=None, **cross_attention_kwargs
+    ):
         # The `Attention` class can call different attention processors / attention functions
         # here we simply pass along all tensors to the selected processor class
         # For standard processors that are defined here, `**cross_attention_kwargs` is empty
@@ -301,6 +309,7 @@ def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None
             hidden_states,
             encoder_hidden_states=encoder_hidden_states,
             attention_mask=attention_mask,
+            vq_emb=vq_emb,
             **cross_attention_kwargs,
         )
 
@@ -416,9 +425,13 @@ def __call__(
         hidden_states,
         encoder_hidden_states=None,
         attention_mask=None,
+        vq_emb=None,
     ):
         residual = hidden_states
 
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, vq_emb)
+
         input_ndim = hidden_states.ndim
 
         if input_ndim == 4:
@@ -1241,3 +1254,26 @@ def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None,
     CustomDiffusionAttnProcessor,
     CustomDiffusionXFormersAttnProcessor,
 ]
+
+
+class SpatialNorm(nn.Module):
+    """
+    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002
+    """
+
+    def __init__(
+        self,
+        f_channels,
+        zq_channels,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=32, eps=1e-6, affine=True)
+        self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+        self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, f, zq):
+        f_size = f.shape[-2:]
+        zq = F.interpolate(zq, size=f_size, mode="nearest")
+        norm_f = self.norm_layer(f)
+        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return new_f
\ No newline at end of file
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 83bec9a52593..a5e1559c5c2f 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -20,7 +20,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from .attention import AdaGroupNorm, SpatialNorm
+from .attention import AdaGroupNorm
+from .attention_processor import SpatialNorm
 
 
 class Upsample1D(nn.Module):
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 0004f074c563..fe409474b77a 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -348,6 +348,7 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
             resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
         )
     elif up_block_type == "AttnUpDecoderBlock2D":
         return AttnUpDecoderBlock2D(
@@ -360,6 +361,7 @@ def get_up_block(
             resnet_groups=resnet_groups,
             attn_num_head_channels=attn_num_head_channels,
             resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
         )
     elif up_block_type == "KUpBlock2D":
         return KUpBlock2D(
@@ -395,7 +397,7 @@ def __init__(
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
+        resnet_time_scale_shift: str = "default",  # default, spatial
         resnet_act_fn: str = "swish",
         resnet_groups: int = 32,
         resnet_pre_norm: bool = True,
@@ -433,7 +435,8 @@ def __init__(
                         dim_head=attn_num_head_channels if attn_num_head_channels is not None else in_channels,
                         rescale_output_factor=output_scale_factor,
                         eps=resnet_eps,
-                        norm_num_groups=resnet_groups,
+                        norm_num_groups=resnet_groups if resnet_time_scale_shift == "default" else None,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
                         residual_connection=True,
                         bias=True,
                         upcast_softmax=True,
@@ -465,7 +468,7 @@ def forward(self, hidden_states, temb=None):
         hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
             if attn is not None:
-                hidden_states = attn(hidden_states)
+                hidden_states = attn(hidden_states, vq_emb=temb)
             hidden_states = resnet(hidden_states, temb)
 
         return hidden_states
@@ -1979,12 +1982,13 @@ def __init__(
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
+        resnet_time_scale_shift: str = "default",  # default, spatial
         resnet_act_fn: str = "swish",
         resnet_groups: int = 32,
         resnet_pre_norm: bool = True,
         output_scale_factor=1.0,
         add_upsample=True,
+        temb_channels=None,
     ):
         super().__init__()
         resnets = []
@@ -1996,7 +2000,7 @@ def __init__(
                 ResnetBlock2D(
                     in_channels=input_channels,
                     out_channels=out_channels,
-                    temb_channels=None,
+                    temb_channels=temb_channels,
                     eps=resnet_eps,
                     groups=resnet_groups,
                     dropout=dropout,
@@ -2014,9 +2018,9 @@ def __init__(
         else:
             self.upsamplers = None
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states, temb=None):
         for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
+            hidden_states = resnet(hidden_states, temb=temb)
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -2040,6 +2044,7 @@ def __init__(
         attn_num_head_channels=1,
         output_scale_factor=1.0,
         add_upsample=True,
+        temb_channels=None,
     ):
         super().__init__()
         resnets = []
@@ -2052,7 +2057,7 @@ def __init__(
                 ResnetBlock2D(
                     in_channels=input_channels,
                     out_channels=out_channels,
-                    temb_channels=None,
+                    temb_channels=temb_channels,
                     eps=resnet_eps,
                     groups=resnet_groups,
                     dropout=dropout,
@@ -2069,7 +2074,8 @@ def __init__(
                     dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels,
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
-                    norm_num_groups=resnet_groups,
+                    norm_num_groups=resnet_groups if resnet_time_scale_shift == "default" else None,
+                    spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
                     residual_connection=True,
                     bias=True,
                     upcast_softmax=True,
@@ -2085,10 +2091,10 @@ def __init__(
         else:
             self.upsamplers = None
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states, temb=None):
         for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb=None)
-            hidden_states = attn(hidden_states)
+            hidden_states = resnet(hidden_states, temb=temb)
+            hidden_states = attn(hidden_states, vq_emb=temb)
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 776203042e9b..869ffa2a9be4 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -19,8 +19,9 @@
 import torch.nn as nn
 
 from ..utils import BaseOutput, randn_tensor
+from .attention_processor import SpatialNorm
 from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
-from .attention import SpatialNorm
+
 
 @dataclass
 class DecoderOutput(BaseOutput):
@@ -149,7 +150,7 @@ def __init__(
         layers_per_block=2,
         norm_num_groups=32,
         act_fn="silu",
-        norm_type="default", # default, spatial
+        norm_type="default",  # default, spatial
     ):
         super().__init__()
         self.layers_per_block = layers_per_block
@@ -165,7 +166,6 @@ def __init__(
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
 
-        
         temb_channels = in_channels if norm_type == "spatial" else None
 
         # mid
@@ -230,12 +230,12 @@ def custom_forward(*inputs):
                 return custom_forward
 
             # middle
-            sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+            sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample, zq)
             sample = sample.to(upscale_dtype)
 
             # up
             for up_block in self.up_blocks:
-                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample)
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, zq)
         else:
             # middle
             sample = self.mid_block(sample, zq)
diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py
index 040447ba82c8..ce5439925318 100644
--- a/src/diffusers/models/vq_model.py
+++ b/src/diffusers/models/vq_model.py
@@ -82,11 +82,10 @@ def __init__(
         norm_num_groups: int = 32,
         vq_embed_dim: Optional[int] = None,
         scaling_factor: float = 0.18215,
-        norm_type: str = "default"
+        norm_type: str = "default", # default, spatial 
     ):
         super().__init__()
 
-
         # pass init params to Encoder
         self.encoder = Encoder(
             in_channels=in_channels,
@@ -134,7 +133,7 @@ def decode(
             quant, emb_loss, info = self.quantize(h)
         else:
             quant = h
-        quant2 = self.post_quant_conv(quant) 
+        quant2 = self.post_quant_conv(quant)
         dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
 
         if not return_dict:
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index eaea3a89c6ce..ba906e22300d 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -52,7 +52,7 @@
         IFPipeline,
         IFSuperResolutionPipeline,
     )
-    from .kandinsky import KandinskyPipeline, KandinskyPriorPipeline, KandinskyInpaintPipeline
+    from .kandinsky import KandinskyInpaintPipeline, KandinskyPipeline, KandinskyPriorPipeline
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 93731dbdbf3d..0242d9ae5edf 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -13,7 +13,7 @@
     from ...utils.dummy_torch_and_transformers_objects import KandinskyPipeline, KandinskyPriorPipeline
 else:
     from .pipeline_kandinsky import KandinskyPipeline
-    from .pipeline_kandinsky_prior import KandinskyPriorPipeline
     from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
+    from .pipeline_kandinsky_prior import KandinskyPriorPipeline
     from .text_encoder import MultilingualCLIP
     from .text_proj import KandinskyTextProjModel
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index d988f38506ea..fd11f24f85b0 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -329,11 +329,10 @@ def __call__(
             self.scheduler,
         )
 
-
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            
+
             noise_pred = self.unet(
                 sample=latent_model_input,  # [2, 4, 96, 96]
                 timestep=t,
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index b41d087ee184..50530dbcbd6c 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -12,15 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from copy import deepcopy
 from typing import List, Optional, Union
 
-import PIL
-from PIL import Image
-
 import numpy as np
+import PIL
 import torch
 import torch.nn.functional as F
-
+from PIL import Image
 from transformers import (
     XLMRobertaTokenizerFast,
 )
@@ -37,11 +36,11 @@
 )
 from .text_encoder import MultilingualCLIP
 from .text_proj import KandinskyTextProjModel
-from copy import deepcopy 
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+
 def get_new_h_w(h, w):
     new_h = h // 64
     if h % 64 != 0:
@@ -51,6 +50,7 @@ def get_new_h_w(h, w):
         new_w += 1
     return new_h * 8, new_w * 8
 
+
 def prepare_mask(mask):
     mask = mask.float()[0]
     old_mask = deepcopy(mask)
@@ -105,7 +105,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
     def __init__(
         self,
         text_encoder: MultilingualCLIP,
-        #image_encoder: MOVQ # TO_DO add this later 
+        # image_encoder: MOVQ # TO_DO add this later
         tokenizer: XLMRobertaTokenizerFast,
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
@@ -344,20 +344,20 @@ def __call__(
         prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
-       
+
         image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)
-        
+
         text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
             image_embeddings=image_embeds,
             prompt_embeds=prompt_embeds,
             text_encoder_hidden_states=text_encoder_hidden_states,
         )
 
-        # preprocess image and mask 
+        # preprocess image and mask
         ## Encode the image
         image = prepare_image(image, width, height).to(device)
         image = self.image_encoder.encode(image)
-        
+
         ## prepared mask
         mask_image = torch.from_numpy(mask_image).unsqueeze(0).unsqueeze(0)
         image_shape = tuple(image.shape[-2:])
@@ -367,7 +367,7 @@ def __call__(
             mode="nearest",
         )
         mask_image = prepare_mask(mask_image).to(device)
-        
+
         ## apply mask on image
         masked_image = image * mask_image
 
@@ -380,11 +380,11 @@ def __call__(
 
         # YiYi's TO-DO: hard-code to be 4, need to set it to be the z_channels in MoVQ encoder's config once it's added
         num_channels_latents = 4
-        #num_channels_latents = self.image_encoder.config.z_channels
-        
+        # num_channels_latents = self.image_encoder.config.z_channels
+
         # get h, w for latents
         sample_height, sample_width = get_new_h_w(height, width)
-    
+
         # create initial latent
         latents = self.prepare_latents(
             (batch_size, num_channels_latents, sample_height, sample_width),
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 8ef7b369d096..29c978455af1 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -183,9 +183,7 @@ def _encode_prompt(
         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
         if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(
-                untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-            )
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"

From 32e114407b3c9a748a00d4f72cd2b51cbfb4c949 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 13 May 2023 09:37:59 +0000
Subject: [PATCH 041/182] add movq module

---
 src/diffusers/models/attention_processor.py   |  3 +--
 .../pipelines/kandinsky/pipeline_kandinsky.py | 24 +++++++++++++++++--
 .../kandinsky/pipeline_kandinsky_inpaint.py   | 13 ++++++----
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 050ecfac19a2..06e4a193bba7 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -299,7 +299,7 @@ def set_processor(self, processor: "AttnProcessor"):
         self.processor = processor
 
     def forward(
-        self, hidden_states, encoder_hidden_states=None, attention_mask=None, vq_emb=None, **cross_attention_kwargs
+        self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs
     ):
         # The `Attention` class can call different attention processors / attention functions
         # here we simply pass along all tensors to the selected processor class
@@ -309,7 +309,6 @@ def forward(
             hidden_states,
             encoder_hidden_states=encoder_hidden_states,
             attention_mask=attention_mask,
-            vq_emb=vq_emb,
             **cross_attention_kwargs,
         )
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index fd11f24f85b0..9ce71637713a 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -19,8 +19,9 @@
     XLMRobertaTokenizerFast,
 )
 
-from ...models import UNet2DConditionModel
+from ...models import UNet2DConditionModel, VQModel
 from ...pipelines import DiffusionPipeline
+from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
     is_accelerate_available,
@@ -63,6 +64,8 @@ class KandinskyPipeline(DiffusionPipeline):
             Conditional U-Net architecture to denoise the image embedding.
         text_proj ([`KandinskyTextProjModel`]):
             Utility class to prepare and combine the embeddings before they are passed to the decoder.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
     """
 
     def __init__(
@@ -72,6 +75,7 @@ def __init__(
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
         scheduler: UnCLIPScheduler,
+        movq: VQModel
     ):
         super().__init__()
 
@@ -81,6 +85,7 @@ def __init__(
             text_proj=text_proj,
             unet=unet,
             scheduler=scheduler,
+            movq=movq,
         )
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
@@ -370,4 +375,19 @@ def __call__(
 
             _, latents = latents.chunk(2)
 
-        return latents
+        
+        # post-processing
+        image = self.movq.decode(latents,force_not_quantize=True)["sample"]
+
+        image = image * 0.5 + 0.5
+        image = image.clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 50530dbcbd6c..7faf2f9d7f10 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -24,7 +24,7 @@
     XLMRobertaTokenizerFast,
 )
 
-from ...models import UNet2DConditionModel
+from ...models import UNet2DConditionModel, VQModel
 from ...pipelines import DiffusionPipeline
 from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import UnCLIPScheduler
@@ -100,12 +100,14 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
             Conditional U-Net architecture to denoise the image embedding.
         text_proj ([`KandinskyTextProjModel`]):
             Utility class to prepare and combine the embeddings before they are passed to the decoder.
+        movq ([`VQModel`]):
+            MoVQ image encoder and decoder
     """
 
     def __init__(
         self,
         text_encoder: MultilingualCLIP,
-        # image_encoder: MOVQ # TO_DO add this later
+        movq: VQModel,
         tokenizer: XLMRobertaTokenizerFast,
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
@@ -115,6 +117,7 @@ def __init__(
 
         self.register_modules(
             text_encoder=text_encoder,
+            movq=movq,
             tokenizer=tokenizer,
             text_proj=text_proj,
             unet=unet,
@@ -356,7 +359,7 @@ def __call__(
         # preprocess image and mask
         ## Encode the image
         image = prepare_image(image, width, height).to(device)
-        image = self.image_encoder.encode(image)
+        image = self.movq.encode(image)["latents"]
 
         ## prepared mask
         mask_image = torch.from_numpy(mask_image).unsqueeze(0).unsqueeze(0)
@@ -380,7 +383,7 @@ def __call__(
 
         # YiYi's TO-DO: hard-code to be 4, need to set it to be the z_channels in MoVQ encoder's config once it's added
         num_channels_latents = 4
-        # num_channels_latents = self.image_encoder.config.z_channels
+        # num_channels_latents = self.movq.config.z_channels
 
         # get h, w for latents
         sample_height, sample_width = get_new_h_w(height, width)
@@ -450,7 +453,7 @@ def __call__(
             _, latents = latents.chunk(2)
 
         # post-processing
-        image = self.image_encoder.decode(latents)
+        image = self.movq.decode(latents,force_not_quantize=True)["sample"]
 
         image = image * 0.5 + 0.5
         image = image.clamp(0, 1)

From 0ba8a62dd369845a7219c19d7278a5364d0e4077 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 13 May 2023 09:39:04 +0000
Subject: [PATCH 042/182] make style

---
 scripts/convert_kandinsky_to_diffusers.py     | 24 +++++++++++--------
 src/diffusers/models/attention.py             |  2 +-
 src/diffusers/models/attention_processor.py   |  6 ++---
 src/diffusers/models/vq_model.py              |  2 +-
 .../pipelines/kandinsky/pipeline_kandinsky.py |  6 ++---
 .../kandinsky/pipeline_kandinsky_inpaint.py   |  2 +-
 .../dummy_torch_and_transformers_objects.py   | 15 ++++++++++++
 7 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py
index 37613369ca92..de9879f7f03b 100644
--- a/scripts/convert_kandinsky_to_diffusers.py
+++ b/scripts/convert_kandinsky_to_diffusers.py
@@ -1217,9 +1217,7 @@ def movq_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_pr
         f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
         f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"],
         # proj_attn
-        f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
-            :, :, 0, 0
-        ],
+        f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][:, :, 0, 0],
         f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
     }
 
@@ -1227,11 +1225,19 @@ def movq_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_pr
 def movq_attention_to_diffusers_checkpoint_spatial_norm(checkpoint, *, diffusers_attention_prefix, attention_prefix):
     return {
         # norm
-        f"{diffusers_attention_prefix}.spatial_norm.norm_layer.weight": checkpoint[f"{attention_prefix}.norm.norm_layer.weight"],
-        f"{diffusers_attention_prefix}.spatial_norm.norm_layer.bias": checkpoint[f"{attention_prefix}.norm.norm_layer.bias"],
-        f"{diffusers_attention_prefix}.spatial_norm.conv_y.weight": checkpoint[f"{attention_prefix}.norm.conv_y.weight"],
+        f"{diffusers_attention_prefix}.spatial_norm.norm_layer.weight": checkpoint[
+            f"{attention_prefix}.norm.norm_layer.weight"
+        ],
+        f"{diffusers_attention_prefix}.spatial_norm.norm_layer.bias": checkpoint[
+            f"{attention_prefix}.norm.norm_layer.bias"
+        ],
+        f"{diffusers_attention_prefix}.spatial_norm.conv_y.weight": checkpoint[
+            f"{attention_prefix}.norm.conv_y.weight"
+        ],
         f"{diffusers_attention_prefix}.spatial_norm.conv_y.bias": checkpoint[f"{attention_prefix}.norm.conv_y.bias"],
-        f"{diffusers_attention_prefix}.spatial_norm.conv_b.weight": checkpoint[f"{attention_prefix}.norm.conv_b.weight"],
+        f"{diffusers_attention_prefix}.spatial_norm.conv_b.weight": checkpoint[
+            f"{attention_prefix}.norm.conv_b.weight"
+        ],
         f"{diffusers_attention_prefix}.spatial_norm.conv_b.bias": checkpoint[f"{attention_prefix}.norm.conv_b.bias"],
         # query
         f"{diffusers_attention_prefix}.to_q.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
@@ -1243,9 +1249,7 @@ def movq_attention_to_diffusers_checkpoint_spatial_norm(checkpoint, *, diffusers
         f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
         f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"],
         # proj_attn
-        f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
-            :, :, 0, 0
-        ],
+        f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][:, :, 0, 0],
         f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
     }
 
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index d83d1c7ecbd1..0b313b83d360 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -368,4 +368,4 @@ def forward(self, x, emb):
 
         x = F.group_norm(x, self.num_groups, eps=self.eps)
         x = x * (1 + scale) + shift
-        return x
\ No newline at end of file
+        return x
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 06e4a193bba7..bf2f9cea91d4 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -298,9 +298,7 @@ def set_processor(self, processor: "AttnProcessor"):
 
         self.processor = processor
 
-    def forward(
-        self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs
-    ):
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
         # The `Attention` class can call different attention processors / attention functions
         # here we simply pass along all tensors to the selected processor class
         # For standard processors that are defined here, `**cross_attention_kwargs` is empty
@@ -1275,4 +1273,4 @@ def forward(self, f, zq):
         zq = F.interpolate(zq, size=f_size, mode="nearest")
         norm_f = self.norm_layer(f)
         new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
-        return new_f
\ No newline at end of file
+        return new_f
diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py
index ce5439925318..53a3ecc2b2d2 100644
--- a/src/diffusers/models/vq_model.py
+++ b/src/diffusers/models/vq_model.py
@@ -82,7 +82,7 @@ def __init__(
         norm_num_groups: int = 32,
         vq_embed_dim: Optional[int] = None,
         scaling_factor: float = 0.18215,
-        norm_type: str = "default", # default, spatial 
+        norm_type: str = "default",  # default, spatial
     ):
         super().__init__()
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 9ce71637713a..7392d36b2e15 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -75,7 +75,7 @@ def __init__(
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
         scheduler: UnCLIPScheduler,
-        movq: VQModel
+        movq: VQModel,
     ):
         super().__init__()
 
@@ -375,9 +375,8 @@ def __call__(
 
             _, latents = latents.chunk(2)
 
-        
         # post-processing
-        image = self.movq.decode(latents,force_not_quantize=True)["sample"]
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         image = image * 0.5 + 0.5
         image = image.clamp(0, 1)
@@ -390,4 +389,3 @@ def __call__(
             return (image,)
 
         return ImagePipelineOutput(images=image)
-
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 7faf2f9d7f10..a9ab4a67e904 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -453,7 +453,7 @@ def __call__(
             _, latents = latents.chunk(2)
 
         # post-processing
-        image = self.movq.decode(latents,force_not_quantize=True)["sample"]
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         image = image * 0.5 + 0.5
         image = image.clamp(0, 1)
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 55d057ffc6c0..cdd61dcf2ac5 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -152,6 +152,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class KandinskyInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class KandinskyPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From dd218a7df8553048be7682059fb085fba361c3bc Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 13 May 2023 19:42:53 +0000
Subject: [PATCH 043/182] add vq_emb and spatial_norm step bo attnprocessor2.0

---
 src/diffusers/models/attention_processor.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index bf2f9cea91d4..a490bd2bfd8b 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -879,9 +879,12 @@ def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
 
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, vq_emb=None,):
         residual = hidden_states
 
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, vq_emb)
+            
         input_ndim = hidden_states.ndim
 
         if input_ndim == 4:

From 3adde2c0301a3ab4f9dcd0a1157d5cacfdadbb50 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 15 May 2023 16:22:24 +0000
Subject: [PATCH 044/182] image_embeds and negative_image_embeds arguments are
 required

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 7392d36b2e15..d31b52a5bdc7 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -279,6 +279,8 @@ def _execution_device(self):
     def __call__(
         self,
         prompt: Union[str, List[str]],
+        image_embeds: torch.FloatTensor,
+        negative_image_embeds: torch.FloatTensor,
         height: int = 512,
         width: int = 512,
         num_inference_steps: int = 100,
@@ -287,8 +289,6 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
-        image_embeds: Optional[torch.FloatTensor] = None,
-        negative_image_embeds: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ):

From 3699be5db0b2b11560bf21212382287320a38fd8 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 15 May 2023 16:24:04 +0000
Subject: [PATCH 045/182] add tests

---
 tests/pipelines/kandinsky/__init__.py       |   0
 tests/pipelines/kandinsky/test_kandinsky.py | 250 ++++++++++++++++++++
 2 files changed, 250 insertions(+)
 create mode 100644 tests/pipelines/kandinsky/__init__.py
 create mode 100644 tests/pipelines/kandinsky/test_kandinsky.py

diff --git a/tests/pipelines/kandinsky/__init__.py b/tests/pipelines/kandinsky/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
new file mode 100644
index 000000000000..aa7daab32598
--- /dev/null
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from transformers import XLMRobertaTokenizerFast, PretrainedConfig
+
+from diffusers import PriorTransformer, KandinskyPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
+from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
+from diffusers.pipelines.kandinsky.text_encoder import MultilingualCLIP
+from diffusers.utils import floats_tensor, load_numpy, nightly, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
+class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyPipeline
+    params = [
+        "prompt",
+        "image_embeds",
+        "negative_image_embeds",
+    ]
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict"
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+    
+    # YiYi's TO-DO: add a tiny tokenizer?
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
+        return tokenizer
+   
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = PretrainedConfig(
+            modelBase="YiYiXu/tiny-random-mclip-base",
+            numDims=100,
+            transformerDimensions=32)
+
+        return MultilingualCLIP(config)
+
+    @property
+    def dummy_text_proj(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "clip_embeddings_dim": self.cross_attention_dim,
+            "time_embed_dim": self.time_embed_dim,
+            "clip_extra_context_tokens":2,
+            "cross_attention_dim": self.cross_attention_dim,
+            "clip_text_encoder_hidden_states_dim":  self.text_embedder_hidden_size,
+        }
+
+        model = KandinskyTextProjModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 4,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": "identity",
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32,64],
+            "down_block_types": ["DownEncoderBlock2D","AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": ["AttnUpDecoderBlock2D","UpDecoderBlock2D",],
+            "vq_embed_dim": 4
+            }
+
+    @property
+    def dummy_movq(self):
+        # seeded differently to get different unet than `self.dummy_super_res_first`
+        torch.manual_seed(1)
+
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        unet = self.dummy_unet
+        text_proj = self.dummy_text_proj
+        movq = self.dummy_movq
+
+        scheduler = UnCLIPScheduler(
+            clip_sample = True,
+            clip_sample_range = 2.0,
+            sample_min_value=1.0,
+            sample_max_value= None,
+            num_train_timesteps= 1000,
+            prediction_type="epsilon",
+            variance_type= "learned_range",
+            thresholding= True,
+            beta_schedule= "linear",
+            beta_start= 0.00085,
+            beta_end=0.012
+        )
+
+        components = {
+            "text_proj": text_proj,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.cross_attention_dim)).to(device)
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "image_embeds": image_embeds,
+            "negative_image_embeds":image_embeds,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        
+        print(f"image.shape {image.shape}")
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [
+                0.9997,
+                0.9988,
+                0.0028,
+                0.9997,
+                0.9984,
+                0.9965,
+                0.0029,
+                0.9986,
+                0.0025,
+            ]
+        )
+        print(image_slice.flatten())
+        print(image_from_tuple_slice.flatten())
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2

From 913ffdeea47eb5df4a2c9b201364570645fdc85c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 15 May 2023 18:51:00 +0000
Subject: [PATCH 046/182] fix

---
 tests/pipelines/kandinsky/test_kandinsky.py | 39 +++++++++++----------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index aa7daab32598..8d5433ea26b3 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -62,7 +62,7 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 
     @property
     def text_embedder_hidden_size(self):
-        return 32
+        return 1024
 
     @property
     def time_input_dim(self):
@@ -86,13 +86,23 @@ def dummy_tokenizer(self):
         tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
         return tokenizer
    
+    # @property
+    # def dummy_text_encoder(self):
+    #     torch.manual_seed(0)
+    #     config = PretrainedConfig(
+    #         modelBase="YiYiXu/tiny-random-mclip-base",
+    #         numDims=100,
+    #         transformerDimensions=32)
+
+    #     return MultilingualCLIP(config)
+    
     @property
     def dummy_text_encoder(self):
         torch.manual_seed(0)
         config = PretrainedConfig(
-            modelBase="YiYiXu/tiny-random-mclip-base",
-            numDims=100,
-            transformerDimensions=32)
+             modelBase="xlm-roberta-large",
+             numDims=self.cross_attention_dim,
+             transformerDimensions=1024)
 
         return MultilingualCLIP(config)
 
@@ -151,9 +161,7 @@ def dummy_movq_kwargs(self):
 
     @property
     def dummy_movq(self):
-        # seeded differently to get different unet than `self.dummy_super_res_first`
-        torch.manual_seed(1)
-
+        torch.manual_seed(0)
         model = VQModel(**self.dummy_movq_kwargs)
         return model
 
@@ -190,7 +198,8 @@ def get_dummy_components(self):
         return components
 
     def get_dummy_inputs(self, device, seed=0):
-        image_embeds = floats_tensor((1, self.cross_attention_dim)).to(device)
+        image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed+1)).to(device)
         if str(device).startswith("mps"):
             generator = torch.manual_seed(seed)
         else:
@@ -200,6 +209,8 @@ def get_dummy_inputs(self, device, seed=0):
             "image_embeds": image_embeds,
             "negative_image_embeds":image_embeds,
             "generator": generator,
+            "height": 256,
+            "width":256,
             "num_inference_steps": 2,
             "output_type": "np",
         }
@@ -231,17 +242,7 @@ def test_kandinsky(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [
-                0.9997,
-                0.9988,
-                0.0028,
-                0.9997,
-                0.9984,
-                0.9965,
-                0.0029,
-                0.9986,
-                0.0025,
-            ]
+            [0.5208529, 0.4821977, 0.44796965, 0.5479469, 0.54242486, 0.45028442, 0.42460358, 0.46456948, 0.48675597]
         )
         print(image_slice.flatten())
         print(image_from_tuple_slice.flatten())

From a27267f85595ee7157f4054600733221360064bf Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 15 May 2023 20:32:13 +0000
Subject: [PATCH 047/182] fix image_embeds dtype

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index d31b52a5bdc7..f630fd0bb574 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -309,7 +309,7 @@ def __call__(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
 
-        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=prompt_embeds.dtype, device=device)
 
         text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
             image_embeddings=image_embeds,

From 333697a5b4f13f2ad33e1720f524c45a37e29d6c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 15 May 2023 21:19:34 +0000
Subject: [PATCH 048/182] fix cpu offload

---
 .../pipelines/kandinsky/pipeline_kandinsky.py | 39 +++++++++----------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index f630fd0bb574..0f2d993de77e 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -119,9 +119,7 @@ def _encode_prompt(
             return_tensors="pt",
         )
 
-        text_input_ids = text_inputs.input_ids.to(device)
-        text_mask = text_inputs.attention_mask.to(device)
-
+        text_input_ids = text_inputs.input_ids
         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
         if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
@@ -130,7 +128,9 @@ def _encode_prompt(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        text_input_ids = text_input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
 
         prompt_embeds, text_encoder_hidden_states = self.text_encoder(
             input_ids=text_input_ids, attention_mask=text_mask
@@ -201,31 +201,28 @@ def _encode_prompt(
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
-        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
-        Note that offloading happens on a submodule basis. Memory savings are higher than with
-        `enable_model_cpu_offload`, but performance is lower.
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
         """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+        if is_accelerate_available():
             from accelerate import cpu_offload
         else:
-            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+            raise ImportError("Please install accelerate via `pip install accelerate`")
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
-
-        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+        models = [
+            self.unet,
+            self.text_proj,
+            self.text_encoder,
+            self.movq,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
     def enable_model_cpu_offload(self, gpu_id=0):

From 1080a131c63aa7d2f818d54a1b2b4ffabac4dcf7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 15 May 2023 22:35:26 +0000
Subject: [PATCH 049/182] fix batch consistent

---
 .../pipelines/kandinsky/pipeline_kandinsky.py        | 12 ++++++++++--
 tests/pipelines/kandinsky/test_kandinsky.py          |  7 ++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 0f2d993de77e..300b3c189d62 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -299,12 +299,20 @@ def __call__(
         device = self._execution_device
 
         batch_size = batch_size * num_images_per_prompt
-
         do_classifier_free_guidance = guidance_scale > 1.0
 
         prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
+        
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
         image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=prompt_embeds.dtype, device=device)
 
@@ -364,7 +372,7 @@ def __call__(
             latents = self.scheduler.step(
                 noise_pred,
                 t,
-                latents,
+                latent_model_input,
                 prev_timestep=prev_timestep,
                 generator=generator,
                 batch_size=batch_size,
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 8d5433ea26b3..d5c307c8087f 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -43,7 +43,12 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "image_embeds",
         "negative_image_embeds",
     ]
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    batch_params = [
+        "prompt",
+        "negative_prompt",
+        "image_embeds",
+        "negative_image_embeds"
+    ]
     required_optional_params = [
         "generator",
         "height",

From 6aec2d086741b9495cb08924c38c6c1a20a982cd Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 15 May 2023 22:37:01 +0000
Subject: [PATCH 050/182] remove print line

---
 tests/pipelines/kandinsky/test_kandinsky.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index d5c307c8087f..4a6f0e1ab5b9 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -249,8 +249,6 @@ def test_kandinsky(self):
         expected_slice = np.array(
             [0.5208529, 0.4821977, 0.44796965, 0.5479469, 0.54242486, 0.45028442, 0.42460358, 0.46456948, 0.48675597]
         )
-        print(image_slice.flatten())
-        print(image_from_tuple_slice.flatten())
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2

From 9fbcff133568465bb780ff1b56a4d237c3fb032a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 15 May 2023 23:11:33 +0000
Subject: [PATCH 051/182] fix inpaint pipeline

---
 .../kandinsky/pipeline_kandinsky_inpaint.py   | 61 ++++++++++---------
 tests/pipelines/kandinsky/test_kandinsky.py   |  2 +-
 2 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index a9ab4a67e904..f60f28969198 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -155,9 +155,7 @@ def _encode_prompt(
             return_tensors="pt",
         )
 
-        text_input_ids = text_inputs.input_ids.to(device)
-        text_mask = text_inputs.attention_mask.to(device)
-
+        text_input_ids = text_inputs.input_ids
         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
         if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
@@ -166,7 +164,9 @@ def _encode_prompt(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        text_input_ids = text_input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
 
         prompt_embeds, text_encoder_hidden_states = self.text_encoder(
             input_ids=text_input_ids, attention_mask=text_mask
@@ -237,31 +237,28 @@ def _encode_prompt(
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
-        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
-        Note that offloading happens on a submodule basis. Memory savings are higher than with
-        `enable_model_cpu_offload`, but performance is lower.
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
         """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+        if is_accelerate_available():
             from accelerate import cpu_offload
         else:
-            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+            raise ImportError("Please install accelerate via `pip install accelerate`")
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
-
-        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+        models = [
+            self.unet,
+            self.text_proj,
+            self.text_encoder,
+            self.movq,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
     def enable_model_cpu_offload(self, gpu_id=0):
@@ -315,8 +312,10 @@ def _execution_device(self):
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
-        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+        image_embeds: torch.FloatTensor,
+        negative_image_embeds: torch.FloatTensor,
         height: int = 512,
         width: int = 512,
         num_inference_steps: int = 100,
@@ -325,8 +324,6 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
-        image_embeds: Optional[torch.FloatTensor] = None,
-        negative_image_embeds: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ):
@@ -341,14 +338,22 @@ def __call__(
         device = self._execution_device
 
         batch_size = batch_size * num_images_per_prompt
-
         do_classifier_free_guidance = guidance_scale > 1.0
 
         prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
 
-        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=prompt_embeds.dtype, device=device)
 
         text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
             image_embeddings=image_embeds,
@@ -444,7 +449,7 @@ def __call__(
             latents = self.scheduler.step(
                 noise_pred,
                 t,
-                latents,
+                torch.cat([latents] * 2) if do_classifier_free_guidance else latents,
                 prev_timestep=prev_timestep,
                 generator=generator,
                 batch_size=batch_size,
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 4a6f0e1ab5b9..336f56daffd5 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -251,4 +251,4 @@ def test_kandinsky(self):
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
\ No newline at end of file

From 7170530de61eb81fc7bc53115f2652782c8e11cb Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 00:49:47 +0000
Subject: [PATCH 052/182] refactor image and mask pre-processing

---
 .../kandinsky/pipeline_kandinsky_inpaint.py   | 140 +++++++++++++++---
 1 file changed, 119 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index f60f28969198..7a4cece97fc8 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -73,13 +73,121 @@ def prepare_mask(mask):
     return mask.unsqueeze(0)
 
 
-def prepare_image(pil_image, w=512, h=512):
-    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
-    arr = np.array(pil_image.convert("RGB"))
-    arr = arr.astype(np.float32) / 127.5 - 1
-    arr = np.transpose(arr, [2, 0, 1])
-    image = torch.from_numpy(arr).unsqueeze(0)
-    return image
+def prepare_mask_and_masked_image(image, mask, height, width):
+    r"""
+    Prepares a pair (image, mask) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+
+    if image is None:
+        raise ValueError("`image` input cannot be undefined.")
+
+    if mask is None:
+        raise ValueError("`mask_image` input cannot be undefined.")
+
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            # resize all images w.r.t passed height an width
+            image = [i.resize((width, height), resample=Image.BICUBIC, reducing_gap=1)for i in image]
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+        image_shape = tuple(image.shape[-2:])
+        mask = F.interpolate(mask,image_shape,mode="nearest",)
+        mask = prepare_mask(mask)
+
+    masked_image = image * (mask < 0.5)
+
+    return mask, masked_image
 
 
 class KandinskyInpaintPipeline(DiffusionPipeline):
@@ -363,21 +471,11 @@ def __call__(
 
         # preprocess image and mask
         ## Encode the image
-        image = prepare_image(image, width, height).to(device)
-        image = self.movq.encode(image)["latents"]
 
-        ## prepared mask
-        mask_image = torch.from_numpy(mask_image).unsqueeze(0).unsqueeze(0)
-        image_shape = tuple(image.shape[-2:])
-        mask_image = F.interpolate(
-            mask_image,
-            image_shape,
-            mode="nearest",
-        )
-        mask_image = prepare_mask(mask_image).to(device)
-
-        ## apply mask on image
-        masked_image = image * mask_image
+        mask_image, masked_image = prepare_mask_and_masked_image(image, mask_image, height, width)
+        
+        image = image.to(device)
+        image = self.movq.encode(image)["latents"]
 
         if do_classifier_free_guidance:
             mask_image = mask_image.repeat(2, 1, 1, 1)

From 14bb0e963c6229e33f2ca5adf3b5bc93560dd013 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 01:04:32 +0000
Subject: [PATCH 053/182] fix

---
 .../kandinsky/pipeline_kandinsky_inpaint.py   | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 7a4cece97fc8..c27f6f972eb2 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -172,6 +172,7 @@ def prepare_mask_and_masked_image(image, mask, height, width):
             mask = [mask]
 
         if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
             mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
             mask = mask.astype(np.float32) / 255.0
         elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
@@ -181,13 +182,7 @@ def prepare_mask_and_masked_image(image, mask, height, width):
         mask[mask >= 0.5] = 1
         mask = torch.from_numpy(mask)
 
-        image_shape = tuple(image.shape[-2:])
-        mask = F.interpolate(mask,image_shape,mode="nearest",)
-        mask = prepare_mask(mask)
-
-    masked_image = image * (mask < 0.5)
-
-    return mask, masked_image
+    return mask, image
 
 
 class KandinskyInpaintPipeline(DiffusionPipeline):
@@ -472,10 +467,22 @@ def __call__(
         # preprocess image and mask
         ## Encode the image
 
-        mask_image, masked_image = prepare_mask_and_masked_image(image, mask_image, height, width)
+        mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width)
         
         image = image.to(device)
         image = self.movq.encode(image)["latents"]
+        
+        mask_image = mask_image.to(device)
+
+        image_shape = tuple(image.shape[-2:])
+        mask_image = F.interpolate(
+            mask_image,
+            image_shape,
+            mode="nearest",
+        )
+        mask_image = prepare_mask(mask_image).to(device)
+        # apply mask on image
+        masked_image = image * mask_image 
 
         if do_classifier_free_guidance:
             mask_image = mask_image.repeat(2, 1, 1, 1)

From bde29e58aed95aaf0e7e2b56aab8da4687f54d76 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 02:02:33 +0000
Subject: [PATCH 054/182] refactor adding movq_scale_factor

---
 .../pipelines/kandinsky/pipeline_kandinsky.py   | 15 ++++++++-------
 .../kandinsky/pipeline_kandinsky_inpaint.py     | 17 +++++++++--------
 tests/pipelines/kandinsky/test_kandinsky.py     |  4 ++--
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 300b3c189d62..42801cac8bfd 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -36,14 +36,14 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-def get_new_h_w(h, w):
-    new_h = h // 64
-    if h % 64 != 0:
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor ** 2
+    if h % scale_factor ** 2 != 0:
         new_h += 1
-    new_w = w // 64
-    if w % 64 != 0:
+    new_w = w // scale_factor ** 2
+    if w % scale_factor ** 2 != 0:
         new_w += 1
-    return new_h * 8, new_w * 8
+    return new_h * scale_factor, new_w * scale_factor
 
 
 class KandinskyPipeline(DiffusionPipeline):
@@ -87,6 +87,7 @@ def __init__(
             scheduler=scheduler,
             movq=movq,
         )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels)-1)
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -327,7 +328,7 @@ def __call__(
 
         num_channels_latents = self.unet.config.in_channels
 
-        height, width = get_new_h_w(height, width)
+        height, width = get_new_h_w(height, width, self.movq_scale_factor)
 
         # create initial latent
         latents = self.prepare_latents(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index c27f6f972eb2..a6c1d8ad68bc 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -41,14 +41,14 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-def get_new_h_w(h, w):
-    new_h = h // 64
-    if h % 64 != 0:
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor ** 2
+    if h % scale_factor ** 2 != 0:
         new_h += 1
-    new_w = w // 64
-    if w % 64 != 0:
+    new_w = w // scale_factor ** 2
+    if w % scale_factor ** 2 != 0:
         new_w += 1
-    return new_h * 8, new_w * 8
+    return new_h * scale_factor, new_w * scale_factor
 
 
 def prepare_mask(mask):
@@ -226,6 +226,7 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
         )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels)-1)
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -471,7 +472,7 @@ def __call__(
         
         image = image.to(device)
         image = self.movq.encode(image)["latents"]
-        
+
         mask_image = mask_image.to(device)
 
         image_shape = tuple(image.shape[-2:])
@@ -496,7 +497,7 @@ def __call__(
         # num_channels_latents = self.movq.config.z_channels
 
         # get h, w for latents
-        sample_height, sample_width = get_new_h_w(height, width)
+        sample_height, sample_width = get_new_h_w(height, width, self.movq_scale_factor)
 
         # create initial latent
         latents = self.prepare_latents(
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 336f56daffd5..807b9658e829 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -214,8 +214,8 @@ def get_dummy_inputs(self, device, seed=0):
             "image_embeds": image_embeds,
             "negative_image_embeds":image_embeds,
             "generator": generator,
-            "height": 256,
-            "width":256,
+            "height": 64,
+            "width":64,
             "num_inference_steps": 2,
             "output_type": "np",
         }

From 16dd10c1da1653307c303e0edf44da2f59cac3f2 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 02:54:42 +0000
Subject: [PATCH 055/182] fix mask dtype

---
 .../pipelines/kandinsky/pipeline_kandinsky_inpaint.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index a6c1d8ad68bc..7f165dd4ac69 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -52,7 +52,7 @@ def get_new_h_w(h, w, scale_factor=8):
 
 
 def prepare_mask(mask):
-    mask = mask.float()[0]
+    mask = mask[0]
     old_mask = deepcopy(mask)
     for i in range(mask.shape[1]):
         for j in range(mask.shape[2]):
@@ -470,10 +470,10 @@ def __call__(
 
         mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width)
         
-        image = image.to(device)
+        image = image.to(dtype=prompt_embeds.dtype, device=device)
         image = self.movq.encode(image)["latents"]
 
-        mask_image = mask_image.to(device)
+        mask_image = mask_image.to(dtype=prompt_embeds.dtype, device=device)
 
         image_shape = tuple(image.shape[-2:])
         mask_image = F.interpolate(
@@ -481,7 +481,7 @@ def __call__(
             image_shape,
             mode="nearest",
         )
-        mask_image = prepare_mask(mask_image).to(device)
+        mask_image = prepare_mask(mask_image)
         # apply mask on image
         masked_image = image * mask_image 
 

From b160f899e8568ff27f5dbda40516934dc85c7e5e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 03:25:12 +0000
Subject: [PATCH 056/182] fix mask shape

---
 .../kandinsky/pipeline_kandinsky_inpaint.py   | 46 +++++++++++--------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 7f165dd4ac69..ad24d5ec35b1 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -51,26 +51,30 @@ def get_new_h_w(h, w, scale_factor=8):
     return new_h * scale_factor, new_w * scale_factor
 
 
-def prepare_mask(mask):
-    mask = mask[0]
-    old_mask = deepcopy(mask)
-    for i in range(mask.shape[1]):
-        for j in range(mask.shape[2]):
-            if old_mask[0][i][j] == 1:
-                continue
-            if i != 0:
-                mask[:, i - 1, j] = 0
-            if j != 0:
-                mask[:, i, j - 1] = 0
-            if i != 0 and j != 0:
-                mask[:, i - 1, j - 1] = 0
-            if i != mask.shape[1] - 1:
-                mask[:, i + 1, j] = 0
-            if j != mask.shape[2] - 1:
-                mask[:, i, j + 1] = 0
-            if i != mask.shape[1] - 1 and j != mask.shape[2] - 1:
-                mask[:, i + 1, j + 1] = 0
-    return mask.unsqueeze(0)
+def prepare_mask(masks):
+    prepared_masks = []
+    for mask in masks:
+        old_mask = deepcopy(mask)
+        for i in range(mask.shape[1]):
+            for j in range(mask.shape[2]):
+                if old_mask[0][i][j] == 1:
+                    continue
+                if i != 0:
+                    mask[:, i - 1, j] = 0
+                if j != 0:
+                    mask[:, i, j - 1] = 0
+                if i != 0 and j != 0:
+                    mask[:, i - 1, j - 1] = 0
+                if i != mask.shape[1] - 1:
+                    mask[:, i + 1, j] = 0
+                if j != mask.shape[2] - 1:
+                    mask[:, i, j + 1] = 0
+                if i != mask.shape[1] - 1 and j != mask.shape[2] - 1:
+                    mask[:, i + 1, j + 1] = 0
+        prepared_masks.append(mask)
+    return torch.stack(prepared_masks, dim=0)
+        
+    
 
 
 def prepare_mask_and_masked_image(image, mask, height, width):
@@ -485,6 +489,8 @@ def __call__(
         # apply mask on image
         masked_image = image * mask_image 
 
+        mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+        masked_image = masked_image.repeat_interleave(num_images_per_prompt, dim=0)
         if do_classifier_free_guidance:
             mask_image = mask_image.repeat(2, 1, 1, 1)
             masked_image = masked_image.repeat(2, 1, 1, 1)

From 383c934083837b5d66d01eac7f2a42d5e107df35 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 03:28:41 +0000
Subject: [PATCH 057/182] add inpaint test

---
 .../kandinsky/test_kandinsky_inpaint.py       | 268 ++++++++++++++++++
 1 file changed, 268 insertions(+)
 create mode 100644 tests/pipelines/kandinsky/test_kandinsky_inpaint.py

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
new file mode 100644
index 000000000000..c8c36cd96e96
--- /dev/null
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -0,0 +1,268 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+
+from transformers import XLMRobertaTokenizerFast, PretrainedConfig
+
+from diffusers import KandinskyInpaintPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
+from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
+from diffusers.pipelines.kandinsky.text_encoder import MultilingualCLIP
+from diffusers.utils import floats_tensor, load_numpy, nightly, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
+class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyInpaintPipeline
+    params = [
+        "prompt",
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+        "mask_image"
+    ]
+    batch_params = [
+        "prompt",
+        "negative_prompt",
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+        "mask_image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict"
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 1024
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+    
+    # YiYi's TO-DO: add a tiny tokenizer?
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
+        return tokenizer
+   
+    # @property
+    # def dummy_text_encoder(self):
+    #     torch.manual_seed(0)
+    #     config = PretrainedConfig(
+    #         modelBase="YiYiXu/tiny-random-mclip-base",
+    #         numDims=100,
+    #         transformerDimensions=32)
+
+    #     return MultilingualCLIP(config)
+    
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = PretrainedConfig(
+             modelBase="xlm-roberta-large",
+             numDims=self.cross_attention_dim,
+             transformerDimensions=1024)
+
+        return MultilingualCLIP(config)
+
+    @property
+    def dummy_text_proj(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "clip_embeddings_dim": self.cross_attention_dim,
+            "time_embed_dim": self.time_embed_dim,
+            "clip_extra_context_tokens":2,
+            "cross_attention_dim": self.cross_attention_dim,
+            "clip_text_encoder_hidden_states_dim":  self.text_embedder_hidden_size,
+        }
+
+        model = KandinskyTextProjModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 9,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": "identity",
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32,64],
+            "down_block_types": ["DownEncoderBlock2D","AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": ["AttnUpDecoderBlock2D","UpDecoderBlock2D",],
+            "vq_embed_dim": 4
+            }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        unet = self.dummy_unet
+        text_proj = self.dummy_text_proj
+        movq = self.dummy_movq
+
+        scheduler = UnCLIPScheduler(
+            clip_sample = True,
+            clip_sample_range = 2.0,
+            sample_min_value=1.0,
+            sample_max_value= None,
+            num_train_timesteps= 1000,
+            prediction_type="epsilon",
+            variance_type= "learned_range",
+            thresholding= True,
+            beta_schedule= "linear",
+            beta_start= 0.00085,
+            beta_end=0.012
+        )
+
+        components = {
+            "text_proj": text_proj,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed+1)).to(device)
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+        # create mask
+        mask = np.ones((64, 64), dtype=np.float32)
+        mask[:32,:32] =  0
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "image": init_image,
+            "mask_image": mask,
+            "image_embeds": image_embeds,
+            "negative_image_embeds":image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width":64,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_inpaint(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        
+        print(f"image.shape {image.shape}")
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.52034867, 0.4924194, 0.44671825, 0.5747229, 0.574834, 0.45885202, 0.41398984, 0.4793774, 0.50443137])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
\ No newline at end of file

From 793e408fc686915c35aa563fd42aac8274f1f60f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 03:31:46 +0000
Subject: [PATCH 058/182] make style

---
 src/diffusers/models/attention_processor.py   | 11 ++-
 .../pipelines/kandinsky/pipeline_kandinsky.py | 16 ++--
 .../kandinsky/pipeline_kandinsky_inpaint.py   | 28 +++---
 tests/pipelines/kandinsky/test_kandinsky.py   | 81 ++++++++----------
 .../kandinsky/test_kandinsky_inpaint.py       | 85 +++++++++----------
 5 files changed, 108 insertions(+), 113 deletions(-)

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index a490bd2bfd8b..1dad3b81c61f 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -879,12 +879,19 @@ def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
 
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, vq_emb=None,):
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        vq_emb=None,
+    ):
         residual = hidden_states
 
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, vq_emb)
-            
+
         input_ndim = hidden_states.ndim
 
         if input_ndim == 4:
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 42801cac8bfd..3ff4c1287cd7 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -37,11 +37,11 @@
 
 
 def get_new_h_w(h, w, scale_factor=8):
-    new_h = h // scale_factor ** 2
-    if h % scale_factor ** 2 != 0:
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
         new_h += 1
-    new_w = w // scale_factor ** 2
-    if w % scale_factor ** 2 != 0:
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
         new_w += 1
     return new_h * scale_factor, new_w * scale_factor
 
@@ -87,7 +87,7 @@ def __init__(
             scheduler=scheduler,
             movq=movq,
         )
-        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels)-1)
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -305,7 +305,7 @@ def __call__(
         prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
-        
+
         if isinstance(image_embeds, list):
             image_embeds = torch.cat(image_embeds, dim=0)
         if isinstance(negative_image_embeds, list):
@@ -315,7 +315,9 @@ def __call__(
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
-        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=prompt_embeds.dtype, device=device)
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+            dtype=prompt_embeds.dtype, device=device
+        )
 
         text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
             image_embeddings=image_embeds,
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index ad24d5ec35b1..da7646edfc75 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -42,11 +42,11 @@
 
 
 def get_new_h_w(h, w, scale_factor=8):
-    new_h = h // scale_factor ** 2
-    if h % scale_factor ** 2 != 0:
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
         new_h += 1
-    new_w = w // scale_factor ** 2
-    if w % scale_factor ** 2 != 0:
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
         new_w += 1
     return new_h * scale_factor, new_w * scale_factor
 
@@ -73,15 +73,13 @@ def prepare_mask(masks):
                     mask[:, i + 1, j + 1] = 0
         prepared_masks.append(mask)
     return torch.stack(prepared_masks, dim=0)
-        
-    
 
 
 def prepare_mask_and_masked_image(image, mask, height, width):
     r"""
-    Prepares a pair (image, mask) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will be
-    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
-    ``image`` and ``1`` for the ``mask``.
+    Prepares a pair (image, mask) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will
+    be converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for
+    the ``image`` and ``1`` for the ``mask``.
 
     The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
     binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
@@ -162,7 +160,7 @@ def prepare_mask_and_masked_image(image, mask, height, width):
 
         if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
             # resize all images w.r.t passed height an width
-            image = [i.resize((width, height), resample=Image.BICUBIC, reducing_gap=1)for i in image]
+            image = [i.resize((width, height), resample=Image.BICUBIC, reducing_gap=1) for i in image]
             image = [np.array(i.convert("RGB"))[None, :] for i in image]
             image = np.concatenate(image, axis=0)
         elif isinstance(image, list) and isinstance(image[0], np.ndarray):
@@ -230,7 +228,7 @@ def __init__(
             unet=unet,
             scheduler=scheduler,
         )
-        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels)-1)
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -461,7 +459,9 @@ def __call__(
             image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
             negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
-        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=prompt_embeds.dtype, device=device)
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+            dtype=prompt_embeds.dtype, device=device
+        )
 
         text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
             image_embeddings=image_embeds,
@@ -473,7 +473,7 @@ def __call__(
         ## Encode the image
 
         mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width)
-        
+
         image = image.to(dtype=prompt_embeds.dtype, device=device)
         image = self.movq.encode(image)["latents"]
 
@@ -487,7 +487,7 @@ def __call__(
         )
         mask_image = prepare_mask(mask_image)
         # apply mask on image
-        masked_image = image * mask_image 
+        masked_image = image * mask_image
 
         mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
         masked_image = masked_image.repeat_interleave(num_images_per_prompt, dim=0)
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 807b9658e829..12d089b9ce12 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -13,23 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
 import random
 import unittest
 
 import numpy as np
 import torch
+from transformers import PretrainedConfig, XLMRobertaTokenizerFast
 
-from transformers import XLMRobertaTokenizerFast, PretrainedConfig
-
-from diffusers import PriorTransformer, KandinskyPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
-from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
+from diffusers import KandinskyPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MultilingualCLIP
-from diffusers.utils import floats_tensor, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
+from diffusers.utils import floats_tensor
 
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -43,12 +39,7 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "image_embeds",
         "negative_image_embeds",
     ]
-    batch_params = [
-        "prompt",
-        "negative_prompt",
-        "image_embeds",
-        "negative_image_embeds"
-    ]
+    batch_params = ["prompt", "negative_prompt", "image_embeds", "negative_image_embeds"]
     required_optional_params = [
         "generator",
         "height",
@@ -61,7 +52,7 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "guidance_scale",
         "num_images_per_prompt",
         "output_type",
-        "return_dict"
+        "return_dict",
     ]
     test_xformers_attention = False
 
@@ -84,13 +75,13 @@ def time_embed_dim(self):
     @property
     def cross_attention_dim(self):
         return 100
-    
+
     # YiYi's TO-DO: add a tiny tokenizer?
     @property
     def dummy_tokenizer(self):
         tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
         return tokenizer
-   
+
     # @property
     # def dummy_text_encoder(self):
     #     torch.manual_seed(0)
@@ -100,14 +91,13 @@ def dummy_tokenizer(self):
     #         transformerDimensions=32)
 
     #     return MultilingualCLIP(config)
-    
+
     @property
     def dummy_text_encoder(self):
         torch.manual_seed(0)
         config = PretrainedConfig(
-             modelBase="xlm-roberta-large",
-             numDims=self.cross_attention_dim,
-             transformerDimensions=1024)
+            modelBase="xlm-roberta-large", numDims=self.cross_attention_dim, transformerDimensions=1024
+        )
 
         return MultilingualCLIP(config)
 
@@ -118,9 +108,9 @@ def dummy_text_proj(self):
         model_kwargs = {
             "clip_embeddings_dim": self.cross_attention_dim,
             "time_embed_dim": self.time_embed_dim,
-            "clip_extra_context_tokens":2,
+            "clip_extra_context_tokens": 2,
             "cross_attention_dim": self.cross_attention_dim,
-            "clip_text_encoder_hidden_states_dim":  self.text_embedder_hidden_size,
+            "clip_text_encoder_hidden_states_dim": self.text_embedder_hidden_size,
         }
 
         model = KandinskyTextProjModel(**model_kwargs)
@@ -151,8 +141,8 @@ def dummy_unet(self):
     @property
     def dummy_movq_kwargs(self):
         return {
-            "block_out_channels": [32,64],
-            "down_block_types": ["DownEncoderBlock2D","AttnDownEncoderBlock2D"],
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
             "in_channels": 3,
             "latent_channels": 4,
             "layers_per_block": 1,
@@ -160,9 +150,12 @@ def dummy_movq_kwargs(self):
             "norm_type": "spatial",
             "num_vq_embeddings": 12,
             "out_channels": 3,
-            "up_block_types": ["AttnUpDecoderBlock2D","UpDecoderBlock2D",],
-            "vq_embed_dim": 4
-            }
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
 
     @property
     def dummy_movq(self):
@@ -178,17 +171,17 @@ def get_dummy_components(self):
         movq = self.dummy_movq
 
         scheduler = UnCLIPScheduler(
-            clip_sample = True,
-            clip_sample_range = 2.0,
+            clip_sample=True,
+            clip_sample_range=2.0,
             sample_min_value=1.0,
-            sample_max_value= None,
-            num_train_timesteps= 1000,
+            sample_max_value=None,
+            num_train_timesteps=1000,
             prediction_type="epsilon",
-            variance_type= "learned_range",
-            thresholding= True,
-            beta_schedule= "linear",
-            beta_start= 0.00085,
-            beta_end=0.012
+            variance_type="learned_range",
+            thresholding=True,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
         )
 
         components = {
@@ -197,14 +190,14 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "unet": unet,
             "scheduler": scheduler,
-            "movq": movq
+            "movq": movq,
         }
 
         return components
 
     def get_dummy_inputs(self, device, seed=0):
         image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
-        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed+1)).to(device)
+        floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
         if str(device).startswith("mps"):
             generator = torch.manual_seed(seed)
         else:
@@ -212,10 +205,10 @@ def get_dummy_inputs(self, device, seed=0):
         inputs = {
             "prompt": "horse",
             "image_embeds": image_embeds,
-            "negative_image_embeds":image_embeds,
+            "negative_image_embeds": image_embeds,
             "generator": generator,
             "height": 64,
-            "width":64,
+            "width": 64,
             "num_inference_steps": 2,
             "output_type": "np",
         }
@@ -241,7 +234,7 @@ def test_kandinsky(self):
 
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        
+
         print(f"image.shape {image.shape}")
 
         assert image.shape == (1, 64, 64, 3)
@@ -251,4 +244,4 @@ def test_kandinsky(self):
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
\ No newline at end of file
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index c8c36cd96e96..a5f9e29f1d7b 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -13,24 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
 import random
 import unittest
 
 import numpy as np
 import torch
 from PIL import Image
-
-from transformers import XLMRobertaTokenizerFast, PretrainedConfig
+from transformers import PretrainedConfig, XLMRobertaTokenizerFast
 
 from diffusers import KandinskyInpaintPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
-from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
 from diffusers.pipelines.kandinsky.text_encoder import MultilingualCLIP
-from diffusers.utils import floats_tensor, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
+from diffusers.utils import floats_tensor
 
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -39,13 +35,7 @@
 
 class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyInpaintPipeline
-    params = [
-        "prompt",
-        "image_embeds",
-        "negative_image_embeds",
-        "image",
-        "mask_image"
-    ]
+    params = ["prompt", "image_embeds", "negative_image_embeds", "image", "mask_image"]
     batch_params = [
         "prompt",
         "negative_prompt",
@@ -66,7 +56,7 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "guidance_scale",
         "num_images_per_prompt",
         "output_type",
-        "return_dict"
+        "return_dict",
     ]
     test_xformers_attention = False
 
@@ -89,13 +79,13 @@ def time_embed_dim(self):
     @property
     def cross_attention_dim(self):
         return 100
-    
+
     # YiYi's TO-DO: add a tiny tokenizer?
     @property
     def dummy_tokenizer(self):
         tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
         return tokenizer
-   
+
     # @property
     # def dummy_text_encoder(self):
     #     torch.manual_seed(0)
@@ -105,14 +95,13 @@ def dummy_tokenizer(self):
     #         transformerDimensions=32)
 
     #     return MultilingualCLIP(config)
-    
+
     @property
     def dummy_text_encoder(self):
         torch.manual_seed(0)
         config = PretrainedConfig(
-             modelBase="xlm-roberta-large",
-             numDims=self.cross_attention_dim,
-             transformerDimensions=1024)
+            modelBase="xlm-roberta-large", numDims=self.cross_attention_dim, transformerDimensions=1024
+        )
 
         return MultilingualCLIP(config)
 
@@ -123,9 +112,9 @@ def dummy_text_proj(self):
         model_kwargs = {
             "clip_embeddings_dim": self.cross_attention_dim,
             "time_embed_dim": self.time_embed_dim,
-            "clip_extra_context_tokens":2,
+            "clip_extra_context_tokens": 2,
             "cross_attention_dim": self.cross_attention_dim,
-            "clip_text_encoder_hidden_states_dim":  self.text_embedder_hidden_size,
+            "clip_text_encoder_hidden_states_dim": self.text_embedder_hidden_size,
         }
 
         model = KandinskyTextProjModel(**model_kwargs)
@@ -156,8 +145,8 @@ def dummy_unet(self):
     @property
     def dummy_movq_kwargs(self):
         return {
-            "block_out_channels": [32,64],
-            "down_block_types": ["DownEncoderBlock2D","AttnDownEncoderBlock2D"],
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
             "in_channels": 3,
             "latent_channels": 4,
             "layers_per_block": 1,
@@ -165,9 +154,12 @@ def dummy_movq_kwargs(self):
             "norm_type": "spatial",
             "num_vq_embeddings": 12,
             "out_channels": 3,
-            "up_block_types": ["AttnUpDecoderBlock2D","UpDecoderBlock2D",],
-            "vq_embed_dim": 4
-            }
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
 
     @property
     def dummy_movq(self):
@@ -183,17 +175,17 @@ def get_dummy_components(self):
         movq = self.dummy_movq
 
         scheduler = UnCLIPScheduler(
-            clip_sample = True,
-            clip_sample_range = 2.0,
+            clip_sample=True,
+            clip_sample_range=2.0,
             sample_min_value=1.0,
-            sample_max_value= None,
-            num_train_timesteps= 1000,
+            sample_max_value=None,
+            num_train_timesteps=1000,
             prediction_type="epsilon",
-            variance_type= "learned_range",
-            thresholding= True,
-            beta_schedule= "linear",
-            beta_start= 0.00085,
-            beta_end=0.012
+            variance_type="learned_range",
+            thresholding=True,
+            beta_schedule="linear",
+            beta_start=0.00085,
+            beta_end=0.012,
         )
 
         components = {
@@ -202,21 +194,21 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "unet": unet,
             "scheduler": scheduler,
-            "movq": movq
+            "movq": movq,
         }
 
         return components
 
     def get_dummy_inputs(self, device, seed=0):
         image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
-        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed+1)).to(device)
+        floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
         # create init_image
         image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
         image = image.cpu().permute(0, 2, 3, 1)[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
         # create mask
         mask = np.ones((64, 64), dtype=np.float32)
-        mask[:32,:32] =  0
+        mask[:32, :32] = 0
 
         if str(device).startswith("mps"):
             generator = torch.manual_seed(seed)
@@ -227,10 +219,10 @@ def get_dummy_inputs(self, device, seed=0):
             "image": init_image,
             "mask_image": mask,
             "image_embeds": image_embeds,
-            "negative_image_embeds":image_embeds,
+            "negative_image_embeds": image_embeds,
             "generator": generator,
             "height": 64,
-            "width":64,
+            "width": 64,
             "num_inference_steps": 2,
             "output_type": "np",
         }
@@ -256,13 +248,14 @@ def test_kandinsky_inpaint(self):
 
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        
+
         print(f"image.shape {image.shape}")
 
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.52034867, 0.4924194, 0.44671825, 0.5747229, 0.574834, 0.45885202, 0.41398984, 0.4793774, 0.50443137])
+            [0.52034867, 0.4924194, 0.44671825, 0.5747229, 0.574834, 0.45885202, 0.41398984, 0.4793774, 0.50443137]
+        )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
\ No newline at end of file
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2

From a63c2723dbed101c953acb30eeaad12301e974e0 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 04:04:56 +0000
Subject: [PATCH 059/182] prior_num_inference_steps -> num_inference_steps

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 29c978455af1..50469e06b37d 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -262,7 +262,7 @@ def __call__(
         self,
         prompt,
         num_images_per_prompt: int = 1,
-        prior_num_inference_steps: int = 5,
+        num_inference_steps: int = 5,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         prior_latents: Optional[torch.FloatTensor] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -291,7 +291,7 @@ def __call__(
             )
 
             # prior
-            self.scheduler.set_timesteps(prior_num_inference_steps, device=device)
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
             prior_timesteps_tensor = self.scheduler.timesteps
 
             embedding_dim = self.prior.config.embedding_dim

From a1ddc2947ef8bd4fc8c89d3ec99903700b21c01e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 04:09:17 +0000
Subject: [PATCH 060/182] clean up prior pipeline args

---
 .../kandinsky/pipeline_kandinsky_prior.py     | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 50469e06b37d..c8e592de7e1e 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -264,9 +264,9 @@ def __call__(
         num_images_per_prompt: int = 1,
         num_inference_steps: int = 5,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        prior_latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
-        prior_guidance_scale: float = 4.0,
+        guidance_scale: float = 4.0,
         output_type: Optional[str] = "pt",
         return_dict: bool = True,
     ):
@@ -285,7 +285,7 @@ def __call__(
             image_embeddings = self.create_zero_img_emb(batch_size=batch_size, device=device)
 
         else:
-            do_classifier_free_guidance = prior_guidance_scale > 1.0
+            do_classifier_free_guidance = guidance_scale > 1.0
             prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
                 prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
             )
@@ -296,18 +296,18 @@ def __call__(
 
             embedding_dim = self.prior.config.embedding_dim
 
-            prior_latents = self.prepare_latents(
+            latents = self.prepare_latents(
                 (batch_size, embedding_dim),
                 prompt_embeds.dtype,
                 device,
                 generator,
-                prior_latents,
+                latents,
                 self.scheduler,
             )
 
             for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
 
                 predicted_image_embedding = self.prior(
                     latent_model_input,
@@ -321,7 +321,7 @@ def __call__(
                     predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(
                         2
                     )
-                    predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                    predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * (
                         predicted_image_embedding_text - predicted_image_embedding_uncond
                     )
 
@@ -330,16 +330,16 @@ def __call__(
                 else:
                     prev_timestep = prior_timesteps_tensor[i + 1]
 
-                prior_latents = self.scheduler.step(
+                latents = self.scheduler.step(
                     predicted_image_embedding,
                     timestep=t,
-                    sample=prior_latents,
+                    sample=latents,
                     generator=generator,
                     prev_timestep=prev_timestep,
                 ).prev_sample
 
-            prior_latents = self.prior.post_process_latents(prior_latents)
+            latents = self.prior.post_process_latents(latents)
 
-            image_embeddings = prior_latents
+            image_embeddings = latents
 
         return image_embeddings

From 9aa256afd61f972f34048aa98b47fd5f843af5ca Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 04:29:43 +0000
Subject: [PATCH 061/182] remove the hardcoded clip image size

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index c8e592de7e1e..9f70de1b572e 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -82,7 +82,7 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         return latents
 
     def create_zero_img_emb(self, batch_size, device):
-        zero_img = torch.zeros(1, 3, 224, 224).to(device=device)
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(device=device)
         zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
         zero_image_emb = zero_image_emb.repeat(batch_size, 1)
         return zero_image_emb

From fed91165ac9577e531792292958ef4d00734553a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 04:54:47 +0000
Subject: [PATCH 062/182] output

---
 .../pipelines/kandinsky/pipeline_kandinsky_prior.py       | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 9f70de1b572e..603c216ee7e2 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -19,6 +19,7 @@
 
 from ...models import PriorTransformer
 from ...pipelines import DiffusionPipeline
+from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
     is_accelerate_available,
@@ -267,7 +268,7 @@ def __call__(
         latents: Optional[torch.FloatTensor] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         guidance_scale: float = 4.0,
-        output_type: Optional[str] = "pt",
+        output_type: Optional[str] = "pt", # pt only 
         return_dict: bool = True,
     ):
         if isinstance(prompt, str):
@@ -342,4 +343,7 @@ def __call__(
 
             image_embeddings = latents
 
-        return image_embeddings
+            if not return_dict:
+                return (image_embeddings, )
+
+        return ImagePipelineOutput(images=image_embeddings)

From 7ce5c05c3d4ca3c9da142701d91c91ba21d67013 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 08:22:05 +0000
Subject: [PATCH 063/182] add tests for prior

---
 .../kandinsky/pipeline_kandinsky_prior.py     |  68 ++----
 .../kandinsky/test_kandinsky_prior.py         | 228 ++++++++++++++++++
 2 files changed, 248 insertions(+), 48 deletions(-)
 create mode 100644 tests/pipelines/kandinsky/test_kandinsky_prior.py

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 603c216ee7e2..5826261eb9aa 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -88,60 +88,26 @@ def create_zero_img_emb(self, batch_size, device):
         zero_image_emb = zero_image_emb.repeat(batch_size, 1)
         return zero_image_emb
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
-        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
-        Note that offloading happens on a submodule basis. Memory savings are higher than with
-        `enable_model_cpu_offload`, but performance is lower.
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
         """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+        if is_accelerate_available():
             from accelerate import cpu_offload
         else:
-            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+            raise ImportError("Please install accelerate via `pip install accelerate`")
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
-
-        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
+        models = [
+            self.image_encoder,
+            self.text_encoder,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
 
     @property
     def _execution_device(self):
@@ -150,9 +116,9 @@ def _execution_device(self):
         `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
         hooks.
         """
-        if self.device != torch.device("meta") or not hasattr(self.decoder, "_hf_hook"):
+        if self.device != torch.device("meta") or not hasattr(self.text_encoder, "_hf_hook"):
             return self.device
-        for module in self.decoder.modules():
+        for module in self.text_encoder.modules():
             if (
                 hasattr(module, "_hf_hook")
                 and hasattr(module._hf_hook, "execution_device")
@@ -343,6 +309,12 @@ def __call__(
 
             image_embeddings = latents
 
+            # YiYi's notes: 
+            ## Prior Pipeline should always return a tensor that can be used in text2img/img2img/inpainting pipelines
+            ## However need np type for testing purpose
+            if output_type == 'np':
+                image_embeddings = image_embeddings.cpu().numpy()
+
             if not return_dict:
                 return (image_embeddings, )
 
diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py
new file mode 100644
index 000000000000..cfbf76f9f2bb
--- /dev/null
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -0,0 +1,228 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import CLIPTextConfig, CLIPVisionConfig, CLIPTokenizer, CLIPTextModelWithProjection, CLIPVisionModelWithProjection
+
+from diffusers import PriorTransformer, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel
+from diffusers.utils import load_numpy, nightly, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
+class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyPriorPipeline
+    params = ["prompt"]
+    batch_params = [
+        "prompt",
+        "negative_prompt"
+    ]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "generator",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict"
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "embedding_dim": self.text_embedder_hidden_size,
+            "num_layers": 1,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0
+        model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape))
+        return model
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            image_size=224,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        model = CLIPVisionModelWithProjection(config)
+        return model
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        image_encoder = self.dummy_image_encoder
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+
+        scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="sample",
+            num_train_timesteps=1000,
+            clip_sample=True,
+            clip_sample_range=10.0,
+        )
+
+        components = {
+            "prior": prior,
+            "image_encoder": image_encoder,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "scheduler": scheduler,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_prior(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -10:]
+        image_from_tuple_slice = image_from_tuple[0, -10:]
+
+        assert image.shape == (1, 32)
+
+        expected_slice = np.array(
+            [
+                -0.0532,  
+                1.7120,  
+                0.3656, 
+                -1.0852, 
+                -0.8946, 
+                -1.1756,  
+                0.4348,  
+                0.2482,
+                0.5146, 
+                -0.1156
+                ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        test_max_difference = torch_device == "cpu"
+        relax_max_difference = True
+        test_mean_pixel_difference = False
+
+        self._test_inference_batch_single_identical(
+            test_max_difference=test_max_difference,
+            relax_max_difference=relax_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
+
+    @skip_mps
+    def test_attention_slicing_forward_pass(self):
+        test_max_difference = torch_device == "cpu"
+        test_mean_pixel_difference = False
+
+        self._test_attention_slicing_forward_pass(
+            test_max_difference=test_max_difference,
+            test_mean_pixel_difference=test_mean_pixel_difference,
+        )
\ No newline at end of file

From 795411d6fd709e0c5beb29a563c3f746a03b6638 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 09:14:57 +0000
Subject: [PATCH 064/182] add slow test for text2img

---
 .../kandinsky/pipeline_kandinsky_prior.py     |  2 +-
 tests/pipelines/kandinsky/test_kandinsky.py   | 51 +++++++++++++++++--
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 5826261eb9aa..84df5e30349c 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -83,7 +83,7 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         return latents
 
     def create_zero_img_emb(self, batch_size, device):
-        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(device=device)
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(device=device, dtype=self.image_encoder.dtype)
         zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
         zero_image_emb = zero_image_emb.repeat(batch_size, 1)
         return zero_image_emb
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 12d089b9ce12..7ba229428bd7 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import random
 import unittest
 
@@ -20,12 +21,13 @@
 import torch
 from transformers import PretrainedConfig, XLMRobertaTokenizerFast
 
-from diffusers import KandinskyPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
+from diffusers import KandinskyPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MultilingualCLIP
 from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
-from diffusers.utils import floats_tensor
+from diffusers.utils import floats_tensor, slow, nightly, load_numpy, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
 
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -245,3 +247,46 @@ def test_kandinsky(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch_gpu
+class KandinskyPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_text2img(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinsky/kandinsky_text2img_cat_fp16.npy"
+        )
+        pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        prompt= "red cat, 4k photo"
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb = pipe_prior(prompt, generator=generator,).images
+        zero_image_emb = pipe_prior("").images
+
+        output = pipeline(
+            prompt,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
\ No newline at end of file

From faa0145f798be3fde6ebff8d0a553b9fb9128682 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 16 May 2023 09:41:28 +0000
Subject: [PATCH 065/182] more tests

---
 .../kandinsky/test_kandinsky_inpaint.py       | 62 ++++++++++++++++++-
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index a5f9e29f1d7b..5b6a8defbb21 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import random
 import unittest
 
@@ -21,12 +22,13 @@
 from PIL import Image
 from transformers import PretrainedConfig, XLMRobertaTokenizerFast
 
-from diffusers import KandinskyInpaintPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
+from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MultilingualCLIP
 from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
-from diffusers.utils import floats_tensor
+from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
 
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -259,3 +261,57 @@ def test_kandinsky_inpaint(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+@slow
+@require_torch_gpu
+class KandinskyInpaintPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_inpaint(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinsky/kandinsky_inpaint_cat_with_hat_fp16.npy"
+        )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/kandinsky/cat.png"
+        )
+        mask = np.ones((768, 768), dtype=np.float32)
+        mask[:250,250:-250] =  0
+        
+        prompt="a hat"
+
+        pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint", torch_dtype=torch.float16)
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb = pipe_prior(prompt, generator=generator,).images
+        zero_image_emb = pipe_prior("").images
+        
+        output = pipeline(
+            prompt,
+            image=init_image,
+            mask_image=mask,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            height=768,
+            width=768,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)
\ No newline at end of file

From 95efab204007a853c20547be9685ef9af9e6357f Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 16 May 2023 12:09:23 -1000
Subject: [PATCH 066/182] Update
 src/diffusers/pipelines/kandinsky/text_encoder.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/text_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/text_encoder.py b/src/diffusers/pipelines/kandinsky/text_encoder.py
index 4906f95d387b..aaf1ff3e80f6 100644
--- a/src/diffusers/pipelines/kandinsky/text_encoder.py
+++ b/src/diffusers/pipelines/kandinsky/text_encoder.py
@@ -17,7 +17,7 @@ class MultilingualCLIP(PreTrainedModel):
 
     def __init__(self, config, *args, **kwargs):
         super().__init__(config, *args, **kwargs)
-        self.transformer = AutoModel.from_pretrained(config.modelBase, cache_dir=kwargs.get("cache_dir"))
+        self.transformer = XLMRobertaModel(config)
         self.LinearTransformation = torch.nn.Linear(
             in_features=config.transformerDimensions, out_features=config.numDims
         )

From 8eea5a095fcb9664c97f5b93fddea7f160ae7ab5 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 16 May 2023 12:09:35 -1000
Subject: [PATCH 067/182] Update
 src/diffusers/pipelines/kandinsky/text_encoder.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/text_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/text_encoder.py b/src/diffusers/pipelines/kandinsky/text_encoder.py
index aaf1ff3e80f6..de2d3620e02a 100644
--- a/src/diffusers/pipelines/kandinsky/text_encoder.py
+++ b/src/diffusers/pipelines/kandinsky/text_encoder.py
@@ -2,7 +2,7 @@
 from transformers import AutoModel, PretrainedConfig, PreTrainedModel
 
 
-class MCLIPConfig(PretrainedConfig):
+class MCLIPConfig(XLMRobertaConfig):
     model_type = "M-CLIP"
 
     def __init__(self, modelBase="xlm-roberta-large", transformerDimSize=1024, imageDimSize=768, **kwargs):

From c5c2135c986868763d5e322c71dac0af270edb1d Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 16 May 2023 12:09:45 -1000
Subject: [PATCH 068/182] Update
 src/diffusers/pipelines/kandinsky/text_encoder.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/text_encoder.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/text_encoder.py b/src/diffusers/pipelines/kandinsky/text_encoder.py
index de2d3620e02a..d32d7ad12f81 100644
--- a/src/diffusers/pipelines/kandinsky/text_encoder.py
+++ b/src/diffusers/pipelines/kandinsky/text_encoder.py
@@ -8,7 +8,6 @@ class MCLIPConfig(XLMRobertaConfig):
     def __init__(self, modelBase="xlm-roberta-large", transformerDimSize=1024, imageDimSize=768, **kwargs):
         self.transformerDimensions = transformerDimSize
         self.numDims = imageDimSize
-        self.modelBase = modelBase
         super().__init__(**kwargs)
 
 

From e26113970093980c0894d41a5bcba2f50d5ea1a1 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 17 May 2023 00:38:39 +0000
Subject: [PATCH 069/182] fix

---
 src/diffusers/pipelines/kandinsky/text_encoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/text_encoder.py b/src/diffusers/pipelines/kandinsky/text_encoder.py
index d32d7ad12f81..a40aaa1f5b37 100644
--- a/src/diffusers/pipelines/kandinsky/text_encoder.py
+++ b/src/diffusers/pipelines/kandinsky/text_encoder.py
@@ -1,11 +1,11 @@
 import torch
-from transformers import AutoModel, PretrainedConfig, PreTrainedModel
+from transformers import XLMRobertaModel, XLMRobertaConfig, PreTrainedModel
 
 
 class MCLIPConfig(XLMRobertaConfig):
     model_type = "M-CLIP"
 
-    def __init__(self, modelBase="xlm-roberta-large", transformerDimSize=1024, imageDimSize=768, **kwargs):
+    def __init__(self,transformerDimSize=1024, imageDimSize=768, **kwargs):
         self.transformerDimensions = transformerDimSize
         self.numDims = imageDimSize
         super().__init__(**kwargs)

From ff0fe4b9d7852e4fb1c1eae4d5f795f213e11c4d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 17 May 2023 00:40:03 +0000
Subject: [PATCH 070/182] make style

---
 .../kandinsky/pipeline_kandinsky_prior.py     | 13 +++---
 .../pipelines/kandinsky/text_encoder.py       |  4 +-
 tests/pipelines/kandinsky/test_kandinsky.py   | 13 +++---
 .../kandinsky/test_kandinsky_inpaint.py       | 23 ++++++-----
 .../kandinsky/test_kandinsky_prior.py         | 40 +++++++------------
 5 files changed, 45 insertions(+), 48 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 84df5e30349c..022497676913 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -23,7 +23,6 @@
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
     is_accelerate_available,
-    is_accelerate_version,
     logging,
     randn_tensor,
 )
@@ -83,7 +82,9 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         return latents
 
     def create_zero_img_emb(self, batch_size, device):
-        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(device=device, dtype=self.image_encoder.dtype)
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
+            device=device, dtype=self.image_encoder.dtype
+        )
         zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
         zero_image_emb = zero_image_emb.repeat(batch_size, 1)
         return zero_image_emb
@@ -234,7 +235,7 @@ def __call__(
         latents: Optional[torch.FloatTensor] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         guidance_scale: float = 4.0,
-        output_type: Optional[str] = "pt", # pt only 
+        output_type: Optional[str] = "pt",  # pt only
         return_dict: bool = True,
     ):
         if isinstance(prompt, str):
@@ -309,13 +310,13 @@ def __call__(
 
             image_embeddings = latents
 
-            # YiYi's notes: 
+            # YiYi's notes:
             ## Prior Pipeline should always return a tensor that can be used in text2img/img2img/inpainting pipelines
             ## However need np type for testing purpose
-            if output_type == 'np':
+            if output_type == "np":
                 image_embeddings = image_embeddings.cpu().numpy()
 
             if not return_dict:
-                return (image_embeddings, )
+                return (image_embeddings,)
 
         return ImagePipelineOutput(images=image_embeddings)
diff --git a/src/diffusers/pipelines/kandinsky/text_encoder.py b/src/diffusers/pipelines/kandinsky/text_encoder.py
index a40aaa1f5b37..516abca45354 100644
--- a/src/diffusers/pipelines/kandinsky/text_encoder.py
+++ b/src/diffusers/pipelines/kandinsky/text_encoder.py
@@ -1,11 +1,11 @@
 import torch
-from transformers import XLMRobertaModel, XLMRobertaConfig, PreTrainedModel
+from transformers import PreTrainedModel, XLMRobertaConfig, XLMRobertaModel
 
 
 class MCLIPConfig(XLMRobertaConfig):
     model_type = "M-CLIP"
 
-    def __init__(self,transformerDimSize=1024, imageDimSize=768, **kwargs):
+    def __init__(self, transformerDimSize=1024, imageDimSize=768, **kwargs):
         self.transformerDimensions = transformerDimSize
         self.numDims = imageDimSize
         super().__init__(**kwargs)
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 7ba229428bd7..fd24eacec344 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -24,8 +24,8 @@
 from diffusers import KandinskyPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MultilingualCLIP
 from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
-from diffusers.utils import floats_tensor, slow, nightly, load_numpy, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
@@ -270,10 +270,13 @@ def test_kandinsky_text2img(self):
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
 
-        prompt= "red cat, 4k photo"
+        prompt = "red cat, 4k photo"
 
         generator = torch.Generator(device="cpu").manual_seed(0)
-        image_emb = pipe_prior(prompt, generator=generator,).images
+        image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+        ).images
         zero_image_emb = pipe_prior("").images
 
         output = pipeline(
@@ -289,4 +292,4 @@ def test_kandinsky_text2img(self):
 
         assert image.shape == (512, 512, 3)
 
-        assert_mean_pixel_difference(image, expected_image)
\ No newline at end of file
+        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index 5b6a8defbb21..b90a3066d91e 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -25,8 +25,8 @@
 from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MultilingualCLIP
 from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
-from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
@@ -262,6 +262,7 @@ def test_kandinsky_inpaint(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
+
 @slow
 @require_torch_gpu
 class KandinskyInpaintPipelineIntegrationTests(unittest.TestCase):
@@ -278,13 +279,12 @@ def test_kandinsky_inpaint(self):
         )
 
         init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/kandinsky/cat.png"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
         )
         mask = np.ones((768, 768), dtype=np.float32)
-        mask[:250,250:-250] =  0
-        
-        prompt="a hat"
+        mask[:250, 250:-250] = 0
+
+        prompt = "a hat"
 
         pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
         pipe_prior.to(torch_device)
@@ -294,9 +294,12 @@ def test_kandinsky_inpaint(self):
         pipeline.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
-        image_emb = pipe_prior(prompt, generator=generator,).images
+        image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+        ).images
         zero_image_emb = pipe_prior("").images
-        
+
         output = pipeline(
             prompt,
             image=init_image,
@@ -314,4 +317,4 @@ def test_kandinsky_inpaint(self):
 
         assert image.shape == (768, 768, 3)
 
-        assert_mean_pixel_difference(image, expected_image)
\ No newline at end of file
+        assert_mean_pixel_difference(image, expected_image)
diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py
index cfbf76f9f2bb..3eabd2b0f0d1 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -13,20 +13,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
 import unittest
 
 import numpy as np
 import torch
 from torch import nn
-from transformers import CLIPTextConfig, CLIPVisionConfig, CLIPTokenizer, CLIPTextModelWithProjection, CLIPVisionModelWithProjection
+from transformers import (
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
 
-from diffusers import PriorTransformer, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel
-from diffusers.utils import load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
+from diffusers import KandinskyPriorPipeline, PriorTransformer, UnCLIPScheduler
+from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import skip_mps
 
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -36,10 +40,7 @@
 class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyPriorPipeline
     params = ["prompt"]
-    batch_params = [
-        "prompt",
-        "negative_prompt"
-    ]
+    batch_params = ["prompt", "negative_prompt"]
     required_optional_params = [
         "num_images_per_prompt",
         "generator",
@@ -48,7 +49,7 @@ class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "negative_prompt",
         "guidance_scale",
         "output_type",
-        "return_dict"
+        "return_dict",
     ]
     test_xformers_attention = False
 
@@ -188,18 +189,7 @@ def test_kandinsky_prior(self):
         assert image.shape == (1, 32)
 
         expected_slice = np.array(
-            [
-                -0.0532,  
-                1.7120,  
-                0.3656, 
-                -1.0852, 
-                -0.8946, 
-                -1.1756,  
-                0.4348,  
-                0.2482,
-                0.5146, 
-                -0.1156
-                ]
+            [-0.0532, 1.7120, 0.3656, -1.0852, -0.8946, -1.1756, 0.4348, 0.2482, 0.5146, -0.1156]
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -225,4 +215,4 @@ def test_attention_slicing_forward_pass(self):
         self._test_attention_slicing_forward_pass(
             test_max_difference=test_max_difference,
             test_mean_pixel_difference=test_mean_pixel_difference,
-        )
\ No newline at end of file
+        )

From 3c3e6796b134337cc48f3dde16fb66ea82db61f5 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 17 May 2023 07:49:06 +0000
Subject: [PATCH 071/182] fix tokenizer max_length + fix fast tests for
 text2img

---
 .../pipelines/kandinsky/pipeline_kandinsky.py |  4 +-
 .../kandinsky/pipeline_kandinsky_inpaint.py   |  4 +-
 tests/pipelines/kandinsky/test_kandinsky.py   | 42 +++++++++----------
 3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 3ff4c1287cd7..a76a5256df25 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -113,8 +113,8 @@ def _encode_prompt(
         text_inputs = self.tokenizer(
             prompt,
             padding="max_length",
-            max_length=self.tokenizer.model_max_length,
             truncation=True,
+            max_length=77,
             return_attention_mask=True,
             add_special_tokens=True,
             return_tensors="pt",
@@ -164,7 +164,7 @@ def _encode_prompt(
             uncond_input = self.tokenizer(
                 uncond_tokens,
                 padding="max_length",
-                max_length=self.tokenizer.model_max_length,
+                max_length=77,
                 truncation=True,
                 return_attention_mask=True,
                 add_special_tokens=True,
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index da7646edfc75..3029313db729 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -254,7 +254,7 @@ def _encode_prompt(
         text_inputs = self.tokenizer(
             prompt,
             padding="max_length",
-            max_length=self.tokenizer.model_max_length,
+            max_length=77,
             truncation=True,
             return_attention_mask=True,
             add_special_tokens=True,
@@ -305,7 +305,7 @@ def _encode_prompt(
             uncond_input = self.tokenizer(
                 uncond_tokens,
                 padding="max_length",
-                max_length=self.tokenizer.model_max_length,
+                max_length=77,
                 truncation=True,
                 return_attention_mask=True,
                 add_special_tokens=True,
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index fd24eacec344..580748183efd 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -19,10 +19,10 @@
 
 import numpy as np
 import torch
-from transformers import PretrainedConfig, XLMRobertaTokenizerFast
+from transformers import XLMRobertaTokenizer
 
 from diffusers import KandinskyPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
-from diffusers.pipelines.kandinsky.text_encoder import MultilingualCLIP
+from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
 from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
@@ -60,7 +60,7 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 
     @property
     def text_embedder_hidden_size(self):
-        return 1024
+        return 32
 
     @property
     def time_input_dim(self):
@@ -81,27 +81,26 @@ def cross_attention_dim(self):
     # YiYi's TO-DO: add a tiny tokenizer?
     @property
     def dummy_tokenizer(self):
-        tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
+        tokenizer = XLMRobertaTokenizer.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
         return tokenizer
 
-    # @property
-    # def dummy_text_encoder(self):
-    #     torch.manual_seed(0)
-    #     config = PretrainedConfig(
-    #         modelBase="YiYiXu/tiny-random-mclip-base",
-    #         numDims=100,
-    #         transformerDimensions=32)
-
-    #     return MultilingualCLIP(config)
-
     @property
     def dummy_text_encoder(self):
         torch.manual_seed(0)
-        config = PretrainedConfig(
-            modelBase="xlm-roberta-large", numDims=self.cross_attention_dim, transformerDimensions=1024
+        config = MCLIPConfig(
+            numDims=self.cross_attention_dim, 
+            transformerDimensions=self.text_embedder_hidden_size,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            vocab_size=250002,
         )
+        
+        text_encoder = MultilingualCLIP(config)
+        text_encoder = text_encoder.eval()
 
-        return MultilingualCLIP(config)
+        return text_encoder 
 
     @property
     def dummy_text_proj(self):
@@ -194,12 +193,11 @@ def get_dummy_components(self):
             "scheduler": scheduler,
             "movq": movq,
         }
-
         return components
 
     def get_dummy_inputs(self, device, seed=0):
         image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
-        floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
+        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
         if str(device).startswith("mps"):
             generator = torch.manual_seed(seed)
         else:
@@ -207,7 +205,7 @@ def get_dummy_inputs(self, device, seed=0):
         inputs = {
             "prompt": "horse",
             "image_embeds": image_embeds,
-            "negative_image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
             "generator": generator,
             "height": 64,
             "width": 64,
@@ -237,12 +235,10 @@ def test_kandinsky(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        print(f"image.shape {image.shape}")
-
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.5208529, 0.4821977, 0.44796965, 0.5479469, 0.54242486, 0.45028442, 0.42460358, 0.46456948, 0.48675597]
+            [0.50759643, 0.50876284, 0.4554392, 0.5594512, 0.53785735, 0.44757918,  0.4388101,  0.46746832, 0.4886209]
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

From 3e8d31aa613cf47e2925e094039ee19b4637db97 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 17 May 2023 07:58:42 +0000
Subject: [PATCH 072/182] fix inpaint fast test

---
 .../kandinsky/test_kandinsky_inpaint.py       | 40 +++++++++----------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index b90a3066d91e..35d5cacf8756 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -20,10 +20,10 @@
 import numpy as np
 import torch
 from PIL import Image
-from transformers import PretrainedConfig, XLMRobertaTokenizerFast
+from transformers import XLMRobertaTokenizer
 
 from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
-from diffusers.pipelines.kandinsky.text_encoder import MultilingualCLIP
+from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
@@ -64,7 +64,7 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 
     @property
     def text_embedder_hidden_size(self):
-        return 1024
+        return 32
 
     @property
     def time_input_dim(self):
@@ -82,30 +82,28 @@ def time_embed_dim(self):
     def cross_attention_dim(self):
         return 100
 
-    # YiYi's TO-DO: add a tiny tokenizer?
     @property
     def dummy_tokenizer(self):
-        tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
+        tokenizer = XLMRobertaTokenizer.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
         return tokenizer
 
-    # @property
-    # def dummy_text_encoder(self):
-    #     torch.manual_seed(0)
-    #     config = PretrainedConfig(
-    #         modelBase="YiYiXu/tiny-random-mclip-base",
-    #         numDims=100,
-    #         transformerDimensions=32)
-
-    #     return MultilingualCLIP(config)
-
     @property
     def dummy_text_encoder(self):
         torch.manual_seed(0)
-        config = PretrainedConfig(
-            modelBase="xlm-roberta-large", numDims=self.cross_attention_dim, transformerDimensions=1024
+        config = MCLIPConfig(
+            numDims=self.cross_attention_dim, 
+            transformerDimensions=self.text_embedder_hidden_size,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            vocab_size=250002,
         )
+        
+        text_encoder = MultilingualCLIP(config)
+        text_encoder = text_encoder.eval()
 
-        return MultilingualCLIP(config)
+        return text_encoder 
 
     @property
     def dummy_text_proj(self):
@@ -203,7 +201,7 @@ def get_dummy_components(self):
 
     def get_dummy_inputs(self, device, seed=0):
         image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
-        floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
+        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
         # create init_image
         image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
         image = image.cpu().permute(0, 2, 3, 1)[0]
@@ -221,7 +219,7 @@ def get_dummy_inputs(self, device, seed=0):
             "image": init_image,
             "mask_image": mask,
             "image_embeds": image_embeds,
-            "negative_image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
             "generator": generator,
             "height": 64,
             "width": 64,
@@ -256,7 +254,7 @@ def test_kandinsky_inpaint(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.52034867, 0.4924194, 0.44671825, 0.5747229, 0.574834, 0.45885202, 0.41398984, 0.4793774, 0.50443137]
+            [0.5069735,  0.5303574,  0.47324282, 0.57705986, 0.57984686, 0.44895405, 0.42856842, 0.4831331,  0.5052104 ]
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

From 6557f287818eda04872826d89199288cba9aef59 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 17 May 2023 07:59:37 +0000
Subject: [PATCH 073/182] make style

---
 tests/pipelines/kandinsky/test_kandinsky.py         | 8 ++++----
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 580748183efd..73f667b36b85 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -88,7 +88,7 @@ def dummy_tokenizer(self):
     def dummy_text_encoder(self):
         torch.manual_seed(0)
         config = MCLIPConfig(
-            numDims=self.cross_attention_dim, 
+            numDims=self.cross_attention_dim,
             transformerDimensions=self.text_embedder_hidden_size,
             hidden_size=self.text_embedder_hidden_size,
             intermediate_size=37,
@@ -96,11 +96,11 @@ def dummy_text_encoder(self):
             num_hidden_layers=5,
             vocab_size=250002,
         )
-        
+
         text_encoder = MultilingualCLIP(config)
         text_encoder = text_encoder.eval()
 
-        return text_encoder 
+        return text_encoder
 
     @property
     def dummy_text_proj(self):
@@ -238,7 +238,7 @@ def test_kandinsky(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.50759643, 0.50876284, 0.4554392, 0.5594512, 0.53785735, 0.44757918,  0.4388101,  0.46746832, 0.4886209]
+            [0.50759643, 0.50876284, 0.4554392, 0.5594512, 0.53785735, 0.44757918, 0.4388101, 0.46746832, 0.4886209]
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index 35d5cacf8756..923b17d7657a 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -91,7 +91,7 @@ def dummy_tokenizer(self):
     def dummy_text_encoder(self):
         torch.manual_seed(0)
         config = MCLIPConfig(
-            numDims=self.cross_attention_dim, 
+            numDims=self.cross_attention_dim,
             transformerDimensions=self.text_embedder_hidden_size,
             hidden_size=self.text_embedder_hidden_size,
             intermediate_size=37,
@@ -99,11 +99,11 @@ def dummy_text_encoder(self):
             num_hidden_layers=5,
             vocab_size=250002,
         )
-        
+
         text_encoder = MultilingualCLIP(config)
         text_encoder = text_encoder.eval()
 
-        return text_encoder 
+        return text_encoder
 
     @property
     def dummy_text_proj(self):
@@ -254,7 +254,7 @@ def test_kandinsky_inpaint(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.5069735,  0.5303574,  0.47324282, 0.57705986, 0.57984686, 0.44895405, 0.42856842, 0.4831331,  0.5052104 ]
+            [0.5069735, 0.5303574, 0.47324282, 0.57705986, 0.57984686, 0.44895405, 0.42856842, 0.4831331, 0.5052104]
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

From da35ddfc8e01c7a4d071a1cfae46eb4c0818b199 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 17 May 2023 08:23:24 +0000
Subject: [PATCH 074/182] fix

---
 .../pipelines/kandinsky/pipeline_kandinsky.py          |  6 +++---
 .../pipelines/kandinsky/pipeline_kandinsky_inpaint.py  | 10 ++++------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index a76a5256df25..e7f3bb407d68 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -16,7 +16,7 @@
 
 import torch
 from transformers import (
-    XLMRobertaTokenizerFast,
+    XLMRobertaTokenizer,
 )
 
 from ...models import UNet2DConditionModel, VQModel
@@ -56,7 +56,7 @@ class KandinskyPipeline(DiffusionPipeline):
     Args:
         text_encoder ([`MultilingualCLIP`]):
             Frozen text-encoder.
-        tokenizer ([`XLMRobertaTokenizerFast`]):
+        tokenizer ([`XLMRobertaTokenizer`]):
             Tokenizer of class
         scheduler ([`UnCLIPScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
@@ -71,7 +71,7 @@ class KandinskyPipeline(DiffusionPipeline):
     def __init__(
         self,
         text_encoder: MultilingualCLIP,
-        tokenizer: XLMRobertaTokenizerFast,
+        tokenizer: XLMRobertaTokenizer,
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
         scheduler: UnCLIPScheduler,
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 3029313db729..efdb9bf794b8 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -21,7 +21,7 @@
 import torch.nn.functional as F
 from PIL import Image
 from transformers import (
-    XLMRobertaTokenizerFast,
+    XLMRobertaTokenizer,
 )
 
 from ...models import UNet2DConditionModel, VQModel
@@ -197,7 +197,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
     Args:
         text_encoder ([`MultilingualCLIP`]):
             Frozen text-encoder.
-        tokenizer ([`XLMRobertaTokenizerFast`]):
+        tokenizer ([`XLMRobertaTokenizer`]):
             Tokenizer of class
         scheduler ([`UnCLIPScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
@@ -213,7 +213,7 @@ def __init__(
         self,
         text_encoder: MultilingualCLIP,
         movq: VQModel,
-        tokenizer: XLMRobertaTokenizerFast,
+        tokenizer: XLMRobertaTokenizer,
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
         scheduler: UnCLIPScheduler,
@@ -498,9 +498,7 @@ def __call__(
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps_tensor = self.scheduler.timesteps
 
-        # YiYi's TO-DO: hard-code to be 4, need to set it to be the z_channels in MoVQ encoder's config once it's added
-        num_channels_latents = 4
-        # num_channels_latents = self.movq.config.z_channels
+        num_channels_latents = self.movq.config.latent_channels
 
         # get h, w for latents
         sample_height, sample_width = get_new_h_w(height, width, self.movq_scale_factor)

From f05c3ac67576de7efdd05f44b5e597f9e864fdbd Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 17 May 2023 08:43:09 +0000
Subject: [PATCH 075/182] add doc

---
 docs/source/en/_toctree.yml                |  2 ++
 docs/source/en/api/pipelines/kandinsky.mdx | 40 ++++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 docs/source/en/api/pipelines/kandinsky.mdx

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 246b467d8b04..24e11b44ac4d 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -222,6 +222,8 @@
       title: Text-to-Video Zero
     - local: api/pipelines/unclip
       title: UnCLIP
+    - local: api/pipelines/kandinsky
+      title: Kandinsky
     - local: api/pipelines/latent_diffusion_uncond
       title: Unconditional Latent Diffusion
     - local: api/pipelines/versatile_diffusion
diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
new file mode 100644
index 000000000000..a49212dfd733
--- /dev/null
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -0,0 +1,40 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Kandinsky
+
+## Overview
+
+Kandinsky 2.1 inherits best practicies from Dall-E 2 and Latent diffusion, while introducing some new ideas.
+
+As text and image encoder it uses CLIP model and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.
+
+The Kandinsky model in diffusers comes from ai-forever and the original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2)
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* | - |
+| [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* | - |
+
+
+## KandinskyPipeline
+
+[[autodoc]] KandinskyPipeline
+	- all
+	- __call__
+
+[[autodoc]] KandinskyInpaintPipeline
+	- all
+	- __call__
+
+[[autodoc]] KandinskyPriorPipeline
+	- all
+	- __call__

From 89319f83593505344de98787c17df748ddb417b8 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 17 May 2023 08:45:07 +0000
Subject: [PATCH 076/182] doc

---
 docs/source/en/_toctree.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 24e11b44ac4d..c9270c83c9ae 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -160,6 +160,8 @@
       title: DiT
     - local: api/pipelines/if
       title: IF
+    - local: api/pipelines/kandinsky
+      title: Kandinsky
     - local: api/pipelines/latent_diffusion
       title: Latent Diffusion
     - local: api/pipelines/paint_by_example
@@ -222,8 +224,6 @@
       title: Text-to-Video Zero
     - local: api/pipelines/unclip
       title: UnCLIP
-    - local: api/pipelines/kandinsky
-      title: Kandinsky
     - local: api/pipelines/latent_diffusion_uncond
       title: Unconditional Latent Diffusion
     - local: api/pipelines/versatile_diffusion

From 88efed523a78c85312fc2397b1130ba56b37dcd9 Mon Sep 17 00:00:00 2001
From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com>
Date: Wed, 17 May 2023 22:58:55 +0530
Subject: [PATCH 077/182] [WIP] Add img2img (#3426)

* Add img2img

---------

Co-authored-by: ayushmangal <ayushmangal@microsoft.com>
---
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipelines/__init__.py           |   2 +-
 src/diffusers/pipelines/kandinsky/__init__.py |   1 +
 .../pipelines/kandinsky/pipeline_kandinsky.py |   8 +-
 .../kandinsky/pipeline_kandinsky_img2img.py   | 420 ++++++++++++++++++
 .../dummy_torch_and_transformers_objects.py   |  17 +
 .../kandinsky/test_kandinsky_img2img.py       | 308 +++++++++++++
 7 files changed, 751 insertions(+), 6 deletions(-)
 create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
 create mode 100644 tests/pipelines/kandinsky/test_kandinsky_img2img.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 53e70a96928e..d2f1afd29f08 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -129,6 +129,7 @@
         IFPipeline,
         IFSuperResolutionPipeline,
         KandinskyInpaintPipeline,
+        KandinskyImg2ImgPipeline,
         KandinskyPipeline,
         KandinskyPriorPipeline,
         LDMTextToImagePipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index ba906e22300d..03a4d9c7d371 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -52,7 +52,7 @@
         IFPipeline,
         IFSuperResolutionPipeline,
     )
-    from .kandinsky import KandinskyInpaintPipeline, KandinskyPipeline, KandinskyPriorPipeline
+    from .kandinsky import KandinskyInpaintPipeline, KandinskyPipeline, KandinskyPriorPipeline, KandinskyImg2ImgPipeline
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 0242d9ae5edf..199ef0a998a7 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -14,6 +14,7 @@
 else:
     from .pipeline_kandinsky import KandinskyPipeline
     from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
+    from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline
     from .pipeline_kandinsky_prior import KandinskyPriorPipeline
     from .text_encoder import MultilingualCLIP
     from .text_proj import KandinskyTextProjModel
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index e7f3bb407d68..73687f0b8ec4 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -22,7 +22,7 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...pipelines import DiffusionPipeline
 from ...pipelines.pipeline_utils import ImagePipelineOutput
-from ...schedulers import UnCLIPScheduler
+from ...schedulers import DDPMScheduler
 from ...utils import (
     is_accelerate_available,
     is_accelerate_version,
@@ -58,7 +58,7 @@ class KandinskyPipeline(DiffusionPipeline):
             Frozen text-encoder.
         tokenizer ([`XLMRobertaTokenizer`]):
             Tokenizer of class
-        scheduler ([`UnCLIPScheduler`]):
+        scheduler ([`DDPMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
@@ -74,7 +74,7 @@ def __init__(
         tokenizer: XLMRobertaTokenizer,
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
-        scheduler: UnCLIPScheduler,
+        scheduler: DDPMScheduler,
         movq: VQModel,
     ):
         super().__init__()
@@ -376,9 +376,7 @@ def __call__(
                 noise_pred,
                 t,
                 latent_model_input,
-                prev_timestep=prev_timestep,
                 generator=generator,
-                batch_size=batch_size,
             ).prev_sample
 
             _, latents = latents.chunk(2)
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
new file mode 100644
index 000000000000..5147b9b6b4d3
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -0,0 +1,420 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+
+import torch
+from transformers import (
+    XLMRobertaTokenizerFast,
+)
+
+from ...models import UNet2DConditionModel, VQModel
+from ...pipelines import DiffusionPipeline
+from ...pipelines.pipeline_utils import ImagePipelineOutput
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    randn_tensor,
+)
+from .text_encoder import MultilingualCLIP
+from .text_proj import KandinskyTextProjModel
+import PIL
+from PIL import Image
+import numpy as np 
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+def prepare_image(pil_image, w=512, h=512):
+    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+class KandinskyImg2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizerFast`]):
+            Tokenizer of class
+        scheduler ([`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        text_proj ([`KandinskyTextProjModel`]):
+            Utility class to prepare and combine the embeddings before they are passed to the decoder.
+    """
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        tokenizer: XLMRobertaTokenizerFast,
+        text_proj: KandinskyTextProjModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_proj=text_proj,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, latents, latent_timestep, shape, dtype, device, generator, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        
+        latents = latents * scheduler.init_noise_sigma
+        
+        shape = latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        latents = self.scheduler.add_noise(latents, noise, latent_timestep)
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        prompt_embeds, text_encoder_hidden_states = self.text_encoder(
+            input_ids=text_input_ids, attention_mask=text_mask
+        )
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            uncond_text_input_ids = uncond_input.input_ids.to(device)
+            uncond_text_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(
+                input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask
+            )
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        strength: float = 0.75,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        negative_image_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)
+
+        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
+            image_embeddings=image_embeds,
+            prompt_embeds=prompt_embeds,
+            text_encoder_hidden_states=text_encoder_hidden_states,
+        )
+
+        image = prepare_image(image, width, height).to(device)
+        latents = self.movq.encode(image)["latents"]
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps_tensor[:1].repeat(batch_size * num_images_per_prompt)
+
+        num_channels_latents = self.movq.config.latent_channels
+
+        height, width = get_new_h_w(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            latents,
+            latent_timestep,
+            (batch_size, num_channels_latents, height, width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,            
+            self.scheduler,
+        )
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps_tensor):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                
+                noise_pred = self.unet(
+                    sample=latent_model_input,  # [2, 4, 96, 96]
+                    timestep=t,
+                    encoder_hidden_states=text_encoder_hidden_states,
+                    class_labels=additive_clip_time_embeddings,
+                ).sample
+
+                # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline,
+                # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond)
+                # this means the our latent shape is batch_size *2 instad batch_size
+
+                if do_classifier_free_guidance:
+                    noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    variance_pred_uncond, variance_pred_text = variance_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred] * 2)
+                    variance_pred = torch.cat([variance_pred_uncond, variance_pred_text])
+                    noise_pred = torch.cat([noise_pred, variance_pred], dim=1)
+
+                if i + 1 == timesteps_tensor.shape[0]:
+                    prev_timestep = None
+                else:
+                    prev_timestep = timesteps_tensor[i + 1]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    generator=generator,
+                ).prev_sample
+
+                _, latents = latents.chunk(2)
+
+        # post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        image = image * 0.5 + 0.5
+        image = image.clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index cdd61dcf2ac5..1f8bd6a3d2cb 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -152,6 +152,23 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+
+class KandinskyImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+
 class KandinskyInpaintPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
new file mode 100644
index 000000000000..a93e0d3a32ea
--- /dev/null
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -0,0 +1,308 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import XLMRobertaTokenizer
+
+from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline, DDPMScheduler, UNet2DConditionModel, VQModel, KandinskyImg2ImgPipeline
+from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
+from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
+class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyImg2ImgPipeline
+    params = ["prompt", "image_embeds", "negative_image_embeds", "image"]
+    batch_params = [
+        "prompt",
+        "negative_prompt",
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "strength",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = XLMRobertaTokenizer.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = MCLIPConfig(
+            numDims=self.cross_attention_dim,
+            transformerDimensions=self.text_embedder_hidden_size,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            vocab_size=250002,
+        )
+
+        text_encoder = MultilingualCLIP(config)
+        text_encoder = text_encoder.eval()
+
+        return text_encoder
+
+    @property
+    def dummy_text_proj(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "clip_embeddings_dim": self.cross_attention_dim,
+            "time_embed_dim": self.time_embed_dim,
+            "clip_extra_context_tokens": 2,
+            "cross_attention_dim": self.cross_attention_dim,
+            "clip_text_encoder_hidden_states_dim": self.text_embedder_hidden_size,
+        }
+
+        model = KandinskyTextProjModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 4,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": "identity",
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        unet = self.dummy_unet
+        text_proj = self.dummy_text_proj
+        movq = self.dummy_movq
+
+        ddpm_config = {
+        "clip_sample": True,
+        "clip_sample_range": 2.0,
+        "sample_max_value": None,
+        "num_train_timesteps": 1000,
+        "prediction_type": "epsilon",
+        "variance_type": "learned_range",
+        "thresholding": True,
+        "beta_schedule": "linear",
+        "beta_start": 0.00085,
+        "beta_end":0.012
+        }
+
+        scheduler = DDPMScheduler(**ddpm_config)
+
+        components = {
+            "text_proj": text_proj,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "image": init_image,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_img2img(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        print(f"image.shape {image.shape}")
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.6635241 , 0.6152489 , 0.5687914 , 0.57371366, 0.53458804, 0.47828954, 0.5454488 , 0.51518494, 0.49540082])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch_gpu
+class KandinskyImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_img2img(self):
+        expected_image = load_numpy(
+            "https://user-images.githubusercontent.com/43698245/238954026-6c3e3da6-ef18-4d78-b521-6386e6922444.png"
+        )
+
+        init_image = load_image(
+            "https://user-images.githubusercontent.com/43698245/238191310-a99fe3cf-c2ee-417e-94f1-c1829a0ae0a3.png"
+        )
+        prompt = "A red cartoon frog, 4k"
+
+        pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyImg2ImgPipeline.from_pretrained("ayushtues/test-kandinsky-img2img", torch_dtype=torch.float16)
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+        ).images
+        zero_image_emb = pipe_prior("").images
+
+        output = pipeline(
+            prompt,
+            image=init_image,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            height=768,
+            width=768,
+            strength=0.2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)

From a05cc4fa218e7e63eca87300418a036866c933d8 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 17 May 2023 17:30:18 +0000
Subject: [PATCH 078/182] Revert "[WIP] Add img2img (#3426)"

This reverts commit 88efed523a78c85312fc2397b1130ba56b37dcd9.
---
 src/diffusers/__init__.py                     |   1 -
 src/diffusers/pipelines/__init__.py           |   2 +-
 src/diffusers/pipelines/kandinsky/__init__.py |   1 -
 .../pipelines/kandinsky/pipeline_kandinsky.py |   8 +-
 .../kandinsky/pipeline_kandinsky_img2img.py   | 420 ------------------
 .../dummy_torch_and_transformers_objects.py   |  17 -
 .../kandinsky/test_kandinsky_img2img.py       | 308 -------------
 7 files changed, 6 insertions(+), 751 deletions(-)
 delete mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
 delete mode 100644 tests/pipelines/kandinsky/test_kandinsky_img2img.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d2f1afd29f08..53e70a96928e 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -129,7 +129,6 @@
         IFPipeline,
         IFSuperResolutionPipeline,
         KandinskyInpaintPipeline,
-        KandinskyImg2ImgPipeline,
         KandinskyPipeline,
         KandinskyPriorPipeline,
         LDMTextToImagePipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 03a4d9c7d371..ba906e22300d 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -52,7 +52,7 @@
         IFPipeline,
         IFSuperResolutionPipeline,
     )
-    from .kandinsky import KandinskyInpaintPipeline, KandinskyPipeline, KandinskyPriorPipeline, KandinskyImg2ImgPipeline
+    from .kandinsky import KandinskyInpaintPipeline, KandinskyPipeline, KandinskyPriorPipeline
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 199ef0a998a7..0242d9ae5edf 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -14,7 +14,6 @@
 else:
     from .pipeline_kandinsky import KandinskyPipeline
     from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
-    from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline
     from .pipeline_kandinsky_prior import KandinskyPriorPipeline
     from .text_encoder import MultilingualCLIP
     from .text_proj import KandinskyTextProjModel
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 73687f0b8ec4..e7f3bb407d68 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -22,7 +22,7 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...pipelines import DiffusionPipeline
 from ...pipelines.pipeline_utils import ImagePipelineOutput
-from ...schedulers import DDPMScheduler
+from ...schedulers import UnCLIPScheduler
 from ...utils import (
     is_accelerate_available,
     is_accelerate_version,
@@ -58,7 +58,7 @@ class KandinskyPipeline(DiffusionPipeline):
             Frozen text-encoder.
         tokenizer ([`XLMRobertaTokenizer`]):
             Tokenizer of class
-        scheduler ([`DDPMScheduler`]):
+        scheduler ([`UnCLIPScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
@@ -74,7 +74,7 @@ def __init__(
         tokenizer: XLMRobertaTokenizer,
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
-        scheduler: DDPMScheduler,
+        scheduler: UnCLIPScheduler,
         movq: VQModel,
     ):
         super().__init__()
@@ -376,7 +376,9 @@ def __call__(
                 noise_pred,
                 t,
                 latent_model_input,
+                prev_timestep=prev_timestep,
                 generator=generator,
+                batch_size=batch_size,
             ).prev_sample
 
             _, latents = latents.chunk(2)
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
deleted file mode 100644
index 5147b9b6b4d3..000000000000
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ /dev/null
@@ -1,420 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Union
-
-import torch
-from transformers import (
-    XLMRobertaTokenizerFast,
-)
-
-from ...models import UNet2DConditionModel, VQModel
-from ...pipelines import DiffusionPipeline
-from ...pipelines.pipeline_utils import ImagePipelineOutput
-from ...schedulers import DDPMScheduler
-from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
-    logging,
-    randn_tensor,
-)
-from .text_encoder import MultilingualCLIP
-from .text_proj import KandinskyTextProjModel
-import PIL
-from PIL import Image
-import numpy as np 
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def get_new_h_w(h, w, scale_factor=8):
-    new_h = h // scale_factor**2
-    if h % scale_factor**2 != 0:
-        new_h += 1
-    new_w = w // scale_factor**2
-    if w % scale_factor**2 != 0:
-        new_w += 1
-    return new_h * scale_factor, new_w * scale_factor
-
-def prepare_image(pil_image, w=512, h=512):
-    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
-    arr = np.array(pil_image.convert("RGB"))
-    arr = arr.astype(np.float32) / 127.5 - 1
-    arr = np.transpose(arr, [2, 0, 1])
-    image = torch.from_numpy(arr).unsqueeze(0)
-    return image
-
-class KandinskyImg2ImgPipeline(DiffusionPipeline):
-    """
-    Pipeline for image-to-image generation using Kandinsky
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        text_encoder ([`MultilingualCLIP`]):
-            Frozen text-encoder.
-        tokenizer ([`XLMRobertaTokenizerFast`]):
-            Tokenizer of class
-        scheduler ([`DDPMScheduler`]):
-            A scheduler to be used in combination with `unet` to generate image latents.
-        unet ([`UNet2DConditionModel`]):
-            Conditional U-Net architecture to denoise the image embedding.
-        text_proj ([`KandinskyTextProjModel`]):
-            Utility class to prepare and combine the embeddings before they are passed to the decoder.
-    """
-
-    def __init__(
-        self,
-        text_encoder: MultilingualCLIP,
-        tokenizer: XLMRobertaTokenizerFast,
-        text_proj: KandinskyTextProjModel,
-        unet: UNet2DConditionModel,
-        scheduler: DDPMScheduler,
-        movq: VQModel
-    ):
-        super().__init__()
-
-        self.register_modules(
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            text_proj=text_proj,
-            unet=unet,
-            scheduler=scheduler,
-            movq=movq
-        )
-        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
-
-    def get_timesteps(self, num_inference_steps, strength, device):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
-
-        return timesteps, num_inference_steps - t_start
-
-    def prepare_latents(self, latents, latent_timestep, shape, dtype, device, generator, scheduler):
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-        
-        latents = latents * scheduler.init_noise_sigma
-        
-        shape = latents.shape
-        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-
-        # get latents
-        latents = self.scheduler.add_noise(latents, noise, latent_timestep)
-        return latents
-
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-    ):
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_attention_mask=True,
-            add_special_tokens=True,
-            return_tensors="pt",
-        )
-
-        text_input_ids = text_inputs.input_ids.to(device)
-        text_mask = text_inputs.attention_mask.to(device)
-
-        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-
-        prompt_embeds, text_encoder_hidden_states = self.text_encoder(
-            input_ids=text_input_ids, attention_mask=text_mask
-        )
-
-        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
-
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_attention_mask=True,
-                add_special_tokens=True,
-                return_tensors="pt",
-            )
-            uncond_text_input_ids = uncond_input.input_ids.to(device)
-            uncond_text_mask = uncond_input.attention_mask.to(device)
-
-            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(
-                input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask
-            )
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
-
-            seq_len = uncond_text_encoder_hidden_states.shape[1]
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
-                batch_size * num_images_per_prompt, seq_len, -1
-            )
-            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
-
-            # done duplicates
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
-
-            text_mask = torch.cat([uncond_text_mask, text_mask])
-
-        return prompt_embeds, text_encoder_hidden_states, text_mask
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
-    def enable_sequential_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
-        Note that offloading happens on a submodule basis. Memory savings are higher than with
-        `enable_model_cpu_offload`, but performance is lower.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
-            from accelerate import cpu_offload
-        else:
-            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
-
-        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-
-    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
-    def _execution_device(self):
-        r"""
-        Returns the device on which the pipeline's models will be executed. After calling
-        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
-        hooks.
-        """
-        if not hasattr(self.unet, "_hf_hook"):
-            return self.device
-        for module in self.unet.modules():
-            if (
-                hasattr(module, "_hf_hook")
-                and hasattr(module._hf_hook, "execution_device")
-                and module._hf_hook.execution_device is not None
-            ):
-                return torch.device(module._hf_hook.execution_device)
-        return self.device
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
-        height: int = 512,
-        width: int = 512,
-        num_inference_steps: int = 100,
-        strength: float = 0.75,
-        guidance_scale: float = 4.0,
-        num_images_per_prompt: int = 1,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        image_embeds: Optional[torch.FloatTensor] = None,
-        negative_image_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ):
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        device = self._execution_device
-
-        batch_size = batch_size * num_images_per_prompt
-
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
-            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-        )
-
-        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)
-
-        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
-            image_embeddings=image_embeds,
-            prompt_embeds=prompt_embeds,
-            text_encoder_hidden_states=text_encoder_hidden_states,
-        )
-
-        image = prepare_image(image, width, height).to(device)
-        latents = self.movq.encode(image)["latents"]
-
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
-        latent_timestep = timesteps_tensor[:1].repeat(batch_size * num_images_per_prompt)
-
-        num_channels_latents = self.movq.config.latent_channels
-
-        height, width = get_new_h_w(height, width, self.movq_scale_factor)
-
-        # create initial latent
-        latents = self.prepare_latents(
-            latents,
-            latent_timestep,
-            (batch_size, num_channels_latents, height, width),
-            text_encoder_hidden_states.dtype,
-            device,
-            generator,            
-            self.scheduler,
-        )
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps_tensor):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                
-                noise_pred = self.unet(
-                    sample=latent_model_input,  # [2, 4, 96, 96]
-                    timestep=t,
-                    encoder_hidden_states=text_encoder_hidden_states,
-                    class_labels=additive_clip_time_embeddings,
-                ).sample
-
-                # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline,
-                # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond)
-                # this means the our latent shape is batch_size *2 instad batch_size
-
-                if do_classifier_free_guidance:
-                    noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    variance_pred_uncond, variance_pred_text = variance_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                    noise_pred = torch.cat([noise_pred] * 2)
-                    variance_pred = torch.cat([variance_pred_uncond, variance_pred_text])
-                    noise_pred = torch.cat([noise_pred, variance_pred], dim=1)
-
-                if i + 1 == timesteps_tensor.shape[0]:
-                    prev_timestep = None
-                else:
-                    prev_timestep = timesteps_tensor[i + 1]
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    generator=generator,
-                ).prev_sample
-
-                _, latents = latents.chunk(2)
-
-        # post-processing
-        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
-
-        image = image * 0.5 + 0.5
-        image = image.clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
-
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 1f8bd6a3d2cb..cdd61dcf2ac5 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -152,23 +152,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-
-class KandinskyImg2ImgPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
-
 class KandinskyInpaintPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
deleted file mode 100644
index a93e0d3a32ea..000000000000
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ /dev/null
@@ -1,308 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import XLMRobertaTokenizer
-
-from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline, DDPMScheduler, UNet2DConditionModel, VQModel, KandinskyImg2ImgPipeline
-from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
-from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
-
-
-class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = KandinskyImg2ImgPipeline
-    params = ["prompt", "image_embeds", "negative_image_embeds", "image"]
-    batch_params = [
-        "prompt",
-        "negative_prompt",
-        "image_embeds",
-        "negative_image_embeds",
-        "image",
-    ]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "strength",
-        "guidance_scale",
-        "negative_prompt",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
-
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 100
-
-    @property
-    def dummy_tokenizer(self):
-        tokenizer = XLMRobertaTokenizer.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
-        return tokenizer
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = MCLIPConfig(
-            numDims=self.cross_attention_dim,
-            transformerDimensions=self.text_embedder_hidden_size,
-            hidden_size=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            vocab_size=250002,
-        )
-
-        text_encoder = MultilingualCLIP(config)
-        text_encoder = text_encoder.eval()
-
-        return text_encoder
-
-    @property
-    def dummy_text_proj(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "clip_embeddings_dim": self.cross_attention_dim,
-            "time_embed_dim": self.time_embed_dim,
-            "clip_extra_context_tokens": 2,
-            "cross_attention_dim": self.cross_attention_dim,
-            "clip_text_encoder_hidden_states_dim": self.text_embedder_hidden_size,
-        }
-
-        model = KandinskyTextProjModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_unet(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "in_channels": 4,
-            # Out channels is double in channels because predicts mean and variance
-            "out_channels": 8,
-            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
-            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
-            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "layers_per_block": 1,
-            "cross_attention_dim": self.cross_attention_dim,
-            "attention_head_dim": 4,
-            "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": "identity",
-        }
-
-        model = UNet2DConditionModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_movq_kwargs(self):
-        return {
-            "block_out_channels": [32, 64],
-            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
-            "in_channels": 3,
-            "latent_channels": 4,
-            "layers_per_block": 1,
-            "norm_num_groups": 8,
-            "norm_type": "spatial",
-            "num_vq_embeddings": 12,
-            "out_channels": 3,
-            "up_block_types": [
-                "AttnUpDecoderBlock2D",
-                "UpDecoderBlock2D",
-            ],
-            "vq_embed_dim": 4,
-        }
-
-    @property
-    def dummy_movq(self):
-        torch.manual_seed(0)
-        model = VQModel(**self.dummy_movq_kwargs)
-        return model
-
-    def get_dummy_components(self):
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        unet = self.dummy_unet
-        text_proj = self.dummy_text_proj
-        movq = self.dummy_movq
-
-        ddpm_config = {
-        "clip_sample": True,
-        "clip_sample_range": 2.0,
-        "sample_max_value": None,
-        "num_train_timesteps": 1000,
-        "prediction_type": "epsilon",
-        "variance_type": "learned_range",
-        "thresholding": True,
-        "beta_schedule": "linear",
-        "beta_start": 0.00085,
-        "beta_end":0.012
-        }
-
-        scheduler = DDPMScheduler(**ddpm_config)
-
-        components = {
-            "text_proj": text_proj,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "unet": unet,
-            "scheduler": scheduler,
-            "movq": movq,
-        }
-
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
-        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
-        # create init_image
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
-
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "horse",
-            "image": init_image,
-            "image_embeds": image_embeds,
-            "negative_image_embeds": negative_image_embeds,
-            "generator": generator,
-            "height": 64,
-            "width": 64,
-            "num_inference_steps": 2,
-            "output_type": "np",
-        }
-        return inputs
-
-    def test_kandinsky_img2img(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.images
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        print(f"image.shape {image.shape}")
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array([0.6635241 , 0.6152489 , 0.5687914 , 0.57371366, 0.53458804, 0.47828954, 0.5454488 , 0.51518494, 0.49540082])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@slow
-@require_torch_gpu
-class KandinskyImg2ImgPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_kandinsky_img2img(self):
-        expected_image = load_numpy(
-            "https://user-images.githubusercontent.com/43698245/238954026-6c3e3da6-ef18-4d78-b521-6386e6922444.png"
-        )
-
-        init_image = load_image(
-            "https://user-images.githubusercontent.com/43698245/238191310-a99fe3cf-c2ee-417e-94f1-c1829a0ae0a3.png"
-        )
-        prompt = "A red cartoon frog, 4k"
-
-        pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
-        pipe_prior.to(torch_device)
-
-        pipeline = KandinskyImg2ImgPipeline.from_pretrained("ayushtues/test-kandinsky-img2img", torch_dtype=torch.float16)
-        pipeline = pipeline.to(torch_device)
-        pipeline.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        image_emb = pipe_prior(
-            prompt,
-            generator=generator,
-        ).images
-        zero_image_emb = pipe_prior("").images
-
-        output = pipeline(
-            prompt,
-            image=init_image,
-            image_embeds=image_emb,
-            negative_image_embeds=zero_image_emb,
-            generator=generator,
-            num_inference_steps=100,
-            height=768,
-            width=768,
-            strength=0.2,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (768, 768, 3)
-
-        assert_mean_pixel_difference(image, expected_image)

From 5b0736cdcc89ecd2a87e5878d6c9f1b72f29cb79 Mon Sep 17 00:00:00 2001
From: ayushmangal <ayushmangal@microsoft.com>
Date: Wed, 17 May 2023 17:37:44 +0000
Subject: [PATCH 079/182] ayush's PR to add img2img #3426

---
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipelines/__init__.py           |   2 +-
 src/diffusers/pipelines/kandinsky/__init__.py |   1 +
 .../kandinsky/pipeline_kandinsky_img2img.py   | 420 ++++++++++++++++++
 .../dummy_torch_and_transformers_objects.py   |  17 +
 5 files changed, 440 insertions(+), 1 deletion(-)
 create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 53e70a96928e..d2f1afd29f08 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -129,6 +129,7 @@
         IFPipeline,
         IFSuperResolutionPipeline,
         KandinskyInpaintPipeline,
+        KandinskyImg2ImgPipeline,
         KandinskyPipeline,
         KandinskyPriorPipeline,
         LDMTextToImagePipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index ba906e22300d..03a4d9c7d371 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -52,7 +52,7 @@
         IFPipeline,
         IFSuperResolutionPipeline,
     )
-    from .kandinsky import KandinskyInpaintPipeline, KandinskyPipeline, KandinskyPriorPipeline
+    from .kandinsky import KandinskyInpaintPipeline, KandinskyPipeline, KandinskyPriorPipeline, KandinskyImg2ImgPipeline
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 0242d9ae5edf..199ef0a998a7 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -14,6 +14,7 @@
 else:
     from .pipeline_kandinsky import KandinskyPipeline
     from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
+    from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline
     from .pipeline_kandinsky_prior import KandinskyPriorPipeline
     from .text_encoder import MultilingualCLIP
     from .text_proj import KandinskyTextProjModel
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
new file mode 100644
index 000000000000..5147b9b6b4d3
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -0,0 +1,420 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+
+import torch
+from transformers import (
+    XLMRobertaTokenizerFast,
+)
+
+from ...models import UNet2DConditionModel, VQModel
+from ...pipelines import DiffusionPipeline
+from ...pipelines.pipeline_utils import ImagePipelineOutput
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    randn_tensor,
+)
+from .text_encoder import MultilingualCLIP
+from .text_proj import KandinskyTextProjModel
+import PIL
+from PIL import Image
+import numpy as np 
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+def prepare_image(pil_image, w=512, h=512):
+    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+class KandinskyImg2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizerFast`]):
+            Tokenizer of class
+        scheduler ([`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        text_proj ([`KandinskyTextProjModel`]):
+            Utility class to prepare and combine the embeddings before they are passed to the decoder.
+    """
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        tokenizer: XLMRobertaTokenizerFast,
+        text_proj: KandinskyTextProjModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_proj=text_proj,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, latents, latent_timestep, shape, dtype, device, generator, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        
+        latents = latents * scheduler.init_noise_sigma
+        
+        shape = latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        latents = self.scheduler.add_noise(latents, noise, latent_timestep)
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        prompt_embeds, text_encoder_hidden_states = self.text_encoder(
+            input_ids=text_input_ids, attention_mask=text_mask
+        )
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            uncond_text_input_ids = uncond_input.input_ids.to(device)
+            uncond_text_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(
+                input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask
+            )
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        strength: float = 0.75,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        negative_image_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)
+
+        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
+            image_embeddings=image_embeds,
+            prompt_embeds=prompt_embeds,
+            text_encoder_hidden_states=text_encoder_hidden_states,
+        )
+
+        image = prepare_image(image, width, height).to(device)
+        latents = self.movq.encode(image)["latents"]
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps_tensor[:1].repeat(batch_size * num_images_per_prompt)
+
+        num_channels_latents = self.movq.config.latent_channels
+
+        height, width = get_new_h_w(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            latents,
+            latent_timestep,
+            (batch_size, num_channels_latents, height, width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,            
+            self.scheduler,
+        )
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps_tensor):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                
+                noise_pred = self.unet(
+                    sample=latent_model_input,  # [2, 4, 96, 96]
+                    timestep=t,
+                    encoder_hidden_states=text_encoder_hidden_states,
+                    class_labels=additive_clip_time_embeddings,
+                ).sample
+
+                # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline,
+                # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond)
+                # this means the our latent shape is batch_size *2 instad batch_size
+
+                if do_classifier_free_guidance:
+                    noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    variance_pred_uncond, variance_pred_text = variance_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred] * 2)
+                    variance_pred = torch.cat([variance_pred_uncond, variance_pred_text])
+                    noise_pred = torch.cat([noise_pred, variance_pred], dim=1)
+
+                if i + 1 == timesteps_tensor.shape[0]:
+                    prev_timestep = None
+                else:
+                    prev_timestep = timesteps_tensor[i + 1]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    generator=generator,
+                ).prev_sample
+
+                _, latents = latents.chunk(2)
+
+        # post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        image = image * 0.5 + 0.5
+        image = image.clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index cdd61dcf2ac5..1f8bd6a3d2cb 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -152,6 +152,23 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+
+class KandinskyImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+
 class KandinskyInpaintPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From 012f87f21ce648bf4f5eb754d588daba6413e5ae Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 17:50:02 +0000
Subject: [PATCH 080/182] img2img with ddim

---
 .../kandinsky/pipeline_kandinsky_img2img.py   | 106 +++++++++++++-----
 1 file changed, 80 insertions(+), 26 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 5147b9b6b4d3..c8e6a883cb38 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -16,13 +16,13 @@
 
 import torch
 from transformers import (
-    XLMRobertaTokenizerFast,
+    XLMRobertaTokenizer,
 )
 
 from ...models import UNet2DConditionModel, VQModel
 from ...pipelines import DiffusionPipeline
 from ...pipelines.pipeline_utils import ImagePipelineOutput
-from ...schedulers import DDPMScheduler
+from ...schedulers import DDIMScheduler
 from ...utils import (
     is_accelerate_available,
     is_accelerate_version,
@@ -65,24 +65,26 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
     Args:
         text_encoder ([`MultilingualCLIP`]):
             Frozen text-encoder.
-        tokenizer ([`XLMRobertaTokenizerFast`]):
+        tokenizer ([`XLMRobertaTokenizer`]):
             Tokenizer of class
-        scheduler ([`DDPMScheduler`]):
+        scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
         text_proj ([`KandinskyTextProjModel`]):
             Utility class to prepare and combine the embeddings before they are passed to the decoder.
+        movq ([`VQModel`]):
+            MoVQ image encoder and decoder
     """
 
     def __init__(
         self,
         text_encoder: MultilingualCLIP,
-        tokenizer: XLMRobertaTokenizerFast,
+        movq: VQModel,
+        tokenizer: XLMRobertaTokenizer,
         text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
-        scheduler: DDPMScheduler,
-        movq: VQModel
+        scheduler: DDIMScheduler,
     ):
         super().__init__()
 
@@ -95,10 +97,10 @@ def __init__(
             movq=movq
         )
         self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
-
+    
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps) 
 
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start:]
@@ -114,12 +116,19 @@ def prepare_latents(self, latents, latent_timestep, shape, dtype, device, genera
             latents = latents.to(device)
         
         latents = latents * scheduler.init_noise_sigma
-        
+
         shape = latents.shape
-        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        # YiYi notes: put this back after done testing
+        #noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        # YiYi notes: testing only (create noise to match original)
+        torch.manual_seed(0)
+        noise = torch.randn_like(latents)
+        print(f" noise added :{noise.shape},{noise.sum()},{noise[0,0,0,:5]} ")
 
         # get latents
-        latents = self.scheduler.add_noise(latents, noise, latent_timestep)
+        # YiYi notes: we use a hard coded add_noise method here because it use a different beta schedule for adding noise >=<
+        # latents = self.scheduler.add_noise(latents, noise, latent_timestep)
+        latents = self.add_noise(latents, noise, latent_timestep)
         return latents
 
     def _encode_prompt(
@@ -135,7 +144,7 @@ def _encode_prompt(
         text_inputs = self.tokenizer(
             prompt,
             padding="max_length",
-            max_length=self.tokenizer.model_max_length,
+            max_length=77,
             truncation=True,
             return_attention_mask=True,
             add_special_tokens=True,
@@ -186,7 +195,7 @@ def _encode_prompt(
             uncond_input = self.tokenizer(
                 uncond_tokens,
                 padding="max_length",
-                max_length=self.tokenizer.model_max_length,
+                max_length=77,
                 truncation=True,
                 return_attention_mask=True,
                 add_special_tokens=True,
@@ -297,22 +306,51 @@ def _execution_device(self):
             ):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
+    
+    # YiYi's notes: Hard code this method here for now because the kandinsky repo use a different beta schedule for add noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+
+        betas = torch.linspace(0.0001, 0.02, 1000, dtype=torch.float32)
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        alphas_cumprod = alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+
+        return noisy_samples
 
     @torch.no_grad()
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        image_embeds: torch.FloatTensor,
+        negative_image_embeds: torch.FloatTensor,
         height: int = 512,
         width: int = 512,
         num_inference_steps: int = 100,
         strength: float = 0.75,
-        guidance_scale: float = 4.0,
+        guidance_scale: float = 7.0,
         num_images_per_prompt: int = 1,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        image_embeds: Optional[torch.FloatTensor] = None,
-        negative_image_embeds: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ):
@@ -341,14 +379,31 @@ def __call__(
             text_encoder_hidden_states=text_encoder_hidden_states,
         )
 
-        image = prepare_image(image, width, height).to(device)
+        image = prepare_image(image, width, height).to(dtype=prompt_embeds.dtype, device=device)
         latents = self.movq.encode(image)["latents"]
 
+        print(f"encoded image latents: {latents.shape},{latents.sum()}")
+
         self.scheduler.set_timesteps(num_inference_steps, device=device)
+        # YiYi's notes: add 1 to match original ddim steps
+        # (Notes from kandinsky repo:  add one to get the final alpha values right (the ones from first scale to data during sampling))
+        self.scheduler.timesteps = self.scheduler.timesteps + 1
         timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
-        latent_timestep = timesteps_tensor[:1].repeat(batch_size * num_images_per_prompt)
 
-        num_channels_latents = self.movq.config.latent_channels
+        # YiYi's notes the timestep for add_noise is calculated different in original repo:
+        #latent_timestep = timesteps_tensor[:1].repeat(batch_size * num_images_per_prompt)
+ 
+        latent_timestep = int(self.scheduler.config.num_train_timesteps * strength) - 2
+        # YiYi's notes: above formular is taken from the original repo 
+        ## note that diffusers's strength arg is same as 1-stregth in the original and because we use init() here
+        ## we use -2 instead of -1
+        latent_timestep = torch.tensor(
+            [latent_timestep] * (batch_size * num_images_per_prompt), 
+            dtype=timesteps_tensor.dtype,
+            device=device )
+        print(f" latent_timestep for add_noise :{latent_timestep}")
+
+        num_channels_latents = self.unet.config.in_channels
 
         height, width = get_new_h_w(height, width, self.movq_scale_factor)
 
@@ -362,7 +417,8 @@ def __call__(
             generator,            
             self.scheduler,
         )
-
+        # yiyi testing only - create a generator here to overwrite
+        generator = torch.Generator(device='cuda').manual_seed(0)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps_tensor):
                 # expand the latents if we are doing classifier free guidance
@@ -380,13 +436,11 @@ def __call__(
                 # this means the our latent shape is batch_size *2 instad batch_size
 
                 if do_classifier_free_guidance:
-                    noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                    noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    variance_pred_uncond, variance_pred_text = variance_pred.chunk(2)
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                     noise_pred = torch.cat([noise_pred] * 2)
-                    variance_pred = torch.cat([variance_pred_uncond, variance_pred_text])
-                    noise_pred = torch.cat([noise_pred, variance_pred], dim=1)
+
 
                 if i + 1 == timesteps_tensor.shape[0]:
                     prev_timestep = None

From af57beaaf79124cafe25e317305eb8cda58d0690 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 19:07:06 +0000
Subject: [PATCH 081/182] remove a manual random seed

---
 .../kandinsky/pipeline_kandinsky_img2img.py          | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index c8e6a883cb38..d5aa46abb401 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -118,12 +118,7 @@ def prepare_latents(self, latents, latent_timestep, shape, dtype, device, genera
         latents = latents * scheduler.init_noise_sigma
 
         shape = latents.shape
-        # YiYi notes: put this back after done testing
-        #noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        # YiYi notes: testing only (create noise to match original)
-        torch.manual_seed(0)
-        noise = torch.randn_like(latents)
-        print(f" noise added :{noise.shape},{noise.sum()},{noise[0,0,0,:5]} ")
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
 
         # get latents
         # YiYi notes: we use a hard coded add_noise method here because it use a different beta schedule for adding noise >=<
@@ -382,8 +377,6 @@ def __call__(
         image = prepare_image(image, width, height).to(dtype=prompt_embeds.dtype, device=device)
         latents = self.movq.encode(image)["latents"]
 
-        print(f"encoded image latents: {latents.shape},{latents.sum()}")
-
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         # YiYi's notes: add 1 to match original ddim steps
         # (Notes from kandinsky repo:  add one to get the final alpha values right (the ones from first scale to data during sampling))
@@ -401,7 +394,6 @@ def __call__(
             [latent_timestep] * (batch_size * num_images_per_prompt), 
             dtype=timesteps_tensor.dtype,
             device=device )
-        print(f" latent_timestep for add_noise :{latent_timestep}")
 
         num_channels_latents = self.unet.config.in_channels
 
@@ -418,7 +410,7 @@ def __call__(
             self.scheduler,
         )
         # yiyi testing only - create a generator here to overwrite
-        generator = torch.Generator(device='cuda').manual_seed(0)
+        #generator = torch.Generator(device='cuda').manual_seed(0)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps_tensor):
                 # expand the latents if we are doing classifier free guidance

From 2754aa49ef3a0f0d1cb92086dfe202972248e352 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 19:16:55 +0000
Subject: [PATCH 082/182] fix

---
 .../kandinsky/pipeline_kandinsky_img2img.py       | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index d5aa46abb401..b6098ed65506 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -121,7 +121,8 @@ def prepare_latents(self, latents, latent_timestep, shape, dtype, device, genera
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
 
         # get latents
-        # YiYi notes: we use a hard coded add_noise method here because it use a different beta schedule for adding noise >=<
+        # YiYi notes: I created a add_noise method on the pipeline to overwrite the one in schedule because 
+        ##            it use a different beta schedule for adding noise vs sampling 
         # latents = self.scheduler.add_noise(latents, noise, latent_timestep)
         latents = self.add_noise(latents, noise, latent_timestep)
         return latents
@@ -383,13 +384,10 @@ def __call__(
         self.scheduler.timesteps = self.scheduler.timesteps + 1
         timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
 
-        # YiYi's notes the timestep for add_noise is calculated different in original repo:
-        #latent_timestep = timesteps_tensor[:1].repeat(batch_size * num_images_per_prompt)
- 
+        # YiYi's notes 
+        #       the timestep for add_noise is calculated different in original repo (this formular is taken from the original repo)
         latent_timestep = int(self.scheduler.config.num_train_timesteps * strength) - 2
-        # YiYi's notes: above formular is taken from the original repo 
-        ## note that diffusers's strength arg is same as 1-stregth in the original and because we use init() here
-        ## we use -2 instead of -1
+
         latent_timestep = torch.tensor(
             [latent_timestep] * (batch_size * num_images_per_prompt), 
             dtype=timesteps_tensor.dtype,
@@ -409,8 +407,7 @@ def __call__(
             generator,            
             self.scheduler,
         )
-        # yiyi testing only - create a generator here to overwrite
-        #generator = torch.Generator(device='cuda').manual_seed(0)
+
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps_tensor):
                 # expand the latents if we are doing classifier free guidance

From d27257ebf65f4b57f9e4b755fbc859675b5b095e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 19:20:04 +0000
Subject: [PATCH 083/182] make batch_size latents, instead of x2

---
 .../pipelines/kandinsky/pipeline_kandinsky_img2img.py         | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index b6098ed65506..6b2fdbf457da 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -428,8 +428,6 @@ def __call__(
                     noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                    noise_pred = torch.cat([noise_pred] * 2)
-
 
                 if i + 1 == timesteps_tensor.shape[0]:
                     prev_timestep = None
@@ -444,8 +442,6 @@ def __call__(
                     generator=generator,
                 ).prev_sample
 
-                _, latents = latents.chunk(2)
-
         # post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 

From 4b5538eaa9928d3bd758041014354ce1ec789439 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 19:32:59 +0000
Subject: [PATCH 084/182] more clean up

---
 .../kandinsky/pipeline_kandinsky_img2img.py   | 118 +++++++++---------
 1 file changed, 61 insertions(+), 57 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 6b2fdbf457da..fe129e9fd876 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -147,9 +147,7 @@ def _encode_prompt(
             return_tensors="pt",
         )
 
-        text_input_ids = text_inputs.input_ids.to(device)
-        text_mask = text_inputs.attention_mask.to(device)
-
+        text_input_ids = text_inputs.input_ids
         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
         if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
@@ -158,7 +156,9 @@ def _encode_prompt(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        text_input_ids = text_input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
 
         prompt_embeds, text_encoder_hidden_states = self.text_encoder(
             input_ids=text_input_ids, attention_mask=text_mask
@@ -229,31 +229,29 @@ def _encode_prompt(
 
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
-        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
-        Note that offloading happens on a submodule basis. Memory savings are higher than with
-        `enable_model_cpu_offload`, but performance is lower.
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
         """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+        if is_accelerate_available():
             from accelerate import cpu_offload
         else:
-            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+            raise ImportError("Please install accelerate via `pip install accelerate`")
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+        models = [
+            self.unet,
+            self.text_proj,
+            self.text_encoder,
+            self.movq,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
 
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
-
-        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
     def enable_model_cpu_offload(self, gpu_id=0):
@@ -367,7 +365,18 @@ def __call__(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
 
-        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(device)
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+            dtype=prompt_embeds.dtype, device=device
+        )
 
         text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
             image_embeddings=image_embeds,
@@ -379,8 +388,9 @@ def __call__(
         latents = self.movq.encode(image)["latents"]
 
         self.scheduler.set_timesteps(num_inference_steps, device=device)
-        # YiYi's notes: add 1 to match original ddim steps
-        # (Notes from kandinsky repo:  add one to get the final alpha values right (the ones from first scale to data during sampling))
+        
+        # YiYi's Notes: This step is taken from the origianl Kandinsky repo 
+        # add one to get the final alpha values right (the ones from first scale to data during sampling))
         self.scheduler.timesteps = self.scheduler.timesteps + 1
         timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
 
@@ -408,39 +418,34 @@ def __call__(
             self.scheduler,
         )
 
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps_tensor):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                
-                noise_pred = self.unet(
-                    sample=latent_model_input,  # [2, 4, 96, 96]
-                    timestep=t,
-                    encoder_hidden_states=text_encoder_hidden_states,
-                    class_labels=additive_clip_time_embeddings,
-                ).sample
-
-                # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline,
-                # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond)
-                # this means the our latent shape is batch_size *2 instad batch_size
-
-                if do_classifier_free_guidance:
-                    noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                if i + 1 == timesteps_tensor.shape[0]:
-                    prev_timestep = None
-                else:
-                    prev_timestep = timesteps_tensor[i + 1]
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    generator=generator,
-                ).prev_sample
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                class_labels=additive_clip_time_embeddings,
+            ).sample
+
+            if do_classifier_free_guidance:
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            if i + 1 == timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            ).prev_sample
 
         # post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
@@ -456,4 +461,3 @@ def __call__(
             return (image,)
 
         return ImagePipelineOutput(images=image)
-

From 52ae7d9b634fea310a2e80c9a871a25e7af8b103 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 20:55:07 +0000
Subject: [PATCH 085/182] fix image pre-processing

---
 .../kandinsky/pipeline_kandinsky_img2img.py   | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index fe129e9fd876..938b50b2722d 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -347,7 +347,8 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-    ):
+    ):  
+        # 2. Define call parameters
         if isinstance(prompt, str):
             batch_size = 1
         elif isinstance(prompt, list):
@@ -360,11 +361,12 @@ def __call__(
         batch_size = batch_size * num_images_per_prompt
 
         do_classifier_free_guidance = guidance_scale > 1.0
-
+        
+        # 3. get text and image encoding
         prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
-
+        
         if isinstance(image_embeds, list):
             image_embeds = torch.cat(image_embeds, dim=0)
         if isinstance(negative_image_embeds, list):
@@ -383,10 +385,21 @@ def __call__(
             prompt_embeds=prompt_embeds,
             text_encoder_hidden_states=text_encoder_hidden_states,
         )
+        
+        # 4. pre-processing initial image 
+        if not isinstance(image, list):
+            image = [image]
+        if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
+            )
+
+        image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
+        image = image.to(dtype=prompt_embeds.dtype, device=device)
 
-        image = prepare_image(image, width, height).to(dtype=prompt_embeds.dtype, device=device)
         latents = self.movq.encode(image)["latents"]
 
+        # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         
         # YiYi's Notes: This step is taken from the origianl Kandinsky repo 
@@ -407,7 +420,7 @@ def __call__(
 
         height, width = get_new_h_w(height, width, self.movq_scale_factor)
 
-        # create initial latent
+        # 6. Create initial latent
         latents = self.prepare_latents(
             latents,
             latent_timestep,
@@ -417,7 +430,8 @@ def __call__(
             generator,            
             self.scheduler,
         )
-
+        
+        # 7. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
@@ -447,7 +461,7 @@ def __call__(
                 generator=generator,
             ).prev_sample
 
-        # post-processing
+        # 8. post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         image = image * 0.5 + 0.5

From 01a05e55cae181c94ed0a176ae71704b19fa5105 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 21:33:40 +0000
Subject: [PATCH 086/182] fix batching

---
 .../pipelines/kandinsky/pipeline_kandinsky_img2img.py          | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 938b50b2722d..ba5f105ec0c5 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -398,6 +398,7 @@ def __call__(
         image = image.to(dtype=prompt_embeds.dtype, device=device)
 
         latents = self.movq.encode(image)["latents"]
+        latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -412,7 +413,7 @@ def __call__(
         latent_timestep = int(self.scheduler.config.num_train_timesteps * strength) - 2
 
         latent_timestep = torch.tensor(
-            [latent_timestep] * (batch_size * num_images_per_prompt), 
+            [latent_timestep] * batch_size, 
             dtype=timesteps_tensor.dtype,
             device=device )
 

From bbf54dccf3e3ba8ff6938e7740ce2757310bcc66 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 22:09:44 +0000
Subject: [PATCH 087/182] add slow test for img2img

---
 .../kandinsky/test_kandinsky_img2img.py       | 324 ++++++++++++++++++
 1 file changed, 324 insertions(+)
 create mode 100644 tests/pipelines/kandinsky/test_kandinsky_img2img.py

diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
new file mode 100644
index 000000000000..bd4ae655a37c
--- /dev/null
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -0,0 +1,324 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import XLMRobertaTokenizer
+
+from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline, DDIMScheduler, UNet2DConditionModel, VQModel
+from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
+from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.use_deterministic_algorithms(True)
+
+
+class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyImg2ImgPipeline
+    params = ["prompt", "image_embeds", "negative_image_embeds", "image"]
+    batch_params = [
+        "prompt",
+        "negative_prompt",
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "strength",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def cross_attention_dim(self):
+        return 100
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = XLMRobertaTokenizer.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = MCLIPConfig(
+            numDims=self.cross_attention_dim,
+            transformerDimensions=self.text_embedder_hidden_size,
+            hidden_size=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            vocab_size=250002,
+        )
+
+        text_encoder = MultilingualCLIP(config)
+        text_encoder = text_encoder.eval()
+
+        return text_encoder
+
+    @property
+    def dummy_text_proj(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "clip_embeddings_dim": self.cross_attention_dim,
+            "time_embed_dim": self.time_embed_dim,
+            "clip_extra_context_tokens": 2,
+            "cross_attention_dim": self.cross_attention_dim,
+            "clip_text_encoder_hidden_states_dim": self.text_embedder_hidden_size,
+        }
+
+        model = KandinskyTextProjModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "in_channels": 4,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 8,
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": "identity",
+        }
+
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+
+    @property
+    def dummy_movq_kwargs(self):
+        return {
+            "block_out_channels": [32, 64],
+            "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 1,
+            "norm_num_groups": 8,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 12,
+            "out_channels": 3,
+            "up_block_types": [
+                "AttnUpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def dummy_movq(self):
+        torch.manual_seed(0)
+        model = VQModel(**self.dummy_movq_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        unet = self.dummy_unet
+        text_proj = self.dummy_text_proj
+        movq = self.dummy_movq
+
+        ddim_config = {
+            "num_train_timesteps": 1000,
+            "beta_schedule":  "linear",
+            "beta_start": 0.00085,
+            "beta_end":0.012,
+            "clip_sample" : False,
+            "set_alpha_to_one" : False, # not sure what this does, so set to default value for now
+            "steps_offset" : 0,
+            "prediction_type" : "epsilon",
+            "thresholding" : False,
+        }
+
+        scheduler = DDIMScheduler(**ddim_config)
+
+        components = {
+            "text_proj": text_proj,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "movq": movq,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device)
+        negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device)
+        # create init_image
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "image": init_image,
+            "image_embeds": image_embeds,
+            "negative_image_embeds": negative_image_embeds,
+            "generator": generator,
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 2,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_kandinsky_img2img(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        print(f"image.shape {image.shape}")
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array(
+            [0.43521464, 0.668655,  0.41744298, 0.6815478,  0.44146872, 0.4427491,  0.50876176, 0.37860417, 0.5109416 ]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch_gpu
+class KandinskyImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_kandinsky_img2img(self):
+         expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "kandinsky_img2img_frog.npy"
+         )
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+        prompt = "A red cartoon frog, 4k"
+
+        pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+        pipe_prior.to(torch_device)
+
+        pipeline = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+
+        ddim_config = {
+            "num_train_timesteps": 1000,
+            "beta_schedule":  "linear",
+            "beta_start": 0.00085,
+            "beta_end":0.012,
+            "clip_sample" : False,
+            "set_alpha_to_one" : False, # not sure what this does, so set to default value for now
+            "steps_offset" : 0,
+            "prediction_type" : "epsilon",
+            "thresholding" : False,
+        }
+
+        ddim_scheduler = DDIMScheduler(**ddim_config)
+        pipeline.scheduler = ddim_scheduler
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_emb = pipe_prior(
+            prompt,
+            generator=generator,
+        ).images
+        zero_image_emb = pipe_prior("").images
+
+        output = pipeline(
+            prompt,
+            image=init_image,
+            image_embeds=image_emb,
+            negative_image_embeds=zero_image_emb,
+            generator=generator,
+            num_inference_steps=100,
+            height=768,
+            width=768,
+            strength=0.2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+
+        assert_mean_pixel_difference(image, expected_image)

From 66ea0627b129a8b702f6565eb88a81d3ac9a246e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 22:14:05 +0000
Subject: [PATCH 088/182] fix slow tests

---
 tests/pipelines/kandinsky/test_kandinsky_img2img.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index bd4ae655a37c..ae95fef02356 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -265,10 +265,10 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def test_kandinsky_img2img(self):
-         expected_image = load_numpy(
+        expected_image = load_numpy(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "kandinsky_img2img_frog.npy"
-         )
+            "/kandinsky/kandinsky_img2img_frog.npy"
+        )
 
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"

From f6e0d2fae57cb6e5e85f9991feb7b2d3b67295a8 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 22:15:31 +0000
Subject: [PATCH 089/182] make style

---
 src/diffusers/__init__.py                     |  2 +-
 src/diffusers/pipelines/__init__.py           |  7 ++-
 src/diffusers/pipelines/kandinsky/__init__.py |  2 +-
 .../kandinsky/pipeline_kandinsky_img2img.py   | 55 +++++++++----------
 .../dummy_torch_and_transformers_objects.py   |  2 -
 .../kandinsky/test_kandinsky_img2img.py       | 32 +++++------
 6 files changed, 51 insertions(+), 49 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d2f1afd29f08..323b3be53557 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -128,8 +128,8 @@
         IFInpaintingSuperResolutionPipeline,
         IFPipeline,
         IFSuperResolutionPipeline,
-        KandinskyInpaintPipeline,
         KandinskyImg2ImgPipeline,
+        KandinskyInpaintPipeline,
         KandinskyPipeline,
         KandinskyPriorPipeline,
         LDMTextToImagePipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 03a4d9c7d371..9e9824cb8855 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -52,7 +52,12 @@
         IFPipeline,
         IFSuperResolutionPipeline,
     )
-    from .kandinsky import KandinskyInpaintPipeline, KandinskyPipeline, KandinskyPriorPipeline, KandinskyImg2ImgPipeline
+    from .kandinsky import (
+        KandinskyImg2ImgPipeline,
+        KandinskyInpaintPipeline,
+        KandinskyPipeline,
+        KandinskyPriorPipeline,
+    )
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 199ef0a998a7..49b13dafe990 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -13,8 +13,8 @@
     from ...utils.dummy_torch_and_transformers_objects import KandinskyPipeline, KandinskyPriorPipeline
 else:
     from .pipeline_kandinsky import KandinskyPipeline
-    from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
     from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline
+    from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
     from .pipeline_kandinsky_prior import KandinskyPriorPipeline
     from .text_encoder import MultilingualCLIP
     from .text_proj import KandinskyTextProjModel
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index ba5f105ec0c5..a5ea501cd874 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -14,7 +14,10 @@
 
 from typing import List, Optional, Union
 
+import numpy as np
+import PIL
 import torch
+from PIL import Image
 from transformers import (
     XLMRobertaTokenizer,
 )
@@ -31,9 +34,7 @@
 )
 from .text_encoder import MultilingualCLIP
 from .text_proj import KandinskyTextProjModel
-import PIL
-from PIL import Image
-import numpy as np 
+
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -47,6 +48,7 @@ def get_new_h_w(h, w, scale_factor=8):
         new_w += 1
     return new_h * scale_factor, new_w * scale_factor
 
+
 def prepare_image(pil_image, w=512, h=512):
     pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
     arr = np.array(pil_image.convert("RGB"))
@@ -55,6 +57,7 @@ def prepare_image(pil_image, w=512, h=512):
     image = torch.from_numpy(arr).unsqueeze(0)
     return image
 
+
 class KandinskyImg2ImgPipeline(DiffusionPipeline):
     """
     Pipeline for image-to-image generation using Kandinsky
@@ -94,13 +97,13 @@ def __init__(
             text_proj=text_proj,
             unet=unet,
             scheduler=scheduler,
-            movq=movq
+            movq=movq,
         )
         self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
-    
+
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps) 
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
         timesteps = self.scheduler.timesteps[t_start:]
@@ -114,15 +117,15 @@ def prepare_latents(self, latents, latent_timestep, shape, dtype, device, genera
             if latents.shape != shape:
                 raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
             latents = latents.to(device)
-        
+
         latents = latents * scheduler.init_noise_sigma
 
         shape = latents.shape
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
 
         # get latents
-        # YiYi notes: I created a add_noise method on the pipeline to overwrite the one in schedule because 
-        ##            it use a different beta schedule for adding noise vs sampling 
+        # YiYi notes: I created a add_noise method on the pipeline to overwrite the one in schedule because
+        ##            it use a different beta schedule for adding noise vs sampling
         # latents = self.scheduler.add_noise(latents, noise, latent_timestep)
         latents = self.add_noise(latents, noise, latent_timestep)
         return latents
@@ -252,7 +255,6 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
     def enable_model_cpu_offload(self, gpu_id=0):
         r"""
@@ -300,7 +302,7 @@ def _execution_device(self):
             ):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
-    
+
     # YiYi's notes: Hard code this method here for now because the kandinsky repo use a different beta schedule for add noise
     def add_noise(
         self,
@@ -347,7 +349,7 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-    ):  
+    ):
         # 2. Define call parameters
         if isinstance(prompt, str):
             batch_size = 1
@@ -361,12 +363,12 @@ def __call__(
         batch_size = batch_size * num_images_per_prompt
 
         do_classifier_free_guidance = guidance_scale > 1.0
-        
+
         # 3. get text and image encoding
         prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
-        
+
         if isinstance(image_embeds, list):
             image_embeds = torch.cat(image_embeds, dim=0)
         if isinstance(negative_image_embeds, list):
@@ -385,8 +387,8 @@ def __call__(
             prompt_embeds=prompt_embeds,
             text_encoder_hidden_states=text_encoder_hidden_states,
         )
-        
-        # 4. pre-processing initial image 
+
+        # 4. pre-processing initial image
         if not isinstance(image, list):
             image = [image]
         if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
@@ -402,20 +404,17 @@ def __call__(
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
-        
-        # YiYi's Notes: This step is taken from the origianl Kandinsky repo 
+
+        # YiYi's Notes: This step is taken from the origianl Kandinsky repo
         # add one to get the final alpha values right (the ones from first scale to data during sampling))
         self.scheduler.timesteps = self.scheduler.timesteps + 1
         timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
 
-        # YiYi's notes 
+        # YiYi's notes
         #       the timestep for add_noise is calculated different in original repo (this formular is taken from the original repo)
         latent_timestep = int(self.scheduler.config.num_train_timesteps * strength) - 2
 
-        latent_timestep = torch.tensor(
-            [latent_timestep] * batch_size, 
-            dtype=timesteps_tensor.dtype,
-            device=device )
+        latent_timestep = torch.tensor([latent_timestep] * batch_size, dtype=timesteps_tensor.dtype, device=device)
 
         num_channels_latents = self.unet.config.in_channels
 
@@ -428,15 +427,15 @@ def __call__(
             (batch_size, num_channels_latents, height, width),
             text_encoder_hidden_states.dtype,
             device,
-            generator,            
+            generator,
             self.scheduler,
         )
-        
+
         # 7. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            
+
             noise_pred = self.unet(
                 sample=latent_model_input,
                 timestep=t,
@@ -450,9 +449,9 @@ def __call__(
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             if i + 1 == timesteps_tensor.shape[0]:
-                prev_timestep = None
+                pass
             else:
-                prev_timestep = timesteps_tensor[i + 1]
+                timesteps_tensor[i + 1]
 
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 1f8bd6a3d2cb..f9fec508b28a 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -152,7 +152,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-
 class KandinskyImg2ImgPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
@@ -168,7 +167,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-
 class KandinskyInpaintPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index ae95fef02356..6855a012f630 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -22,7 +22,7 @@
 from PIL import Image
 from transformers import XLMRobertaTokenizer
 
-from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline, DDIMScheduler, UNet2DConditionModel, VQModel
+from diffusers import DDIMScheduler, KandinskyImg2ImgPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
@@ -175,14 +175,14 @@ def get_dummy_components(self):
 
         ddim_config = {
             "num_train_timesteps": 1000,
-            "beta_schedule":  "linear",
+            "beta_schedule": "linear",
             "beta_start": 0.00085,
-            "beta_end":0.012,
-            "clip_sample" : False,
-            "set_alpha_to_one" : False, # not sure what this does, so set to default value for now
-            "steps_offset" : 0,
-            "prediction_type" : "epsilon",
-            "thresholding" : False,
+            "beta_end": 0.012,
+            "clip_sample": False,
+            "set_alpha_to_one": False,  # not sure what this does, so set to default value for now
+            "steps_offset": 0,
+            "prediction_type": "epsilon",
+            "thresholding": False,
         }
 
         scheduler = DDIMScheduler(**ddim_config)
@@ -249,7 +249,7 @@ def test_kandinsky_img2img(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.43521464, 0.668655,  0.41744298, 0.6815478,  0.44146872, 0.4427491,  0.50876176, 0.37860417, 0.5109416 ]
+            [0.43521464, 0.668655, 0.41744298, 0.6815478, 0.44146872, 0.4427491, 0.50876176, 0.37860417, 0.5109416]
         )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -282,14 +282,14 @@ def test_kandinsky_img2img(self):
 
         ddim_config = {
             "num_train_timesteps": 1000,
-            "beta_schedule":  "linear",
+            "beta_schedule": "linear",
             "beta_start": 0.00085,
-            "beta_end":0.012,
-            "clip_sample" : False,
-            "set_alpha_to_one" : False, # not sure what this does, so set to default value for now
-            "steps_offset" : 0,
-            "prediction_type" : "epsilon",
-            "thresholding" : False,
+            "beta_end": 0.012,
+            "clip_sample": False,
+            "set_alpha_to_one": False,  # not sure what this does, so set to default value for now
+            "steps_offset": 0,
+            "prediction_type": "epsilon",
+            "thresholding": False,
         }
 
         ddim_scheduler = DDIMScheduler(**ddim_config)

From 94dc73b0060b2224cdd92283b64be4f1818e4b37 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 22:29:13 +0000
Subject: [PATCH 090/182] make style

---
 src/diffusers/models/vae.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 9e8014c3101e..e7d45ff606cf 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -18,8 +18,8 @@
 import torch
 import torch.nn as nn
 
-from .attention_processor import SpatialNorm
 from ..utils import BaseOutput, is_torch_version, randn_tensor
+from .attention_processor import SpatialNorm
 from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
 
 
@@ -238,7 +238,6 @@ def custom_forward(*inputs):
 
                 return custom_forward
 
-
             if is_torch_version(">=", "1.11.0"):
                 # middle
                 sample = torch.utils.checkpoint.checkpoint(

From 3f5c86bad2d3c6b342ede4bfa95e53b645efefcc Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 18 May 2023 23:46:36 +0000
Subject: [PATCH 091/182] add doc

---
 docs/source/en/api/pipelines/kandinsky.mdx | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index a49212dfd733..808d2012a0ae 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -23,6 +23,7 @@ The Kandinsky model in diffusers comes from ai-forever and the original codebase
 |---|---|:---:|
 | [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* | - |
 | [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* | - |
+| [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* | - |
 
 
 ## KandinskyPipeline
@@ -35,6 +36,10 @@ The Kandinsky model in diffusers comes from ai-forever and the original codebase
 	- all
 	- __call__
 
+[[autodoc]] KandinskyImg2ImgPipeline
+	- all
+	- __call__
+
 [[autodoc]] KandinskyPriorPipeline
 	- all
 	- __call__

From 40247e80ad5e553f026834bf2e60cdc9737e4932 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 19 May 2023 11:55:22 +0200
Subject: [PATCH 092/182] Refactor Kadinsky

---
 src/diffusers/models/embeddings.py            | 39 ++++++++++++++
 src/diffusers/models/unet_2d_condition.py     | 54 +++++++++++++++++--
 .../pipelines/kandinsky/pipeline_kandinsky.py | 13 ++---
 3 files changed, 95 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index fa88bce305e6..b3b270921fde 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -360,6 +360,27 @@ def forward(self, labels, force_drop_ids=None):
         return embeddings
 
 
+class TextImageProjection(nn.Module):
+    def __init__(self, text_embed_dim: int = 1024, image_embed_dim: int = 768, cross_attention_dim: int = 768,  num_image_text_embeds: int = 10):
+        super().__init__()
+
+        self.num_image_text_embeds = num_image_text_embeds
+        self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim)
+        self.text_proj = nn.Linear(text_embed_dim, cross_attention_dim)
+
+    def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor):
+        batch_size = text_embeds.shape[0]
+
+        # image
+        image_text_embeds = self.image_embeds(image_embeds)
+        image_text_embeds = image_text_embeds.reshape(batch_size, self.num_image_text_embeds, -1)
+
+        # text
+        text_embeds = self.text_proj(text_embeds)
+
+        return torch.cat([image_text_embeds, text_embeds], dim=1)
+
+
 class CombinedTimestepLabelEmbeddings(nn.Module):
     def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
         super().__init__()
@@ -395,6 +416,24 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+class TextImageTimeEmbedding(nn.Module):
+    def __init__(self, text_embed_dim: int = 768, image_embed_dim: int = 768, time_embed_dim: int = 1536):
+        super().__init__()
+        self.text_proj = nn.Linear(text_embed_dim, time_embed_dim)
+        self.text_norm = nn.LayerNorm(time_embed_dim)
+        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
+
+    def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor):
+        # text
+        time_text_embeds = self.text_proj(text_embeds)
+        time_text_embeds = self.text_norm(time_text_embeds)
+
+        # image
+        time_image_embeds = self.image_proj(image_embeds)
+
+        return time_image_embeds + time_text_embeds
+
+
 class AttentionPooling(nn.Module):
     # Copied from https://github.com/deep-floyd/IF/blob/2f91391f27dd3c468bf174be5805b4cc92980c0b/deepfloyd_if/model/nn.py#L54
 
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 2a4c9fd72c1b..ea7877523a28 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -23,7 +23,7 @@
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
 from .attention_processor import AttentionProcessor, AttnProcessor
-from .embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps
+from .embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps, TextImageTimeEmbedding, TextImageProjection
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (
     CrossAttnDownBlock2D,
@@ -90,7 +90,9 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
             The dimension of the cross attention features.
         encoder_hid_dim (`int`, *optional*, defaults to None):
-            If given, `encoder_hidden_states` will be projected from this dimension to `cross_attention_dim`.
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to None):
+            If given, the `encoder_hidden_states` and potentially other embeddings will be down-projected to text embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
         attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
         resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
             for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`.
@@ -156,6 +158,7 @@ def __init__(
         norm_eps: float = 1e-5,
         cross_attention_dim: Union[int, Tuple[int]] = 1280,
         encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
         attention_head_dim: Union[int, Tuple[int]] = 8,
         dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
@@ -247,8 +250,23 @@ def __init__(
             cond_proj_dim=time_cond_proj_dim,
         )
 
-        if encoder_hid_dim is not None:
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}.")
+
+        if encoder_hid_dim_type == "text_proj":
             self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(text_embed_dim=encoder_hid_dim, image_embed_dim=cross_attention_dim, cross_attention_dim=cross_attention_dim)
+
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'.")
         else:
             self.encoder_hid_proj = None
 
@@ -290,8 +308,13 @@ def __init__(
             self.add_embedding = TextTimeEmbedding(
                 text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
             )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim)
         elif addition_embed_type is not None:
-            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.")
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
 
         if time_embedding_act_fn is None:
             self.time_embed_act = None
@@ -616,6 +639,7 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
         mid_block_additional_residual: Optional[torch.Tensor] = None,
         return_dict: bool = True,
@@ -631,6 +655,9 @@ def forward(
                 A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            added_cond_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified includes additonal conditions that can be used for additonal time embeddings or 
+                encoder hidden states projections. See the configurations `encoder_hid_dim_type` and `addition_embed_type` for more information.
 
         Returns:
             [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
@@ -707,12 +734,29 @@ def forward(
         if self.config.addition_embed_type == "text":
             aug_emb = self.add_embedding(encoder_hidden_states)
             emb = emb + aug_emb
+        elif self.config.addition_embed_type == "text_image":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`")
+
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+
+            aug_emb = self.add_embedding(text_embs, image_embs)
+            emb = emb + aug_emb
 
         if self.time_embed_act is not None:
             emb = self.time_embed_act(emb)
 
-        if self.encoder_hid_proj is not None:
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
             encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`")
+
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
 
         # 2. pre-process
         sample = self.conv_in(sample)
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index e7f3bb407d68..5067317a4181 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -319,11 +319,11 @@ def __call__(
             dtype=prompt_embeds.dtype, device=device
         )
 
-        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
-            image_embeddings=image_embeds,
-            prompt_embeds=prompt_embeds,
-            text_encoder_hidden_states=text_encoder_hidden_states,
-        )
+        # text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
+        #     image_embeddings=image_embeds,
+        #     prompt_embeds=prompt_embeds,
+        #     text_encoder_hidden_states=text_encoder_hidden_states,
+        # )
 
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps_tensor = self.scheduler.timesteps
@@ -346,11 +346,12 @@ def __call__(
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
 
+            added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
             noise_pred = self.unet(
                 sample=latent_model_input,  # [2, 4, 96, 96]
                 timestep=t,
                 encoder_hidden_states=text_encoder_hidden_states,
-                class_labels=additive_clip_time_embeddings,
+                added_cond_kwargs=added_cond_kwargs,
             ).sample
 
             # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline,

From 413982dae47831cb01bd54d2ba580d5e16eae9e5 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 19 May 2023 13:53:15 +0200
Subject: [PATCH 093/182] Remove text_proj

---
 .../pipelines/kandinsky/pipeline_kandinsky.py     | 12 ------------
 .../kandinsky/pipeline_kandinsky_img2img.py       | 15 ++-------------
 .../kandinsky/pipeline_kandinsky_inpaint.py       | 15 ++-------------
 3 files changed, 4 insertions(+), 38 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 5067317a4181..268024335940 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -30,7 +30,6 @@
     randn_tensor,
 )
 from .text_encoder import MultilingualCLIP
-from .text_proj import KandinskyTextProjModel
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -62,8 +61,6 @@ class KandinskyPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
-        text_proj ([`KandinskyTextProjModel`]):
-            Utility class to prepare and combine the embeddings before they are passed to the decoder.
         movq ([`VQModel`]):
             MoVQ Decoder to generate the image from the latents.
     """
@@ -72,7 +69,6 @@ def __init__(
         self,
         text_encoder: MultilingualCLIP,
         tokenizer: XLMRobertaTokenizer,
-        text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
         scheduler: UnCLIPScheduler,
         movq: VQModel,
@@ -82,7 +78,6 @@ def __init__(
         self.register_modules(
             text_encoder=text_encoder,
             tokenizer=tokenizer,
-            text_proj=text_proj,
             unet=unet,
             scheduler=scheduler,
             movq=movq,
@@ -217,7 +212,6 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         models = [
             self.unet,
-            self.text_proj,
             self.text_encoder,
             self.movq,
         ]
@@ -319,12 +313,6 @@ def __call__(
             dtype=prompt_embeds.dtype, device=device
         )
 
-        # text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
-        #     image_embeddings=image_embeds,
-        #     prompt_embeds=prompt_embeds,
-        #     text_encoder_hidden_states=text_encoder_hidden_states,
-        # )
-
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps_tensor = self.scheduler.timesteps
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index a5ea501cd874..7ce5df279ae7 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -33,7 +33,6 @@
     randn_tensor,
 )
 from .text_encoder import MultilingualCLIP
-from .text_proj import KandinskyTextProjModel
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -74,8 +73,6 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
-        text_proj ([`KandinskyTextProjModel`]):
-            Utility class to prepare and combine the embeddings before they are passed to the decoder.
         movq ([`VQModel`]):
             MoVQ image encoder and decoder
     """
@@ -85,7 +82,6 @@ def __init__(
         text_encoder: MultilingualCLIP,
         movq: VQModel,
         tokenizer: XLMRobertaTokenizer,
-        text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
         scheduler: DDIMScheduler,
     ):
@@ -94,7 +90,6 @@ def __init__(
         self.register_modules(
             text_encoder=text_encoder,
             tokenizer=tokenizer,
-            text_proj=text_proj,
             unet=unet,
             scheduler=scheduler,
             movq=movq,
@@ -247,7 +242,6 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         models = [
             self.unet,
-            self.text_proj,
             self.text_encoder,
             self.movq,
         ]
@@ -382,12 +376,6 @@ def __call__(
             dtype=prompt_embeds.dtype, device=device
         )
 
-        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
-            image_embeddings=image_embeds,
-            prompt_embeds=prompt_embeds,
-            text_encoder_hidden_states=text_encoder_hidden_states,
-        )
-
         # 4. pre-processing initial image
         if not isinstance(image, list):
             image = [image]
@@ -436,11 +424,12 @@ def __call__(
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
 
+            added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
             noise_pred = self.unet(
                 sample=latent_model_input,
                 timestep=t,
                 encoder_hidden_states=text_encoder_hidden_states,
-                class_labels=additive_clip_time_embeddings,
+                added_cond_kwargs=added_cond_kwargs,
             ).sample
 
             if do_classifier_free_guidance:
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index efdb9bf794b8..a98e63b7f063 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -35,7 +35,6 @@
     randn_tensor,
 )
 from .text_encoder import MultilingualCLIP
-from .text_proj import KandinskyTextProjModel
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -203,8 +202,6 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
-        text_proj ([`KandinskyTextProjModel`]):
-            Utility class to prepare and combine the embeddings before they are passed to the decoder.
         movq ([`VQModel`]):
             MoVQ image encoder and decoder
     """
@@ -214,7 +211,6 @@ def __init__(
         text_encoder: MultilingualCLIP,
         movq: VQModel,
         tokenizer: XLMRobertaTokenizer,
-        text_proj: KandinskyTextProjModel,
         unet: UNet2DConditionModel,
         scheduler: UnCLIPScheduler,
     ):
@@ -224,7 +220,6 @@ def __init__(
             text_encoder=text_encoder,
             movq=movq,
             tokenizer=tokenizer,
-            text_proj=text_proj,
             unet=unet,
             scheduler=scheduler,
         )
@@ -358,7 +353,6 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         models = [
             self.unet,
-            self.text_proj,
             self.text_encoder,
             self.movq,
         ]
@@ -463,12 +457,6 @@ def __call__(
             dtype=prompt_embeds.dtype, device=device
         )
 
-        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
-            image_embeddings=image_embeds,
-            prompt_embeds=prompt_embeds,
-            text_encoder_hidden_states=text_encoder_hidden_states,
-        )
-
         # preprocess image and mask
         ## Encode the image
 
@@ -530,11 +518,12 @@ def __call__(
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             latent_model_input = torch.cat([latent_model_input, masked_image, mask_image], dim=1)
 
+            added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
             noise_pred = self.unet(
                 sample=latent_model_input,  # [2, 9, 96, 96]
                 timestep=t,
                 encoder_hidden_states=text_encoder_hidden_states,
-                class_labels=additive_clip_time_embeddings,
+                added_cond_kwargs=added_cond_kwargs,
             ).sample
 
             # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline,

From 49f2ef7f281270c1f8979faa88982baccb19b079 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 19 May 2023 14:25:57 +0200
Subject: [PATCH 094/182] Remove text proj class

---
 src/diffusers/models/embeddings.py            |  8 ++-
 src/diffusers/models/unet_2d_condition.py     | 46 +++++++++---
 src/diffusers/pipelines/kandinsky/__init__.py |  1 -
 .../pipelines/kandinsky/text_proj.py          | 71 -------------------
 4 files changed, 42 insertions(+), 84 deletions(-)
 delete mode 100644 src/diffusers/pipelines/kandinsky/text_proj.py

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index b3b270921fde..0f9615f95e6d 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -361,7 +361,13 @@ def forward(self, labels, force_drop_ids=None):
 
 
 class TextImageProjection(nn.Module):
-    def __init__(self, text_embed_dim: int = 1024, image_embed_dim: int = 768, cross_attention_dim: int = 768,  num_image_text_embeds: int = 10):
+    def __init__(
+        self,
+        text_embed_dim: int = 1024,
+        image_embed_dim: int = 768,
+        cross_attention_dim: int = 768,
+        num_image_text_embeds: int = 10,
+    ):
         super().__init__()
 
         self.num_image_text_embeds = num_image_text_embeds
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index ea7877523a28..92e00bc557f6 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -23,7 +23,14 @@
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
 from .attention_processor import AttentionProcessor, AttnProcessor
-from .embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps, TextImageTimeEmbedding, TextImageProjection
+from .embeddings import (
+    GaussianFourierProjection,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (
     CrossAttnDownBlock2D,
@@ -90,9 +97,11 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
             The dimension of the cross attention features.
         encoder_hid_dim (`int`, *optional*, defaults to None):
-            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` dimension to `cross_attention_dim`.
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
         encoder_hid_dim_type (`str`, *optional*, defaults to None):
-            If given, the `encoder_hidden_states` and potentially other embeddings will be down-projected to text embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+            If given, the `encoder_hidden_states` and potentially other embeddings will be down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
         attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
         resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
             for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`.
@@ -255,7 +264,9 @@ def __init__(
             logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
 
         if encoder_hid_dim is None and encoder_hid_dim_type is not None:
-            raise ValueError(f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}.")
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
 
         if encoder_hid_dim_type == "text_proj":
             self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
@@ -263,10 +274,16 @@ def __init__(
             # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
             # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
             # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
-            self.encoder_hid_proj = TextImageProjection(text_embed_dim=encoder_hid_dim, image_embed_dim=cross_attention_dim, cross_attention_dim=cross_attention_dim)
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
 
         elif encoder_hid_dim_type is not None:
-            raise ValueError(f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'.")
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
         else:
             self.encoder_hid_proj = None
 
@@ -312,7 +329,9 @@ def __init__(
             # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
             # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
             # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
-            self.add_embedding = TextImageTimeEmbedding(text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim)
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
         elif addition_embed_type is not None:
             raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
 
@@ -656,8 +675,9 @@ def forward(
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
             added_cond_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified includes additonal conditions that can be used for additonal time embeddings or 
-                encoder hidden states projections. See the configurations `encoder_hid_dim_type` and `addition_embed_type` for more information.
+                A kwargs dictionary that if specified includes additonal conditions that can be used for additonal time
+                embeddings or encoder hidden states projections. See the configurations `encoder_hid_dim_type` and
+                `addition_embed_type` for more information.
 
         Returns:
             [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
@@ -737,7 +757,9 @@ def forward(
         elif self.config.addition_embed_type == "text_image":
             # Kadinsky 2.1 - style
             if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`")
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
 
             image_embs = added_cond_kwargs.get("image_embeds")
             text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
@@ -753,7 +775,9 @@ def forward(
         elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
             # Kadinsky 2.1 - style
             if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`")
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
 
             image_embeds = added_cond_kwargs.get("image_embeds")
             encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py
index 49b13dafe990..c8eecba0c7f2 100644
--- a/src/diffusers/pipelines/kandinsky/__init__.py
+++ b/src/diffusers/pipelines/kandinsky/__init__.py
@@ -17,4 +17,3 @@
     from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
     from .pipeline_kandinsky_prior import KandinskyPriorPipeline
     from .text_encoder import MultilingualCLIP
-    from .text_proj import KandinskyTextProjModel
diff --git a/src/diffusers/pipelines/kandinsky/text_proj.py b/src/diffusers/pipelines/kandinsky/text_proj.py
deleted file mode 100644
index ab985ebbd056..000000000000
--- a/src/diffusers/pipelines/kandinsky/text_proj.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-from torch import nn
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models import ModelMixin
-
-
-class KandinskyTextProjModel(ModelMixin, ConfigMixin):
-    """
-    Utility class for Kandingsky text embeddings. Used to combine the image and text embeddings into a format usable by
-    the unet diffusion model.
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        *,
-        clip_extra_context_tokens: int = 10,
-        clip_text_encoder_hidden_states_dim: int = 1024,
-        clip_embeddings_dim: int = 768,
-        time_embed_dim: int = 1536,
-        cross_attention_dim: int = 768,
-    ):
-        super().__init__()
-
-        # parameters for additional clip time embeddings
-        self.embedding_proj = nn.Linear(clip_embeddings_dim, time_embed_dim)
-        self.embedding_norm = nn.LayerNorm(time_embed_dim)
-        self.clip_image_embeddings_project_to_time_embeddings = nn.Linear(clip_embeddings_dim, time_embed_dim)
-
-        # parameters for encoder hidden states
-        self.clip_extra_context_tokens = clip_extra_context_tokens
-        self.clip_extra_context_tokens_proj = nn.Linear(
-            clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim
-        )
-        self.encoder_hidden_states_proj = nn.Linear(clip_text_encoder_hidden_states_dim, cross_attention_dim)
-
-    def forward(self, *, image_embeddings, prompt_embeds, text_encoder_hidden_states):
-        # The image embeddings batch size and the text embeddings batch size are equal
-        assert image_embeddings.shape[0] == prompt_embeds.shape[0] == text_encoder_hidden_states.shape[0]
-
-        batch_size = prompt_embeds.shape[0]
-
-        # project text and image embeddings to add to the existing timestep embedding
-        time_projected_prompt_embeds = self.embedding_proj(prompt_embeds)
-        time_projected_prompt_embeds = self.embedding_norm(time_projected_prompt_embeds)
-        time_projected_image_embeddings = self.clip_image_embeddings_project_to_time_embeddings(image_embeddings)
-        additive_clip_time_embeddings = time_projected_image_embeddings + time_projected_prompt_embeds
-
-        # extra tokens of context that are concatenated to the sequence of outputs from the GLIDE text encoder"
-        clip_extra_context_tokens = self.clip_extra_context_tokens_proj(image_embeddings)
-        clip_extra_context_tokens = clip_extra_context_tokens.reshape(batch_size, self.clip_extra_context_tokens, -1)
-
-        text_encoder_hidden_states = self.encoder_hidden_states_proj(text_encoder_hidden_states)
-        text_encoder_hidden_states = torch.cat([clip_extra_context_tokens, text_encoder_hidden_states], dim=1)
-
-        return text_encoder_hidden_states, additive_clip_time_embeddings

From ed3d92e715bfc215bfff199fe379214b177a442a Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Fri, 19 May 2023 14:01:22 -1000
Subject: [PATCH 095/182] Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/attention_processor.py       |  8 ++++----
 src/diffusers/models/unet_2d_blocks.py            |  4 ++--
 src/diffusers/models/vae.py                       |  4 ++--
 src/diffusers/models/vq_model.py                  |  2 +-
 .../kandinsky/pipeline_kandinsky_prior.py         | 15 ++++++++-------
 tests/pipelines/kandinsky/test_kandinsky.py       |  4 ++--
 .../pipelines/kandinsky/test_kandinsky_img2img.py |  2 +-
 .../pipelines/kandinsky/test_kandinsky_inpaint.py |  2 +-
 8 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 409d2decbfe9..a17712b69b00 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -425,12 +425,12 @@ def __call__(
         hidden_states,
         encoder_hidden_states=None,
         attention_mask=None,
-        vq_emb=None,
+        temb=None,
     ):
         residual = hidden_states
 
         if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, vq_emb)
+            hidden_states = attn.spatial_norm(hidden_states, temb)
 
         input_ndim = hidden_states.ndim
 
@@ -888,12 +888,12 @@ def __call__(
         hidden_states,
         encoder_hidden_states=None,
         attention_mask=None,
-        vq_emb=None,
+        temb=None,
     ):
         residual = hidden_states
 
         if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, vq_emb)
+            hidden_states = attn.spatial_norm(hidden_states, temb)
 
         input_ndim = hidden_states.ndim
 
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index eb9731f68c0f..03e2d7a24cb8 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -469,7 +469,7 @@ def forward(self, hidden_states, temb=None):
         hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
             if attn is not None:
-                hidden_states = attn(hidden_states, vq_emb=temb)
+                hidden_states = attn(hidden_states, temb=temb)
             hidden_states = resnet(hidden_states, temb)
 
         return hidden_states
@@ -2183,7 +2183,7 @@ def __init__(
     def forward(self, hidden_states, temb=None):
         for resnet, attn in zip(self.resnets, self.attentions):
             hidden_states = resnet(hidden_states, temb=temb)
-            hidden_states = attn(hidden_states, vq_emb=temb)
+            hidden_states = attn(hidden_states, temb=temb)
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index e7d45ff606cf..4e20a2797321 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -183,7 +183,7 @@ def __init__(
             resnet_eps=1e-6,
             resnet_act_fn=act_fn,
             output_scale_factor=1,
-            resnet_time_scale_shift=norm_type,
+            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
             attn_num_head_channels=None,
             resnet_groups=norm_num_groups,
             temb_channels=temb_channels,
@@ -225,7 +225,7 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    def forward(self, z, zq=None):
+    def forward(self, z, latent_embeds=None):
         sample = z
         sample = self.conv_in(sample)
 
diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py
index 53a3ecc2b2d2..73158294ee6e 100644
--- a/src/diffusers/models/vq_model.py
+++ b/src/diffusers/models/vq_model.py
@@ -82,7 +82,7 @@ def __init__(
         norm_num_groups: int = 32,
         vq_embed_dim: Optional[int] = None,
         scaling_factor: float = 0.18215,
-        norm_type: str = "default",  # default, spatial
+        norm_type: str = "group",  # group, spatial
     ):
         super().__init__()
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 022497676913..6abdd00db6a5 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -81,7 +81,8 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         latents = latents * scheduler.init_noise_sigma
         return latents
 
-    def create_zero_img_emb(self, batch_size, device):
+    def get_zero_embed(self, batch_size=1, device=None):
+        device = device or self.device
         zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
             device=device, dtype=self.image_encoder.dtype
         )
@@ -228,7 +229,7 @@ def _encode_prompt(
     @torch.no_grad()
     def __call__(
         self,
-        prompt,
+        prompt: Union[str, List[str]],
         num_images_per_prompt: int = 1,
         num_inference_steps: int = 5,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -249,10 +250,6 @@ def __call__(
 
         batch_size = batch_size * num_images_per_prompt
 
-        if prompt == "" or prompt[0] == "":
-            image_embeddings = self.create_zero_img_emb(batch_size=batch_size, device=device)
-
-        else:
             do_classifier_free_guidance = guidance_scale > 1.0
             prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
                 prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
@@ -309,14 +306,18 @@ def __call__(
             latents = self.prior.post_process_latents(latents)
 
             image_embeddings = latents
+            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
 
             # YiYi's notes:
             ## Prior Pipeline should always return a tensor that can be used in text2img/img2img/inpainting pipelines
             ## However need np type for testing purpose
             if output_type == "np":
                 image_embeddings = image_embeddings.cpu().numpy()
+                zero_embeds = zero_embeds.cpu().numpy()
+            elif output_type != "pt":
+                raise ValueError(f"output_type={output_type} is not supported. Only 'pt' or 'np' is supported.")
 
             if not return_dict:
                 return (image_embeddings,)
 
-        return ImagePipelineOutput(images=image_embeddings)
+        return ImagePipelineOutput(images=image_embeddings, zero_embeds=zeros_embeds)
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 73f667b36b85..22160590b920 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -81,7 +81,7 @@ def cross_attention_dim(self):
     # YiYi's TO-DO: add a tiny tokenizer?
     @property
     def dummy_tokenizer(self):
-        tokenizer = XLMRobertaTokenizer.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/tiny-random-mclip-base")
         return tokenizer
 
     @property
@@ -94,7 +94,7 @@ def dummy_text_encoder(self):
             intermediate_size=37,
             num_attention_heads=4,
             num_hidden_layers=5,
-            vocab_size=250002,
+            vocab_size=1005,
         )
 
         text_encoder = MultilingualCLIP(config)
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index 6855a012f630..0883cf7a025d 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -96,7 +96,7 @@ def dummy_text_encoder(self):
             intermediate_size=37,
             num_attention_heads=4,
             num_hidden_layers=5,
-            vocab_size=250002,
+            vocab_size=1005,
         )
 
         text_encoder = MultilingualCLIP(config)
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index 923b17d7657a..d7e06ce7a786 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -97,7 +97,7 @@ def dummy_text_encoder(self):
             intermediate_size=37,
             num_attention_heads=4,
             num_hidden_layers=5,
-            vocab_size=250002,
+            vocab_size=1005,
         )
 
         text_encoder = MultilingualCLIP(config)

From 38fb834f64ba1d768f054c7fdf3ff8f7bce28f9a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 20 May 2023 01:44:10 +0000
Subject: [PATCH 096/182] fix

---
 src/diffusers/models/vae.py                   |  16 +-
 .../kandinsky/pipeline_kandinsky_prior.py     | 150 ++++++++++--------
 2 files changed, 91 insertions(+), 75 deletions(-)

diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 4e20a2797321..3e105a14fdf1 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -241,37 +241,37 @@ def custom_forward(*inputs):
             if is_torch_version(">=", "1.11.0"):
                 # middle
                 sample = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.mid_block), sample, zq, use_reentrant=False
+                    create_custom_forward(self.mid_block), sample, latent_embeds, use_reentrant=False
                 )
                 sample = sample.to(upscale_dtype)
 
                 # up
                 for up_block in self.up_blocks:
                     sample = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(up_block), sample, zq, use_reentrant=False
+                        create_custom_forward(up_block), sample, latent_embeds, use_reentrant=False
                     )
             else:
                 # middle
-                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample, zq)
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample, latent_embeds)
                 sample = sample.to(upscale_dtype)
 
                 # up
                 for up_block in self.up_blocks:
-                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, zq)
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
         else:
             # middle
-            sample = self.mid_block(sample, zq)
+            sample = self.mid_block(sample, latent_embeds)
             sample = sample.to(upscale_dtype)
 
             # up
             for up_block in self.up_blocks:
-                sample = up_block(sample, zq)
+                sample = up_block(sample, latent_embeds)
 
         # post-process
-        if zq is None:
+        if latent_embeds is None:
             sample = self.conv_norm_out(sample)
         else:
-            sample = self.conv_norm_out(sample, zq)
+            sample = self.conv_norm_out(sample, latent_embeds)
         sample = self.conv_act(sample)
         sample = self.conv_out(sample)
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 6abdd00db6a5..bc6f38c4211e 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 from typing import List, Optional, Union
+from dataclasses import dataclass
 
 import torch
+import numpy as np
 from transformers import CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...models import PriorTransformer
@@ -22,6 +24,7 @@
 from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
+    BaseOutput,
     is_accelerate_available,
     logging,
     randn_tensor,
@@ -30,6 +33,20 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+@dataclass
+class KandinskyPriorPipelineOutput(BaseOutput):
+    """
+    Output class for KandinskyPriorPipeline.
+
+    Args:
+        images (`torch.FloatTensor`)
+            clip image embeddings for text prompt
+        zero_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
+            clip image embeddings for unconditional tokens
+    """
+
+    images: Union[torch.FloatTensor, np.ndarray]
+    zero_embeds: Union[torch.FloatTensor, np.ndarray]
 
 class KandinskyPriorPipeline(DiffusionPipeline):
     """
@@ -250,74 +267,73 @@ def __call__(
 
         batch_size = batch_size * num_images_per_prompt
 
-            do_classifier_free_guidance = guidance_scale > 1.0
-            prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
-                prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
 
-            # prior
-            self.scheduler.set_timesteps(num_inference_steps, device=device)
-            prior_timesteps_tensor = self.scheduler.timesteps
+        # prior
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        prior_timesteps_tensor = self.scheduler.timesteps
 
-            embedding_dim = self.prior.config.embedding_dim
+        embedding_dim = self.prior.config.embedding_dim
 
-            latents = self.prepare_latents(
-                (batch_size, embedding_dim),
-                prompt_embeds.dtype,
-                device,
-                generator,
-                latents,
-                self.scheduler,
-            )
+        latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(
+                    2
+                )
+                predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == prior_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = prior_timesteps_tensor[i + 1]
+
+            latents = self.scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        latents = self.prior.post_process_latents(latents)
+
+        image_embeddings = latents
+        zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+
+        ## Prior Pipeline should always return a tensor that can be used in text2img/img2img/inpainting pipelines
+        ## However need np type for testing purpose
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+            zero_embeds = zero_embeds.cpu().numpy()
+        elif output_type != "pt":
+            raise ValueError(f"output_type={output_type} is not supported. Only 'pt' or 'np' is supported.")
+
+        if not return_dict:
+            return (image_embeddings,)
 
-            for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-
-                predicted_image_embedding = self.prior(
-                    latent_model_input,
-                    timestep=t,
-                    proj_embedding=prompt_embeds,
-                    encoder_hidden_states=text_encoder_hidden_states,
-                    attention_mask=text_mask,
-                ).predicted_image_embedding
-
-                if do_classifier_free_guidance:
-                    predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(
-                        2
-                    )
-                    predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * (
-                        predicted_image_embedding_text - predicted_image_embedding_uncond
-                    )
-
-                if i + 1 == prior_timesteps_tensor.shape[0]:
-                    prev_timestep = None
-                else:
-                    prev_timestep = prior_timesteps_tensor[i + 1]
-
-                latents = self.scheduler.step(
-                    predicted_image_embedding,
-                    timestep=t,
-                    sample=latents,
-                    generator=generator,
-                    prev_timestep=prev_timestep,
-                ).prev_sample
-
-            latents = self.prior.post_process_latents(latents)
-
-            image_embeddings = latents
-            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
-
-            # YiYi's notes:
-            ## Prior Pipeline should always return a tensor that can be used in text2img/img2img/inpainting pipelines
-            ## However need np type for testing purpose
-            if output_type == "np":
-                image_embeddings = image_embeddings.cpu().numpy()
-                zero_embeds = zero_embeds.cpu().numpy()
-            elif output_type != "pt":
-                raise ValueError(f"output_type={output_type} is not supported. Only 'pt' or 'np' is supported.")
-
-            if not return_dict:
-                return (image_embeddings,)
-
-        return ImagePipelineOutput(images=image_embeddings, zero_embeds=zeros_embeds)
+        return KandinskyPriorPipelineOutput(images=image_embeddings, zero_embeds=zero_embeds)

From 0d28355a516bf5895a63c0812bca69566ed7c276 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 20 May 2023 03:22:40 +0000
Subject: [PATCH 097/182] add interpolate method

---
 .../kandinsky/pipeline_kandinsky_prior.py     | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index bc6f38c4211e..38673461a929 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -15,6 +15,8 @@
 from typing import List, Optional, Union
 from dataclasses import dataclass
 
+import PIL
+
 import torch
 import numpy as np
 from transformers import CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
@@ -30,9 +32,25 @@
     randn_tensor,
 )
 
+from torchvision import transforms
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+
+image_transforms = transforms.Compose(
+        [
+            transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            _convert_image_to_rgb,
+            transforms.ToTensor(),
+            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ]
+    )
+
+
 @dataclass
 class KandinskyPriorPipelineOutput(BaseOutput):
     """
@@ -87,6 +105,69 @@ def __init__(
             image_encoder=image_encoder,
         )
 
+    def interpolate(
+        self, 
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], 
+        weights: List[float], 
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 5,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        negative_prior_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Union[str, List[str]] = "",
+        guidance_scale: float = 4.0,
+        output_type: Optional[str] = "pt",  # pt only
+        return_dict: bool = True,
+        device = None,
+    ):
+        device = device or self.device
+
+        if len(images_and_prompts) != len(weights):
+            raise ValueError(f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length")
+        
+        image_embeddings = []
+        for cond, weight in zip(images_and_prompts, weights):
+            if isinstance(cond, str):
+                            # this is for testing only, normally we should pass it as argument
+                generator = torch.Generator(device='cuda').manual_seed(0)
+
+                image_emb = self.__call__(
+                    cond, 
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    generator=generator,
+                    latents=latents,
+                    negative_prompt=negative_prior_prompt,
+                    guidance_scale=guidance_scale,
+                    ).images
+                
+            elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
+                
+                if isinstance(cond, PIL.Image.Image):
+                    cond = image_transforms(cond).unsqueeze(0).to(dtype = self.image_encoder.dtype, device=device)
+                
+                image_emb = self.image_encoder(cond)["image_embeds"]
+            
+            else:
+                raise ValueError(f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}")
+            
+            image_embeddings.append(image_emb * weight)
+
+        image_emb = torch.cat(image_embeddings).sum(dim=0, keepdim=True)
+
+        out_zero = self.__call__(
+            negative_prompt,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            negative_prompt=negative_prior_prompt,
+            guidance_scale=guidance_scale,
+            )
+        zero_image_emb = out_zero.zero_embeds if negative_prompt == "" else out_zero.images
+
+        return image_emb, zero_image_emb
+
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)

From 419e466b594568a831c463fa67df94d0e2edcfd6 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 20 May 2023 03:27:45 +0000
Subject: [PATCH 098/182] make style

---
 src/diffusers/models/vae.py                   |  4 +-
 .../kandinsky/pipeline_kandinsky_prior.py     | 69 ++++++++++---------
 tests/pipelines/kandinsky/test_kandinsky.py   |  2 +-
 3 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 3e105a14fdf1..88f22a16f41d 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -252,7 +252,9 @@ def custom_forward(*inputs):
                     )
             else:
                 # middle
-                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample, latent_embeds)
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, latent_embeds
+                )
                 sample = sample.to(upscale_dtype)
 
                 # up
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 38673461a929..c8a4093176af 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -12,18 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional, Union
 from dataclasses import dataclass
+from typing import List, Optional, Union
 
+import numpy as np
 import PIL
-
 import torch
-import numpy as np
+from torchvision import transforms
 from transformers import CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...models import PriorTransformer
 from ...pipelines import DiffusionPipeline
-from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
     BaseOutput,
@@ -32,7 +31,6 @@
     randn_tensor,
 )
 
-from torchvision import transforms
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -40,15 +38,16 @@
 def _convert_image_to_rgb(image):
     return image.convert("RGB")
 
+
 image_transforms = transforms.Compose(
-        [
-            transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.CenterCrop(224),
-            _convert_image_to_rgb,
-            transforms.ToTensor(),
-            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-        ]
-    )
+    [
+        transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+        transforms.CenterCrop(224),
+        _convert_image_to_rgb,
+        transforms.ToTensor(),
+        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ]
+)
 
 
 @dataclass
@@ -66,6 +65,7 @@ class KandinskyPriorPipelineOutput(BaseOutput):
     images: Union[torch.FloatTensor, np.ndarray]
     zero_embeds: Union[torch.FloatTensor, np.ndarray]
 
+
 class KandinskyPriorPipeline(DiffusionPipeline):
     """
     Pipeline for generate image prior for Kandinsky
@@ -106,9 +106,9 @@ def __init__(
         )
 
     def interpolate(
-        self, 
-        images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], 
-        weights: List[float], 
+        self,
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
+        weights: List[float],
         num_images_per_prompt: int = 1,
         num_inference_steps: int = 5,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -118,39 +118,42 @@ def interpolate(
         guidance_scale: float = 4.0,
         output_type: Optional[str] = "pt",  # pt only
         return_dict: bool = True,
-        device = None,
+        device=None,
     ):
         device = device or self.device
 
         if len(images_and_prompts) != len(weights):
-            raise ValueError(f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length")
-        
+            raise ValueError(
+                f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
+            )
+
         image_embeddings = []
         for cond, weight in zip(images_and_prompts, weights):
             if isinstance(cond, str):
-                            # this is for testing only, normally we should pass it as argument
-                generator = torch.Generator(device='cuda').manual_seed(0)
+                # this is for testing only, normally we should pass it as argument
+                generator = torch.Generator(device="cuda").manual_seed(0)
 
                 image_emb = self.__call__(
-                    cond, 
+                    cond,
                     num_inference_steps=num_inference_steps,
                     num_images_per_prompt=num_images_per_prompt,
                     generator=generator,
                     latents=latents,
                     negative_prompt=negative_prior_prompt,
                     guidance_scale=guidance_scale,
-                    ).images
-                
+                ).images
+
             elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
-                
                 if isinstance(cond, PIL.Image.Image):
-                    cond = image_transforms(cond).unsqueeze(0).to(dtype = self.image_encoder.dtype, device=device)
-                
+                    cond = image_transforms(cond).unsqueeze(0).to(dtype=self.image_encoder.dtype, device=device)
+
                 image_emb = self.image_encoder(cond)["image_embeds"]
-            
+
             else:
-                raise ValueError(f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}")
-            
+                raise ValueError(
+                    f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}"
+                )
+
             image_embeddings.append(image_emb * weight)
 
         image_emb = torch.cat(image_embeddings).sum(dim=0, keepdim=True)
@@ -163,7 +166,7 @@ def interpolate(
             latents=latents,
             negative_prompt=negative_prior_prompt,
             guidance_scale=guidance_scale,
-            )
+        )
         zero_image_emb = out_zero.zero_embeds if negative_prompt == "" else out_zero.images
 
         return image_emb, zero_image_emb
@@ -381,9 +384,7 @@ def __call__(
             ).predicted_image_embedding
 
             if do_classifier_free_guidance:
-                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(
-                    2
-                )
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
                 predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * (
                     predicted_image_embedding_text - predicted_image_embedding_uncond
                 )
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 22160590b920..9d31c767b5c4 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 import torch
-from transformers import XLMRobertaTokenizer
+from transformers import XLMRobertaTokenizerFast
 
 from diffusers import KandinskyPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP

From 52d694419fc3b8de12c2c0b198d85d28a2d2c819 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 20 May 2023 03:41:02 +0000
Subject: [PATCH 099/182] fix

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index c8a4093176af..c710e03d16f9 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -130,8 +130,6 @@ def interpolate(
         image_embeddings = []
         for cond, weight in zip(images_and_prompts, weights):
             if isinstance(cond, str):
-                # this is for testing only, normally we should pass it as argument
-                generator = torch.Generator(device="cuda").manual_seed(0)
 
                 image_emb = self.__call__(
                     cond,

From bd65d6416cefbb20c8eba88442d1d36e41d51c51 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 20 May 2023 06:29:55 +0000
Subject: [PATCH 100/182] add docstring and post-processing to text2img

---
 .../pipelines/kandinsky/pipeline_kandinsky.py | 99 +++++++++++++++++--
 .../kandinsky/pipeline_kandinsky_prior.py     |  1 -
 2 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 268024335940..cadb548249ee 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -28,12 +28,44 @@
     is_accelerate_version,
     logging,
     randn_tensor,
+    replace_example_docstring,
 )
 from .text_encoder import MultilingualCLIP
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
+        >>> pipe_prior.to("cuda")
+
+        >>> out = pipe_prior(prompt, generator=generator,)
+        >>> image_emb = out.images
+        >>> zero_image_emb = out.zero_embeds
+
+        >>> pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky")
+        >>> pipe.to("cuda)
+
+        >>> prompt= "red cat, 4k photo"
+
+        >>> image = pipe(
+        ...    prompt,
+        ...    image_embeds=image_emb,
+        ...    negative_image_embeds =zero_image_emb,
+        ...    height=768,
+        ...    width=768,
+        ...    num_inference_steps=100,
+        ... ).images
+
+        >>> image[0].save("cat.png")
+        ```
+"""
+
 
 def get_new_h_w(h, w, scale_factor=8):
     new_h = h // scale_factor**2
@@ -268,11 +300,12 @@ def _execution_device(self):
         return self.device
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        image_embeds: torch.FloatTensor,
-        negative_image_embeds: torch.FloatTensor,
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
         height: int = 512,
         width: int = 512,
         num_inference_steps: int = 100,
@@ -284,6 +317,53 @@ def __call__(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation. 
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: 
+                `"pil"` (`PIL.Image.Image`), `"np"` (`np.array`)  or  `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+
         if isinstance(prompt, str):
             batch_size = 1
         elif isinstance(prompt, list):
@@ -375,13 +455,20 @@ def __call__(
         # post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
-        image = image * 0.5 + 0.5
-        image = image.clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(
+                f"the output_type {output_type} is not supported. Currently we only support: "
+                "`pil`, `np`, `pt`"
+            )
+
+        if output_type in ['np', 'pil']:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
 
         if output_type == "pil":
             image = self.numpy_to_pil(image)
-
+        
         if not return_dict:
             return (image,)
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index c710e03d16f9..9f6d5ceab343 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -130,7 +130,6 @@ def interpolate(
         image_embeddings = []
         for cond, weight in zip(images_and_prompts, weights):
             if isinstance(cond, str):
-
                 image_emb = self.__call__(
                     cond,
                     num_inference_steps=num_inference_steps,

From 6bc0aea928ae7ffe3c4c8dc1dcfbc123585c2c5f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 20 May 2023 20:45:59 +0000
Subject: [PATCH 101/182] fix doc string for inpaint

---
 .../kandinsky/pipeline_kandinsky_inpaint.py   | 116 +++++++++++++++++-
 1 file changed, 111 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index a98e63b7f063..650c4db999eb 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -33,12 +33,54 @@
     is_accelerate_version,
     logging,
     randn_tensor,
+    replace_example_docstring,
 )
 from .text_encoder import MultilingualCLIP
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
+        >>> pipe_prior.to("cuda")
+
+        >>> out = pipe_prior(prompt)
+        >>> image_emb = out.images
+        >>> zero_image_emb = out.zero_embeds
+
+        >>> pipe = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint")
+        >>> pipe.to("cuda)
+
+        >>> prompt= "red cat, 4k photo"
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
+        ...     "/kandinsky/cat.png")
+
+        >>> mask = np.ones((768, 768), dtype=np.float32)
+        >>> mask[:250,250:-250] =  0
+
+        >>> out = pipe(
+        ...     prompt,
+        ...     image=init_image,
+        ...     mask_image=mask,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds =zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=150,
+        ... )
+
+        >>> image = out.images[0]
+        >>> image.save("cat_with_hat.png")
+        ```
+"""
+
 
 def get_new_h_w(h, w, scale_factor=8):
     new_h = h // scale_factor**2
@@ -76,7 +118,7 @@ def prepare_mask(masks):
 
 def prepare_mask_and_masked_image(image, mask, height, width):
     r"""
-    Prepares a pair (image, mask) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will
+    Prepares a pair (mask, image) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will
     be converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for
     the ``image`` and ``1`` for the ``mask``.
 
@@ -90,7 +132,11 @@ def prepare_mask_and_masked_image(image, mask, height, width):
         mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
             It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
             ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
-
+        height (`int`, *optional*, defaults to 512):
+            The height in pixels of the generated image.
+        width (`int`, *optional*, defaults to 512):
+            The width in pixels of the generated image.
+            
 
     Raises:
         ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
@@ -99,7 +145,7 @@ def prepare_mask_and_masked_image(image, mask, height, width):
             (ot the other way around).
 
     Returns:
-        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+        tuple[torch.Tensor]: The pair (mask, image) as ``torch.Tensor`` with 4
             dimensions: ``batch x channels x height x width``.
     """
 
@@ -188,7 +234,7 @@ def prepare_mask_and_masked_image(image, mask, height, width):
 
 class KandinskyInpaintPipeline(DiffusionPipeline):
     """
-    Pipeline for text-to-image generation using Kandinsky
+    Pipeline for text-guided image inpainting using Kandinsky2.1
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
@@ -409,11 +455,12 @@ def _execution_device(self):
         return self.device
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]],
         image: Union[torch.FloatTensor, PIL.Image.Image],
-        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
         image_embeds: torch.FloatTensor,
         negative_image_embeds: torch.FloatTensor,
         height: int = 512,
@@ -427,6 +474,65 @@ def __call__(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ):
+
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation. 
+            image (`torch.FloatTensor`, `PIL.Image.Image` or `np.ndarray`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`PIL.Image.Image`,`torch.FloatTensor` or `np.ndarray`):
+                `Image`, or a tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. You can pass a pytorch tensor as mask only if
+                the image you passed is a pytorch tensor, and it should contain one color channel (L) instead of 3, 
+                so the expected shape would be either `(B, 1, H, W,)`, `(B, H, W)`, `(1, H, W)` or `(H, W)`
+                If image is an PIL image or numpy array, mask should also be a either PIL image or numpy array. 
+                If it is a PIL image, it will be converted to a single channel (luminance) before use. If it is a nummpy array, 
+                the expected shape is `(H, W)`. 
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: 
+                `"pil"` (`PIL.Image.Image`), `"np"` (`np.array`)  or  `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+
         # Define call parameters
         if isinstance(prompt, str):
             batch_size = 1

From 332cb27e7fb501aea6c67023de00871f166cdd95 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 20 May 2023 22:58:17 +0000
Subject: [PATCH 102/182] more docstring fix

---
 .../kandinsky/pipeline_kandinsky_img2img.py   | 120 ++++++++++++++++--
 .../kandinsky/pipeline_kandinsky_inpaint.py   |  23 ++--
 2 files changed, 121 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 7ce5df279ae7..9bad8d650f4f 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -37,6 +37,41 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
+        >>> pipe_prior.to("cuda")
+        
+        >>> prompt= "A red cartoon frog, 4k"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, generator=generator, return_dict=False)
+
+        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky-img2img")
+        >>> pipe.to("cuda)
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
+        ...     "/kandinsky/frog.png"
+        ... )
+
+        >>> image = pipe(
+        ...    prompt,
+        ...    image=init_image,
+        ...    image_embeds=image_emb,
+        ...    negative_image_embeds =zero_image_emb,
+        ...    height=768,
+        ...    width=768,
+        ...    num_inference_steps=100,
+        ...    strength=0.2,
+        ... ).images
+
+        >>> image[0].save("cat_with_hat.png")
+        ```
+"""
+
 
 def get_new_h_w(h, w, scale_factor=8):
     new_h = h // scale_factor**2
@@ -327,16 +362,17 @@ def add_noise(
         return noisy_samples
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        image: Union[torch.FloatTensor, PIL.Image.Image],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
         image_embeds: torch.FloatTensor,
         negative_image_embeds: torch.FloatTensor,
         height: int = 512,
         width: int = 512,
         num_inference_steps: int = 100,
-        strength: float = 0.75,
+        strength: float = 0.3,
         guidance_scale: float = 7.0,
         num_images_per_prompt: int = 1,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -344,7 +380,58 @@ def __call__(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ):
-        # 2. Define call parameters
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation. 
+            image (`torch.FloatTensor`, `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: 
+                `"pil"` (`PIL.Image.Image`), `"np"` (`np.array`)  or  `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        # 1. Define call parameters
         if isinstance(prompt, str):
             batch_size = 1
         elif isinstance(prompt, list):
@@ -358,7 +445,7 @@ def __call__(
 
         do_classifier_free_guidance = guidance_scale > 1.0
 
-        # 3. get text and image encoding
+        # 2. get text and image embeddings
         prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
@@ -376,7 +463,7 @@ def __call__(
             dtype=prompt_embeds.dtype, device=device
         )
 
-        # 4. pre-processing initial image
+        # 3. pre-processing initial image
         if not isinstance(image, list):
             image = [image]
         if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
@@ -390,7 +477,7 @@ def __call__(
         latents = self.movq.encode(image)["latents"]
         latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
 
-        # 5. set timesteps
+        # 4. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
 
         # YiYi's Notes: This step is taken from the origianl Kandinsky repo
@@ -408,7 +495,7 @@ def __call__(
 
         height, width = get_new_h_w(height, width, self.movq_scale_factor)
 
-        # 6. Create initial latent
+        # 5. Create initial latent
         latents = self.prepare_latents(
             latents,
             latent_timestep,
@@ -419,7 +506,7 @@ def __call__(
             self.scheduler,
         )
 
-        # 7. Denoising loop
+        # 6. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
@@ -450,16 +537,23 @@ def __call__(
                 generator=generator,
             ).prev_sample
 
-        # 8. post-processing
+        # 7. post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
-        image = image * 0.5 + 0.5
-        image = image.clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(
+                f"the output_type {output_type} is not supported. Currently we only support: "
+                "`pil`, `np`, `pt`"
+            )
+
+        if output_type in ['np', 'pil']:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
 
         if output_type == "pil":
             image = self.numpy_to_pil(image)
-
+        
         if not return_dict:
             return (image,)
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 650c4db999eb..2bc2416b3938 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -46,18 +46,16 @@
         >>> from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline
         >>> from diffusers.utils import load_image
         >>> import torch
-
+     
         >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
         >>> pipe_prior.to("cuda")
 
-        >>> out = pipe_prior(prompt)
-        >>> image_emb = out.images
-        >>> zero_image_emb = out.zero_embeds
+        >>> prompt= "red cat, 4k photo"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
 
         >>> pipe = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint")
         >>> pipe.to("cuda)
 
-        >>> prompt= "red cat, 4k photo"
         >>> init_image = load_image(
         ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
         ...     "/kandinsky/cat.png")
@@ -665,13 +663,20 @@ def __call__(
         # post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
-        image = image * 0.5 + 0.5
-        image = image.clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(
+                f"the output_type {output_type} is not supported. Currently we only support: "
+                "`pil`, `np`, `pt`"
+            )
+
+        if output_type in ['np', 'pil']:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
 
         if output_type == "pil":
             image = self.numpy_to_pil(image)
-
+        
         if not return_dict:
             return (image,)
 

From 214af0797effa87fe85efeeca0098fa8c84c62f9 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sat, 20 May 2023 22:59:29 +0000
Subject: [PATCH 103/182] return a tuple in prior

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 9f6d5ceab343..bda9a531e2e0 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -413,6 +413,6 @@ def __call__(
             raise ValueError(f"output_type={output_type} is not supported. Only 'pt' or 'np' is supported.")
 
         if not return_dict:
-            return (image_embeddings,)
+            return (image_embeddings, zero_embeds)
 
         return KandinskyPriorPipelineOutput(images=image_embeddings, zero_embeds=zero_embeds)

From d7c8c2ec6b091a8799ca14dc5068d7cb7a37b329 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 21 May 2023 01:56:44 +0000
Subject: [PATCH 104/182] clean up

---
 docs/source/en/api/pipelines/kandinsky.mdx    |   1 +
 .../pipelines/kandinsky/pipeline_kandinsky.py |  11 +-
 .../kandinsky/pipeline_kandinsky_img2img.py   |  21 +--
 .../kandinsky/pipeline_kandinsky_inpaint.py   |  16 +-
 .../kandinsky/pipeline_kandinsky_prior.py     | 165 ++++++++++++++++--
 5 files changed, 177 insertions(+), 37 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 808d2012a0ae..43537a342a1d 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -43,3 +43,4 @@ The Kandinsky model in diffusers comes from ai-forever and the original codebase
 [[autodoc]] KandinskyPriorPipeline
 	- all
 	- __call__
+	- interpolate
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index cadb548249ee..5022dff574bf 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -43,15 +43,14 @@
 
         >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
         >>> pipe_prior.to("cuda")
-
-        >>> out = pipe_prior(prompt, generator=generator,)
+        
+        >>> prompt= "red cat, 4k photo"
+        >>> out = pipe_prior(prompt)
         >>> image_emb = out.images
         >>> zero_image_emb = out.zero_embeds
 
         >>> pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky")
-        >>> pipe.to("cuda)
-
-        >>> prompt= "red cat, 4k photo"
+        >>> pipe.to("cuda")
 
         >>> image = pipe(
         ...    prompt,
@@ -422,7 +421,7 @@ def __call__(
                 added_cond_kwargs=added_cond_kwargs,
             ).sample
 
-            # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline,
+            # CFG is currently implemented exactly as original repo as a baseline,
             # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond)
             # this means the our latent shape is batch_size *2 instad batch_size
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 9bad8d650f4f..5e909b52160b 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -27,6 +27,7 @@
 from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import DDIMScheduler
 from ...utils import (
+    replace_example_docstring,
     is_accelerate_available,
     is_accelerate_version,
     logging,
@@ -41,16 +42,17 @@
     Examples:
         ```py
         >>> from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
+        >>> from diffusers.utils import load_image
         >>> import torch
 
-        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
         >>> pipe_prior.to("cuda")
         
         >>> prompt= "A red cartoon frog, 4k"
-        >>> image_emb, zero_image_emb = pipe_prior(prompt, generator=generator, return_dict=False)
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
 
-        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky-img2img")
-        >>> pipe.to("cuda)
+        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky-img2img", torch_dtype=torch.float16)
+        >>> pipe.to("cuda")
 
         >>> init_image = load_image(
         ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
@@ -68,7 +70,7 @@
         ...    strength=0.2,
         ... ).images
 
-        >>> image[0].save("cat_with_hat.png")
+        >>> image[0].save("red_frog.png")
         ```
 """
 
@@ -153,10 +155,6 @@ def prepare_latents(self, latents, latent_timestep, shape, dtype, device, genera
         shape = latents.shape
         noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
 
-        # get latents
-        # YiYi notes: I created a add_noise method on the pipeline to overwrite the one in schedule because
-        ##            it use a different beta schedule for adding noise vs sampling
-        # latents = self.scheduler.add_noise(latents, noise, latent_timestep)
         latents = self.add_noise(latents, noise, latent_timestep)
         return latents
 
@@ -332,15 +330,14 @@ def _execution_device(self):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
 
-    # YiYi's notes: Hard code this method here for now because the kandinsky repo use a different beta schedule for add noise
+    #  add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling
     def add_noise(
         self,
         original_samples: torch.FloatTensor,
         noise: torch.FloatTensor,
         timesteps: torch.IntTensor,
     ) -> torch.FloatTensor:
-        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
-
+    
         betas = torch.linspace(0.0001, 0.02, 1000, dtype=torch.float32)
         alphas = 1.0 - betas
         alphas_cumprod = torch.cumprod(alphas, dim=0)
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 2bc2416b3938..0b62a45450c8 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -46,15 +46,16 @@
         >>> from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline
         >>> from diffusers.utils import load_image
         >>> import torch
+        >>> import numpy as np
      
-        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
         >>> pipe_prior.to("cuda")
 
-        >>> prompt= "red cat, 4k photo"
+        >>> prompt= "a hat"
         >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
 
-        >>> pipe = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint")
-        >>> pipe.to("cuda)
+        >>> pipe = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint", torch_dtype=torch.float16)
+        >>> pipe.to("cuda")
 
         >>> init_image = load_image(
         ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
@@ -562,8 +563,6 @@ def __call__(
         )
 
         # preprocess image and mask
-        ## Encode the image
-
         mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width)
 
         image = image.to(dtype=prompt_embeds.dtype, device=device)
@@ -578,7 +577,6 @@ def __call__(
             mode="nearest",
         )
         mask_image = prepare_mask(mask_image)
-        # apply mask on image
         masked_image = image * mask_image
 
         mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
@@ -624,13 +622,13 @@ def __call__(
 
             added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
             noise_pred = self.unet(
-                sample=latent_model_input,  # [2, 9, 96, 96]
+                sample=latent_model_input,
                 timestep=t,
                 encoder_hidden_states=text_encoder_hidden_states,
                 added_cond_kwargs=added_cond_kwargs,
             ).sample
 
-            # YiYi Notes: CFG is currently implemented exactly as original repo as a baseline,
+            # CFG is currently implemented exactly as original repo as a baseline,
             # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond)
             # this means the our latent shape is batch_size *2 instad batch_size
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index bda9a531e2e0..b2c10fe93789 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -29,16 +29,84 @@
     is_accelerate_available,
     logging,
     randn_tensor,
+    replace_example_docstring,
 )
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
+        >>> pipe_prior.to("cuda")
+        
+        >>> prompt= "red cat, 4k photo"
+        >>> out = pipe_prior(prompt)
+        >>> image_emb = out.images
+        >>> zero_image_emb = out.zero_embeds
+
+        >>> pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky")
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...    prompt,
+        ...    image_embeds=image_emb,
+        ...    negative_image_embeds =zero_image_emb,
+        ...    height=768,
+        ...    width=768,
+        ...    num_inference_steps=100,
+        ... ).images
+
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+EXAMPLE_INTERPOLATE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyPriorPipeline, KandinskyPipeline
+        >>> from diffusers.utils import load_image
+        >>> import PIL
+
+        >>> import torch
+        >>> from torchvision import transforms
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+        >>> pipe_prior.to("cuda")
+
+        >>> img1 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
+        ... "/kandinsky/cat.png")
+
+        >>> img2 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
+        ... "/kandinsky/starry_night.jpeg")
+
+        >>> images_texts = ["a cat", img1, img2 ]
+        >>> weights = [0.3,0.3,0.4]
+        >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
+
+        >>> pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ... "",
+        ... image_embeds=image_emb,
+        ... negative_image_embeds =zero_image_emb,
+        ... height=768,
+        ... width=768,
+        ... num_inference_steps=150
+        ... ).images[0]
+
+        >>> image.save("starry_cat.png")
+        ```
+"""
 
 def _convert_image_to_rgb(image):
     return image.convert("RGB")
 
-
 image_transforms = transforms.Compose(
     [
         transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
@@ -68,7 +136,7 @@ class KandinskyPriorPipelineOutput(BaseOutput):
 
 class KandinskyPriorPipeline(DiffusionPipeline):
     """
-    Pipeline for generate image prior for Kandinsky
+    Pipeline for generating image prior for Kandinsky
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
@@ -104,7 +172,9 @@ def __init__(
             scheduler=scheduler,
             image_encoder=image_encoder,
         )
-
+    
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
     def interpolate(
         self,
         images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
@@ -113,13 +183,50 @@ def interpolate(
         num_inference_steps: int = 5,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
-        negative_prior_prompt: Optional[Union[str, List[str]]] = None,
-        negative_prompt: Union[str, List[str]] = "",
+        negative_prior_prompt: Optional[str] = None,
+        negative_prompt: Union[str] = "",
         guidance_scale: float = 4.0,
-        output_type: Optional[str] = "pt",  # pt only
-        return_dict: bool = True,
         device=None,
     ):
+        """
+        Function invoked when using the prior pipeline for interpolation.
+
+        Args:
+            images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
+                list of prompts and images to guide the image generation. 
+            weights: (`List[float]`):
+                list of weights for each condition in `images_and_prompts`
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prior_prompt (`str`, *optional*):
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+
+        Examples:
+
+        Returns:
+            `tuple`
+        """
+
         device = device or self.device
 
         if len(images_and_prompts) != len(weights):
@@ -325,6 +432,7 @@ def _encode_prompt(
         return prompt_embeds, text_encoder_hidden_states, text_mask
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]],
@@ -337,6 +445,45 @@ def __call__(
         output_type: Optional[str] = "pt",  # pt only
         return_dict: bool = True,
     ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation. 
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: 
+                `"np"` (`np.array`)  or  `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
         if isinstance(prompt, str):
             batch_size = 1
         elif isinstance(prompt, list):
@@ -404,9 +551,7 @@ def __call__(
         image_embeddings = latents
         zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
 
-        ## Prior Pipeline should always return a tensor that can be used in text2img/img2img/inpainting pipelines
-        ## However need np type for testing purpose
-        if output_type == "np":
+        if output_type == "np": 
             image_embeddings = image_embeddings.cpu().numpy()
             zero_embeds = zero_embeds.cpu().numpy()
         elif output_type != "pt":

From 506e0d1b5da61fa98353eae03dbe59c29fc20ce5 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 21 May 2023 01:58:35 +0000
Subject: [PATCH 105/182] fix copies

---
 .../versatile_diffusion/modeling_text_unet.py | 71 +++++++++++++++++--
 1 file changed, 67 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 7aaa0e49e1da..858d7f77117c 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -182,7 +182,11 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
         cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
             The dimension of the cross attention features.
         encoder_hid_dim (`int`, *optional*, defaults to None):
-            If given, `encoder_hidden_states` will be projected from this dimension to `cross_attention_dim`.
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to None):
+            If given, the `encoder_hidden_states` and potentially other embeddings will be down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
         attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
         resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
             for resnet blocks, see [`~models.resnet.ResnetBlockFlat`]. Choose from `default` or `scale_shift`.
@@ -253,6 +257,7 @@ def __init__(
         norm_eps: float = 1e-5,
         cross_attention_dim: Union[int, Tuple[int]] = 1280,
         encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
         attention_head_dim: Union[int, Tuple[int]] = 8,
         dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
@@ -350,8 +355,31 @@ def __init__(
             cond_proj_dim=time_cond_proj_dim,
         )
 
-        if encoder_hid_dim is not None:
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
             self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
         else:
             self.encoder_hid_proj = None
 
@@ -393,8 +421,15 @@ def __init__(
             self.add_embedding = TextTimeEmbedding(
                 text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
             )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
         elif addition_embed_type is not None:
-            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.")
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
 
         if time_embedding_act_fn is None:
             self.time_embed_act = None
@@ -719,6 +754,7 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
         mid_block_additional_residual: Optional[torch.Tensor] = None,
         return_dict: bool = True,
@@ -734,6 +770,10 @@ def forward(
                 A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            added_cond_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified includes additonal conditions that can be used for additonal time
+                embeddings or encoder hidden states projections. See the configurations `encoder_hid_dim_type` and
+                `addition_embed_type` for more information.
 
         Returns:
             [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
@@ -810,12 +850,35 @@ def forward(
         if self.config.addition_embed_type == "text":
             aug_emb = self.add_embedding(encoder_hidden_states)
             emb = emb + aug_emb
+        elif self.config.addition_embed_type == "text_image":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires"
+                    " the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+
+            aug_emb = self.add_embedding(text_embs, image_embs)
+            emb = emb + aug_emb
 
         if self.time_embed_act is not None:
             emb = self.time_embed_act(emb)
 
-        if self.encoder_hid_proj is not None:
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
             encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which"
+                    " requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
 
         # 2. pre-process
         sample = self.conv_in(sample)

From e301190f73e612c6cbb4727390f05648977e7cb7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 21 May 2023 02:03:32 +0000
Subject: [PATCH 106/182] make style

---
 .../pipelines/kandinsky/pipeline_kandinsky.py | 33 +++++----
 .../kandinsky/pipeline_kandinsky_img2img.py   | 42 ++++++-----
 .../kandinsky/pipeline_kandinsky_inpaint.py   | 45 ++++++------
 .../kandinsky/pipeline_kandinsky_prior.py     | 70 ++++++++++---------
 .../versatile_diffusion/modeling_text_unet.py |  9 ++-
 5 files changed, 104 insertions(+), 95 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 5022dff574bf..657854b231a1 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -43,8 +43,8 @@
 
         >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
         >>> pipe_prior.to("cuda")
-        
-        >>> prompt= "red cat, 4k photo"
+
+        >>> prompt = "red cat, 4k photo"
         >>> out = pipe_prior(prompt)
         >>> image_emb = out.images
         >>> zero_image_emb = out.zero_embeds
@@ -53,12 +53,12 @@
         >>> pipe.to("cuda")
 
         >>> image = pipe(
-        ...    prompt,
-        ...    image_embeds=image_emb,
-        ...    negative_image_embeds =zero_image_emb,
-        ...    height=768,
-        ...    width=768,
-        ...    num_inference_steps=100,
+        ...     prompt,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
         ... ).images
 
         >>> image[0].save("cat.png")
@@ -321,7 +321,7 @@ def __call__(
 
         Args:
             prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation. 
+                The prompt or prompts to guide the image generation.
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
@@ -342,8 +342,8 @@ def __call__(
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                 to make generation deterministic.
@@ -352,8 +352,8 @@ def __call__(
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
             output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between: 
-                `"pil"` (`PIL.Image.Image`), `"np"` (`np.array`)  or  `"pt"` (`torch.Tensor`).
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
 
@@ -456,18 +456,17 @@ def __call__(
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(
-                f"the output_type {output_type} is not supported. Currently we only support: "
-                "`pil`, `np`, `pt`"
+                f"the output_type {output_type} is not supported. Currently we only support: " "`pil`, `np`, `pt`"
             )
 
-        if output_type in ['np', 'pil']:
+        if output_type in ["np", "pil"]:
             image = image * 0.5 + 0.5
             image = image.clamp(0, 1)
             image = image.cpu().permute(0, 2, 3, 1).float().numpy()
 
         if output_type == "pil":
             image = self.numpy_to_pil(image)
-        
+
         if not return_dict:
             return (image,)
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 5e909b52160b..64218b59b6a4 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -27,11 +27,11 @@
 from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import DDIMScheduler
 from ...utils import (
-    replace_example_docstring,
     is_accelerate_available,
     is_accelerate_version,
     logging,
     randn_tensor,
+    replace_example_docstring,
 )
 from .text_encoder import MultilingualCLIP
 
@@ -47,27 +47,27 @@
 
         >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
         >>> pipe_prior.to("cuda")
-        
-        >>> prompt= "A red cartoon frog, 4k"
+
+        >>> prompt = "A red cartoon frog, 4k"
         >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
 
         >>> pipe = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky-img2img", torch_dtype=torch.float16)
         >>> pipe.to("cuda")
 
         >>> init_image = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
         ...     "/kandinsky/frog.png"
         ... )
 
         >>> image = pipe(
-        ...    prompt,
-        ...    image=init_image,
-        ...    image_embeds=image_emb,
-        ...    negative_image_embeds =zero_image_emb,
-        ...    height=768,
-        ...    width=768,
-        ...    num_inference_steps=100,
-        ...    strength=0.2,
+        ...     prompt,
+        ...     image=init_image,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ...     strength=0.2,
         ... ).images
 
         >>> image[0].save("red_frog.png")
@@ -337,7 +337,6 @@ def add_noise(
         noise: torch.FloatTensor,
         timesteps: torch.IntTensor,
     ) -> torch.FloatTensor:
-    
         betas = torch.linspace(0.0001, 0.02, 1000, dtype=torch.float32)
         alphas = 1.0 - betas
         alphas_cumprod = torch.cumprod(alphas, dim=0)
@@ -382,7 +381,7 @@ def __call__(
 
         Args:
             prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation. 
+                The prompt or prompts to guide the image generation.
             image (`torch.FloatTensor`, `PIL.Image.Image`):
                 `Image`, or tensor representing an image batch, that will be used as the starting point for the
                 process.
@@ -412,14 +411,14 @@ def __call__(
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                 to make generation deterministic.
             output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between: 
-                `"pil"` (`PIL.Image.Image`), `"np"` (`np.array`)  or  `"pt"` (`torch.Tensor`).
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
 
@@ -539,18 +538,17 @@ def __call__(
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(
-                f"the output_type {output_type} is not supported. Currently we only support: "
-                "`pil`, `np`, `pt`"
+                f"the output_type {output_type} is not supported. Currently we only support: " "`pil`, `np`, `pt`"
             )
 
-        if output_type in ['np', 'pil']:
+        if output_type in ["np", "pil"]:
             image = image * 0.5 + 0.5
             image = image.clamp(0, 1)
             image = image.cpu().permute(0, 2, 3, 1).float().numpy()
 
         if output_type == "pil":
             image = self.numpy_to_pil(image)
-        
+
         if not return_dict:
             return (image,)
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 0b62a45450c8..1409cabba558 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -47,29 +47,30 @@
         >>> from diffusers.utils import load_image
         >>> import torch
         >>> import numpy as np
-     
+
         >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
         >>> pipe_prior.to("cuda")
 
-        >>> prompt= "a hat"
+        >>> prompt = "a hat"
         >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
 
         >>> pipe = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint", torch_dtype=torch.float16)
         >>> pipe.to("cuda")
 
         >>> init_image = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
-        ...     "/kandinsky/cat.png")
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
 
         >>> mask = np.ones((768, 768), dtype=np.float32)
-        >>> mask[:250,250:-250] =  0
+        >>> mask[:250, 250:-250] = 0
 
         >>> out = pipe(
         ...     prompt,
         ...     image=init_image,
         ...     mask_image=mask,
         ...     image_embeds=image_emb,
-        ...     negative_image_embeds =zero_image_emb,
+        ...     negative_image_embeds=zero_image_emb,
         ...     height=768,
         ...     width=768,
         ...     num_inference_steps=150,
@@ -135,7 +136,7 @@ def prepare_mask_and_masked_image(image, mask, height, width):
             The height in pixels of the generated image.
         width (`int`, *optional*, defaults to 512):
             The width in pixels of the generated image.
-            
+
 
     Raises:
         ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
@@ -473,24 +474,23 @@ def __call__(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ):
-
         """
         Function invoked when calling the pipeline for generation.
 
         Args:
             prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation. 
+                The prompt or prompts to guide the image generation.
             image (`torch.FloatTensor`, `PIL.Image.Image` or `np.ndarray`):
                 `Image`, or tensor representing an image batch, that will be used as the starting point for the
                 process.
             mask_image (`PIL.Image.Image`,`torch.FloatTensor` or `np.ndarray`):
                 `Image`, or a tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                repainted, while black pixels will be preserved. You can pass a pytorch tensor as mask only if
-                the image you passed is a pytorch tensor, and it should contain one color channel (L) instead of 3, 
-                so the expected shape would be either `(B, 1, H, W,)`, `(B, H, W)`, `(1, H, W)` or `(H, W)`
-                If image is an PIL image or numpy array, mask should also be a either PIL image or numpy array. 
-                If it is a PIL image, it will be converted to a single channel (luminance) before use. If it is a nummpy array, 
-                the expected shape is `(H, W)`. 
+                repainted, while black pixels will be preserved. You can pass a pytorch tensor as mask only if the
+                image you passed is a pytorch tensor, and it should contain one color channel (L) instead of 3, so the
+                expected shape would be either `(B, 1, H, W,)`, `(B, H, W)`, `(1, H, W)` or `(H, W)` If image is an PIL
+                image or numpy array, mask should also be a either PIL image or numpy array. If it is a PIL image, it
+                will be converted to a single channel (luminance) before use. If it is a nummpy array, the expected
+                shape is `(H, W)`.
             image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
                 The clip image embeddings for text prompt, that will be used to condition the image generation.
             negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
@@ -511,8 +511,8 @@ def __call__(
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                 to make generation deterministic.
@@ -521,8 +521,8 @@ def __call__(
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
             output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between: 
-                `"pil"` (`PIL.Image.Image`), `"np"` (`np.array`)  or  `"pt"` (`torch.Tensor`).
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
 
@@ -663,18 +663,17 @@ def __call__(
 
         if output_type not in ["pt", "np", "pil"]:
             raise ValueError(
-                f"the output_type {output_type} is not supported. Currently we only support: "
-                "`pil`, `np`, `pt`"
+                f"the output_type {output_type} is not supported. Currently we only support: " "`pil`, `np`, `pt`"
             )
 
-        if output_type in ['np', 'pil']:
+        if output_type in ["np", "pil"]:
             image = image * 0.5 + 0.5
             image = image.clamp(0, 1)
             image = image.cpu().permute(0, 2, 3, 1).float().numpy()
 
         if output_type == "pil":
             image = self.numpy_to_pil(image)
-        
+
         if not return_dict:
             return (image,)
 
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index b2c10fe93789..c0b25026ddc8 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -43,8 +43,8 @@
 
         >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
         >>> pipe_prior.to("cuda")
-        
-        >>> prompt= "red cat, 4k photo"
+
+        >>> prompt = "red cat, 4k photo"
         >>> out = pipe_prior(prompt)
         >>> image_emb = out.images
         >>> zero_image_emb = out.zero_embeds
@@ -53,12 +53,12 @@
         >>> pipe.to("cuda")
 
         >>> image = pipe(
-        ...    prompt,
-        ...    image_embeds=image_emb,
-        ...    negative_image_embeds =zero_image_emb,
-        ...    height=768,
-        ...    width=768,
-        ...    num_inference_steps=100,
+        ...     prompt,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
         ... ).images
 
         >>> image[0].save("cat.png")
@@ -78,35 +78,41 @@
         >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
         >>> pipe_prior.to("cuda")
 
-        >>> img1 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
-        ... "/kandinsky/cat.png")
+        >>> img1 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
 
-        >>> img2 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
-        ... "/kandinsky/starry_night.jpeg")
+        >>> img2 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/starry_night.jpeg"
+        ... )
 
-        >>> images_texts = ["a cat", img1, img2 ]
-        >>> weights = [0.3,0.3,0.4]
+        >>> images_texts = ["a cat", img1, img2]
+        >>> weights = [0.3, 0.3, 0.4]
         >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
 
         >>> pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
         >>> pipe.to("cuda")
 
         >>> image = pipe(
-        ... "",
-        ... image_embeds=image_emb,
-        ... negative_image_embeds =zero_image_emb,
-        ... height=768,
-        ... width=768,
-        ... num_inference_steps=150
+        ...     "",
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=150,
         ... ).images[0]
 
         >>> image.save("starry_cat.png")
         ```
 """
 
+
 def _convert_image_to_rgb(image):
     return image.convert("RGB")
 
+
 image_transforms = transforms.Compose(
     [
         transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
@@ -172,7 +178,7 @@ def __init__(
             scheduler=scheduler,
             image_encoder=image_encoder,
         )
-    
+
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
     def interpolate(
@@ -193,7 +199,7 @@ def interpolate(
 
         Args:
             images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
-                list of prompts and images to guide the image generation. 
+                list of prompts and images to guide the image generation.
             weights: (`List[float]`):
                 list of weights for each condition in `images_and_prompts`
             num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -209,11 +215,11 @@ def interpolate(
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
             negative_prior_prompt (`str`, *optional*):
-                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
             negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
             guidance_scale (`float`, *optional*, defaults to 4.0):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -450,7 +456,7 @@ def __call__(
 
         Args:
             prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation. 
+                The prompt or prompts to guide the image generation.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             num_inference_steps (`int`, *optional*, defaults to 100):
@@ -464,8 +470,8 @@ def __call__(
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
             negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
             guidance_scale (`float`, *optional*, defaults to 4.0):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -473,8 +479,8 @@ def __call__(
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
             output_type (`str`, *optional*, defaults to `"pt"`):
-                The output format of the generate image. Choose between: 
-                `"np"` (`np.array`)  or  `"pt"` (`torch.Tensor`).
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
 
@@ -551,7 +557,7 @@ def __call__(
         image_embeddings = latents
         zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
 
-        if output_type == "np": 
+        if output_type == "np":
             image_embeddings = image_embeddings.cpu().numpy()
             zero_embeds = zero_embeds.cpu().numpy()
         elif output_type != "pt":
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 858d7f77117c..bfa4352d48ae 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -15,7 +15,14 @@
     AttnProcessor,
 )
 from ...models.dual_transformer_2d import DualTransformer2DModel
-from ...models.embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps
+from ...models.embeddings import (
+    GaussianFourierProjection,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
 from ...models.transformer_2d import Transformer2DModel
 from ...models.unet_2d_condition import UNet2DConditionOutput
 from ...utils import is_torch_version, logging

From 056e3044495dbe8f33479d2e0f5509c7aea6aa7b Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Sat, 20 May 2023 16:04:48 -1000
Subject: [PATCH 107/182] Update
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 1409cabba558..f3dfae26e2fc 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -426,7 +426,7 @@ def enable_model_cpu_offload(self, gpu_id=0):
             torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
         if self.safety_checker is not None:

From b691b6361dad34dc3f1dd44eb0208b734cee7265 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Sat, 20 May 2023 16:07:09 -1000
Subject: [PATCH 108/182] Update src/diffusers/models/vae.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/vae.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 88f22a16f41d..dd4af0efcfd9 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -159,7 +159,7 @@ def __init__(
         layers_per_block=2,
         norm_num_groups=32,
         act_fn="silu",
-        norm_type="default",  # default, spatial
+        norm_type="group",  # group, spatial
     ):
         super().__init__()
         self.layers_per_block = layers_per_block

From 58779426596645392d419cc3c2da0434a1b46a1c Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Sat, 20 May 2023 16:07:31 -1000
Subject: [PATCH 109/182] Update
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 64218b59b6a4..0c456226745c 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -302,7 +302,7 @@ def enable_model_cpu_offload(self, gpu_id=0):
             torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
         if self.safety_checker is not None:

From 0fac55311f2b23cf30c7c76a3e02839d4a258297 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 21 May 2023 03:40:00 +0000
Subject: [PATCH 110/182] fix copies + testing removing batch_size

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py      | 5 ++---
 .../pipelines/kandinsky/pipeline_kandinsky_img2img.py        | 1 -
 .../pipelines/kandinsky/pipeline_kandinsky_inpaint.py        | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 657854b231a1..cf97081a8169 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -250,7 +250,6 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
     def enable_model_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
@@ -270,7 +269,7 @@ def enable_model_cpu_offload(self, gpu_id=0):
             torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
         if self.safety_checker is not None:
@@ -446,7 +445,7 @@ def __call__(
                 latent_model_input,
                 prev_timestep=prev_timestep,
                 generator=generator,
-                batch_size=batch_size,
+                #batch_size=batch_size,
             ).prev_sample
 
             _, latents = latents.chunk(2)
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 0c456226745c..18615203d64f 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -282,7 +282,6 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
     def enable_model_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index f3dfae26e2fc..cbdeb1146768 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -406,7 +406,6 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
     def enable_model_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
@@ -653,7 +652,7 @@ def __call__(
                 torch.cat([latents] * 2) if do_classifier_free_guidance else latents,
                 prev_timestep=prev_timestep,
                 generator=generator,
-                batch_size=batch_size,
+                #batch_size=batch_size,
             ).prev_sample
 
             _, latents = latents.chunk(2)

From 440a2796001cbe2dbb796e968077733009cb1708 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 21 May 2023 03:46:59 +0000
Subject: [PATCH 111/182] update post-processing for prior pipeline

---
 .../pipelines/kandinsky/pipeline_kandinsky_prior.py   | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index c0b25026ddc8..6eccbec6d64f 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -25,6 +25,7 @@
 from ...pipelines import DiffusionPipeline
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
+    deprecate,
     BaseOutput,
     is_accelerate_available,
     logging,
@@ -556,12 +557,18 @@ def __call__(
 
         image_embeddings = latents
         zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+ 
+        if output_type not in [ "pt", "np"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`np`, `pt``"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
 
         if output_type == "np":
             image_embeddings = image_embeddings.cpu().numpy()
             zero_embeds = zero_embeds.cpu().numpy()
-        elif output_type != "pt":
-            raise ValueError(f"output_type={output_type} is not supported. Only 'pt' or 'np' is supported.")
 
         if not return_dict:
             return (image_embeddings, zero_embeds)

From 74a6be88ee35310d6bf0e8ee81dd06a83f15c30b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 21 May 2023 03:50:32 +0000
Subject: [PATCH 112/182] update tests (specify steps and cfg arguments)

---
 tests/pipelines/kandinsky/test_kandinsky_prior.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py
index 3eabd2b0f0d1..cdc946f1b744 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -159,7 +159,9 @@ def get_dummy_inputs(self, device, seed=0):
             generator = torch.Generator(device=device).manual_seed(seed)
         inputs = {
             "prompt": "horse",
+            "num_inference_steps": 5,
             "generator": generator,
+            "guidance_scale": 4.0,
             "num_inference_steps": 2,
             "output_type": "np",
         }

From 6e3e6af446deb66c6d48cad60111634a3279de30 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 21 May 2023 04:36:54 +0000
Subject: [PATCH 113/182] add back batch_size

---
 .../pipelines/kandinsky/pipeline_kandinsky.py | 10 +++++--
 .../kandinsky/pipeline_kandinsky_inpaint.py   |  2 +-
 tests/pipelines/kandinsky/test_kandinsky.py   | 29 +++++--------------
 .../kandinsky/test_kandinsky_prior.py         |  1 -
 4 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index cf97081a8169..6e75113d4a95 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -24,6 +24,7 @@
 from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
+    deprecate,
     is_accelerate_available,
     is_accelerate_version,
     logging,
@@ -445,7 +446,7 @@ def __call__(
                 latent_model_input,
                 prev_timestep=prev_timestep,
                 generator=generator,
-                #batch_size=batch_size,
+                batch_size=batch_size,
             ).prev_sample
 
             _, latents = latents.chunk(2)
@@ -454,9 +455,12 @@ def __call__(
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         if output_type not in ["pt", "np", "pil"]:
-            raise ValueError(
-                f"the output_type {output_type} is not supported. Currently we only support: " "`pil`, `np`, `pt`"
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`np`, `pt`, `pil` "
             )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
 
         if output_type in ["np", "pil"]:
             image = image * 0.5 + 0.5
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index cbdeb1146768..2e334fb9473d 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -652,7 +652,7 @@ def __call__(
                 torch.cat([latents] * 2) if do_classifier_free_guidance else latents,
                 prev_timestep=prev_timestep,
                 generator=generator,
-                #batch_size=batch_size,
+                batch_size=batch_size,
             ).prev_sample
 
             _, latents = latents.chunk(2)
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 9d31c767b5c4..8d85eaa7b969 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -23,7 +23,6 @@
 
 from diffusers import KandinskyPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
-from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
 from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 
@@ -78,7 +77,6 @@ def time_embed_dim(self):
     def cross_attention_dim(self):
         return 100
 
-    # YiYi's TO-DO: add a tiny tokenizer?
     @property
     def dummy_tokenizer(self):
         tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/tiny-random-mclip-base")
@@ -102,21 +100,6 @@ def dummy_text_encoder(self):
 
         return text_encoder
 
-    @property
-    def dummy_text_proj(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "clip_embeddings_dim": self.cross_attention_dim,
-            "time_embed_dim": self.time_embed_dim,
-            "clip_extra_context_tokens": 2,
-            "cross_attention_dim": self.cross_attention_dim,
-            "clip_text_encoder_hidden_states_dim": self.text_embedder_hidden_size,
-        }
-
-        model = KandinskyTextProjModel(**model_kwargs)
-        return model
-
     @property
     def dummy_unet(self):
         torch.manual_seed(0)
@@ -125,15 +108,18 @@ def dummy_unet(self):
             "in_channels": 4,
             # Out channels is double in channels because predicts mean and variance
             "out_channels": 8,
+            "addition_embed_type": "text_image",
             "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
             "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
             "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
             "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
             "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "text_image_proj",
             "cross_attention_dim": self.cross_attention_dim,
             "attention_head_dim": 4,
             "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": "identity",
+            "class_embed_type": None
         }
 
         model = UNet2DConditionModel(**model_kwargs)
@@ -168,7 +154,6 @@ def get_dummy_components(self):
         text_encoder = self.dummy_text_encoder
         tokenizer = self.dummy_tokenizer
         unet = self.dummy_unet
-        text_proj = self.dummy_text_proj
         movq = self.dummy_movq
 
         scheduler = UnCLIPScheduler(
@@ -186,7 +171,6 @@ def get_dummy_components(self):
         )
 
         components = {
-            "text_proj": text_proj,
             "text_encoder": text_encoder,
             "tokenizer": tokenizer,
             "unet": unet,
@@ -209,6 +193,7 @@ def get_dummy_inputs(self, device, seed=0):
             "generator": generator,
             "height": 64,
             "width": 64,
+            "guidance_scale": 4.0,
             "num_inference_steps": 2,
             "output_type": "np",
         }
@@ -241,8 +226,8 @@ def test_kandinsky(self):
             [0.50759643, 0.50876284, 0.4554392, 0.5594512, 0.53785735, 0.44757918, 0.4388101, 0.46746832, 0.4886209]
         )
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
 
 
 @slow
diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py
index cdc946f1b744..534da34262e3 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -159,7 +159,6 @@ def get_dummy_inputs(self, device, seed=0):
             generator = torch.Generator(device=device).manual_seed(seed)
         inputs = {
             "prompt": "horse",
-            "num_inference_steps": 5,
             "generator": generator,
             "guidance_scale": 4.0,
             "num_inference_steps": 2,

From 71fa2bf086e9d19a8cdd5c0a03597cde62e64446 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 21 May 2023 05:39:59 +0000
Subject: [PATCH 114/182] update tests

---
 .../pipelines/kandinsky/pipeline_kandinsky.py |  2 +-
 .../kandinsky/pipeline_kandinsky_img2img.py   | 20 +++++-----
 .../kandinsky/pipeline_kandinsky_inpaint.py   |  8 +++-
 tests/pipelines/kandinsky/test_kandinsky.py   |  2 +-
 .../kandinsky/test_kandinsky_img2img.py       | 37 ++++++-------------
 .../kandinsky/test_kandinsky_inpaint.py       | 32 ++++++----------
 6 files changed, 40 insertions(+), 61 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 6e75113d4a95..3cd06d9ef280 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -415,7 +415,7 @@ def __call__(
 
             added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
             noise_pred = self.unet(
-                sample=latent_model_input,  # [2, 4, 96, 96]
+                sample=latent_model_input, 
                 timestep=t,
                 encoder_hidden_states=text_encoder_hidden_states,
                 added_cond_kwargs=added_cond_kwargs,
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 18615203d64f..1a83b402e620 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -27,6 +27,7 @@
 from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import DDIMScheduler
 from ...utils import (
+    deprecate,
     is_accelerate_available,
     is_accelerate_version,
     logging,
@@ -475,13 +476,12 @@ def __call__(
         # 4. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
 
-        # YiYi's Notes: This step is taken from the origianl Kandinsky repo
-        # add one to get the final alpha values right (the ones from first scale to data during sampling))
+        # This step is taken from the origianl Kandinsky repo
+        #  - add one to get the final alpha values right (the ones from first scale to data during sampling))
         self.scheduler.timesteps = self.scheduler.timesteps + 1
         timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
 
-        # YiYi's notes
-        #       the timestep for add_noise is calculated different in original repo (this formular is taken from the original repo)
+        # the formular to calculate timestep for add_noise is taken from the original kandinsky repo
         latent_timestep = int(self.scheduler.config.num_train_timesteps * strength) - 2
 
         latent_timestep = torch.tensor([latent_timestep] * batch_size, dtype=timesteps_tensor.dtype, device=device)
@@ -519,11 +519,6 @@ def __call__(
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
-            if i + 1 == timesteps_tensor.shape[0]:
-                pass
-            else:
-                timesteps_tensor[i + 1]
-
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(
                 noise_pred,
@@ -536,9 +531,12 @@ def __call__(
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         if output_type not in ["pt", "np", "pil"]:
-            raise ValueError(
-                f"the output_type {output_type} is not supported. Currently we only support: " "`pil`, `np`, `pt`"
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`np`, `pt`, `pil` "
             )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
 
         if output_type in ["np", "pil"]:
             image = image * 0.5 + 0.5
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 2e334fb9473d..2f9f1f94fb42 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -29,6 +29,7 @@
 from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
+    deprecate,
     is_accelerate_available,
     is_accelerate_version,
     logging,
@@ -661,9 +662,12 @@ def __call__(
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         if output_type not in ["pt", "np", "pil"]:
-            raise ValueError(
-                f"the output_type {output_type} is not supported. Currently we only support: " "`pil`, `np`, `pt`"
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`np`, `pt`, `pil` "
             )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
 
         if output_type in ["np", "pil"]:
             image = image * 0.5 + 0.5
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 8d85eaa7b969..961dcaa980e8 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -223,7 +223,7 @@ def test_kandinsky(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.50759643, 0.50876284, 0.4554392, 0.5594512, 0.53785735, 0.44757918, 0.4388101, 0.46746832, 0.4886209]
+            [0.4532004,  0.5363492, 0.48854294, 0.55743736, 0.572249, 0.45844495, 0.43908486, 0.46844718, 0.5048713 ]
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index 0883cf7a025d..5655df60d92c 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -20,11 +20,10 @@
 import numpy as np
 import torch
 from PIL import Image
-from transformers import XLMRobertaTokenizer
+from transformers import XLMRobertaTokenizerFast
 
 from diffusers import DDIMScheduler, KandinskyImg2ImgPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
-from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 
@@ -83,7 +82,7 @@ def cross_attention_dim(self):
 
     @property
     def dummy_tokenizer(self):
-        tokenizer = XLMRobertaTokenizer.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/tiny-random-mclip-base")
         return tokenizer
 
     @property
@@ -104,21 +103,6 @@ def dummy_text_encoder(self):
 
         return text_encoder
 
-    @property
-    def dummy_text_proj(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "clip_embeddings_dim": self.cross_attention_dim,
-            "time_embed_dim": self.time_embed_dim,
-            "clip_extra_context_tokens": 2,
-            "cross_attention_dim": self.cross_attention_dim,
-            "clip_text_encoder_hidden_states_dim": self.text_embedder_hidden_size,
-        }
-
-        model = KandinskyTextProjModel(**model_kwargs)
-        return model
-
     @property
     def dummy_unet(self):
         torch.manual_seed(0)
@@ -127,15 +111,18 @@ def dummy_unet(self):
             "in_channels": 4,
             # Out channels is double in channels because predicts mean and variance
             "out_channels": 8,
+            "addition_embed_type": "text_image",
             "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
             "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
             "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
             "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
             "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "text_image_proj",
             "cross_attention_dim": self.cross_attention_dim,
             "attention_head_dim": 4,
             "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": "identity",
+            "class_embed_type": None
         }
 
         model = UNet2DConditionModel(**model_kwargs)
@@ -170,7 +157,6 @@ def get_dummy_components(self):
         text_encoder = self.dummy_text_encoder
         tokenizer = self.dummy_tokenizer
         unet = self.dummy_unet
-        text_proj = self.dummy_text_proj
         movq = self.dummy_movq
 
         ddim_config = {
@@ -188,7 +174,6 @@ def get_dummy_components(self):
         scheduler = DDIMScheduler(**ddim_config)
 
         components = {
-            "text_proj": text_proj,
             "text_encoder": text_encoder,
             "tokenizer": tokenizer,
             "unet": unet,
@@ -218,7 +203,9 @@ def get_dummy_inputs(self, device, seed=0):
             "generator": generator,
             "height": 64,
             "width": 64,
-            "num_inference_steps": 2,
+            "num_inference_steps": 10,
+            "guidance_scale": 7.,
+            "strength": 0.2,
             "output_type": "np",
         }
         return inputs
@@ -249,10 +236,10 @@ def test_kandinsky_img2img(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.43521464, 0.668655, 0.41744298, 0.6815478, 0.44146872, 0.4427491, 0.50876176, 0.37860417, 0.5109416]
+            [0.61474943, 0.6073539, 0.43308544, 0.5928269, 0.47493595, 0.46755973, 0.4613838, 0.45368797, 0.50119233]
         )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
 
 
 @slow
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index d7e06ce7a786..fbcecbf56e7c 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -20,11 +20,10 @@
 import numpy as np
 import torch
 from PIL import Image
-from transformers import XLMRobertaTokenizer
+from transformers import XLMRobertaTokenizerFast
 
 from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
-from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 
@@ -84,7 +83,7 @@ def cross_attention_dim(self):
 
     @property
     def dummy_tokenizer(self):
-        tokenizer = XLMRobertaTokenizer.from_pretrained("YiYiXu/Kandinsky", subfolder="tokenizer")
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/tiny-random-mclip-base")
         return tokenizer
 
     @property
@@ -105,18 +104,6 @@ def dummy_text_encoder(self):
 
         return text_encoder
 
-    @property
-    def dummy_text_proj(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "clip_embeddings_dim": self.cross_attention_dim,
-            "time_embed_dim": self.time_embed_dim,
-            "clip_extra_context_tokens": 2,
-            "cross_attention_dim": self.cross_attention_dim,
-            "clip_text_encoder_hidden_states_dim": self.text_embedder_hidden_size,
-        }
-
         model = KandinskyTextProjModel(**model_kwargs)
         return model
 
@@ -128,17 +115,21 @@ def dummy_unet(self):
             "in_channels": 9,
             # Out channels is double in channels because predicts mean and variance
             "out_channels": 8,
+            "addition_embed_type": "text_image",
             "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
             "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
             "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
             "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
             "layers_per_block": 1,
+            "encoder_hid_dim": self.text_embedder_hidden_size,
+            "encoder_hid_dim_type": "text_image_proj",
             "cross_attention_dim": self.cross_attention_dim,
             "attention_head_dim": 4,
             "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": "identity",
+            "class_embed_type": None
         }
 
+
         model = UNet2DConditionModel(**model_kwargs)
         return model
 
@@ -171,7 +162,6 @@ def get_dummy_components(self):
         text_encoder = self.dummy_text_encoder
         tokenizer = self.dummy_tokenizer
         unet = self.dummy_unet
-        text_proj = self.dummy_text_proj
         movq = self.dummy_movq
 
         scheduler = UnCLIPScheduler(
@@ -189,7 +179,6 @@ def get_dummy_components(self):
         )
 
         components = {
-            "text_proj": text_proj,
             "text_encoder": text_encoder,
             "tokenizer": tokenizer,
             "unet": unet,
@@ -224,6 +213,7 @@ def get_dummy_inputs(self, device, seed=0):
             "height": 64,
             "width": 64,
             "num_inference_steps": 2,
+            "guidance_scale": 4.0,
             "output_type": "np",
         }
         return inputs
@@ -254,11 +244,11 @@ def test_kandinsky_inpaint(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.5069735, 0.5303574, 0.47324282, 0.57705986, 0.57984686, 0.44895405, 0.42856842, 0.4831331, 0.5052104]
+            [0.6187187, 0.53577256, 0.48749307, 0.5421068,  0.5214845,  0.40533125, 0.40913218, 0.48657694, 0.48048347]
         )
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
 
 
 @slow

From 550be36eff6aa3d292eabf9afe2f423efa533708 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 21 May 2023 05:43:04 +0000
Subject: [PATCH 115/182] make style

---
 .../pipelines/kandinsky/pipeline_kandinsky.py    |  2 +-
 .../kandinsky/pipeline_kandinsky_prior.py        |  6 +++---
 tests/pipelines/kandinsky/test_kandinsky.py      | 12 ++++++++----
 .../kandinsky/test_kandinsky_img2img.py          | 12 ++++++++----
 .../kandinsky/test_kandinsky_inpaint.py          | 16 ++++++++--------
 5 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 3cd06d9ef280..0894c6b74121 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -415,7 +415,7 @@ def __call__(
 
             added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
             noise_pred = self.unet(
-                sample=latent_model_input, 
+                sample=latent_model_input,
                 timestep=t,
                 encoder_hidden_states=text_encoder_hidden_states,
                 added_cond_kwargs=added_cond_kwargs,
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 6eccbec6d64f..adb609ab324a 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -25,8 +25,8 @@
 from ...pipelines import DiffusionPipeline
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
-    deprecate,
     BaseOutput,
+    deprecate,
     is_accelerate_available,
     logging,
     randn_tensor,
@@ -557,8 +557,8 @@ def __call__(
 
         image_embeddings = latents
         zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
- 
-        if output_type not in [ "pt", "np"]:
+
+        if output_type not in ["pt", "np"]:
             deprecation_message = (
                 f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
                 "`np`, `pt``"
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 961dcaa980e8..e7615d4be168 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -119,7 +119,7 @@ def dummy_unet(self):
             "cross_attention_dim": self.cross_attention_dim,
             "attention_head_dim": 4,
             "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": None
+            "class_embed_type": None,
         }
 
         model = UNet2DConditionModel(**model_kwargs)
@@ -223,11 +223,15 @@ def test_kandinsky(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.4532004,  0.5363492, 0.48854294, 0.55743736, 0.572249, 0.45844495, 0.43908486, 0.46844718, 0.5048713 ]
+            [0.4532004, 0.5363492, 0.48854294, 0.55743736, 0.572249, 0.45844495, 0.43908486, 0.46844718, 0.5048713]
         )
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
 
 
 @slow
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index 5655df60d92c..e4a84f7ffea5 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -122,7 +122,7 @@ def dummy_unet(self):
             "cross_attention_dim": self.cross_attention_dim,
             "attention_head_dim": 4,
             "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": None
+            "class_embed_type": None,
         }
 
         model = UNet2DConditionModel(**model_kwargs)
@@ -204,7 +204,7 @@ def get_dummy_inputs(self, device, seed=0):
             "height": 64,
             "width": 64,
             "num_inference_steps": 10,
-            "guidance_scale": 7.,
+            "guidance_scale": 7.0,
             "strength": 0.2,
             "output_type": "np",
         }
@@ -238,8 +238,12 @@ def test_kandinsky_img2img(self):
         expected_slice = np.array(
             [0.61474943, 0.6073539, 0.43308544, 0.5928269, 0.47493595, 0.46755973, 0.4613838, 0.45368797, 0.50119233]
         )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
 
 
 @slow
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index fbcecbf56e7c..ab311a9ea3a6 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -104,9 +104,6 @@ def dummy_text_encoder(self):
 
         return text_encoder
 
-        model = KandinskyTextProjModel(**model_kwargs)
-        return model
-
     @property
     def dummy_unet(self):
         torch.manual_seed(0)
@@ -126,10 +123,9 @@ def dummy_unet(self):
             "cross_attention_dim": self.cross_attention_dim,
             "attention_head_dim": 4,
             "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": None
+            "class_embed_type": None,
         }
 
-
         model = UNet2DConditionModel(**model_kwargs)
         return model
 
@@ -244,11 +240,15 @@ def test_kandinsky_inpaint(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.6187187, 0.53577256, 0.48749307, 0.5421068,  0.5214845,  0.40533125, 0.40913218, 0.48657694, 0.48048347]
+            [0.6187187, 0.53577256, 0.48749307, 0.5421068, 0.5214845, 0.40533125, 0.40913218, 0.48657694, 0.48048347]
         )
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2, f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
 
 
 @slow

From 391adbffc9c2031d8a7fd8de1a45f72ce18cfadf Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 22 May 2023 07:00:47 -1000
Subject: [PATCH 116/182] Update
 src/diffusers/pipelines/kandinsky/text_encoder.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/text_encoder.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/text_encoder.py b/src/diffusers/pipelines/kandinsky/text_encoder.py
index 516abca45354..caa0029f00ca 100644
--- a/src/diffusers/pipelines/kandinsky/text_encoder.py
+++ b/src/diffusers/pipelines/kandinsky/text_encoder.py
@@ -25,8 +25,3 @@ def forward(self, input_ids, attention_mask):
         embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
         embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None]
         return self.LinearTransformation(embs2), embs
-
-    @classmethod
-    def _load_state_dict_into_model(cls, model, state_dict, pretrained_model_name_or_path, _fast_init=True):
-        model.load_state_dict(state_dict)
-        return model, [], [], []

From 67e09da548ab19f0dd376e3d299b1ecef319664e Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 22 May 2023 07:02:31 -1000
Subject: [PATCH 117/182] Update
 tests/pipelines/kandinsky/test_kandinsky_img2img.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 tests/pipelines/kandinsky/test_kandinsky_img2img.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index e4a84f7ffea5..ac4e14f4b456 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -231,7 +231,6 @@ def test_kandinsky_img2img(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        print(f"image.shape {image.shape}")
 
         assert image.shape == (1, 64, 64, 3)
 

From 4257cc7021df69dc9561864a3f638109da39274d Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 22 May 2023 07:03:10 -1000
Subject: [PATCH 118/182] Update
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index ab311a9ea3a6..719b61a84b42 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -30,8 +30,7 @@
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
 
 
 class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):

From 003902c20ce353744bfdf22a31978dd8231c0ab8 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 22 May 2023 07:03:32 -1000
Subject: [PATCH 119/182] Update
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index 719b61a84b42..e62898333b89 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -25,7 +25,7 @@
 from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import require_torch_gpu, enable_full_determinism
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 

From f1aa660eab918904fa6579539ae0552b3b418308 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 22 May 2023 07:04:25 -1000
Subject: [PATCH 120/182] Update
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../pipelines/kandinsky/pipeline_kandinsky_prior.py        | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index adb609ab324a..82ae5458e595 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -559,12 +559,7 @@ def __call__(
         zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
 
         if output_type not in ["pt", "np"]:
-            deprecation_message = (
-                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
-                "`np`, `pt``"
-            )
-            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
-            output_type = "np"
+            raise ValueError(f"Only the output types "pt" and "np" are supported not `output_type={output_type}`")
 
         if output_type == "np":
             image_embeddings = image_embeddings.cpu().numpy()

From e2cb50f65ca7f5ef82df6d0c478ba3d28bb86c3f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 22 May 2023 20:47:26 +0000
Subject: [PATCH 121/182] fix

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 82ae5458e595..cb3d24b51d90 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -559,7 +559,7 @@ def __call__(
         zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
 
         if output_type not in ["pt", "np"]:
-            raise ValueError(f"Only the output types "pt" and "np" are supported not `output_type={output_type}`")
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
 
         if output_type == "np":
             image_embeddings = image_embeddings.cpu().numpy()

From 8f7b7e421f52b66fc44b785cd84b5213945e9b29 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 22 May 2023 21:10:10 +0000
Subject: [PATCH 122/182] refactor the cfg

---
 .../pipelines/kandinsky/pipeline_kandinsky.py | 11 ++---------
 src/diffusers/schedulers/scheduling_unclip.py | 19 +++----------------
 2 files changed, 5 insertions(+), 25 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 0894c6b74121..5c12f58ff6ae 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -421,18 +421,12 @@ def __call__(
                 added_cond_kwargs=added_cond_kwargs,
             ).sample
 
-            # CFG is currently implemented exactly as original repo as a baseline,
-            # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond)
-            # this means the our latent shape is batch_size *2 instad batch_size
-
             if do_classifier_free_guidance:
                 noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                variance_pred_uncond, variance_pred_text = variance_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                noise_pred = torch.cat([noise_pred] * 2)
-                variance_pred = torch.cat([variance_pred_uncond, variance_pred_text])
-                noise_pred = torch.cat([noise_pred, variance_pred], dim=1)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
 
             if i + 1 == timesteps_tensor.shape[0]:
                 prev_timestep = None
@@ -446,7 +440,6 @@ def __call__(
                 latent_model_input,
                 prev_timestep=prev_timestep,
                 generator=generator,
-                batch_size=batch_size,
             ).prev_sample
 
             _, latents = latents.chunk(2)
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index a7e327a36850..411ec1393d47 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -261,8 +261,6 @@ def step(
         prev_timestep: Optional[int] = None,
         generator=None,
         return_dict: bool = True,
-        # YiYi's TO-DO: batch_size argument for testing, remove this later
-        batch_size: Optional[int] = None,
     ) -> Union[UnCLIPSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -340,20 +338,9 @@ def step(
         # 6. Add noise
         variance = 0
         if t > 0:
-            if batch_size is not None:
-                assert batch_size * 2 == model_output.shape[0]
-                variance_noise = randn_tensor(
-                    (batch_size, *model_output.shape[1:]),
-                    dtype=model_output.dtype,
-                    generator=generator,
-                    device=model_output.device,
-                )
-
-                variance_noise = torch.cat([variance_noise, variance_noise], dim=0)
-            else:
-                variance_noise = randn_tensor(
-                    model_output.shape, dtype=model_output.dtype, generator=generator, device=model_output.device
-                )
+            variance_noise = randn_tensor(
+                model_output.shape, dtype=model_output.dtype, generator=generator, device=model_output.device
+            )
 
             variance = self._get_variance(
                 t,

From 325b1ebb5d2aa617f95b3cb4ff86c59d4f10d8dc Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 22 May 2023 22:10:56 +0000
Subject: [PATCH 123/182] fix

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 5c12f58ff6ae..5e2e98f35d08 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -437,13 +437,11 @@ def __call__(
             latents = self.scheduler.step(
                 noise_pred,
                 t,
-                latent_model_input,
+                latents,
                 prev_timestep=prev_timestep,
                 generator=generator,
             ).prev_sample
 
-            _, latents = latents.chunk(2)
-
         # post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 

From 1c5ad4bb28c2fb6d52133cc0d1c0c562de5bd378 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 01:36:44 +0000
Subject: [PATCH 124/182] refactor for ddim

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 5e2e98f35d08..d730be8cd1fd 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -427,6 +427,11 @@ def __call__(
                 _, variance_pred_text = variance_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+            
+            
+
+            if not (hasattr(self.scheduler.config, "variance_type") and self.scheduler.config.variance_type in ["learned", "learned_range"]):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
 
             if i + 1 == timesteps_tensor.shape[0]:
                 prev_timestep = None
@@ -438,10 +443,10 @@ def __call__(
                 noise_pred,
                 t,
                 latents,
-                prev_timestep=prev_timestep,
+                # YiYi notes: remove this to test ddim
+                #prev_timestep=prev_timestep,
                 generator=generator,
             ).prev_sample
-
         # post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 

From ed240729479611f8278f3adb25234ba7fcce922c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 01:42:55 +0000
Subject: [PATCH 125/182] add a note about ddpm scheduler

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index d730be8cd1fd..f125a9e17cb0 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -443,7 +443,8 @@ def __call__(
                 noise_pred,
                 t,
                 latents,
-                # YiYi notes: remove this to test ddim
+                # YiYi notes: only reason this pipeline can't work with unclip scheduler is that can't pass down this argument
+                #             need to use DDPM scheduler instead 
                 #prev_timestep=prev_timestep,
                 generator=generator,
             ).prev_sample

From 8e0a37c50194c75070ab7d6ae817dcdf6d79a414 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 03:03:38 +0000
Subject: [PATCH 126/182] unclip scheduler -> ddim

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index f125a9e17cb0..cb371492bf56 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -22,7 +22,7 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...pipelines import DiffusionPipeline
 from ...pipelines.pipeline_utils import ImagePipelineOutput
-from ...schedulers import UnCLIPScheduler
+from ...schedulers import DDIMScheduler
 from ...utils import (
     deprecate,
     is_accelerate_available,
@@ -89,7 +89,7 @@ class KandinskyPipeline(DiffusionPipeline):
             Frozen text-encoder.
         tokenizer ([`XLMRobertaTokenizer`]):
             Tokenizer of class
-        scheduler ([`UnCLIPScheduler`]):
+        scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
@@ -102,7 +102,7 @@ def __init__(
         text_encoder: MultilingualCLIP,
         tokenizer: XLMRobertaTokenizer,
         unet: UNet2DConditionModel,
-        scheduler: UnCLIPScheduler,
+        scheduler: DDIMScheduler,
         movq: VQModel,
     ):
         super().__init__()

From 8906756e4268ed4662547358915f05b7f39f601c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 03:04:33 +0000
Subject: [PATCH 127/182] refactor the cfg in inpaint

---
 .../kandinsky/pipeline_kandinsky_inpaint.py    | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 2f9f1f94fb42..02c4c219b442 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -628,18 +628,15 @@ def __call__(
                 added_cond_kwargs=added_cond_kwargs,
             ).sample
 
-            # CFG is currently implemented exactly as original repo as a baseline,
-            # i.e. we apply cfg to predicted noise, and take predicted variance as it is (uncond + cond)
-            # this means the our latent shape is batch_size *2 instad batch_size
-
             if do_classifier_free_guidance:
                 noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                variance_pred_uncond, variance_pred_text = variance_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                noise_pred = torch.cat([noise_pred] * 2)
-                variance_pred = torch.cat([variance_pred_uncond, variance_pred_text])
-                noise_pred = torch.cat([noise_pred, variance_pred], dim=1)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (hasattr(self.scheduler.config, "variance_type") and self.scheduler.config.variance_type in ["learned", "learned_range"]):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
 
             if i + 1 == timesteps_tensor.shape[0]:
                 prev_timestep = None
@@ -650,14 +647,11 @@ def __call__(
             latents = self.scheduler.step(
                 noise_pred,
                 t,
-                torch.cat([latents] * 2) if do_classifier_free_guidance else latents,
+                latents,
                 prev_timestep=prev_timestep,
                 generator=generator,
-                batch_size=batch_size,
             ).prev_sample
 
-            _, latents = latents.chunk(2)
-
         # post-processing
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 

From 67c3381f6e52d0ed23fa8cd73cb1e87d44120719 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 03:05:08 +0000
Subject: [PATCH 128/182] scheduler step_offset =1 for img2img

---
 .../pipelines/kandinsky/pipeline_kandinsky_img2img.py          | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 1a83b402e620..8b2f4cb35748 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -476,9 +476,6 @@ def __call__(
         # 4. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
 
-        # This step is taken from the origianl Kandinsky repo
-        #  - add one to get the final alpha values right (the ones from first scale to data during sampling))
-        self.scheduler.timesteps = self.scheduler.timesteps + 1
         timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
 
         # the formular to calculate timestep for add_noise is taken from the original kandinsky repo

From 6d9b649cb40e6f46a6b1327402082d8948a9d04c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 03:18:48 +0000
Subject: [PATCH 129/182] update test_inference_batch_consistent test:
 output_type ='np'

---
 tests/pipelines/test_pipelines_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 3984ed76edce..3ddfd35defb7 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -287,7 +287,7 @@ def _test_inference_batch_consistent(
             for arg in additional_params_copy_to_batched_inputs:
                 batched_inputs[arg] = inputs[arg]
 
-            batched_inputs["output_type"] = None
+            batched_inputs["output_type"] = "np"
 
             if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
                 batched_inputs.pop("output_type")

From 976f47a21cd9583ff9cdbfb4094f6dc3cac6125b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 03:30:32 +0000
Subject: [PATCH 130/182] update tests

---
 tests/pipelines/kandinsky/test_kandinsky.py   | 27 +++++++++----------
 .../kandinsky/test_kandinsky_img2img.py       |  5 ++--
 .../kandinsky/test_kandinsky_prior.py         |  5 ++--
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index e7615d4be168..4386d0e69b2e 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -21,16 +21,15 @@
 import torch
 from transformers import XLMRobertaTokenizerFast
 
-from diffusers import KandinskyPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
+from diffusers import KandinskyPipeline, KandinskyPriorPipeline, DDIMScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import require_torch_gpu, enable_full_determinism
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
 
 
 class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@@ -156,18 +155,16 @@ def get_dummy_components(self):
         unet = self.dummy_unet
         movq = self.dummy_movq
 
-        scheduler = UnCLIPScheduler(
-            clip_sample=True,
-            clip_sample_range=2.0,
-            sample_min_value=1.0,
-            sample_max_value=None,
+        scheduler = DDIMScheduler(
             num_train_timesteps=1000,
-            prediction_type="epsilon",
-            variance_type="learned_range",
-            thresholding=True,
-            beta_schedule="linear",
-            beta_start=0.00085,
+            beta_schedule= "linear",
+            beta_start= 0.00085,
             beta_end=0.012,
+            clip_sample=False,
+            set_alpha_to_one=False, 
+            steps_offset=1,
+            prediction_type= "epsilon",
+            thresholding=False,
         )
 
         components = {
@@ -223,7 +220,7 @@ def test_kandinsky(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.4532004, 0.5363492, 0.48854294, 0.55743736, 0.572249, 0.45844495, 0.43908486, 0.46844718, 0.5048713]
+            [0.328663,   1.,  0.23216873, 1. , 0.92717564, 0.4639046, 0.96894777, 0.31713378, 0.6293953]
         )
 
         assert (
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index ac4e14f4b456..dc74306225aa 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -25,13 +25,12 @@
 from diffusers import DDIMScheduler, KandinskyImg2ImgPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import require_torch_gpu, enable_full_determinism
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
 
 
 class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py
index 534da34262e3..ebdebd559cec 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -28,13 +28,12 @@
 
 from diffusers import KandinskyPriorPipeline, PriorTransformer, UnCLIPScheduler
 from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import skip_mps
+from diffusers.utils.testing_utils import skip_mps, enable_full_determinism
 
 from ..test_pipelines_common import PipelineTesterMixin
 
 
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.use_deterministic_algorithms(True)
+enable_full_determinism()
 
 
 class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):

From c19cb182da171acd39df1462c408ae75f562172e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 03:51:14 +0000
Subject: [PATCH 131/182] update valueerror

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py    | 7 +------
 .../pipelines/kandinsky/pipeline_kandinsky_img2img.py      | 7 +------
 .../pipelines/kandinsky/pipeline_kandinsky_inpaint.py      | 7 +------
 3 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index cb371492bf56..69a951473ca0 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -452,12 +452,7 @@ def __call__(
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         if output_type not in ["pt", "np", "pil"]:
-            deprecation_message = (
-                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
-                "`np`, `pt`, `pil` "
-            )
-            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
-            output_type = "np"
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
 
         if output_type in ["np", "pil"]:
             image = image * 0.5 + 0.5
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 8b2f4cb35748..ccdc794a4a0f 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -528,12 +528,7 @@ def __call__(
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         if output_type not in ["pt", "np", "pil"]:
-            deprecation_message = (
-                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
-                "`np`, `pt`, `pil` "
-            )
-            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
-            output_type = "np"
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
 
         if output_type in ["np", "pil"]:
             image = image * 0.5 + 0.5
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 02c4c219b442..582d4be8029b 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -656,12 +656,7 @@ def __call__(
         image = self.movq.decode(latents, force_not_quantize=True)["sample"]
 
         if output_type not in ["pt", "np", "pil"]:
-            deprecation_message = (
-                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
-                "`np`, `pt`, `pil` "
-            )
-            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
-            output_type = "np"
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
 
         if output_type in ["np", "pil"]:
             image = image * 0.5 + 0.5

From e3964ce0b3044998f707c89a63ab6bea53483ace Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 03:55:04 +0000
Subject: [PATCH 132/182] style

---
 .../pipelines/kandinsky/pipeline_kandinsky.py    | 16 ++++++++--------
 .../kandinsky/pipeline_kandinsky_img2img.py      |  1 -
 .../kandinsky/pipeline_kandinsky_inpaint.py      |  6 ++++--
 .../kandinsky/pipeline_kandinsky_prior.py        |  1 -
 tests/pipelines/kandinsky/test_kandinsky.py      | 14 +++++++-------
 .../kandinsky/test_kandinsky_img2img.py          |  3 +--
 .../kandinsky/test_kandinsky_inpaint.py          |  2 +-
 .../pipelines/kandinsky/test_kandinsky_prior.py  |  2 +-
 8 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index 69a951473ca0..c838c34a9031 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -24,7 +24,6 @@
 from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import DDIMScheduler
 from ...utils import (
-    deprecate,
     is_accelerate_available,
     is_accelerate_version,
     logging,
@@ -427,16 +426,17 @@ def __call__(
                 _, variance_pred_text = variance_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
-            
-            
 
-            if not (hasattr(self.scheduler.config, "variance_type") and self.scheduler.config.variance_type in ["learned", "learned_range"]):
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
                 noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
 
             if i + 1 == timesteps_tensor.shape[0]:
-                prev_timestep = None
+                pass
             else:
-                prev_timestep = timesteps_tensor[i + 1]
+                timesteps_tensor[i + 1]
 
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(
@@ -444,8 +444,8 @@ def __call__(
                 t,
                 latents,
                 # YiYi notes: only reason this pipeline can't work with unclip scheduler is that can't pass down this argument
-                #             need to use DDPM scheduler instead 
-                #prev_timestep=prev_timestep,
+                #             need to use DDPM scheduler instead
+                # prev_timestep=prev_timestep,
                 generator=generator,
             ).prev_sample
         # post-processing
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index ccdc794a4a0f..d9ff023ba6cb 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -27,7 +27,6 @@
 from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import DDIMScheduler
 from ...utils import (
-    deprecate,
     is_accelerate_available,
     is_accelerate_version,
     logging,
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 582d4be8029b..1dfad0fd1cf4 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -29,7 +29,6 @@
 from ...pipelines.pipeline_utils import ImagePipelineOutput
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
-    deprecate,
     is_accelerate_available,
     is_accelerate_version,
     logging,
@@ -635,7 +634,10 @@ def __call__(
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
 
-            if not (hasattr(self.scheduler.config, "variance_type") and self.scheduler.config.variance_type in ["learned", "learned_range"]):
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
                 noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
 
             if i + 1 == timesteps_tensor.shape[0]:
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index cb3d24b51d90..ffc2bf4afa7c 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -26,7 +26,6 @@
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
     BaseOutput,
-    deprecate,
     is_accelerate_available,
     logging,
     randn_tensor,
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 4386d0e69b2e..7f8bd5d307b1 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -21,10 +21,10 @@
 import torch
 from transformers import XLMRobertaTokenizerFast
 
-from diffusers import KandinskyPipeline, KandinskyPriorPipeline, DDIMScheduler, UNet2DConditionModel, VQModel
+from diffusers import DDIMScheduler, KandinskyPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, enable_full_determinism
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
@@ -157,13 +157,13 @@ def get_dummy_components(self):
 
         scheduler = DDIMScheduler(
             num_train_timesteps=1000,
-            beta_schedule= "linear",
-            beta_start= 0.00085,
+            beta_schedule="linear",
+            beta_start=0.00085,
             beta_end=0.012,
             clip_sample=False,
-            set_alpha_to_one=False, 
+            set_alpha_to_one=False,
             steps_offset=1,
-            prediction_type= "epsilon",
+            prediction_type="epsilon",
             thresholding=False,
         )
 
@@ -220,7 +220,7 @@ def test_kandinsky(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.328663,   1.,  0.23216873, 1. , 0.92717564, 0.4639046, 0.96894777, 0.31713378, 0.6293953]
+            [0.328663, 1.0, 0.23216873, 1.0, 0.92717564, 0.4639046, 0.96894777, 0.31713378, 0.6293953]
         )
 
         assert (
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index dc74306225aa..06eb12657049 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -25,7 +25,7 @@
 from diffusers import DDIMScheduler, KandinskyImg2ImgPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, enable_full_determinism
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
@@ -230,7 +230,6 @@ def test_kandinsky_img2img(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index e62898333b89..afc013e48453 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -25,7 +25,7 @@
 from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, enable_full_determinism
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py
index ebdebd559cec..abd3b1c714fa 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -28,7 +28,7 @@
 
 from diffusers import KandinskyPriorPipeline, PriorTransformer, UnCLIPScheduler
 from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import skip_mps, enable_full_determinism
+from diffusers.utils.testing_utils import enable_full_determinism, skip_mps
 
 from ..test_pipelines_common import PipelineTesterMixin
 

From 89f0cefaef029d2256278bda5c52429cc2d4a7df Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 21:57:40 +0000
Subject: [PATCH 133/182] update doc

---
 docs/source/en/api/pipelines/kandinsky.mdx | 256 +++++++++++++++++++++
 1 file changed, 256 insertions(+)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 43537a342a1d..5e1776390483 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -25,6 +25,262 @@ The Kandinsky model in diffusers comes from ai-forever and the original codebase
 | [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* | - |
 | [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* | - |
 
+## Usage example
+
+In the following, we will walk you through some cool examples of using Kandinsky pipelines to create incredible artwork beyond imagination.
+
+### Text-to-Image Generation
+
+For text-to-image generation, we need to use both [`KandinskyPriorPipeline] and [`KandinskyPipeline`]. The first step is to create CLIP image embedding conditioned on a text prompt. Let's throw a fun prompt at Kandinsky to see what it will come up with :)
+
+```py
+prompt = 'A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting'
+negative_prompt = 'low quality, bad quality'
+```
+
+You should create the CLIP image embeddings for both your `prompt` and `negavie_prompt`. Also, the prior pipeline we use to create these CLIP image embeddings is a diffusion model itself, so you can use `negative_prompt`, `guidance_scale`, and `num_inference_steps` arguments to guide this process, just like how you would normally do with all other pipelines in diffusers.
+
+```py
+from diffusers import KandinskyPipeline, KandinskyPriorPipeline
+import torch
+
+# # create prior 
+pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+pipe_prior.to("cuda")
+
+generator = torch.Generator(device='cuda').manual_seed(12)
+image_emb = pipe_prior(
+        prompt, 
+        guidance_scale = 1.,
+        num_inference_steps=25,
+        generator=generator, 
+        negative_prompt = negative_prompt).images
+
+zero_image_emb = pipe_prior(
+        negative_prompt, 
+        guidance_scale = 1.,
+        num_inference_steps=25,
+        generator=generator, 
+        negative_prompt = negative_prompt).images
+```
+
+Once we create the image embedding, we can use [`KandinskyPipeline`] to generate images.
+
+```py
+from PIL import Image
+
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    grid_w, grid_h = grid.size
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+# create diffuser pipeline
+pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+pipe.to("cuda")
+
+images = pipe(
+	prompt,
+	image_embeds=image_emb,
+	negative_image_embeds =zero_image_emb,
+	num_images_per_prompt=2,
+	height=768,
+	width=768,
+	num_inference_steps=100,
+	guidance_scale=4.0,
+	generator=generator,
+	).images
+```
+
+One cheeseburger monster coming up! Enjoy! 
+
+![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/cheeseburger.png)
+
+Kandinsky model works really extremely well with creative prompts. Here is some of the amazing art we created using exact same process but different prompts.
+
+```py
+prompt = "bird eye view shot of a full body woman with cyan light orange magenta makeup, digital art, long braided hair her face separated by makeup in the style of yin Yang surrealism, symmetrical face, real image, contrasting tone, pastel gradient background"
+```
+![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/hair.png)
+
+```py
+prompt = "A car exploding into colorful dust"
+```
+![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/dusts.png)
+
+```py
+prompt = "editorial photography of an organic, almost liquid smoke style armchair"
+```
+![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/smokechair.png)
+
+```py
+prompt = "birds eye view of a quilted paper style alien planet landscape, vibrant colours, Cinematic lighting"
+```
+![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/alienplanet.png)
+
+
+
+### Text Guided Image-to-Image Generation
+
+The same Kandinsky model weights can be used for text-guided image-to-image translation. In this case, just make sure to load the weights using the [`KandinskyImg2ImgPipeline`] pipeline.
+
+**Note**: You can also directly move the weights of the text-to-image pipelines to the image-to-image pipelines
+without loading them twice by making use of the [`~DiffusionPipeline.components()`] function as explained [here](#converting-between-different-pipelines).
+
+Let's download an image.
+
+```py
+from PIL import Image
+import requests
+from io import BytesIO
+
+# download image
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+response = requests.get(url)
+original_image = Image.open(BytesIO(response.content)).convert("RGB")
+original_image = original_image.resize((768, 512))
+```
+
+![img](https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg)
+
+```py
+import torch
+from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
+
+
+## create prior 
+pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+pipe_prior.to("cuda")
+
+## create img2img pipeline
+pipe = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+pipe.to("cuda")
+
+prompt = "A fantasy landscape, Cinematic lighting"
+negative_prompt = 'low quality, bad quality'
+
+
+generator = torch.Generator(device='cuda').manual_seed(30)
+image_emb = pipe_prior(
+	prompt, 
+	guidance_scale = 4.,
+	num_inference_steps=25,
+	generator=generator, 
+	negative_prompt = negative_prompt).images
+
+zero_image_emb = pipe_prior(
+	negative_prompt, 
+	guidance_scale = 4.,
+	num_inference_steps=25,
+	generator=generator, 
+	negative_prompt = negative_prompt).images
+
+out = pipe(
+	prompt,
+	image=original_image,
+	image_embeds=image_emb,
+	negative_image_embeds =zero_image_emb,
+	height=768,
+	width=768,
+	num_inference_steps=500,
+	strength=0.3,
+)
+
+out.images[0].save("fantasy_land.png)
+```
+
+![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/img2img_fantasyland.png)
+
+
+### Text Guided Inpainting Generation
+
+You can use [`KandinskyInpaintPipeline`] to edit images. In this example, we will add a hat to the portrait of a cat.
+
+```py
+from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline
+from diffusers.utils import load_image
+import torch
+import numpy as np
+     
+pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+pipe_prior.to("cuda")
+
+prompt= "a hat"
+image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+pipe = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint", torch_dtype=torch.float16)
+pipe.to("cuda")
+
+init_image = load_image(
+     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
+     "/kandinsky/cat.png")
+
+mask = np.ones((768, 768), dtype=np.float32)
+mask[:250,250:-250] =  0
+
+out = pipe(
+    prompt,
+    image=init_image,
+    mask_image=mask,
+    image_embeds=image_emb,
+    negative_image_embeds =zero_image_emb,
+    height=768,
+    width=768,
+    num_inference_steps=150,
+    )
+
+image = out.images[0]
+image.save("cat_with_hat.png")
+```
+![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/inpaint_cat_hat.png)
+
+### Interpolate 
+
+The [`KandinskyPriorPipeline`] also comes with a cool utility function that will allow you to interpolate the latent space of different images and texts super easily. Here is an example of how you can create an Impressionist-style portrait for your pet based on "The Starry Night". 
+
+```py
+
+from diffusers import KandinskyPriorPipeline, KandinskyPipeline
+from diffusers.utils import load_image
+import PIL
+
+import torch
+from torchvision import transforms
+
+pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+pipe_prior.to("cuda")
+
+img1 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
+    "/kandinsky/cat.png")
+
+img2 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
+    "/kandinsky/starry_night.jpeg")
+
+images_texts = ["a cat", img1, img2 ]
+weights = [0.3,0.3,0.4]
+image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
+
+pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+pipe.to("cuda")
+
+image = pipe(
+    "",
+    image_embeds=image_emb,
+    negative_image_embeds =zero_image_emb,
+    height=768,
+    width=768,
+    num_inference_steps=150 ).images[0]
+
+image.save("starry_cat.png")
+```
+![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/starry_cat.png)
+
 
 ## KandinskyPipeline
 

From acfa8d316d5bca8c9771fab64c38fa0f4a27f978 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 22:06:33 +0000
Subject: [PATCH 134/182] fix

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index c838c34a9031..c4661822d15a 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -433,11 +433,6 @@ def __call__(
             ):
                 noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
 
-            if i + 1 == timesteps_tensor.shape[0]:
-                pass
-            else:
-                timesteps_tensor[i + 1]
-
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(
                 noise_pred,

From 7d5635da74b2b77402151aa37e726cfdb411b620 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 23 May 2023 22:11:32 +0000
Subject: [PATCH 135/182] make style

---
 docs/source/en/api/pipelines/kandinsky.mdx | 145 +++++++++------------
 1 file changed, 64 insertions(+), 81 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 5e1776390483..83ed842c40b6 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -33,42 +33,37 @@ In the following, we will walk you through some cool examples of using Kandinsky
 
 For text-to-image generation, we need to use both [`KandinskyPriorPipeline] and [`KandinskyPipeline`]. The first step is to create CLIP image embedding conditioned on a text prompt. Let's throw a fun prompt at Kandinsky to see what it will come up with :)
 
-```py
-prompt = 'A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting'
-negative_prompt = 'low quality, bad quality'
+```python
+prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting"
+negative_prompt = "low quality, bad quality"
 ```
 
 You should create the CLIP image embeddings for both your `prompt` and `negavie_prompt`. Also, the prior pipeline we use to create these CLIP image embeddings is a diffusion model itself, so you can use `negative_prompt`, `guidance_scale`, and `num_inference_steps` arguments to guide this process, just like how you would normally do with all other pipelines in diffusers.
 
-```py
+```python
 from diffusers import KandinskyPipeline, KandinskyPriorPipeline
 import torch
 
-# # create prior 
+# create prior
 pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
 
-generator = torch.Generator(device='cuda').manual_seed(12)
+generator = torch.Generator(device="cuda").manual_seed(12)
 image_emb = pipe_prior(
-        prompt, 
-        guidance_scale = 1.,
-        num_inference_steps=25,
-        generator=generator, 
-        negative_prompt = negative_prompt).images
+    prompt, guidance_scale=1.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
+).images
 
 zero_image_emb = pipe_prior(
-        negative_prompt, 
-        guidance_scale = 1.,
-        num_inference_steps=25,
-        generator=generator, 
-        negative_prompt = negative_prompt).images
+    negative_prompt, guidance_scale=1.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
+).images
 ```
 
 Once we create the image embedding, we can use [`KandinskyPipeline`] to generate images.
 
-```py
+```python
 from PIL import Image
 
+
 def image_grid(imgs, rows, cols):
     assert len(imgs) == rows * cols
 
@@ -86,16 +81,16 @@ pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.f
 pipe.to("cuda")
 
 images = pipe(
-	prompt,
-	image_embeds=image_emb,
-	negative_image_embeds =zero_image_emb,
-	num_images_per_prompt=2,
-	height=768,
-	width=768,
-	num_inference_steps=100,
-	guidance_scale=4.0,
-	generator=generator,
-	).images
+    prompt,
+    image_embeds=image_emb,
+    negative_image_embeds=zero_image_emb,
+    num_images_per_prompt=2,
+    height=768,
+    width=768,
+    num_inference_steps=100,
+    guidance_scale=4.0,
+    generator=generator,
+).images
 ```
 
 One cheeseburger monster coming up! Enjoy! 
@@ -104,28 +99,27 @@ One cheeseburger monster coming up! Enjoy!
 
 Kandinsky model works really extremely well with creative prompts. Here is some of the amazing art we created using exact same process but different prompts.
 
-```py
+```python
 prompt = "bird eye view shot of a full body woman with cyan light orange magenta makeup, digital art, long braided hair her face separated by makeup in the style of yin Yang surrealism, symmetrical face, real image, contrasting tone, pastel gradient background"
 ```
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/hair.png)
 
-```py
+```python
 prompt = "A car exploding into colorful dust"
 ```
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/dusts.png)
 
-```py
+```python
 prompt = "editorial photography of an organic, almost liquid smoke style armchair"
 ```
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/smokechair.png)
 
-```py
+```python
 prompt = "birds eye view of a quilted paper style alien planet landscape, vibrant colours, Cinematic lighting"
 ```
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/alienplanet.png)
 
 
-
 ### Text Guided Image-to-Image Generation
 
 The same Kandinsky model weights can be used for text-guided image-to-image translation. In this case, just make sure to load the weights using the [`KandinskyImg2ImgPipeline`] pipeline.
@@ -135,7 +129,7 @@ without loading them twice by making use of the [`~DiffusionPipeline.components(
 
 Let's download an image.
 
-```py
+```python
 from PIL import Image
 import requests
 from io import BytesIO
@@ -149,50 +143,42 @@ original_image = original_image.resize((768, 512))
 
 ![img](https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg)
 
-```py
+```python
 import torch
 from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
 
-
-## create prior 
+# create prior
 pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
 
-## create img2img pipeline
+# create img2img pipeline
 pipe = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
 pipe.to("cuda")
 
 prompt = "A fantasy landscape, Cinematic lighting"
-negative_prompt = 'low quality, bad quality'
-
+negative_prompt = "low quality, bad quality"
 
-generator = torch.Generator(device='cuda').manual_seed(30)
+generator = torch.Generator(device="cuda").manual_seed(30)
 image_emb = pipe_prior(
-	prompt, 
-	guidance_scale = 4.,
-	num_inference_steps=25,
-	generator=generator, 
-	negative_prompt = negative_prompt).images
+    prompt, guidance_scale=4.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
+).images
 
 zero_image_emb = pipe_prior(
-	negative_prompt, 
-	guidance_scale = 4.,
-	num_inference_steps=25,
-	generator=generator, 
-	negative_prompt = negative_prompt).images
+    negative_prompt, guidance_scale=4.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
+).images
 
 out = pipe(
-	prompt,
-	image=original_image,
-	image_embeds=image_emb,
-	negative_image_embeds =zero_image_emb,
-	height=768,
-	width=768,
-	num_inference_steps=500,
-	strength=0.3,
+    prompt,
+    image=original_image,
+    image_embeds=image_emb,
+    negative_image_embeds=zero_image_emb,
+    height=768,
+    width=768,
+    num_inference_steps=500,
+    strength=0.3,
 )
 
-out.images[0].save("fantasy_land.png)
+out.images[0].save("fantasy_land.png")
 ```
 
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/img2img_fantasyland.png)
@@ -202,38 +188,38 @@ out.images[0].save("fantasy_land.png)
 
 You can use [`KandinskyInpaintPipeline`] to edit images. In this example, we will add a hat to the portrait of a cat.
 
-```py
+```python
 from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline
 from diffusers.utils import load_image
 import torch
 import numpy as np
-     
+
 pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
 
-prompt= "a hat"
+prompt = "a hat"
 image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
 
 pipe = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint", torch_dtype=torch.float16)
 pipe.to("cuda")
 
 init_image = load_image(
-     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
-     "/kandinsky/cat.png")
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+)
 
 mask = np.ones((768, 768), dtype=np.float32)
-mask[:250,250:-250] =  0
+mask[:250, 250:-250] = 0
 
 out = pipe(
     prompt,
     image=init_image,
     mask_image=mask,
     image_embeds=image_emb,
-    negative_image_embeds =zero_image_emb,
+    negative_image_embeds=zero_image_emb,
     height=768,
     width=768,
     num_inference_steps=150,
-    )
+)
 
 image = out.images[0]
 image.save("cat_with_hat.png")
@@ -244,8 +230,7 @@ image.save("cat_with_hat.png")
 
 The [`KandinskyPriorPipeline`] also comes with a cool utility function that will allow you to interpolate the latent space of different images and texts super easily. Here is an example of how you can create an Impressionist-style portrait for your pet based on "The Starry Night". 
 
-```py
-
+```python
 from diffusers import KandinskyPriorPipeline, KandinskyPipeline
 from diffusers.utils import load_image
 import PIL
@@ -256,26 +241,24 @@ from torchvision import transforms
 pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
 
-img1 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
-    "/kandinsky/cat.png")
+img1 = load_image(
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+)
 
-img2 = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" 
-    "/kandinsky/starry_night.jpeg")
+img2 = load_image(
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/starry_night.jpeg"
+)
 
-images_texts = ["a cat", img1, img2 ]
-weights = [0.3,0.3,0.4]
+images_texts = ["a cat", img1, img2]
+weights = [0.3, 0.3, 0.4]
 image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
 
 pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
 pipe.to("cuda")
 
 image = pipe(
-    "",
-    image_embeds=image_emb,
-    negative_image_embeds =zero_image_emb,
-    height=768,
-    width=768,
-    num_inference_steps=150 ).images[0]
+    "", image_embeds=image_emb, negative_image_embeds=zero_image_emb, height=768, width=768, num_inference_steps=150
+).images[0]
 
 image.save("starry_cat.png")
 ```

From bec700fc057d77d8eaa31b3de8fe3322d7f6e653 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 01:08:13 +0000
Subject: [PATCH 136/182] slow test

---
 tests/pipelines/kandinsky/test_kandinsky.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 7f8bd5d307b1..5cb750dcce01 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -245,6 +245,7 @@ def test_kandinsky_text2img(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/kandinsky/kandinsky_text2img_cat_fp16.npy"
         )
+
         pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
         pipe_prior.to(torch_device)
 
@@ -254,13 +255,15 @@ def test_kandinsky_text2img(self):
 
         prompt = "red cat, 4k photo"
 
-        generator = torch.Generator(device="cpu").manual_seed(0)
+        generator = torch.Generator(device="cuda").manual_seed(0)
         image_emb = pipe_prior(
             prompt,
             generator=generator,
+            num_inference_steps=5,
         ).images
         zero_image_emb = pipe_prior("").images
-
+        
+        generator = torch.Generator(device="cuda").manual_seed(0)
         output = pipeline(
             prompt,
             image_embeds=image_emb,

From 7fea921bb2fc243fe5a28302ad26c5ad37bbd582 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 01:09:24 +0000
Subject: [PATCH 137/182] style

---
 tests/pipelines/kandinsky/test_kandinsky.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 5cb750dcce01..407f6efb49e2 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -262,7 +262,7 @@ def test_kandinsky_text2img(self):
             num_inference_steps=5,
         ).images
         zero_image_emb = pipe_prior("").images
-        
+
         generator = torch.Generator(device="cuda").manual_seed(0)
         output = pipeline(
             prompt,

From a74035212a84f9ad112716ca98966ba2184538a9 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 23 May 2023 16:25:21 -1000
Subject: [PATCH 138/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 83ed842c40b6..ad21c18a9ab9 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-Kandinsky 2.1 inherits best practicies from Dall-E 2 and Latent diffusion, while introducing some new ideas.
+Kandinsky 2.1 inherits best practices from [DALL-E 2](https://arxiv.org/abs/2204.06125) and [Latent Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/latent_diffusion), while introducing some new ideas.
 
 As text and image encoder it uses CLIP model and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.
 

From 457845ed3665710cd971dda11fe9bfa77a819e79 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 23 May 2023 16:25:54 -1000
Subject: [PATCH 139/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index ad21c18a9ab9..9db5a569bcf1 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 
 Kandinsky 2.1 inherits best practices from [DALL-E 2](https://arxiv.org/abs/2204.06125) and [Latent Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/latent_diffusion), while introducing some new ideas.
 
-As text and image encoder it uses CLIP model and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.
+It uses [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for encoding images and text, and a diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach enhances the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.
 
 The Kandinsky model in diffusers comes from ai-forever and the original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2)
 

From ae9887fa2ccaf1b0fcf65a7d22bfea217d5a8995 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 23 May 2023 16:26:32 -1000
Subject: [PATCH 140/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 9db5a569bcf1..2ac91d651ddb 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -27,7 +27,7 @@ The Kandinsky model in diffusers comes from ai-forever and the original codebase
 
 ## Usage example
 
-In the following, we will walk you through some cool examples of using Kandinsky pipelines to create incredible artwork beyond imagination.
+In the following, we will walk you through some cool examples of using the Kandinsky pipelines to create some visually aesthetic artwork.
 
 ### Text-to-Image Generation
 

From 5d5fc8b467a5411b57354e77897d624e5e0fab6b Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 23 May 2023 16:26:53 -1000
Subject: [PATCH 141/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 2ac91d651ddb..b3984f60f2d0 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -31,7 +31,7 @@ In the following, we will walk you through some cool examples of using the Kandi
 
 ### Text-to-Image Generation
 
-For text-to-image generation, we need to use both [`KandinskyPriorPipeline] and [`KandinskyPipeline`]. The first step is to create CLIP image embedding conditioned on a text prompt. Let's throw a fun prompt at Kandinsky to see what it will come up with :)
+For text-to-image generation, we need to use both [`KandinskyPriorPipeline`] and [`KandinskyPipeline`]. The first step is to create CLIP image embedding conditioned on a text prompt. Let's throw a fun prompt at Kandinsky to see what it comes up with :)
 
 ```python
 prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting"

From 08d123d40eaf2100c63fd69878208811338aedb8 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 23 May 2023 16:28:39 -1000
Subject: [PATCH 142/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index b3984f60f2d0..122e3ebec32d 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -97,7 +97,7 @@ One cheeseburger monster coming up! Enjoy!
 
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/cheeseburger.png)
 
-Kandinsky model works really extremely well with creative prompts. Here is some of the amazing art we created using exact same process but different prompts.
+The Kandinsky model works extremely well with creative prompts. Here is some of the amazing art we created using the exact same process but with different prompts.
 
 ```python
 prompt = "bird eye view shot of a full body woman with cyan light orange magenta makeup, digital art, long braided hair her face separated by makeup in the style of yin Yang surrealism, symmetrical face, real image, contrasting tone, pastel gradient background"

From 10f47aa7ac095eecb6f5b42a88f730d4208a9075 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 23 May 2023 16:28:59 -1000
Subject: [PATCH 143/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 122e3ebec32d..b7e3575fdf8b 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -125,7 +125,7 @@ prompt = "birds eye view of a quilted paper style alien planet landscape, vibran
 The same Kandinsky model weights can be used for text-guided image-to-image translation. In this case, just make sure to load the weights using the [`KandinskyImg2ImgPipeline`] pipeline.
 
 **Note**: You can also directly move the weights of the text-to-image pipelines to the image-to-image pipelines
-without loading them twice by making use of the [`~DiffusionPipeline.components()`] function as explained [here](#converting-between-different-pipelines).
+without loading them twice by making use of the [`~DiffusionPipeline.components`] function as explained [here](#converting-between-different-pipelines).
 
 Let's download an image.
 

From 5a12a12b5417c4d7bf830d703e9fb86b5950c2cf Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 03:55:29 +0000
Subject: [PATCH 144/182] update img2img slow test

---
 .../kandinsky/test_kandinsky_img2img.py          | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index 06eb12657049..e198c4001c3f 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -267,22 +267,8 @@ def test_kandinsky_img2img(self):
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
-
-        ddim_config = {
-            "num_train_timesteps": 1000,
-            "beta_schedule": "linear",
-            "beta_start": 0.00085,
-            "beta_end": 0.012,
-            "clip_sample": False,
-            "set_alpha_to_one": False,  # not sure what this does, so set to default value for now
-            "steps_offset": 0,
-            "prediction_type": "epsilon",
-            "thresholding": False,
-        }
-
-        ddim_scheduler = DDIMScheduler(**ddim_config)
-        pipeline.scheduler = ddim_scheduler
         pipeline = pipeline.to(torch_device)
+        
         pipeline.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)

From 19c6f3e791b9a42b5ed4e4b0f3e38aafb18002f3 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 03:56:02 +0000
Subject: [PATCH 145/182] style

---
 tests/pipelines/kandinsky/test_kandinsky_img2img.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index e198c4001c3f..48bccb471aa0 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -268,7 +268,7 @@ def test_kandinsky_img2img(self):
 
         pipeline = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
         pipeline = pipeline.to(torch_device)
-        
+
         pipeline.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)

From c0957cfdbc115bcb7bd792ac2304b4be1735bab7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 14:22:54 +0000
Subject: [PATCH 146/182] refactor inpaintpipeline to use ddim

---
 .../kandinsky/pipeline_kandinsky_inpaint.py        | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 1dfad0fd1cf4..fde20110045b 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -27,7 +27,7 @@
 from ...models import UNet2DConditionModel, VQModel
 from ...pipelines import DiffusionPipeline
 from ...pipelines.pipeline_utils import ImagePipelineOutput
-from ...schedulers import UnCLIPScheduler
+from ...schedulers import DDIMScheduler
 from ...utils import (
     is_accelerate_available,
     is_accelerate_version,
@@ -73,7 +73,7 @@
         ...     negative_image_embeds=zero_image_emb,
         ...     height=768,
         ...     width=768,
-        ...     num_inference_steps=150,
+        ...     num_inference_steps=50,
         ... )
 
         >>> image = out.images[0]
@@ -244,7 +244,7 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
             Frozen text-encoder.
         tokenizer ([`XLMRobertaTokenizer`]):
             Tokenizer of class
-        scheduler ([`UnCLIPScheduler`]):
+        scheduler ([`DDIMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
         unet ([`UNet2DConditionModel`]):
             Conditional U-Net architecture to denoise the image embedding.
@@ -258,7 +258,7 @@ def __init__(
         movq: VQModel,
         tokenizer: XLMRobertaTokenizer,
         unet: UNet2DConditionModel,
-        scheduler: UnCLIPScheduler,
+        scheduler: DDIMScheduler,
     ):
         super().__init__()
 
@@ -640,17 +640,11 @@ def __call__(
             ):
                 noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
 
-            if i + 1 == timesteps_tensor.shape[0]:
-                prev_timestep = None
-            else:
-                prev_timestep = timesteps_tensor[i + 1]
-
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(
                 noise_pred,
                 t,
                 latents,
-                prev_timestep=prev_timestep,
                 generator=generator,
             ).prev_sample
 

From fd94d171e61c7015a5298b3513a339bdfd37d20c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 15:18:01 +0000
Subject: [PATCH 147/182] update tests for inpaint

---
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index afc013e48453..e6333996fbe4 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -239,7 +239,7 @@ def test_kandinsky_inpaint(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.6187187, 0.53577256, 0.48749307, 0.5421068, 0.5214845, 0.40533125, 0.40913218, 0.48657694, 0.48048347]
+            [0.52027917, 0.5919095,  0.4203929,  0.566414,   0.52620435, 0.4773464, 0.3785732,  0.36906868, 0.49189737]
         )
 
         assert (

From 72ca4b161909bc87859af9af087231152f64c7ec Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 15:18:47 +0000
Subject: [PATCH 148/182] make style

---
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index e6333996fbe4..b2f4f54b7e73 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -239,7 +239,7 @@ def test_kandinsky_inpaint(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.52027917, 0.5919095,  0.4203929,  0.566414,   0.52620435, 0.4773464, 0.3785732,  0.36906868, 0.49189737]
+            [0.52027917, 0.5919095, 0.4203929, 0.566414, 0.52620435, 0.4773464, 0.3785732, 0.36906868, 0.49189737]
         )
 
         assert (

From f75796c47990f9bab2adc2b611c080fdb59a1550 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:22:07 -1000
Subject: [PATCH 149/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index b7e3575fdf8b..ee42b6d9f255 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -31,7 +31,7 @@ In the following, we will walk you through some cool examples of using the Kandi
 
 ### Text-to-Image Generation
 
-For text-to-image generation, we need to use both [`KandinskyPriorPipeline`] and [`KandinskyPipeline`]. The first step is to create CLIP image embedding conditioned on a text prompt. Let's throw a fun prompt at Kandinsky to see what it comes up with :)
+For text-to-image generation, we need to use both [`KandinskyPriorPipeline`] and [`KandinskyPipeline`]. The first step is to encode text prompts with CLIP and then diffuse the CLIP text embeddings to CLIP image embeddings, as first proposed in [DALL-E 2](https://cdn.openai.com/papers/dall-e-2.pdf). Let's throw a fun prompt at Kandinsky to see what it comes up with :)
 
 ```python
 prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting"

From ae00c840e7dd408247c95523e5e6c42eae52c83d Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:22:51 -1000
Subject: [PATCH 150/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index ee42b6d9f255..d807eea61581 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -38,7 +38,7 @@ prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, mo
 negative_prompt = "low quality, bad quality"
 ```
 
-You should create the CLIP image embeddings for both your `prompt` and `negavie_prompt`. Also, the prior pipeline we use to create these CLIP image embeddings is a diffusion model itself, so you can use `negative_prompt`, `guidance_scale`, and `num_inference_steps` arguments to guide this process, just like how you would normally do with all other pipelines in diffusers.
+We will pass both the `prompt` and `negative_prompt` to our prior diffusion pipeline. In contrast to other diffusion pipelines, such as Stable Diffusion, the `prompt` and `negative_prompt` shall be passed separately so that we can retrieve a CLIP image embedding for each prompt input. You can use `guidance_scale`, and `num_inference_steps` arguments to guide this process, just like how you would normally do with all other pipelines in diffusers.
 
 ```python
 from diffusers import KandinskyPipeline, KandinskyPriorPipeline

From bab4c51e33f80c767e7d43952fc2d9ad340d1469 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:23:29 -1000
Subject: [PATCH 151/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index d807eea61581..df31c082cee9 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -41,7 +41,7 @@ negative_prompt = "low quality, bad quality"
 We will pass both the `prompt` and `negative_prompt` to our prior diffusion pipeline. In contrast to other diffusion pipelines, such as Stable Diffusion, the `prompt` and `negative_prompt` shall be passed separately so that we can retrieve a CLIP image embedding for each prompt input. You can use `guidance_scale`, and `num_inference_steps` arguments to guide this process, just like how you would normally do with all other pipelines in diffusers.
 
 ```python
-from diffusers import KandinskyPipeline, KandinskyPriorPipeline
+from diffusers import KandinskyPriorPipeline
 import torch
 
 # create prior

From 7b0a3fca40212dca8a2295382c7ebeac9f67f3bc Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:23:38 -1000
Subject: [PATCH 152/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index df31c082cee9..2c29169e5c77 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -62,6 +62,7 @@ Once we create the image embedding, we can use [`KandinskyPipeline`] to generate
 
 ```python
 from PIL import Image
+from diffusers import KandinskyPipeline
 
 
 def image_grid(imgs, rows, cols):

From ab410079da4cdf7200795ac3249219055a3fe4b6 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:23:50 -1000
Subject: [PATCH 153/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 2c29169e5c77..5eec6822c2e7 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -98,7 +98,7 @@ One cheeseburger monster coming up! Enjoy!
 
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/cheeseburger.png)
 
-The Kandinsky model works extremely well with creative prompts. Here is some of the amazing art we created using the exact same process but with different prompts.
+The Kandinsky model works extremely well with creative prompts. Here is some of the amazing art that can be created using the exact same process but with different prompts.
 
 ```python
 prompt = "bird eye view shot of a full body woman with cyan light orange magenta makeup, digital art, long braided hair her face separated by makeup in the style of yin Yang surrealism, symmetrical face, real image, contrasting tone, pastel gradient background"

From 416c922b0a96fc3866bb2c1c71dbe9c7656dd698 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:24:09 -1000
Subject: [PATCH 154/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 5eec6822c2e7..5c7221b27748 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -209,6 +209,7 @@ init_image = load_image(
 )
 
 mask = np.ones((768, 768), dtype=np.float32)
+# Let's mask out an area above the cat's head
 mask[:250, 250:-250] = 0
 
 out = pipe(

From 2e2c0ba44c0374da1a397139175df616c3ab8ca2 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:24:36 -1000
Subject: [PATCH 155/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 5c7221b27748..975417f0e032 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -273,6 +273,8 @@ image.save("starry_cat.png")
 	- all
 	- __call__
 
+## KandinskyInpaintPipeline
+
 [[autodoc]] KandinskyInpaintPipeline
 	- all
 	- __call__

From 47852a9c1e425cb377d26ada8114eaa3f33ded51 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:24:54 -1000
Subject: [PATCH 156/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 975417f0e032..8d167c73dedc 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -278,6 +278,8 @@ image.save("starry_cat.png")
 [[autodoc]] KandinskyInpaintPipeline
 	- all
 	- __call__
+	
+## KandinskyImg2ImgPipeline
 
 [[autodoc]] KandinskyImg2ImgPipeline
 	- all

From f9fcf471ac76620e9b3dc7009319b93fe3a367eb Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:25:11 -1000
Subject: [PATCH 157/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 8d167c73dedc..d4c6f58e1b61 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -285,7 +285,3 @@ image.save("starry_cat.png")
 	- all
 	- __call__
 
-[[autodoc]] KandinskyPriorPipeline
-	- all
-	- __call__
-	- interpolate

From e719e609e349c8e44ff3220e85f1ad0769a98194 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:25:32 -1000
Subject: [PATCH 158/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index d4c6f58e1b61..b2359899531f 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -267,6 +267,13 @@ image.save("starry_cat.png")
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/starry_cat.png)
 
 
+## KandinskyPriorPipeline
+
+[[autodoc]] KandinskyPriorPipeline
+	- all
+	- __call__
+	- interpolate
+	
 ## KandinskyPipeline
 
 [[autodoc]] KandinskyPipeline

From 2228f2711d2b4803dd59c43fdbd3fde80ced41c8 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:25:58 -1000
Subject: [PATCH 159/182] Update
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index c4661822d15a..fa38730cdd5f 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -41,7 +41,7 @@
         >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
         >>> import torch
 
-        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-prior")
         >>> pipe_prior.to("cuda")
 
         >>> prompt = "red cat, 4k photo"

From 9939627a4317cc006afa71c08f1f8c763b25433a Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:26:27 -1000
Subject: [PATCH 160/182] Update
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index ffc2bf4afa7c..4d2c16e4c495 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -18,7 +18,6 @@
 import numpy as np
 import PIL
 import torch
-from torchvision import transforms
 from transformers import CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...models import PriorTransformer

From 7ee49a34c305be96ad352a4e778dc07fc888e337 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 05:27:38 -1000
Subject: [PATCH 161/182] Update
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 4d2c16e4c495..43579c5b8008 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -229,7 +229,7 @@ def interpolate(
         Examples:
 
         Returns:
-            `tuple`
+            [`KandinskyPriorPipelineOutput`] or `tuple`
         """
 
         device = device or self.device

From a9802aac3b6492c04cab33569ba07247470d2639 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 17:00:42 +0000
Subject: [PATCH 162/182] remove torchvision transform

---
 .../kandinsky/pipeline_kandinsky_prior.py     | 22 +++++--------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 43579c5b8008..0dc69a5de8f4 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -18,7 +18,7 @@
 import numpy as np
 import PIL
 import torch
-from transformers import CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection, CLIPImageProcessor
 
 from ...models import PriorTransformer
 from ...pipelines import DiffusionPipeline
@@ -108,21 +108,6 @@
 """
 
 
-def _convert_image_to_rgb(image):
-    return image.convert("RGB")
-
-
-image_transforms = transforms.Compose(
-    [
-        transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
-        transforms.CenterCrop(224),
-        _convert_image_to_rgb,
-        transforms.ToTensor(),
-        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-    ]
-)
-
-
 @dataclass
 class KandinskyPriorPipelineOutput(BaseOutput):
     """
@@ -167,6 +152,7 @@ def __init__(
         text_encoder: CLIPTextModelWithProjection,
         tokenizer: CLIPTokenizer,
         scheduler: UnCLIPScheduler,
+        image_processor: CLIPImageProcessor
     ):
         super().__init__()
 
@@ -176,6 +162,7 @@ def __init__(
             tokenizer=tokenizer,
             scheduler=scheduler,
             image_encoder=image_encoder,
+            image_processor=image_processor,
         )
 
     @torch.no_grad()
@@ -254,7 +241,8 @@ def interpolate(
 
             elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
                 if isinstance(cond, PIL.Image.Image):
-                    cond = image_transforms(cond).unsqueeze(0).to(dtype=self.image_encoder.dtype, device=device)
+                    cond = self.image_processor(
+                        cond, return_tensors='pt').pixel_values[0].unsqueeze(0).to(dtype=self.image_encoder.dtype, device=device)
 
                 image_emb = self.image_encoder(cond)["image_embeds"]
 

From ac747883bd626a8cc266b53d9b760047802b066f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 18:02:45 +0000
Subject: [PATCH 163/182] fix test for prior

---
 .../kandinsky/test_kandinsky_prior.py          | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py
index abd3b1c714fa..1cdb3ed7f50a 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -24,6 +24,7 @@
     CLIPTokenizer,
     CLIPVisionConfig,
     CLIPVisionModelWithProjection,
+    CLIPImageProcessor
 )
 
 from diffusers import KandinskyPriorPipeline, PriorTransformer, UnCLIPScheduler
@@ -126,12 +127,28 @@ def dummy_image_encoder(self):
 
         model = CLIPVisionModelWithProjection(config)
         return model
+    
+    @property
+    def dummy_image_processor(self):
+        image_processor = CLIPImageProcessor(
+            crop_size = 224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466,0.4578275,0.40821073],
+            image_std= [0.26862954,0.26130258,0.27577711],
+            resample=3,
+            size=224,)
+        
+        return image_processor
+
 
     def get_dummy_components(self):
         prior = self.dummy_prior
         image_encoder = self.dummy_image_encoder
         text_encoder = self.dummy_text_encoder
         tokenizer = self.dummy_tokenizer
+        image_processor=self.dummy_image_processor
 
         scheduler = UnCLIPScheduler(
             variance_type="fixed_small_log",
@@ -147,6 +164,7 @@ def get_dummy_components(self):
             "text_encoder": text_encoder,
             "tokenizer": tokenizer,
             "scheduler": scheduler,
+            "image_processor": image_processor,
         }
 
         return components

From 87faa117360de6bfc990c5e63ecc22fa18b7efcf Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 18:03:51 +0000
Subject: [PATCH 164/182] style

---
 .../kandinsky/pipeline_kandinsky_prior.py      | 12 ++++++++----
 .../kandinsky/test_kandinsky_prior.py          | 18 +++++++++---------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 0dc69a5de8f4..8cf35e2cba94 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -18,7 +18,7 @@
 import numpy as np
 import PIL
 import torch
-from transformers import CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection, CLIPImageProcessor
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...models import PriorTransformer
 from ...pipelines import DiffusionPipeline
@@ -152,7 +152,7 @@ def __init__(
         text_encoder: CLIPTextModelWithProjection,
         tokenizer: CLIPTokenizer,
         scheduler: UnCLIPScheduler,
-        image_processor: CLIPImageProcessor
+        image_processor: CLIPImageProcessor,
     ):
         super().__init__()
 
@@ -241,8 +241,12 @@ def interpolate(
 
             elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
                 if isinstance(cond, PIL.Image.Image):
-                    cond = self.image_processor(
-                        cond, return_tensors='pt').pixel_values[0].unsqueeze(0).to(dtype=self.image_encoder.dtype, device=device)
+                    cond = (
+                        self.image_processor(cond, return_tensors="pt")
+                        .pixel_values[0]
+                        .unsqueeze(0)
+                        .to(dtype=self.image_encoder.dtype, device=device)
+                    )
 
                 image_emb = self.image_encoder(cond)["image_embeds"]
 
diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py
index 1cdb3ed7f50a..5ed1f2ac984d 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -19,12 +19,12 @@
 import torch
 from torch import nn
 from transformers import (
+    CLIPImageProcessor,
     CLIPTextConfig,
     CLIPTextModelWithProjection,
     CLIPTokenizer,
     CLIPVisionConfig,
     CLIPVisionModelWithProjection,
-    CLIPImageProcessor
 )
 
 from diffusers import KandinskyPriorPipeline, PriorTransformer, UnCLIPScheduler
@@ -127,28 +127,28 @@ def dummy_image_encoder(self):
 
         model = CLIPVisionModelWithProjection(config)
         return model
-    
+
     @property
     def dummy_image_processor(self):
         image_processor = CLIPImageProcessor(
-            crop_size = 224,
+            crop_size=224,
             do_center_crop=True,
             do_normalize=True,
             do_resize=True,
-            image_mean=[0.48145466,0.4578275,0.40821073],
-            image_std= [0.26862954,0.26130258,0.27577711],
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
             resample=3,
-            size=224,)
-        
-        return image_processor
+            size=224,
+        )
 
+        return image_processor
 
     def get_dummy_components(self):
         prior = self.dummy_prior
         image_encoder = self.dummy_image_encoder
         text_encoder = self.dummy_text_encoder
         tokenizer = self.dummy_tokenizer
-        image_processor=self.dummy_image_processor
+        image_processor = self.dummy_image_processor
 
         scheduler = UnCLIPScheduler(
             variance_type="fixed_small_log",

From 2e073eab3552a3fabbe7d3abcd0c62f20d761303 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 18:56:43 +0000
Subject: [PATCH 165/182] revert unclip scheduler

---
 src/diffusers/schedulers/scheduling_unclip.py | 51 +------------------
 1 file changed, 1 insertion(+), 50 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 411ec1393d47..ee1146904d04 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -100,15 +100,6 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, default `epsilon`, optional):
             prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process)
             or `sample` (directly predicting the noisy sample`)
-        thresholding (`bool`, default `False`):
-            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
-            Note that the thresholding method is unsuitable for latent-space diffusion models (such as
-            stable-diffusion).
-        dynamic_thresholding_ratio (`float`, default `0.995`):
-            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
-            (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`.
-        sample_max_value (`float`, default `1.0`):
-            the threshold value for dynamic thresholding. Valid only when `thresholding=True`.
     """
 
     @register_to_config
@@ -212,47 +203,10 @@ def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance
             max_log = beta.log()
 
             frac = (predicted_variance + 1) / 2
-            # this is log variance
             variance = frac * max_log + (1 - frac) * min_log
 
         return variance
 
-    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
-        """
-        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
-        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
-        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
-        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
-        photorealism as well as better image-text alignment, especially when using very large guidance weights."
-
-        https://arxiv.org/abs/2205.11487
-        """
-        dtype = sample.dtype
-        batch_size, channels, height, width = sample.shape
-
-        if dtype not in (torch.float32, torch.float64):
-            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
-
-        # Flatten sample for doing quantile calculation along each image
-        sample = sample.reshape(batch_size, channels * height * width)
-
-        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
-
-        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
-        s = torch.clamp(
-            s,
-            min=self.config.sample_min_value,
-            max=self.config.sample_max_value,
-        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
-
-        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
-        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
-
-        sample = sample.reshape(batch_size, channels, height, width)
-        sample = sample.to(dtype)
-
-        return sample
-
     def step(
         self,
         model_output: torch.FloatTensor,
@@ -317,15 +271,12 @@ def step(
                 " for the UnCLIPScheduler."
             )
 
-        # 3. Clip/threhold "predicted x_0"
+        # 3. Clip "predicted x_0"
         if self.config.clip_sample:
             pred_original_sample = torch.clamp(
                 pred_original_sample, -self.config.clip_sample_range, self.config.clip_sample_range
             )
 
-        if self.config.thresholding:
-            pred_original_sample = self._threshold_sample(pred_original_sample)
-
         # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
         # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
         pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * beta) / beta_prod_t

From dab1a1f8bc5b67912260f5dbe4fa2a5329daeeaa Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 18:59:06 +0000
Subject: [PATCH 166/182] more

---
 src/diffusers/schedulers/scheduling_unclip.py | 20 +++++--------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index ee1146904d04..d44edcb1812a 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -109,23 +109,13 @@ def __init__(
         variance_type: str = "fixed_small_log",
         clip_sample: bool = True,
         clip_sample_range: Optional[float] = 1.0,
-        thresholding: bool = False,
-        dynamic_thresholding_ratio: float = 0.995,
-        sample_min_value: Optional[float] = None,
-        sample_max_value: Optional[float] = 1.0,
         prediction_type: str = "epsilon",
-        beta_schedule: str = "squaredcos_cap_v2",  # "linear"
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
+        beta_schedule: str = "squaredcos_cap_v2",
     ):
-        if beta_schedule == "squaredcos_cap_v2":
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        elif beta_schedule == "linear":
-            # Linear schedule from Ho et al, extended to work for any number of diffusion steps.
-            scale = 1000 / num_train_timesteps
-            self.betas = torch.linspace(beta_start * scale, beta_end * scale, num_train_timesteps, dtype=torch.float64)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+        if beta_schedule != "squaredcos_cap_v2":
+            raise ValueError("UnCLIPScheduler only supports `beta_schedule`: 'squaredcos_cap_v2'")
+
+        self.betas = betas_for_alpha_bar(num_train_timesteps)
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)

From 410704e9fbcd77ebd793d7d8045eb332c9266ecb Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 19:11:31 +0000
Subject: [PATCH 167/182] fix inpaint test - update scheduler

---
 .../kandinsky/test_kandinsky_inpaint.py       | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index b2f4f54b7e73..ae4b420e834f 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -22,7 +22,7 @@
 from PIL import Image
 from transformers import XLMRobertaTokenizerFast
 
-from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline, UnCLIPScheduler, UNet2DConditionModel, VQModel
+from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline, DDIMScheduler, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
@@ -159,19 +159,16 @@ def get_dummy_components(self):
         unet = self.dummy_unet
         movq = self.dummy_movq
 
-        scheduler = UnCLIPScheduler(
-            clip_sample=True,
-            clip_sample_range=2.0,
-            sample_min_value=1.0,
-            sample_max_value=None,
+        scheduler = DDIMScheduler(
             num_train_timesteps=1000,
-            prediction_type="epsilon",
-            variance_type="learned_range",
-            thresholding=True,
-            beta_schedule="linear",
-            beta_start=0.00085,
+            beta_schedule= "linear",
+            beta_start= 0.00085,
             beta_end=0.012,
-        )
+            clip_sample=False,
+            set_alpha_to_one=False, 
+            steps_offset= 1,
+            prediction_type= "epsilon",
+            thresholding=False)
 
         components = {
             "text_encoder": text_encoder,
@@ -239,7 +236,7 @@ def test_kandinsky_inpaint(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.52027917, 0.5919095, 0.4203929, 0.566414, 0.52620435, 0.4773464, 0.3785732, 0.36906868, 0.49189737]
+            [0.8326919, 0.73790467, 0.20918581, 0.9309612,  0.5511791,  0.43713328, 0.5513321,  0.49922934, 0.59497786]
         )
 
         assert (

From eb064398173f9514d9a6a9e524f58eb882b6c2de Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 19:12:10 +0000
Subject: [PATCH 168/182] make style

---
 .../kandinsky/test_kandinsky_inpaint.py         | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index ae4b420e834f..60d847ee1d08 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -22,7 +22,7 @@
 from PIL import Image
 from transformers import XLMRobertaTokenizerFast
 
-from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline, DDIMScheduler, UNet2DConditionModel, VQModel
+from diffusers import DDIMScheduler, KandinskyInpaintPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
@@ -161,14 +161,15 @@ def get_dummy_components(self):
 
         scheduler = DDIMScheduler(
             num_train_timesteps=1000,
-            beta_schedule= "linear",
-            beta_start= 0.00085,
+            beta_schedule="linear",
+            beta_start=0.00085,
             beta_end=0.012,
             clip_sample=False,
-            set_alpha_to_one=False, 
-            steps_offset= 1,
-            prediction_type= "epsilon",
-            thresholding=False)
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+            thresholding=False,
+        )
 
         components = {
             "text_encoder": text_encoder,
@@ -236,7 +237,7 @@ def test_kandinsky_inpaint(self):
         assert image.shape == (1, 64, 64, 3)
 
         expected_slice = np.array(
-            [0.8326919, 0.73790467, 0.20918581, 0.9309612,  0.5511791,  0.43713328, 0.5513321,  0.49922934, 0.59497786]
+            [0.8326919, 0.73790467, 0.20918581, 0.9309612, 0.5511791, 0.43713328, 0.5513321, 0.49922934, 0.59497786]
         )
 
         assert (

From b945f8f2e44531519a0198e1592599bf8fc1f445 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 19:40:58 +0000
Subject: [PATCH 169/182]  change checkpoint name for prior

---
 docs/source/en/api/pipelines/kandinsky.mdx                | 8 ++++----
 .../pipelines/kandinsky/pipeline_kandinsky_img2img.py     | 2 +-
 .../pipelines/kandinsky/pipeline_kandinsky_inpaint.py     | 2 +-
 .../pipelines/kandinsky/pipeline_kandinsky_prior.py       | 4 ++--
 tests/pipelines/kandinsky/test_kandinsky.py               | 2 +-
 tests/pipelines/kandinsky/test_kandinsky_img2img.py       | 2 +-
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py       | 2 +-
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index b2359899531f..30bb1ee233b0 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -45,7 +45,7 @@ from diffusers import KandinskyPriorPipeline
 import torch
 
 # create prior
-pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
 
 generator = torch.Generator(device="cuda").manual_seed(12)
@@ -149,7 +149,7 @@ import torch
 from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
 
 # create prior
-pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
 
 # create img2img pipeline
@@ -195,7 +195,7 @@ from diffusers.utils import load_image
 import torch
 import numpy as np
 
-pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
 
 prompt = "a hat"
@@ -240,7 +240,7 @@ import PIL
 import torch
 from torchvision import transforms
 
-pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
 
 img1 = load_image(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index d9ff023ba6cb..39ee3b1e40c5 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -45,7 +45,7 @@
         >>> from diffusers.utils import load_image
         >>> import torch
 
-        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
         >>> pipe_prior.to("cuda")
 
         >>> prompt = "A red cartoon frog, 4k"
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index fde20110045b..0f5840136f39 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -48,7 +48,7 @@
         >>> import torch
         >>> import numpy as np
 
-        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
         >>> pipe_prior.to("cuda")
 
         >>> prompt = "a hat"
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 8cf35e2cba94..bdd5ec65c6bb 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -40,7 +40,7 @@
         >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
         >>> import torch
 
-        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior")
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior")
         >>> pipe_prior.to("cuda")
 
         >>> prompt = "red cat, 4k photo"
@@ -74,7 +74,7 @@
         >>> import torch
         >>> from torchvision import transforms
 
-        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
         >>> pipe_prior.to("cuda")
 
         >>> img1 = load_image(
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 407f6efb49e2..c29a4090187a 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -246,7 +246,7 @@ def test_kandinsky_text2img(self):
             "/kandinsky/kandinsky_text2img_cat_fp16.npy"
         )
 
-        pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+        pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index 48bccb471aa0..43ec7961f182 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -263,7 +263,7 @@ def test_kandinsky_img2img(self):
         )
         prompt = "A red cartoon frog, 4k"
 
-        pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+        pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index 60d847ee1d08..26083a11f912 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -271,7 +271,7 @@ def test_kandinsky_inpaint(self):
 
         prompt = "a hat"
 
-        pipe_prior = KandinskyPriorPipeline.from_pretrained("YiYiXu/Kandinsky-prior", torch_dtype=torch.float16)
+        pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint", torch_dtype=torch.float16)

From 4145c051fd15468ccb7b575c946d2315fb8ec50d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 19:52:35 +0000
Subject: [PATCH 170/182] update ckpt name for text2img

---
 docs/source/en/api/pipelines/kandinsky.mdx                  | 6 +++---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py     | 2 +-
 .../pipelines/kandinsky/pipeline_kandinsky_img2img.py       | 2 +-
 .../pipelines/kandinsky/pipeline_kandinsky_prior.py         | 4 ++--
 tests/pipelines/kandinsky/test_kandinsky.py                 | 2 +-
 tests/pipelines/kandinsky/test_kandinsky_img2img.py         | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 30bb1ee233b0..fb315267f24e 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -78,7 +78,7 @@ def image_grid(imgs, rows, cols):
 
 
 # create diffuser pipeline
-pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
 pipe.to("cuda")
 
 images = pipe(
@@ -153,7 +153,7 @@ pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandins
 pipe_prior.to("cuda")
 
 # create img2img pipeline
-pipe = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+pipe = KandinskyImg2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
 pipe.to("cuda")
 
 prompt = "A fantasy landscape, Cinematic lighting"
@@ -255,7 +255,7 @@ images_texts = ["a cat", img1, img2]
 weights = [0.3, 0.3, 0.4]
 image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
 
-pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
 pipe.to("cuda")
 
 image = pipe(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
index fa38730cdd5f..29545bd88dc2 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -49,7 +49,7 @@
         >>> image_emb = out.images
         >>> zero_image_emb = out.zero_embeds
 
-        >>> pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky")
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
         >>> pipe.to("cuda")
 
         >>> image = pipe(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index 39ee3b1e40c5..d161f28101ec 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -51,7 +51,7 @@
         >>> prompt = "A red cartoon frog, 4k"
         >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
 
-        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky-img2img", torch_dtype=torch.float16)
+        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
         >>> pipe.to("cuda")
 
         >>> init_image = load_image(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index bdd5ec65c6bb..6aa7681c2af5 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -48,7 +48,7 @@
         >>> image_emb = out.images
         >>> zero_image_emb = out.zero_embeds
 
-        >>> pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky")
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
         >>> pipe.to("cuda")
 
         >>> image = pipe(
@@ -91,7 +91,7 @@
         >>> weights = [0.3, 0.3, 0.4]
         >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
 
-        >>> pipe = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
         >>> pipe.to("cuda")
 
         >>> image = pipe(
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index c29a4090187a..fbaaf53743f1 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -249,7 +249,7 @@ def test_kandinsky_text2img(self):
         pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
         pipe_prior.to(torch_device)
 
-        pipeline = KandinskyPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+        pipeline = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
 
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index 43ec7961f182..8516a991d4f4 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -266,7 +266,7 @@ def test_kandinsky_img2img(self):
         pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
         pipe_prior.to(torch_device)
 
-        pipeline = KandinskyImg2ImgPipeline.from_pretrained("YiYiXu/Kandinsky", torch_dtype=torch.float16)
+        pipeline = KandinskyImg2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
         pipeline = pipeline.to(torch_device)
 
         pipeline.set_progress_bar_config(disable=None)

From f83dddb9bd98a1629a01c9634a78381d8832d3c0 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 20:00:54 +0000
Subject: [PATCH 171/182] update ckpt for inpaint

---
 docs/source/en/api/pipelines/kandinsky.mdx                      | 2 +-
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py | 2 +-
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index fb315267f24e..bed351ecfbae 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -201,7 +201,7 @@ pipe_prior.to("cuda")
 prompt = "a hat"
 image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
 
-pipe = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint", torch_dtype=torch.float16)
+pipe = KandinskyInpaintPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16)
 pipe.to("cuda")
 
 init_image = load_image(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 0f5840136f39..df0e7bdf1e92 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -54,7 +54,7 @@
         >>> prompt = "a hat"
         >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
 
-        >>> pipe = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint", torch_dtype=torch.float16)
+        >>> pipe = KandinskyInpaintPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16)
         >>> pipe.to("cuda")
 
         >>> init_image = load_image(
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index 26083a11f912..cde6fb50d5fe 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -274,7 +274,7 @@ def test_kandinsky_inpaint(self):
         pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
         pipe_prior.to(torch_device)
 
-        pipeline = KandinskyInpaintPipeline.from_pretrained("YiYiXu/Kandinsky-inpaint", torch_dtype=torch.float16)
+        pipeline = KandinskyInpaintPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16)
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
 

From 93f48485f43060ed3a5ef50e22e896b7bcde1b2a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 20:02:38 +0000
Subject: [PATCH 172/182] style

---
 docs/source/en/api/pipelines/kandinsky.mdx       | 16 ++++++++++++----
 .../kandinsky/pipeline_kandinsky_img2img.py      |  8 ++++++--
 .../kandinsky/pipeline_kandinsky_inpaint.py      |  8 ++++++--
 .../kandinsky/pipeline_kandinsky_prior.py        |  4 +++-
 tests/pipelines/kandinsky/test_kandinsky.py      |  4 +++-
 .../kandinsky/test_kandinsky_img2img.py          |  8 ++++++--
 .../kandinsky/test_kandinsky_inpaint.py          |  8 ++++++--
 7 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index bed351ecfbae..def5fc65a2ab 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -45,7 +45,9 @@ from diffusers import KandinskyPriorPipeline
 import torch
 
 # create prior
-pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
+pipe_prior = KandinskyPriorPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+)
 pipe_prior.to("cuda")
 
 generator = torch.Generator(device="cuda").manual_seed(12)
@@ -149,7 +151,9 @@ import torch
 from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
 
 # create prior
-pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
+pipe_prior = KandinskyPriorPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+)
 pipe_prior.to("cuda")
 
 # create img2img pipeline
@@ -195,7 +199,9 @@ from diffusers.utils import load_image
 import torch
 import numpy as np
 
-pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
+pipe_prior = KandinskyPriorPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+)
 pipe_prior.to("cuda")
 
 prompt = "a hat"
@@ -240,7 +246,9 @@ import PIL
 import torch
 from torchvision import transforms
 
-pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
+pipe_prior = KandinskyPriorPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+)
 pipe_prior.to("cuda")
 
 img1 = load_image(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index d161f28101ec..470fa606af1a 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -45,13 +45,17 @@
         >>> from diffusers.utils import load_image
         >>> import torch
 
-        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        ... )
         >>> pipe_prior.to("cuda")
 
         >>> prompt = "A red cartoon frog, 4k"
         >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
 
-        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
+        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
+        ... )
         >>> pipe.to("cuda")
 
         >>> init_image = load_image(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index df0e7bdf1e92..cc9a35e580b3 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -48,13 +48,17 @@
         >>> import torch
         >>> import numpy as np
 
-        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        ... )
         >>> pipe_prior.to("cuda")
 
         >>> prompt = "a hat"
         >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
 
-        >>> pipe = KandinskyInpaintPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16)
+        >>> pipe = KandinskyInpaintPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16
+        ... )
         >>> pipe.to("cuda")
 
         >>> init_image = load_image(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 6aa7681c2af5..8c1b2c992f39 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -74,7 +74,9 @@
         >>> import torch
         >>> from torchvision import transforms
 
-        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        ... )
         >>> pipe_prior.to("cuda")
 
         >>> img1 = load_image(
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index fbaaf53743f1..3854519786dd 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -246,7 +246,9 @@ def test_kandinsky_text2img(self):
             "/kandinsky/kandinsky_text2img_cat_fp16.npy"
         )
 
-        pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
+        pipe_prior = KandinskyPriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        )
         pipe_prior.to(torch_device)
 
         pipeline = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index 8516a991d4f4..11b3eb06f814 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -263,10 +263,14 @@ def test_kandinsky_img2img(self):
         )
         prompt = "A red cartoon frog, 4k"
 
-        pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
+        pipe_prior = KandinskyPriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        )
         pipe_prior.to(torch_device)
 
-        pipeline = KandinskyImg2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
+        pipeline = KandinskyImg2ImgPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
+        )
         pipeline = pipeline.to(torch_device)
 
         pipeline.set_progress_bar_config(disable=None)
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index cde6fb50d5fe..975b0dc964ce 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -271,10 +271,14 @@ def test_kandinsky_inpaint(self):
 
         prompt = "a hat"
 
-        pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
+        pipe_prior = KandinskyPriorPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        )
         pipe_prior.to(torch_device)
 
-        pipeline = KandinskyInpaintPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16)
+        pipeline = KandinskyInpaintPipeline.from_pretrained(
+            "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16
+        )
         pipeline = pipeline.to(torch_device)
         pipeline.set_progress_bar_config(disable=None)
 

From 8d972bf982b8d377df9e1838ae72f08c3a98c3f8 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 20:17:00 +0000
Subject: [PATCH 173/182] explain interpolate

---
 docs/source/en/api/pipelines/kandinsky.mdx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index def5fc65a2ab..1cbf9d00efe9 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -238,6 +238,8 @@ image.save("cat_with_hat.png")
 
 The [`KandinskyPriorPipeline`] also comes with a cool utility function that will allow you to interpolate the latent space of different images and texts super easily. Here is an example of how you can create an Impressionist-style portrait for your pet based on "The Starry Night". 
 
+Note that you can interpolate between texts and images - in the below example, we passed a text prompt "a cat" and two images to the `interplate` function, along with a `weights` variable containing the corresponding weights for each condition we interplate. 
+
 ```python
 from diffusers import KandinskyPriorPipeline, KandinskyPipeline
 from diffusers.utils import load_image
@@ -259,7 +261,9 @@ img2 = load_image(
     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/starry_night.jpeg"
 )
 
+# add all the conditions we want to interpolate, can be either text or image 
 images_texts = ["a cat", img1, img2]
+# specify the weights for each condition in images_texts
 weights = [0.3, 0.3, 0.4]
 image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
 

From 307cef2e5604f489fd1e5edec0a07aa0693c5f1b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 20:17:41 +0000
Subject: [PATCH 174/182] style

---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 1cbf9d00efe9..6fc515262469 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -261,7 +261,7 @@ img2 = load_image(
     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/starry_night.jpeg"
 )
 
-# add all the conditions we want to interpolate, can be either text or image 
+# add all the conditions we want to interpolate, can be either text or image
 images_texts = ["a cat", img1, img2]
 # specify the weights for each condition in images_texts
 weights = [0.3, 0.3, 0.4]

From 1634ebd0b626442265d62222b6b3ed96be928904 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 20:41:26 +0000
Subject: [PATCH 175/182] change default num_inference_steps for prior to 25

---
 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py | 4 ++--
 tests/pipelines/kandinsky/test_kandinsky.py                   | 2 +-
 tests/pipelines/kandinsky/test_kandinsky_img2img.py           | 3 ++-
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py           | 1 +
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 8c1b2c992f39..d9474b43da54 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -174,7 +174,7 @@ def interpolate(
         images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
         weights: List[float],
         num_images_per_prompt: int = 1,
-        num_inference_steps: int = 5,
+        num_inference_steps: int = 25,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         negative_prior_prompt: Optional[str] = None,
@@ -436,7 +436,7 @@ def __call__(
         self,
         prompt: Union[str, List[str]],
         num_images_per_prompt: int = 1,
-        num_inference_steps: int = 5,
+        num_inference_steps: int = 25,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py
index 3854519786dd..8f7d5ae2019c 100644
--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -263,7 +263,7 @@ def test_kandinsky_text2img(self):
             generator=generator,
             num_inference_steps=5,
         ).images
-        zero_image_emb = pipe_prior("").images
+        zero_image_emb = pipe_prior("", num_inference_steps=5).images
 
         generator = torch.Generator(device="cuda").manual_seed(0)
         output = pipeline(
diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index 11b3eb06f814..313c47954d0c 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -279,8 +279,9 @@ def test_kandinsky_img2img(self):
         image_emb = pipe_prior(
             prompt,
             generator=generator,
+            num_inference_steps=5,
         ).images
-        zero_image_emb = pipe_prior("").images
+        zero_image_emb = pipe_prior("", num_inference_steps=5).images
 
         output = pipeline(
             prompt,
diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index 975b0dc964ce..5b0290ed9a65 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -286,6 +286,7 @@ def test_kandinsky_inpaint(self):
         image_emb = pipe_prior(
             prompt,
             generator=generator,
+            num_inference_steps=5,
         ).images
         zero_image_emb = pipe_prior("").images
 

From 1f5c724d643f09196d9cc81b2dd9fc4dc73b1927 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 20:59:37 +0000
Subject: [PATCH 176/182] lose test for inpaint

---
 .../pipelines/kandinsky/test_kandinsky_inpaint.py  | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index 5b0290ed9a65..6919af848354 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -25,7 +25,7 @@
 from diffusers import DDIMScheduler, KandinskyInpaintPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
@@ -247,6 +247,18 @@ def test_kandinsky_inpaint(self):
             np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
         ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
 
+    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
+    # because UnCLIP undeterminism requires a looser check.
+    @skip_mps
+    def test_inference_batch_single_identical(self):
+        test_max_difference = torch_device == "cpu"
+        relax_max_difference = True
+
+        self._test_inference_batch_single_identical(
+            test_max_difference=test_max_difference,
+            relax_max_difference=relax_max_difference,
+        )
+
 
 @slow
 @require_torch_gpu

From 4aa4e359cd5e3eb495da731846c3d094d7b9a075 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 21:44:25 +0000
Subject: [PATCH 177/182] remove a comment

---
 tests/pipelines/kandinsky/test_kandinsky_img2img.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
index 313c47954d0c..6958403ae11c 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -164,7 +164,7 @@ def get_dummy_components(self):
             "beta_start": 0.00085,
             "beta_end": 0.012,
             "clip_sample": False,
-            "set_alpha_to_one": False,  # not sure what this does, so set to default value for now
+            "set_alpha_to_one": False,
             "steps_offset": 0,
             "prediction_type": "epsilon",
             "thresholding": False,

From 1d18955f38546c972586309eb0c4db92dd670219 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 22:58:18 +0000
Subject: [PATCH 178/182] udpate

---
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index 6919af848354..db59a54e2c0d 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -247,17 +247,9 @@ def test_kandinsky_inpaint(self):
             np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
         ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
 
-    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
-    # because UnCLIP undeterminism requires a looser check.
-    @skip_mps
     def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
-        relax_max_difference = True
+        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
-        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
-            relax_max_difference=relax_max_difference,
-        )
 
 
 @slow

From 8d370abeabb57b413ba36edad4724771bae90a90 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 24 May 2023 23:01:12 +0000
Subject: [PATCH 179/182] style

---
 tests/pipelines/kandinsky/test_kandinsky_inpaint.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
index db59a54e2c0d..1bca753bec18 100644
--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -25,7 +25,7 @@
 from diffusers import DDIMScheduler, KandinskyInpaintPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel
 from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
@@ -251,7 +251,6 @@ def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
 
-
 @slow
 @require_torch_gpu
 class KandinskyInpaintPipelineIntegrationTests(unittest.TestCase):

From 6d360b1bad3be5c8b3f57cd233f4eaf0bee56e65 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 24 May 2023 17:06:20 -1000
Subject: [PATCH 180/182] Update docs/source/en/api/pipelines/kandinsky.mdx

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 6fc515262469..cb2bc93c0aad 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -15,7 +15,7 @@ Kandinsky 2.1 inherits best practices from [DALL-E 2](https://arxiv.org/abs/2204
 
 It uses [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for encoding images and text, and a diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach enhances the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.
 
-The Kandinsky model in diffusers comes from ai-forever and the original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2)
+The Kandinsky model in diffusers comes from `ai-forever` and the original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2).
 
 ## Available Pipelines:
 

From 6dbbc9572c410b228b2e2be42c92027b4c25633f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 25 May 2023 04:27:56 +0000
Subject: [PATCH 181/182] add author names

---
 docs/source/en/api/pipelines/kandinsky.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx
index 6fc515262469..b5b4f0f06400 100644
--- a/docs/source/en/api/pipelines/kandinsky.mdx
+++ b/docs/source/en/api/pipelines/kandinsky.mdx
@@ -15,7 +15,7 @@ Kandinsky 2.1 inherits best practices from [DALL-E 2](https://arxiv.org/abs/2204
 
 It uses [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for encoding images and text, and a diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach enhances the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.
 
-The Kandinsky model in diffusers comes from ai-forever and the original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2)
+The Kandinsky model is created by [Arseniy Shakhmatov](https://github.com/cene555), [Anton Razzhigaev](https://github.com/razzant), [Aleksandr Nikolich](https://github.com/AlexWortega), [Igor Pavlov](https://github.com/boomb0om), [Andrey Kuznetsov](https://github.com/kuznetsoffandrey) and [Denis Dimitrov](https://github.com/denndimitrov) and the original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2)
 
 ## Available Pipelines:
 

From 357e15a7455aff1794cb9ffb09f2cf8e6802d676 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 25 May 2023 05:44:26 +0000
Subject: [PATCH 182/182] remove yiyi files

---
 scripts/kandinsky/prior_statedict_compare.txt | 634 ------------------
 scripts/kandinsky/test_diffusers_unet.py      |  37 -
 scripts/kandinsky/test_text_proj.py           |  37 -
 scripts/kandinsky/unclip_log_notes.txt        | 625 -----------------
 scripts/kandinsky/yiyi_kandinsky_repo.py      |  46 --
 scripts/yiyi_run_sd.py                        |  16 -
 scripts/yiyi_test_kandinsky_d.py              |  40 --
 scripts/yiyi_test_sd_unclip.py                |  23 -
 scripts/yiyi_test_unclip.py                   |  13 -
 9 files changed, 1471 deletions(-)
 delete mode 100644 scripts/kandinsky/prior_statedict_compare.txt
 delete mode 100644 scripts/kandinsky/test_diffusers_unet.py
 delete mode 100644 scripts/kandinsky/test_text_proj.py
 delete mode 100644 scripts/kandinsky/unclip_log_notes.txt
 delete mode 100644 scripts/kandinsky/yiyi_kandinsky_repo.py
 delete mode 100644 scripts/yiyi_run_sd.py
 delete mode 100644 scripts/yiyi_test_kandinsky_d.py
 delete mode 100644 scripts/yiyi_test_sd_unclip.py
 delete mode 100644 scripts/yiyi_test_unclip.py

diff --git a/scripts/kandinsky/prior_statedict_compare.txt b/scripts/kandinsky/prior_statedict_compare.txt
deleted file mode 100644
index 1f4854d248b7..000000000000
--- a/scripts/kandinsky/prior_statedict_compare.txt
+++ /dev/null
@@ -1,634 +0,0 @@
-# orig
-model.time_embed.0.weight:torch.Size([2048, 2048])
-model.time_embed.0.bias:torch.Size([2048])
-
-model.clip_img_proj.weight:torch.Size([2048, 768])
-model.clip_img_proj.bias:torch.Size([2048])
-
-model.text_emb_proj.weight:torch.Size([2048, 768])
-model.text_emb_proj.bias:torch.Size([2048])
-
-model.text_enc_proj.weight:torch.Size([2048, 768])
-model.text_enc_proj.bias:torch.Size([2048])
-
-model.positional_embedding:torch.Size([1, 81, 2048])
-
-model.prd_emb:torch.Size([1, 1, 2048])
-
-model.time_embed.2.weight:torch.Size([2048, 2048])
-model.time_embed.2.bias:torch.Size([2048])
-
-
-
-
-
-
-
-
-
-### matched
-
-model.out_proj.weight:torch.Size([768, 2048])
-model.out_proj.bias:torch.Size([768])
-model.transformer.resblocks.0.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.0.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.0.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.0.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.0.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.0.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.0.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.0.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.0.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.0.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.0.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.0.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.1.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.1.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.1.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.1.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.1.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.1.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.1.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.1.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.1.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.1.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.1.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.1.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.2.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.2.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.2.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.2.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.2.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.2.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.2.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.2.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.2.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.2.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.2.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.2.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.3.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.3.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.3.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.3.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.3.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.3.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.3.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.3.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.3.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.3.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.3.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.3.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.4.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.4.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.4.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.4.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.4.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.4.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.4.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.4.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.4.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.4.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.4.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.4.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.5.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.5.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.5.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.5.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.5.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.5.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.5.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.5.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.5.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.5.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.5.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.5.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.6.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.6.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.6.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.6.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.6.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.6.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.6.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.6.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.6.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.6.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.6.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.6.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.7.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.7.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.7.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.7.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.7.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.7.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.7.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.7.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.7.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.7.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.7.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.7.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.8.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.8.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.8.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.8.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.8.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.8.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.8.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.8.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.8.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.8.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.8.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.8.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.9.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.9.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.9.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.9.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.9.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.9.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.9.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.9.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.9.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.9.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.9.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.9.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.10.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.10.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.10.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.10.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.10.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.10.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.10.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.10.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.10.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.10.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.10.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.10.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.11.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.11.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.11.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.11.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.11.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.11.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.11.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.11.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.11.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.11.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.11.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.11.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.12.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.12.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.12.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.12.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.12.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.12.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.12.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.12.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.12.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.12.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.12.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.12.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.13.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.13.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.13.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.13.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.13.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.13.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.13.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.13.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.13.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.13.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.13.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.13.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.14.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.14.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.14.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.14.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.14.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.14.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.14.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.14.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.14.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.14.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.14.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.14.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.15.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.15.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.15.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.15.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.15.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.15.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.15.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.15.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.15.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.15.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.15.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.15.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.16.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.16.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.16.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.16.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.16.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.16.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.16.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.16.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.16.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.16.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.16.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.16.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.17.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.17.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.17.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.17.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.17.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.17.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.17.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.17.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.17.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.17.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.17.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.17.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.18.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.18.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.18.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.18.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.18.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.18.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.18.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.18.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.18.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.18.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.18.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.18.ln_2.bias:torch.Size([2048])
-model.transformer.resblocks.19.attn.c_qkv.weight:torch.Size([6144, 2048])
-model.transformer.resblocks.19.attn.c_qkv.bias:torch.Size([6144])
-model.transformer.resblocks.19.attn.c_proj.weight:torch.Size([2048, 2048])
-model.transformer.resblocks.19.attn.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.19.ln_1.weight:torch.Size([2048])
-model.transformer.resblocks.19.ln_1.bias:torch.Size([2048])
-model.transformer.resblocks.19.mlp.c_fc.weight:torch.Size([8192, 2048])
-model.transformer.resblocks.19.mlp.c_fc.bias:torch.Size([8192])
-model.transformer.resblocks.19.mlp.c_proj.weight:torch.Size([2048, 8192])
-model.transformer.resblocks.19.mlp.c_proj.bias:torch.Size([2048])
-model.transformer.resblocks.19.ln_2.weight:torch.Size([2048])
-model.transformer.resblocks.19.ln_2.bias:torch.Size([2048])
-model.final_ln.weight:torch.Size([2048])
-model.final_ln.bias:torch.Size([2048])
-
-
-# diffuser
-
-time_embedding.linear_1.weight:torch.Size([2048, 2048])
-time_embedding.linear_1.bias:torch.Size([2048])
-
-proj_in.weight:torch.Size([2048, 768])
-proj_in.bias:torch.Size([2048])
-
-embedding_proj.weight:torch.Size([2048, 768])
-embedding_proj.bias:torch.Size([2048])
-
-encoder_hidden_states_proj.weight:torch.Size([2048, 768])
-encoder_hidden_states_proj.bias:torch.Size([2048])
-
-positional_embedding:torch.Size([1, 81, 2048])
-
-prd_embedding:torch.Size([1, 1, 2048])
-
-time_embedding.linear_2.weight:torch.Size([2048, 2048])
-time_embedding.linear_2.bias:torch.Size([2048])
-
-
-
-
-
-
-
-
-### matched
-
-clip_mean:torch.Size([1, 768])
-clip_std:torch.Size([1, 768])
-
-
-transformer_blocks.0.norm1.weight:torch.Size([2048])
-transformer_blocks.0.norm1.bias:torch.Size([2048])
-transformer_blocks.0.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.0.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.0.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.0.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.0.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.0.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.0.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.0.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.0.norm3.weight:torch.Size([2048])
-transformer_blocks.0.norm3.bias:torch.Size([2048])
-transformer_blocks.0.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.0.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.0.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.0.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.1.norm1.weight:torch.Size([2048])
-transformer_blocks.1.norm1.bias:torch.Size([2048])
-transformer_blocks.1.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.1.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.1.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.1.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.1.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.1.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.1.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.1.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.1.norm3.weight:torch.Size([2048])
-transformer_blocks.1.norm3.bias:torch.Size([2048])
-transformer_blocks.1.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.1.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.1.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.1.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.2.norm1.weight:torch.Size([2048])
-transformer_blocks.2.norm1.bias:torch.Size([2048])
-transformer_blocks.2.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.2.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.2.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.2.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.2.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.2.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.2.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.2.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.2.norm3.weight:torch.Size([2048])
-transformer_blocks.2.norm3.bias:torch.Size([2048])
-transformer_blocks.2.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.2.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.2.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.2.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.3.norm1.weight:torch.Size([2048])
-transformer_blocks.3.norm1.bias:torch.Size([2048])
-transformer_blocks.3.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.3.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.3.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.3.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.3.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.3.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.3.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.3.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.3.norm3.weight:torch.Size([2048])
-transformer_blocks.3.norm3.bias:torch.Size([2048])
-transformer_blocks.3.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.3.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.3.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.3.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.4.norm1.weight:torch.Size([2048])
-transformer_blocks.4.norm1.bias:torch.Size([2048])
-transformer_blocks.4.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.4.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.4.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.4.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.4.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.4.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.4.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.4.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.4.norm3.weight:torch.Size([2048])
-transformer_blocks.4.norm3.bias:torch.Size([2048])
-transformer_blocks.4.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.4.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.4.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.4.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.5.norm1.weight:torch.Size([2048])
-transformer_blocks.5.norm1.bias:torch.Size([2048])
-transformer_blocks.5.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.5.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.5.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.5.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.5.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.5.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.5.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.5.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.5.norm3.weight:torch.Size([2048])
-transformer_blocks.5.norm3.bias:torch.Size([2048])
-transformer_blocks.5.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.5.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.5.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.5.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.6.norm1.weight:torch.Size([2048])
-transformer_blocks.6.norm1.bias:torch.Size([2048])
-transformer_blocks.6.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.6.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.6.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.6.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.6.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.6.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.6.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.6.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.6.norm3.weight:torch.Size([2048])
-transformer_blocks.6.norm3.bias:torch.Size([2048])
-transformer_blocks.6.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.6.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.6.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.6.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.7.norm1.weight:torch.Size([2048])
-transformer_blocks.7.norm1.bias:torch.Size([2048])
-transformer_blocks.7.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.7.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.7.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.7.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.7.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.7.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.7.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.7.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.7.norm3.weight:torch.Size([2048])
-transformer_blocks.7.norm3.bias:torch.Size([2048])
-transformer_blocks.7.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.7.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.7.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.7.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.8.norm1.weight:torch.Size([2048])
-transformer_blocks.8.norm1.bias:torch.Size([2048])
-transformer_blocks.8.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.8.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.8.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.8.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.8.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.8.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.8.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.8.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.8.norm3.weight:torch.Size([2048])
-transformer_blocks.8.norm3.bias:torch.Size([2048])
-transformer_blocks.8.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.8.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.8.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.8.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.9.norm1.weight:torch.Size([2048])
-transformer_blocks.9.norm1.bias:torch.Size([2048])
-transformer_blocks.9.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.9.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.9.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.9.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.9.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.9.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.9.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.9.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.9.norm3.weight:torch.Size([2048])
-transformer_blocks.9.norm3.bias:torch.Size([2048])
-transformer_blocks.9.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.9.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.9.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.9.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.10.norm1.weight:torch.Size([2048])
-transformer_blocks.10.norm1.bias:torch.Size([2048])
-transformer_blocks.10.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.10.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.10.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.10.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.10.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.10.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.10.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.10.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.10.norm3.weight:torch.Size([2048])
-transformer_blocks.10.norm3.bias:torch.Size([2048])
-transformer_blocks.10.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.10.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.10.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.10.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.11.norm1.weight:torch.Size([2048])
-transformer_blocks.11.norm1.bias:torch.Size([2048])
-transformer_blocks.11.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.11.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.11.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.11.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.11.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.11.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.11.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.11.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.11.norm3.weight:torch.Size([2048])
-transformer_blocks.11.norm3.bias:torch.Size([2048])
-transformer_blocks.11.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.11.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.11.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.11.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.12.norm1.weight:torch.Size([2048])
-transformer_blocks.12.norm1.bias:torch.Size([2048])
-transformer_blocks.12.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.12.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.12.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.12.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.12.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.12.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.12.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.12.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.12.norm3.weight:torch.Size([2048])
-transformer_blocks.12.norm3.bias:torch.Size([2048])
-transformer_blocks.12.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.12.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.12.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.12.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.13.norm1.weight:torch.Size([2048])
-transformer_blocks.13.norm1.bias:torch.Size([2048])
-transformer_blocks.13.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.13.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.13.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.13.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.13.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.13.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.13.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.13.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.13.norm3.weight:torch.Size([2048])
-transformer_blocks.13.norm3.bias:torch.Size([2048])
-transformer_blocks.13.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.13.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.13.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.13.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.14.norm1.weight:torch.Size([2048])
-transformer_blocks.14.norm1.bias:torch.Size([2048])
-transformer_blocks.14.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.14.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.14.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.14.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.14.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.14.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.14.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.14.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.14.norm3.weight:torch.Size([2048])
-transformer_blocks.14.norm3.bias:torch.Size([2048])
-transformer_blocks.14.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.14.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.14.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.14.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.15.norm1.weight:torch.Size([2048])
-transformer_blocks.15.norm1.bias:torch.Size([2048])
-transformer_blocks.15.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.15.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.15.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.15.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.15.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.15.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.15.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.15.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.15.norm3.weight:torch.Size([2048])
-transformer_blocks.15.norm3.bias:torch.Size([2048])
-transformer_blocks.15.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.15.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.15.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.15.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.16.norm1.weight:torch.Size([2048])
-transformer_blocks.16.norm1.bias:torch.Size([2048])
-transformer_blocks.16.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.16.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.16.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.16.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.16.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.16.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.16.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.16.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.16.norm3.weight:torch.Size([2048])
-transformer_blocks.16.norm3.bias:torch.Size([2048])
-transformer_blocks.16.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.16.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.16.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.16.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.17.norm1.weight:torch.Size([2048])
-transformer_blocks.17.norm1.bias:torch.Size([2048])
-transformer_blocks.17.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.17.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.17.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.17.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.17.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.17.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.17.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.17.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.17.norm3.weight:torch.Size([2048])
-transformer_blocks.17.norm3.bias:torch.Size([2048])
-transformer_blocks.17.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.17.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.17.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.17.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.18.norm1.weight:torch.Size([2048])
-transformer_blocks.18.norm1.bias:torch.Size([2048])
-transformer_blocks.18.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.18.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.18.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.18.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.18.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.18.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.18.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.18.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.18.norm3.weight:torch.Size([2048])
-transformer_blocks.18.norm3.bias:torch.Size([2048])
-transformer_blocks.18.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.18.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.18.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.18.ff.net.2.bias:torch.Size([2048])
-transformer_blocks.19.norm1.weight:torch.Size([2048])
-transformer_blocks.19.norm1.bias:torch.Size([2048])
-transformer_blocks.19.attn1.to_q.weight:torch.Size([2048, 2048])
-transformer_blocks.19.attn1.to_q.bias:torch.Size([2048])
-transformer_blocks.19.attn1.to_k.weight:torch.Size([2048, 2048])
-transformer_blocks.19.attn1.to_k.bias:torch.Size([2048])
-transformer_blocks.19.attn1.to_v.weight:torch.Size([2048, 2048])
-transformer_blocks.19.attn1.to_v.bias:torch.Size([2048])
-transformer_blocks.19.attn1.to_out.0.weight:torch.Size([2048, 2048])
-transformer_blocks.19.attn1.to_out.0.bias:torch.Size([2048])
-transformer_blocks.19.norm3.weight:torch.Size([2048])
-transformer_blocks.19.norm3.bias:torch.Size([2048])
-transformer_blocks.19.ff.net.0.proj.weight:torch.Size([8192, 2048])
-transformer_blocks.19.ff.net.0.proj.bias:torch.Size([8192])
-transformer_blocks.19.ff.net.2.weight:torch.Size([2048, 8192])
-transformer_blocks.19.ff.net.2.bias:torch.Size([2048])
-norm_out.weight:torch.Size([2048])
-norm_out.bias:torch.Size([2048])
-proj_to_clip_embeddings.weight:torch.Size([768, 2048])
-proj_to_clip_embeddings.bias:torch.Size([768])
\ No newline at end of file
diff --git a/scripts/kandinsky/test_diffusers_unet.py b/scripts/kandinsky/test_diffusers_unet.py
deleted file mode 100644
index b9c62b95284a..000000000000
--- a/scripts/kandinsky/test_diffusers_unet.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import torch
-
-from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
-
-
-model_dtype = torch.float32
-# repo = "/home/yiyi_huggingface_co/test-kandinsky"
-# unet = UNet2DConditionModel.from_pretrained(repo, subfolder='unet')
-
-text_proj = KandinskyTextProjModel(
-    clip_extra_context_tokens=10,  # num_image_embs= 10
-    clip_text_encoder_hidden_states_dim=1024,  # text_encoder_in_dim1
-    clip_embeddings_dim=768,  # text_encoder_in_dim2
-    time_embed_dim=1536,  # model_channels * 4
-    cross_attention_dim=768,  # model_dim
-).to("cuda")
-
-print("text proj checkpoint:")
-for k, w in text_proj.state_dict().items():
-    print(f"{k}:{w.shape}")
-
-x = torch.randn(2, 4, 96, 96, device="cuda")
-timesteps = torch.tensor([979.0, 979], device="cuda")
-full_emb = torch.randn(2, 77, 1024, device="cuda").to(model_dtype)
-pooled_emb = torch.randn(2, 768, device="cuda").to(model_dtype)
-image_emb = torch.randn(2, 768, device="cuda").to(model_dtype)
-
-text_encoder_hidden_states, additive_clip_time_embeddings = text_proj(
-    image_embeddings=image_emb,
-    prompt_embeds=image_emb,
-    text_encoder_hidden_states=full_emb,
-)
-
-print(f"text_encoder_hidden_states:{text_encoder_hidden_states.shape},{text_encoder_hidden_states.mean()}")
-# 2, 87, 768
-print(f"additive_clip_time_embeddings:{additive_clip_time_embeddings.shape},{additive_clip_time_embeddings.mean()}")
-# 2, 1536
diff --git a/scripts/kandinsky/test_text_proj.py b/scripts/kandinsky/test_text_proj.py
deleted file mode 100644
index b9c62b95284a..000000000000
--- a/scripts/kandinsky/test_text_proj.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import torch
-
-from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel
-
-
-model_dtype = torch.float32
-# repo = "/home/yiyi_huggingface_co/test-kandinsky"
-# unet = UNet2DConditionModel.from_pretrained(repo, subfolder='unet')
-
-text_proj = KandinskyTextProjModel(
-    clip_extra_context_tokens=10,  # num_image_embs= 10
-    clip_text_encoder_hidden_states_dim=1024,  # text_encoder_in_dim1
-    clip_embeddings_dim=768,  # text_encoder_in_dim2
-    time_embed_dim=1536,  # model_channels * 4
-    cross_attention_dim=768,  # model_dim
-).to("cuda")
-
-print("text proj checkpoint:")
-for k, w in text_proj.state_dict().items():
-    print(f"{k}:{w.shape}")
-
-x = torch.randn(2, 4, 96, 96, device="cuda")
-timesteps = torch.tensor([979.0, 979], device="cuda")
-full_emb = torch.randn(2, 77, 1024, device="cuda").to(model_dtype)
-pooled_emb = torch.randn(2, 768, device="cuda").to(model_dtype)
-image_emb = torch.randn(2, 768, device="cuda").to(model_dtype)
-
-text_encoder_hidden_states, additive_clip_time_embeddings = text_proj(
-    image_embeddings=image_emb,
-    prompt_embeds=image_emb,
-    text_encoder_hidden_states=full_emb,
-)
-
-print(f"text_encoder_hidden_states:{text_encoder_hidden_states.shape},{text_encoder_hidden_states.mean()}")
-# 2, 87, 768
-print(f"additive_clip_time_embeddings:{additive_clip_time_embeddings.shape},{additive_clip_time_embeddings.mean()}")
-# 2, 1536
diff --git a/scripts/kandinsky/unclip_log_notes.txt b/scripts/kandinsky/unclip_log_notes.txt
deleted file mode 100644
index 6287f17d259e..000000000000
--- a/scripts/kandinsky/unclip_log_notes.txt
+++ /dev/null
@@ -1,625 +0,0 @@
-
-decoder diffusion
-1. create initial latents (x_t) [B, C, H, W]
-  decoder_latents [1, 3, 64, 64]
-  0%|                                                                                                                                                                                | 0/25 [00:00<?, ?it/s]i :0
-2. latent model 
-  latent_model_input [2, 3, 64, 64]
-  noise_pred [2, 6, 64, 64]
-3. CFG 
-   noise_pred [2, 6, 64, 64] -> uncond [1, 6, 64, 64], cond [1, 6, 64, 64]
-                                
-   uncond:  uncond_noise [1, 3, 64, 64]       uncond_variance(not used)
-   cond:    cond_noise[1, 3, 64, 64]          cond_variance[1, 3, 64, 64]
-            cfg(uncond_noise, cond_noise)  ->   noise_pred [1, 3, 64, 64] + cond_variance[1, 3, 64, 64] -> noise_pred [1, 6, 64, 64]
-
-4. calculate previous noisy sample: x_t -> x_t-1:
-   inputs for scheduler.step:
-    -noise_pred (guided model_output) [1, 6, 64, 64]
-    -t/timestep: 999
-    -latent/sample/previous denoised (x_t):[1, 3, 64, 64]
-
-   noise_pred(guided outmodel_output) [1, 6, 64, 64] 
-      -> model_output (predicted_noise): [1, 3, 64, 64]
-      -> predicted_variance: [1, 3, 64, 64]
-   pred_noise -> pred_x0: [1, 3, 64, 64]
-   pred_prev_sample(xt-1): [1, 3, 64, 64]
-   
-   generate random variance_noise [1, 3, 64, 64]
-   predicated_variance [1, 3, 64, 64] -> variance [1, 3, 64, 64]
-   final xt-1 [1, 3, 64, 64]
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-   
-  4%|██████▋                                                                                                                                                                 | 1/25 [00:00<00:17,  1.38it/s]i :1
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 916
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 957
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :2
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 874
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 916
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
- 12%|████████████████████▏                                                                                                                                                   | 3/25 [00:00<00:05,  4.06it/s]i :3
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 832
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 874
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :4
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 791
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 832
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
- 20%|█████████████████████████████████▌                                                                                                                                      | 5/25 [00:01<00:03,  6.26it/s]i :5
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 749
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 791
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :6
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 708
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 749
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
- 28%|███████████████████████████████████████████████                                                                                                                         | 7/25 [00:01<00:02,  7.98it/s]i :7
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 666
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 708
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :8
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 624
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 666
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
- 36%|████████████████████████████████████████████████████████████▍                                                                                                           | 9/25 [00:01<00:01,  9.43it/s]i :9
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 583
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 624
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :10
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 541
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 583
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
- 44%|█████████████████████████████████████████████████████████████████████████▍                                                                                             | 11/25 [00:01<00:01, 10.49it/s]i :11
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 500
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 541
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :12
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 458
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 500
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
- 52%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 13/25 [00:01<00:01, 11.29it/s]i :13
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 416
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 458
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :14
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 375
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 416
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
- 60%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 15/25 [00:01<00:00, 11.91it/s]i :15
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 333
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 375
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :16
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 291
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 333
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
- 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 17/25 [00:01<00:00, 12.26it/s]i :17
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 250
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 291
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :18
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 208
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 250
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
- 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 19/25 [00:02<00:00, 12.52it/s]i :19
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 166
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 208
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :20
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 125
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 166
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
- 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 21/25 [00:02<00:00, 12.74it/s]i :21
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 83
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 125
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :22
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 42
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 83
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
- 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 23/25 [00:02<00:00, 12.86it/s]i :23
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: 0
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 42
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-generate variance_noise: torch.Size([1, 3, 64, 64])
- -> _get_variance -> variance torch.Size([1, 3, 64, 64])
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-i :24
-latent_model_input :torch.Size([2, 3, 64, 64])
-noise_pred:torch.Size([2, 6, 64, 64])
-pred -> uncond torch.Size([1, 6, 64, 64]), cond torch.Size([1, 6, 64, 64]) 
-uncond -> uncond_noise torch.Size([1, 3, 64, 64]), uncond_variance(not used)
-cond -> cond_noise torch.Size([1, 3, 64, 64]), cond_variancetorch.Size([1, 3, 64, 64])
- cond_noise + uncond_noise -> cfg -> noise_pred torch.Size([1, 3, 64, 64])
-noise_pred + cond_variance -> noise_pred: torch.Size([1, 6, 64, 64])
-prev_timestep: None
-inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 6, 64, 64])
-  -t/timestep: 0
-  -latent/sample/previous denoised: torch.Size([1, 3, 64, 64])
-noise_pred/outmodel_output -> model_output (noise): torch.Size([1, 3, 64, 64]), predicted_variance: torch.Size([1, 3, 64, 64]) 
- pred_noise -> pred_original_sample: torch.Size([1, 3, 64, 64]) 
-pred_prev_sample: torch.Size([1, 3, 64, 64])
-add noise
-step return pred_prev_sample: torch.Size([1, 3, 64, 64])
-100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:02<00:00,  9.80it/s]
-  0%|                                                                                                                                                                                 | 0/7 [00:00<?, ?it/s]inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 3, 256, 256])
-  -t/timestep: 999
-  -latent/sample/previous denoised: torch.Size([1, 3, 256, 256])
- pred_noise -> pred_original_sample: torch.Size([1, 3, 256, 256]) 
-pred_prev_sample: torch.Size([1, 3, 256, 256])
-add noise
-generate variance_noise: torch.Size([1, 3, 256, 256])
- -> _get_variance -> variance torch.Size([])
-step return pred_prev_sample: torch.Size([1, 3, 256, 256])
- 14%|████████████████████████▏                                                                                                                                                | 1/7 [00:00<00:03,  1.97it/s]inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 3, 256, 256])
-  -t/timestep: 832
-  -latent/sample/previous denoised: torch.Size([1, 3, 256, 256])
- pred_noise -> pred_original_sample: torch.Size([1, 3, 256, 256]) 
-pred_prev_sample: torch.Size([1, 3, 256, 256])
-add noise
-generate variance_noise: torch.Size([1, 3, 256, 256])
- -> _get_variance -> variance torch.Size([])
-step return pred_prev_sample: torch.Size([1, 3, 256, 256])
- 29%|████████████████████████████████████████████████▎                                                                                                                        | 2/7 [00:01<00:02,  1.96it/s]inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 3, 256, 256])
-  -t/timestep: 666
-  -latent/sample/previous denoised: torch.Size([1, 3, 256, 256])
- pred_noise -> pred_original_sample: torch.Size([1, 3, 256, 256]) 
-pred_prev_sample: torch.Size([1, 3, 256, 256])
-add noise
-generate variance_noise: torch.Size([1, 3, 256, 256])
- -> _get_variance -> variance torch.Size([])
-step return pred_prev_sample: torch.Size([1, 3, 256, 256])
- 43%|████████████████████████████████████████████████████████████████████████▍                                                                                                | 3/7 [00:01<00:02,  1.96it/s]inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 3, 256, 256])
-  -t/timestep: 500
-  -latent/sample/previous denoised: torch.Size([1, 3, 256, 256])
- pred_noise -> pred_original_sample: torch.Size([1, 3, 256, 256]) 
-pred_prev_sample: torch.Size([1, 3, 256, 256])
-add noise
-generate variance_noise: torch.Size([1, 3, 256, 256])
- -> _get_variance -> variance torch.Size([])
-step return pred_prev_sample: torch.Size([1, 3, 256, 256])
- 57%|████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                        | 4/7 [00:02<00:01,  1.96it/s]inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 3, 256, 256])
-  -t/timestep: 333
-  -latent/sample/previous denoised: torch.Size([1, 3, 256, 256])
- pred_noise -> pred_original_sample: torch.Size([1, 3, 256, 256]) 
-pred_prev_sample: torch.Size([1, 3, 256, 256])
-add noise
-generate variance_noise: torch.Size([1, 3, 256, 256])
- -> _get_variance -> variance torch.Size([])
-step return pred_prev_sample: torch.Size([1, 3, 256, 256])
- 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 5/7 [00:02<00:01,  1.96it/s]inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 3, 256, 256])
-  -t/timestep: 166
-  -latent/sample/previous denoised: torch.Size([1, 3, 256, 256])
- pred_noise -> pred_original_sample: torch.Size([1, 3, 256, 256]) 
-pred_prev_sample: torch.Size([1, 3, 256, 256])
-add noise
-generate variance_noise: torch.Size([1, 3, 256, 256])
- -> _get_variance -> variance torch.Size([])
-step return pred_prev_sample: torch.Size([1, 3, 256, 256])
- 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 6/7 [00:03<00:00,  1.96it/s]inside step:
-inputs
-  -noise_pred/model_output: torch.Size([1, 3, 256, 256])
-  -t/timestep: 0
-  -latent/sample/previous denoised: torch.Size([1, 3, 256, 256])
- pred_noise -> pred_original_sample: torch.Size([1, 3, 256, 256]) 
-pred_prev_sample: torch.Size([1, 3, 256, 256])
-add noise
-step return pred_prev_sample: torch.Size([1, 3, 256, 256])
diff --git a/scripts/kandinsky/yiyi_kandinsky_repo.py b/scripts/kandinsky/yiyi_kandinsky_repo.py
deleted file mode 100644
index 5bbc738b14b2..000000000000
--- a/scripts/kandinsky/yiyi_kandinsky_repo.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# from transformers import PretrainedConfig
-
-# from diffusers.pipelines.kandinsky import MultilingualCLIP
-
-# out = "/home/yiyi_huggingface_co/model_repo/Kandinsky"
-# out_prior = "/home/yiyi_huggingface_co/model_repo/Kandinsky-prior"
-# # # prior_tokenizer/tokenizer2
-# # tokenizer2 = CLIPTokenizer.from_pretrained("kakaobrain/karlo-v1-alpha", subfolder="tokenizer")
-# # tokenizer2.save_pretrained(f"{out}/prior_tokenizer")
-# # print(f"tokenizer saved at {out}/prior_tokenizer")
-# # # prior_text_encoder/text_encoder
-# # prior_text_encoder = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
-# # prior_text_encoder.save_pretrained(f"{out}/prior_text_encoder")
-# # print(f"text_encoder saved at {out}/text_encoder")
-# # image_encoder
-# clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
-# clip_image_encoder.save_pretrained(f"{out_prior}/image_encoder")
-# # text_encoder & tokenizer
-# model_name = "M-CLIP/XLM-Roberta-Large-Vit-L-14"
-# mclip_text_encoder = MultilingualCLIP.from_pretrained(model_name)
-# mclip_tokenizer = AutoTokenizer.from_pretrained(model_name)
-# mclip_text_encoder.save_pretrained(f"{out}/text_encoder")
-# mclip_tokenizer.save_pretrained(f"{out}/tokenizer")
-# tiny-random-mclip-base
-# from transformers import XLMRobertaConfig, XLMRobertaForMaskedLM
-# config= XLMRobertaConfig(
-#     bos_token_id=0,
-#     eos_token_id=2,
-#     hidden_size=32,
-#     intermediate_size=37,
-#     layer_norm_eps=1e-05,
-#     num_attention_heads=4,
-#     num_hidden_layers=5,
-#     pad_token_id=1,
-#     vocab_size=1000)
-# base_model = XLMRobertaForMaskedLM(config)
-# base_model.save_pretrained("/home/yiyi_huggingface_co/model_repo/tiny-random-mclip-base")
-
-# tiny-random-mclip
-# from diffusers.pipelines.kandinsky.text_encoder import MultilingualCLIP
-
-
-# config = PretrainedConfig(modelBase="YiYiXu/tiny-random-mclip-base", numDims=100, transformerDimensions=32)
-
-# mclip_testing = MultilingualCLIP(config)
-# print(mclip_testing)
diff --git a/scripts/yiyi_run_sd.py b/scripts/yiyi_run_sd.py
deleted file mode 100644
index 2dd2df394d6b..000000000000
--- a/scripts/yiyi_run_sd.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import torch
-
-from diffusers import StableDiffusionPipeline
-
-
-model_id = "CompVis/stable-diffusion-v1-4"
-device = "cuda"
-
-
-pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
-pipe = pipe.to(device)
-
-prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt).images[0]
-
-image.save("astronaut_rides_horse.png")
diff --git a/scripts/yiyi_test_kandinsky_d.py b/scripts/yiyi_test_kandinsky_d.py
deleted file mode 100644
index f4eb25680d04..000000000000
--- a/scripts/yiyi_test_kandinsky_d.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import torch
-
-from diffusers import KandinskyPipeline
-
-
-prompt = "red cat, 4k photo"
-device = "cuda"
-# batch_size=1
-# guidance_scale=4
-# prior_cf_scale=4,
-# prior_steps="5"
-
-pipe_prior = KandinskyPipeline.from_pretrained("/home/yiyi_huggingface_co/test-kandinsky")
-pipe_prior.to(device)
-
-# step1. testing prior
-# set_seed(0)
-# hidden_states = torch.randn(2,768, device=device)
-# print(f"hidden_states:{hidden_states.sum()}")
-# timestep = torch.tensor([4,4], device=device)
-# out = pipe_prior.prior(
-#     hidden_states,
-#     timestep,
-#     proj_embedding,
-#     encoder_hidden_states,
-#     attention_mask
-
-# )
-# print(out)
-# print(f"predicted_image_embedding: {out['predicted_image_embedding'].shape}, {out['predicted_image_embedding'].sum()}")
-
-generator = torch.Generator(device="cuda").manual_seed(0)
-out = pipe_prior(
-    prompt,
-    generator=generator,
-)
-
-print(f"image_embeddings:{out.shape},{out.sum()}")
-
-print(out)
diff --git a/scripts/yiyi_test_sd_unclip.py b/scripts/yiyi_test_sd_unclip.py
deleted file mode 100644
index 2c2985481920..000000000000
--- a/scripts/yiyi_test_sd_unclip.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from io import BytesIO
-
-import requests
-import torch
-from PIL import Image
-
-from diffusers import StableUnCLIPImg2ImgPipeline
-
-
-pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2-1-unclip",
-    torch_dtype=torch.float16,
-    variant="fp16",
-)
-pipe = pipe.to("cuda")
-
-url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/tarsila_do_amaral.png"
-
-response = requests.get(url)
-init_image = Image.open(BytesIO(response.content)).convert("RGB")
-
-images = pipe(init_image).images
-images[0].save("fantasy_landscape.png")
diff --git a/scripts/yiyi_test_unclip.py b/scripts/yiyi_test_unclip.py
deleted file mode 100644
index faea5be0517a..000000000000
--- a/scripts/yiyi_test_unclip.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import torch
-
-from diffusers import UnCLIPPipeline
-
-
-pipe = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16)
-pipe = pipe.to("cuda")
-
-prompt = "a high-resolution photograph of a big red frog on a green leaf."
-
-image = pipe([prompt]).images[0]
-
-image.save("./frog.png")