From 770feb7c7313f8eda8ecfd69d136eb68d588184f Mon Sep 17 00:00:00 2001 From: bghira Date: Sun, 9 Jul 2023 15:20:28 -0700 Subject: [PATCH 01/16] diffusers#4003 - initial implementation of max_inference_steps --- .../stable_diffusion_xl/pipeline_stable_diffusion_xl.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 265e325a45e0..4f0854c3c142 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -545,6 +545,7 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, + max_inference_steps: Optional[int] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, @@ -579,6 +580,9 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. + max_inference_steps (`int`, *optional*): + Instead of completing the backwards pass entirely, stop and return the output after this many steps. + Can be useful with `output_type="latent"` and an img2img pipeline, possibly with better fine detail. guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -782,6 +786,9 @@ def __call__( progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) + if max_inference_steps is not None and i >= max_inference_steps: + logger.debug(f'Breaking inference loop at step {i} as we have reached max_inference_steps={max_inference_steps}') + break # make sure the VAE is in float32 mode, as it overflows in float16 self.vae.to(dtype=torch.float32) From 2f142f669f3bb312190f382ae3fef0362fcbf588 Mon Sep 17 00:00:00 2001 From: bghira Date: Sun, 9 Jul 2023 17:29:36 -0700 Subject: [PATCH 02/16] diffusers#4003 - initial implementation of max_inference_steps and first_inference_step for img2img --- .../pipeline_stable_diffusion_xl_img2img.py | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 1618d9d04a74..52c5fa832139 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -455,11 +455,27 @@ def prepare_extra_step_kwargs(self, generator, eta): return extra_step_kwargs def check_inputs( - self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None + self, prompt, strength, num_inference_steps, first_inference_step, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None ): if strength < 0 or strength > 1: raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") - + if num_inference_steps is None: + raise ValueError("`num_inference_steps` cannot be None.") + elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0: + raise ValueError( + f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type" + f" {type(num_inference_steps)}." + ) + if first_inference_step is not None and (not isinstance(first_inference_step, int) or first_inference_step <= 0): + raise ValueError( + f"`first_inference_step` has to be a positive integer but is {first_inference_step} of type" + f" {type(first_inference_step)}." + ) + if first_inference_step is not None and first_inference_step > num_inference_steps: + raise ValueError( + f"`first_inference_step` has to be smaller than `num_inference_steps` but is {first_inference_step} and" + f" `num_inference_steps` is {num_inference_steps}." + ) if (callback_steps is None) or ( callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) ): @@ -619,6 +635,8 @@ def __call__( ] = None, strength: float = 0.3, num_inference_steps: int = 50, + max_inference_steps: Optional[int] = None, + first_inference_step: Optional[int] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, @@ -659,6 +677,12 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. + max_inference_steps (`int`, *optional*): + Instead of completing the backwards pass entirely, stop and return the output after this many steps. + Can be useful with `output_type="latent"` and an img2img pipeline, possibly with better fine detail. + first_inference_step (`int`, *optional*): + Ignore the first steps of the denoising process, and start from here. + Useful if the input is a latent tensor that still has residual noise, eg. using `max_inference_steps`. guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -737,7 +761,7 @@ def __call__( "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ # 1. Check inputs. Raise error if not correct - self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + self.check_inputs(prompt, strength, num_inference_steps, first_inference_step, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -822,6 +846,10 @@ def __call__( num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): + # skip a number of timesteps, if first_inference_step is set + if first_inference_step is not None and i < first_inference_step: + print(f'Skipping timestep {i} of {num_inference_steps} because of first_inference_step={first_inference_step}') + continue # expand the latents if we are doing classifier free guidance latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents @@ -855,6 +883,9 @@ def __call__( progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) + if max_inference_steps is not None and i >= max_inference_steps: + logger.debug(f'Breaking inference loop at step {i} as we have reached max_inference_steps={max_inference_steps}') + break # make sure the VAE is in float32 mode, as it overflows in float16 self.vae.to(dtype=torch.float32) From a4c6217d20a697dd60e6e928161dfa1d1fd7c81e Mon Sep 17 00:00:00 2001 From: bghira Date: Sun, 9 Jul 2023 17:46:06 -0700 Subject: [PATCH 03/16] diffusers#4003 - use first_inference_step as an input arg for get_timestamps in img2img --- .../pipeline_stable_diffusion_xl_img2img.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 52c5fa832139..d829e53bf94f 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -510,9 +510,12 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) - def get_timesteps(self, num_inference_steps, strength, device): + def get_timesteps(self, num_inference_steps, first_inference_step, strength, device): # get the original timestep using init_timestep - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + if first_inference_step is None: + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + else: + init_timestep = first_inference_step t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] @@ -805,7 +808,7 @@ def __call__( # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, first_inference_step, strength, device) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) # 6. Prepare latent variables @@ -846,10 +849,6 @@ def __call__( num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): - # skip a number of timesteps, if first_inference_step is set - if first_inference_step is not None and i < first_inference_step: - print(f'Skipping timestep {i} of {num_inference_steps} because of first_inference_step={first_inference_step}') - continue # expand the latents if we are doing classifier free guidance latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents From c63a53afd93d869b72c040d3bf7f3e9ae4776940 Mon Sep 17 00:00:00 2001 From: bghira Date: Sun, 9 Jul 2023 19:43:09 -0700 Subject: [PATCH 04/16] diffusers#4003 Do not add noise during img2img when we have a defined first timestep --- .../pipeline_stable_diffusion_xl_img2img.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index d829e53bf94f..dec3b1ad37e1 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -515,14 +515,14 @@ def get_timesteps(self, num_inference_steps, first_inference_step, strength, dev if first_inference_step is None: init_timestep = min(int(num_inference_steps * strength), num_inference_steps) else: - init_timestep = first_inference_step + init_timestep = first_inference_step - 1 t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start - def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True): if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): raise ValueError( f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" @@ -574,12 +574,12 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt ) else: init_latents = torch.cat([init_latents], dim=0) + if add_noise: + shape = init_latents.shape + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - shape = init_latents.shape - noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - - # get latents - init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + # get latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) latents = init_latents return latents @@ -671,7 +671,7 @@ def __call__( instead. image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): The image(s) to modify with the pipeline. - strength (`float`, *optional*, defaults to 0.8): + strength (`float`, *optional*, defaults to 0.3): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will @@ -810,10 +810,12 @@ def __call__( self.scheduler.set_timesteps(num_inference_steps, device=device) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, first_inference_step, strength, device) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) - + add_noise = True + if first_inference_step is not None: + add_noise = False # 6. Prepare latent variables latents = self.prepare_latents( - image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator + image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator, add_noise ) # 7. Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) From 062afbb4564bd21fb4727f9f80c9fa9658957355 Mon Sep 17 00:00:00 2001 From: bghira Date: Mon, 10 Jul 2023 18:28:15 -0700 Subject: [PATCH 05/16] diffusers#4003 Mild updates after revert --- .../pipeline_stable_diffusion_xl.py | 19 +++++++ .../pipeline_stable_diffusion_xl_img2img.py | 57 ++++++++++++------- 2 files changed, 57 insertions(+), 19 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 4f0854c3c142..a18c030f40fa 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -444,6 +444,25 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs + def timesteps_from_strength( + self, strength: float, num_inference_steps: int + ): + """Retrieve values for `final_inference_step` and `begin_inference_step` from `strength`, `num_inference_steps` + + Args: + strength (float): A traditional img2img strength between 0.0 and 1.0, with higher values resulting in greater + influence from the img2img model and lower values, more influence from the base model. + num_inference_steps (int): The total number of inference steps to be taken. + Returns: + final_inference_step (int): The final inference step to be taken. + begin_inference_step (int): The inference step to begin img2img inference. + """ + # We need to invert the percentage. A strength of 0.0 should result in 100% of the inference steps. + inverse_strength = 1.0 - strength + final_inference_step = int(num_inference_steps * inverse_strength) + begin_inference_step = final_inference_step + return final_inference_step, begin_inference_step + def check_inputs( self, prompt, diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index dec3b1ad37e1..9ec709b8ef41 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -454,8 +454,27 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs + def timesteps_from_strength( + self, strength: float, num_inference_steps: int + ): + """Retrieve values for `final_inference_step` and `begin_inference_step` from `strength`, `num_inference_steps` + + Args: + strength (float): A traditional img2img strength between 0.0 and 1.0, with higher values resulting in greater + influence from the img2img model and lower values, more influence from the base model. + num_inference_steps (int): The total number of inference steps to be taken. + Returns: + final_inference_step (int): The final inference step to be taken. + begin_inference_step (int): The inference step to begin img2img inference. + """ + # We need to invert the percentage. A strength of 0.0 should result in 100% of the inference steps. + inverse_strength = 1.0 - strength + final_inference_step = int(num_inference_steps * inverse_strength) + begin_inference_step = final_inference_step + return final_inference_step, begin_inference_step + def check_inputs( - self, prompt, strength, num_inference_steps, first_inference_step, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None + self, prompt, strength, num_inference_steps, begin_inference_step, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None ): if strength < 0 or strength > 1: raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") @@ -466,14 +485,14 @@ def check_inputs( f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type" f" {type(num_inference_steps)}." ) - if first_inference_step is not None and (not isinstance(first_inference_step, int) or first_inference_step <= 0): + if begin_inference_step is not None and (not isinstance(begin_inference_step, int) or begin_inference_step <= 0): raise ValueError( - f"`first_inference_step` has to be a positive integer but is {first_inference_step} of type" - f" {type(first_inference_step)}." + f"`begin_inference_step` has to be a positive integer but is {begin_inference_step} of type" + f" {type(begin_inference_step)}." ) - if first_inference_step is not None and first_inference_step > num_inference_steps: + if begin_inference_step is not None and begin_inference_step > num_inference_steps: raise ValueError( - f"`first_inference_step` has to be smaller than `num_inference_steps` but is {first_inference_step} and" + f"`begin_inference_step` has to be smaller than `num_inference_steps` but is {begin_inference_step} and" f" `num_inference_steps` is {num_inference_steps}." ) if (callback_steps is None) or ( @@ -510,12 +529,12 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) - def get_timesteps(self, num_inference_steps, first_inference_step, strength, device): + def get_timesteps(self, num_inference_steps, begin_inference_step, strength, device): # get the original timestep using init_timestep - if first_inference_step is None: + if begin_inference_step is None: init_timestep = min(int(num_inference_steps * strength), num_inference_steps) else: - init_timestep = first_inference_step - 1 + init_timestep = begin_inference_step - 1 t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] @@ -638,8 +657,8 @@ def __call__( ] = None, strength: float = 0.3, num_inference_steps: int = 50, - max_inference_steps: Optional[int] = None, - first_inference_step: Optional[int] = None, + final_inference_step: Optional[int] = None, + begin_inference_step: Optional[int] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, @@ -680,12 +699,12 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - max_inference_steps (`int`, *optional*): + final_inference_step (`int`, *optional*): Instead of completing the backwards pass entirely, stop and return the output after this many steps. Can be useful with `output_type="latent"` and an img2img pipeline, possibly with better fine detail. - first_inference_step (`int`, *optional*): + begin_inference_step (`int`, *optional*): Ignore the first steps of the denoising process, and start from here. - Useful if the input is a latent tensor that still has residual noise, eg. using `max_inference_steps`. + Useful if the input is a latent tensor that still has residual noise, eg. using `final_inference_step`. guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -764,7 +783,7 @@ def __call__( "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ # 1. Check inputs. Raise error if not correct - self.check_inputs(prompt, strength, num_inference_steps, first_inference_step, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + self.check_inputs(prompt, strength, num_inference_steps, begin_inference_step, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -808,10 +827,10 @@ def __call__( # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, first_inference_step, strength, device) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, begin_inference_step, strength, device) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) add_noise = True - if first_inference_step is not None: + if begin_inference_step is not None: add_noise = False # 6. Prepare latent variables latents = self.prepare_latents( @@ -884,8 +903,8 @@ def __call__( progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if max_inference_steps is not None and i >= max_inference_steps: - logger.debug(f'Breaking inference loop at step {i} as we have reached max_inference_steps={max_inference_steps}') + if final_inference_step is not None and i >= final_inference_step: + logger.debug(f'Breaking inference loop at step {i} as we have reached final_inference_step={final_inference_step}') break # make sure the VAE is in float32 mode, as it overflows in float16 From 37d3dbc2059f54adcb5ab477e46c004695bf7946 Mon Sep 17 00:00:00 2001 From: bghira Date: Mon, 10 Jul 2023 19:06:24 -0700 Subject: [PATCH 06/16] diffusers#4003 Missing change --- .../stable_diffusion_xl/pipeline_stable_diffusion_xl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index a18c030f40fa..60e37ac3c818 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -564,7 +564,7 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, - max_inference_steps: Optional[int] = None, + final_inference_step: Optional[int] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, @@ -599,7 +599,7 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - max_inference_steps (`int`, *optional*): + final_inference_step (`int`, *optional*): Instead of completing the backwards pass entirely, stop and return the output after this many steps. Can be useful with `output_type="latent"` and an img2img pipeline, possibly with better fine detail. guidance_scale (`float`, *optional*, defaults to 7.5): @@ -805,8 +805,8 @@ def __call__( progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if max_inference_steps is not None and i >= max_inference_steps: - logger.debug(f'Breaking inference loop at step {i} as we have reached max_inference_steps={max_inference_steps}') + if final_inference_step is not None and i >= final_inference_step: + logger.debug(f'Breaking inference loop at step {i} as we have reached final_inference_step={final_inference_step}') break # make sure the VAE is in float32 mode, as it overflows in float16 From 577b7915c73f808b4e8ac2e7ec81aa68cad4d580 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 11 Jul 2023 08:55:49 +0000 Subject: [PATCH 07/16] Show implementation with denoising_start and end --- .../pipeline_stable_diffusion_xl.py | 10 ++-- .../pipeline_stable_diffusion_xl_img2img.py | 52 ++++++++++--------- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 60e37ac3c818..4820a78a6584 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -565,6 +565,7 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, final_inference_step: Optional[int] = None, + denoising_end: Optional[float] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, @@ -770,6 +771,12 @@ def __call__( # 8. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + # 7.1 Apply denoising_end + if denoising_end is not None: + num_inference_steps = int(round(denoising_end * num_inference_steps)) + timesteps = timesteps[:num_warmup_steps + self.scheduler.order * num_inference_steps] + with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance @@ -805,9 +812,6 @@ def __call__( progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if final_inference_step is not None and i >= final_inference_step: - logger.debug(f'Breaking inference loop at step {i} as we have reached final_inference_step={final_inference_step}') - break # make sure the VAE is in float32 mode, as it overflows in float16 self.vae.to(dtype=torch.float32) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 9ec709b8ef41..43b35459f4fb 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -474,7 +474,7 @@ def timesteps_from_strength( return final_inference_step, begin_inference_step def check_inputs( - self, prompt, strength, num_inference_steps, begin_inference_step, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None + self, prompt, strength, num_inference_steps, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None ): if strength < 0 or strength > 1: raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") @@ -485,16 +485,6 @@ def check_inputs( f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type" f" {type(num_inference_steps)}." ) - if begin_inference_step is not None and (not isinstance(begin_inference_step, int) or begin_inference_step <= 0): - raise ValueError( - f"`begin_inference_step` has to be a positive integer but is {begin_inference_step} of type" - f" {type(begin_inference_step)}." - ) - if begin_inference_step is not None and begin_inference_step > num_inference_steps: - raise ValueError( - f"`begin_inference_step` has to be smaller than `num_inference_steps` but is {begin_inference_step} and" - f" `num_inference_steps` is {num_inference_steps}." - ) if (callback_steps is None) or ( callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) ): @@ -529,14 +519,14 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) - def get_timesteps(self, num_inference_steps, begin_inference_step, strength, device): + def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None): # get the original timestep using init_timestep - if begin_inference_step is None: + if denoising_start is None: init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + t_start = max(num_inference_steps - init_timestep, 0) else: - init_timestep = begin_inference_step - 1 + t_start = int(round(denoising_start * num_inference_steps)) - t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start @@ -657,8 +647,8 @@ def __call__( ] = None, strength: float = 0.3, num_inference_steps: int = 50, - final_inference_step: Optional[int] = None, - begin_inference_step: Optional[int] = None, + denoising_start: Optional[float] = None, + denoising_end: Optional[float] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, @@ -783,7 +773,7 @@ def __call__( "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ # 1. Check inputs. Raise error if not correct - self.check_inputs(prompt, strength, num_inference_steps, begin_inference_step, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + self.check_inputs(prompt, strength, num_inference_steps, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -827,11 +817,11 @@ def __call__( # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, begin_inference_step, strength, device) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device, denoising_start=denoising_start) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) - add_noise = True - if begin_inference_step is not None: - add_noise = False + + add_noise = True if denoising_start is None else False + # 6. Prepare latent variables latents = self.prepare_latents( image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator, add_noise @@ -868,6 +858,21 @@ def __call__( # 9. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + # 9.1 Apply denoising_end + if denoising_end is not None and denoising_start is not None: + if denoising_end <= denoising_start: + raise ValueError(f"`denoising_end`: {denoising_end} cannot be larger than `denoising_start`: {denoising_start}.") + + orig_num_inference_steps = int(round(num_inference_steps / (1 - denoising_start))) + skipped_final_steps = int(round((1 - denoising_end) * orig_num_inference_steps)) + + num_inference_steps = num_inference_steps - skipped_final_steps + timesteps = timesteps[:num_warmup_steps + self.scheduler.order * num_inference_steps] + elif denoising_end is not None: + num_inference_steps = int(round(denoising_end * num_inference_steps)) + timesteps = timesteps[:num_warmup_steps + self.scheduler.order * num_inference_steps] + with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance @@ -903,9 +908,6 @@ def __call__( progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if final_inference_step is not None and i >= final_inference_step: - logger.debug(f'Breaking inference loop at step {i} as we have reached final_inference_step={final_inference_step}') - break # make sure the VAE is in float32 mode, as it overflows in float16 self.vae.to(dtype=torch.float32) From 2e27720c84feac2d8500e38a00aa0d4f9979a25c Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 11 Jul 2023 10:58:34 +0200 Subject: [PATCH 08/16] Apply suggestions from code review --- .../pipeline_stable_diffusion_xl.py | 20 ------------------- .../pipeline_stable_diffusion_xl_img2img.py | 19 ------------------ 2 files changed, 39 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 4820a78a6584..7cc0c8adb1f6 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -444,25 +444,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - def timesteps_from_strength( - self, strength: float, num_inference_steps: int - ): - """Retrieve values for `final_inference_step` and `begin_inference_step` from `strength`, `num_inference_steps` - - Args: - strength (float): A traditional img2img strength between 0.0 and 1.0, with higher values resulting in greater - influence from the img2img model and lower values, more influence from the base model. - num_inference_steps (int): The total number of inference steps to be taken. - Returns: - final_inference_step (int): The final inference step to be taken. - begin_inference_step (int): The inference step to begin img2img inference. - """ - # We need to invert the percentage. A strength of 0.0 should result in 100% of the inference steps. - inverse_strength = 1.0 - strength - final_inference_step = int(num_inference_steps * inverse_strength) - begin_inference_step = final_inference_step - return final_inference_step, begin_inference_step - def check_inputs( self, prompt, @@ -564,7 +545,6 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, - final_inference_step: Optional[int] = None, denoising_end: Optional[float] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 43b35459f4fb..cf3ae79b18ad 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -454,25 +454,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - def timesteps_from_strength( - self, strength: float, num_inference_steps: int - ): - """Retrieve values for `final_inference_step` and `begin_inference_step` from `strength`, `num_inference_steps` - - Args: - strength (float): A traditional img2img strength between 0.0 and 1.0, with higher values resulting in greater - influence from the img2img model and lower values, more influence from the base model. - num_inference_steps (int): The total number of inference steps to be taken. - Returns: - final_inference_step (int): The final inference step to be taken. - begin_inference_step (int): The inference step to begin img2img inference. - """ - # We need to invert the percentage. A strength of 0.0 should result in 100% of the inference steps. - inverse_strength = 1.0 - strength - final_inference_step = int(num_inference_steps * inverse_strength) - begin_inference_step = final_inference_step - return final_inference_step, begin_inference_step - def check_inputs( self, prompt, strength, num_inference_steps, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None ): From f7e8f05f4e3dc41d601977087e14c5046a89ef92 Mon Sep 17 00:00:00 2001 From: Bagheera <59658056+bghira@users.noreply.github.com> Date: Tue, 11 Jul 2023 07:36:47 -0700 Subject: [PATCH 09/16] Update src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py Co-authored-by: Pedro Cuenca --- .../stable_diffusion_xl/pipeline_stable_diffusion_xl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 7cc0c8adb1f6..643465bbf6f1 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -581,7 +581,7 @@ def __call__( The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. final_inference_step (`int`, *optional*): - Instead of completing the backwards pass entirely, stop and return the output after this many steps. + Instead of completing the denoising pass entirely, stop and return the output after this many steps. Can be useful with `output_type="latent"` and an img2img pipeline, possibly with better fine detail. guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). From 664cdb7e3b3ec27bb342142bdc80f44fbb024d86 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 11 Jul 2023 20:47:30 +0000 Subject: [PATCH 10/16] move to 0.19.0dev --- .../pipeline_stable_diffusion_xl.py | 12 ++-- .../pipeline_stable_diffusion_xl_img2img.py | 67 ++++++++++++++----- 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 7cc0c8adb1f6..cf0a520926bf 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -580,9 +580,13 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - final_inference_step (`int`, *optional*): - Instead of completing the backwards pass entirely, stop and return the output after this many steps. - Can be useful with `output_type="latent"` and an img2img pipeline, possibly with better fine detail. + denoising_end (`float`, *optional*): + When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be + completed before it is intentionally prematurely terminated. For instance, if denoising_end is set to + 0.7 and `num_inference_steps` is fixed at 50, the process will execute only 35 (i.e., 0.7 * 50) + denoising steps. As a result, the returned sample will still retain a substantial amount of noise. The + denoising_end parameter should ideally be utilized when this pipeline forms a part of a "Mixture of + Denoisers" multi-pipeline setup, as elaborated in []. guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -755,7 +759,7 @@ def __call__( # 7.1 Apply denoising_end if denoising_end is not None: num_inference_steps = int(round(denoising_end * num_inference_steps)) - timesteps = timesteps[:num_warmup_steps + self.scheduler.order * num_inference_steps] + timesteps = timesteps[: num_warmup_steps + self.scheduler.order * num_inference_steps] with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index cf3ae79b18ad..50f1f3b2fca2 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -455,7 +455,14 @@ def prepare_extra_step_kwargs(self, generator, eta): return extra_step_kwargs def check_inputs( - self, prompt, strength, num_inference_steps, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None + self, + prompt, + strength, + num_inference_steps, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, ): if strength < 0 or strength > 1: raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") @@ -512,7 +519,9 @@ def get_timesteps(self, num_inference_steps, strength, device, denoising_start=N return timesteps, num_inference_steps - t_start - def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True): + def prepare_latents( + self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True + ): if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): raise ValueError( f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" @@ -670,12 +679,22 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - final_inference_step (`int`, *optional*): - Instead of completing the backwards pass entirely, stop and return the output after this many steps. - Can be useful with `output_type="latent"` and an img2img pipeline, possibly with better fine detail. - begin_inference_step (`int`, *optional*): - Ignore the first steps of the denoising process, and start from here. - Useful if the input is a latent tensor that still has residual noise, eg. using `final_inference_step`. + denoising_start (`float`, *optional*): + When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be + bypassed before it is initiated. For example, if `denoising_start` is set to 0.7 and + num_inference_steps is fixed at 50, the process will begin only from the 35th (i.e., 0.7 * 50) + denoising step. Consequently, the initial part of the denoising process is skipped and it is assumed + that the passed `image` is a partly denoised image. The `denoising_start` parameter is particularly + beneficial when this pipeline is integrated into a "Mixture of Denoisers" multi-pipeline setup, as + detailed in []. + denoising_end (`float`, *optional*): + When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be + completed before it is intentionally prematurely terminated. For instance, if denoising_end is set to + 0.7 and `num_inference_steps` is fixed at 50, the process will execute only 35 (i.e., 0.7 * 50) + denoising steps. As a result, the returned sample will still retain a substantial amount of noise (ca. + 30%) and should be denoised by a successor pipeline that has `denoising_start` set to 0.7 so that it + only denoised the final 30%. The denoising_end parameter should ideally be utilized when this pipeline + forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in []. guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -754,7 +773,15 @@ def __call__( "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ # 1. Check inputs. Raise error if not correct - self.check_inputs(prompt, strength, num_inference_steps, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + self.check_inputs( + prompt, + strength, + num_inference_steps, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -798,14 +825,22 @@ def __call__( # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device, denoising_start=denoising_start) + timesteps, num_inference_steps = self.get_timesteps( + num_inference_steps, strength, device, denoising_start=denoising_start + ) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) add_noise = True if denoising_start is None else False - # 6. Prepare latent variables latents = self.prepare_latents( - image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator, add_noise + image, + latent_timestep, + batch_size, + num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + add_noise, ) # 7. Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -843,16 +878,18 @@ def __call__( # 9.1 Apply denoising_end if denoising_end is not None and denoising_start is not None: if denoising_end <= denoising_start: - raise ValueError(f"`denoising_end`: {denoising_end} cannot be larger than `denoising_start`: {denoising_start}.") + raise ValueError( + f"`denoising_end`: {denoising_end} cannot be larger than `denoising_start`: {denoising_start}." + ) orig_num_inference_steps = int(round(num_inference_steps / (1 - denoising_start))) skipped_final_steps = int(round((1 - denoising_end) * orig_num_inference_steps)) num_inference_steps = num_inference_steps - skipped_final_steps - timesteps = timesteps[:num_warmup_steps + self.scheduler.order * num_inference_steps] + timesteps = timesteps[: num_warmup_steps + self.scheduler.order * num_inference_steps] elif denoising_end is not None: num_inference_steps = int(round(denoising_end * num_inference_steps)) - timesteps = timesteps[:num_warmup_steps + self.scheduler.order * num_inference_steps] + timesteps = timesteps[: num_warmup_steps + self.scheduler.order * num_inference_steps] with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): From f6fce2dd7951b2d50c2482a2989fb07a4a0a003d Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 11 Jul 2023 22:50:27 +0200 Subject: [PATCH 11/16] Apply suggestions from code review --- .../stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index e3ac4472b2e7..ed72f60f97f7 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -878,7 +878,7 @@ def __call__( # 9.1 Apply denoising_end if denoising_end is not None and denoising_start is not None: - if denoising_end <= denoising_start: + if denoising_start >= denoising_end: raise ValueError( f"`denoising_end`: {denoising_end} cannot be larger than `denoising_start`: {denoising_start}." ) From 50addfa85508db515b40bfdddbe6bcf60b1fca6b Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 11 Jul 2023 22:28:45 +0000 Subject: [PATCH 12/16] add exhaustive tests --- .../pipeline_stable_diffusion_xl.py | 2 +- .../pipeline_stable_diffusion_xl_img2img.py | 11 +- .../test_stable_diffusion_xl.py | 133 ++++++++++++++++++ 3 files changed, 140 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 6c760db62bcc..a8045cbec2c8 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -754,7 +754,7 @@ def __call__( add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1) # 8. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) # 7.1 Apply denoising_end if denoising_end is not None: diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index e3ac4472b2e7..031d6d8af892 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -574,12 +574,13 @@ def prepare_latents( ) else: init_latents = torch.cat([init_latents], dim=0) + if add_noise: shape = init_latents.shape noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - # get latents init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + latents = init_latents return latents @@ -825,6 +826,8 @@ def __call__( image = self.image_processor.preprocess(image) # 5. Prepare timesteps + original_num_steps = num_inference_steps # save for denoising_start/end later + self.scheduler.set_timesteps(num_inference_steps, device=device) timesteps, num_inference_steps = self.get_timesteps( num_inference_steps, strength, device, denoising_start=denoising_start @@ -874,7 +877,7 @@ def __call__( add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1) # 9. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) # 9.1 Apply denoising_end if denoising_end is not None and denoising_start is not None: @@ -883,9 +886,7 @@ def __call__( f"`denoising_end`: {denoising_end} cannot be larger than `denoising_start`: {denoising_start}." ) - orig_num_inference_steps = int(round(num_inference_steps / (1 - denoising_start))) - skipped_final_steps = int(round((1 - denoising_end) * orig_num_inference_steps)) - + skipped_final_steps = int(round((1 - denoising_end) * original_num_steps)) num_inference_steps = num_inference_steps - skipped_final_steps timesteps = timesteps[: num_warmup_steps + self.scheduler.order * num_inference_steps] elif denoising_end is not None: diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py index e481e85916d2..3f4dd19c9bd9 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import unittest import numpy as np @@ -21,9 +22,14 @@ from diffusers import ( AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, EulerDiscreteScheduler, + HeunDiscreteScheduler, + StableDiffusionXLImg2ImgPipeline, StableDiffusionXLPipeline, UNet2DConditionModel, + UniPCMultistepScheduler, ) from diffusers.utils import torch_device from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu @@ -216,3 +222,130 @@ def test_stable_diffusion_xl_offloads(self): assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3 assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3 + + def test_stable_diffusion_two_xl_mixture_of_denoiser(self): + components = self.get_dummy_components() + pipe_1 = StableDiffusionXLPipeline(**components).to(torch_device) + pipe_1.unet.set_default_attn_processor() + pipe_2 = StableDiffusionXLImg2ImgPipeline(**components).to(torch_device) + pipe_2.unet.set_default_attn_processor() + + def assert_run_mixture(num_steps, split, scheduler_cls): + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = num_steps + + pipe_1.scheduler = scheduler_cls.from_config(pipe_1.scheduler.config) + pipe_2.scheduler = scheduler_cls.from_config(pipe_2.scheduler.config) + + # Let's retrieve the number of timesteps we want to use + pipe_1.scheduler.set_timesteps(num_steps) + expected_steps = pipe_1.scheduler.timesteps.tolist() + + split_id = int(round(split * num_steps)) * pipe_1.scheduler.order + expected_steps_1 = expected_steps[:split_id] + expected_steps_2 = expected_steps[split_id:] + + # now we monkey patch step `done_steps` + # list into the step function for testing + done_steps = [] + old_step = copy.copy(scheduler_cls.step) + + def new_step(self, *args, **kwargs): + done_steps.append(args[1].cpu().item()) # args[1] is always the passed `t` + return old_step(self, *args, **kwargs) + + scheduler_cls.step = new_step + + inputs_1 = {**inputs, **{"denoising_end": split, "output_type": "latent"}} + latents = pipe_1(**inputs_1).images[0] + + assert expected_steps_1 == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}" + + inputs_2 = {**inputs, **{"denoising_start": split, "image": latents}} + pipe_2(**inputs_2).images[0] + + assert expected_steps_2 == done_steps[len(expected_steps_1) :] + assert expected_steps == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}" + + for steps in [5, 8]: + for split in [0.33, 0.49, 0.71]: + for scheduler_cls in [ + DDIMScheduler, + EulerDiscreteScheduler, + DPMSolverMultistepScheduler, + UniPCMultistepScheduler, + HeunDiscreteScheduler, + ]: + assert_run_mixture(steps, split, scheduler_cls) + + def test_stable_diffusion_three_xl_mixture_of_denoiser(self): + components = self.get_dummy_components() + pipe_1 = StableDiffusionXLPipeline(**components).to(torch_device) + pipe_1.unet.set_default_attn_processor() + pipe_2 = StableDiffusionXLImg2ImgPipeline(**components).to(torch_device) + pipe_2.unet.set_default_attn_processor() + pipe_3 = StableDiffusionXLImg2ImgPipeline(**components).to(torch_device) + pipe_3.unet.set_default_attn_processor() + + def assert_run_mixture(num_steps, split_1, split_2, scheduler_cls): + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = num_steps + + pipe_1.scheduler = scheduler_cls.from_config(pipe_1.scheduler.config) + pipe_2.scheduler = scheduler_cls.from_config(pipe_2.scheduler.config) + pipe_3.scheduler = scheduler_cls.from_config(pipe_3.scheduler.config) + + # Let's retrieve the number of timesteps we want to use + pipe_1.scheduler.set_timesteps(num_steps) + expected_steps = pipe_1.scheduler.timesteps.tolist() + + split_id_1 = int(round(split_1 * num_steps)) * pipe_1.scheduler.order + split_id_2 = int(round(split_2 * num_steps)) * pipe_1.scheduler.order + expected_steps_1 = expected_steps[:split_id_1] + expected_steps_2 = expected_steps[split_id_1:split_id_2] + expected_steps_3 = expected_steps[split_id_2:] + + # now we monkey patch step `done_steps` + # list into the step function for testing + done_steps = [] + old_step = copy.copy(scheduler_cls.step) + + def new_step(self, *args, **kwargs): + done_steps.append(args[1].cpu().item()) # args[1] is always the passed `t` + return old_step(self, *args, **kwargs) + + scheduler_cls.step = new_step + + inputs_1 = {**inputs, **{"denoising_end": split_1, "output_type": "latent"}} + latents = pipe_1(**inputs_1).images[0] + + assert ( + expected_steps_1 == done_steps + ), f"Failure with {scheduler_cls.__name__} and {num_steps} and {split_1} and {split_2}" + + inputs_2 = { + **inputs, + **{"denoising_start": split_1, "denoising_end": split_2, "image": latents, "output_type": "latent"}, + } + pipe_2(**inputs_2).images[0] + + assert expected_steps_2 == done_steps[len(expected_steps_1) :] + + inputs_3 = {**inputs, **{"denoising_start": split_2, "image": latents}} + pipe_3(**inputs_3).images[0] + + assert expected_steps_3 == done_steps[len(expected_steps_1) + len(expected_steps_2) :] + assert ( + expected_steps == done_steps + ), f"Failure with {scheduler_cls.__name__} and {num_steps} and {split_1} and {split_2}" + + for steps in [7, 11]: + for split_1, split_2 in zip([0.19, 0.32], [0.81, 0.68]): + for scheduler_cls in [ + DDIMScheduler, + EulerDiscreteScheduler, + DPMSolverMultistepScheduler, + UniPCMultistepScheduler, + HeunDiscreteScheduler, + ]: + assert_run_mixture(steps, split_1, split_2, scheduler_cls) From 373aa94b3cf064a29ea99ea99e210478e2221a60 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 11 Jul 2023 23:02:59 +0000 Subject: [PATCH 13/16] add docs --- .../en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx index 6df873edab00..9d0f6fd9abe1 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx @@ -59,6 +59,8 @@ image = pipe(prompt=prompt).images[0] ### Refining the image output +#### Refining the image output from + The image can be refined by making use of [stabilityai/stable-diffusion-xl-refiner-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9). In this case, you only have to output the `latents` from the base model. From f5c05f115e52eb2a9af282809be90937154131c3 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 11 Jul 2023 23:34:36 +0000 Subject: [PATCH 14/16] finish --- .../stable_diffusion/stable_diffusion_xl.mdx | 106 ++++++++++++++++-- 1 file changed, 98 insertions(+), 8 deletions(-) diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx index 9d0f6fd9abe1..f3381187974d 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx @@ -59,23 +59,113 @@ image = pipe(prompt=prompt).images[0] ### Refining the image output -#### Refining the image output from +In addition to the [base model checkpoint](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9), +StableDiffusion-XL also includes a [refiner checkpoint](huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9) +that is specialized in denoising low-noise stage images to generate images of improved high-frequency quality. +This refiner checkpoint can be used as a "second-step" pipeline after having run the base checkpoint to improve +image quality. -The image can be refined by making use of [stabilityai/stable-diffusion-xl-refiner-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9). -In this case, you only have to output the `latents` from the base model. +When using the refiner, one can easier +- 1.) employ the base model and refiner as an *Ensemble of Expert Denoisers* as first proposed in [eDiff-I](https://research.nvidia.com/labs/dir/eDiff-I/) or +- 2.) simply run the refiner in [SDEdit](https://arxiv.org/abs/2108.01073) fashion after the base model. + +**Note**: The idea of using SD-XL base & refiner as an ensemble of experts was first brought forward by +a couple community contributors which also helped shape the following `diffusers` implementation, namely: +- [SytanSD](https://github.com/SytanSD) +- [bghira](https://github.com/bghira) +- [Birch-san](https://github.com/Birch-san) + +#### 1.) Ensemble of Expert Denoisers + +When using the base and refiner model as an ensemble of expert of denoisers, the base model should serve as the +expert for the high-noise diffusion stage and the refiner serves as the expert for the low-noise diffusion stage. + +The advantage of 1.) over 2.) is that it requires less overall denoising steps and therefore should be significantly +faster. The drawback is that one cannot really inspect the output of the base model is it will still be heavily denoised. + +To use the base model and refiner as an ensemble of expert denoisers, make sure to define the fraction +of timesteps which should be run through the high-noise denoising stage (*i.e.* the base model) and the low-noise +denoising stage (*i.e.* the refiner model) respectively. This fraction should be set as the [`denoising_end`]() of the base model +and as the [`denoising_start`]() of the refiner model. + +Let's look at an example. +First, we import the two pipelines. Since the text encoder and variational autoencoder are the same +you don't have to load those again for the refiner. ```py -from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline +from diffusers import DiffusionPipeline import torch -pipe = StableDiffusionXLPipeline.from_pretrained( +base = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16, variant="fp16", use_safetensors=True ) pipe.to("cuda") -use_refiner = True -refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16" +refiner = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-refiner-0.9", + text_encoder_2=base.text_encoder_2, + vae=base.vae, + torch_dtype=torch.float16, + use_safetensors=True, + variant="fp16", +) +refiner.to("cuda") +``` + +Now we define the number of inference steps and the fraction at which the model shall be run through the +high-noise denoising stage (*i.e.* the base model). + +```py +n_steps = 40 +high_noise_frac = 0.7 +``` + +A fraction of 0.7 means that 70% of the 40 inference steps (28 steps) are run through the base model +and the remaining 12 steps are run through the refiner. Let's run the two pipelines now. +Make sure to set `denoising_end` and `denoising_start` to the same values and keep `num_inference_steps` +constant. Also remember that the output of the base model should be in latent space: + +```py +prompt = "A majestic lion jumping from a big stone at night" + +image = base(prompt=prompt, num_inference_steps=n_steps, denoising_end=high_noise_frac, output_type="latent").images +image = refiner(prompt=prompt, num_inference_steps=n_steps, denoising_start=high_noise_frac, image=image).images[0] +``` + +Let's have a look at the image + +![lion_ref](https://huggingface.co/datasets/huggingface/documentation-images/blob/main/diffusers/lion_refined.png) + +If we would have just run the base model on the same 40 steps, the image would have been arguably less detailed (e.g. the lion eyes and nose): + +![lion_base](https://huggingface.co/datasets/huggingface/documentation-images/blob/main/diffusers/lion_base.png) + +**Note**: The ensemble-of-experts method works well on all available schedulers! + +#### Refining the image output from fully denoised base image + +In standard [`StableDiffusionImg2ImgPipeline`]-fashion, the fully-denoised image generated of the base model +can be further improved using the [refiner checkpoint](huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9). + +For this, you simply run the refiner as a normal image-to-image pipeline after the "base" text-to-image +pipeline. You can leave the outputs of the base model in latent space. + +```py +from diffusers import DiffusionPipeline +import torch + +pipe = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16, variant="fp16", use_safetensors=True +) +pipe.to("cuda") + +refiner = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-refiner-0.9", + text_encoder_2=pipe_high_noise.text_encoder_2, + vae=pipe_high_noise.vae, + torch_dtype=torch.float16, + use_safetensors=True, + variant="fp16", ) refiner.to("cuda") From 50ea36b42fbe1e8e062219d6df8c0de450854d79 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 12 Jul 2023 13:17:55 +0200 Subject: [PATCH 15/16] Apply suggestions from code review Co-authored-by: Sayak Paul --- .../stable_diffusion/stable_diffusion_xl.mdx | 20 +++++++++++-------- .../pipeline_stable_diffusion_xl.py | 2 +- .../pipeline_stable_diffusion_xl_img2img.py | 4 ++-- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx index f3381187974d..1c5ea390a49f 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx @@ -65,7 +65,7 @@ that is specialized in denoising low-noise stage images to generate images of im This refiner checkpoint can be used as a "second-step" pipeline after having run the base checkpoint to improve image quality. -When using the refiner, one can easier +When using the refiner, one can easily - 1.) employ the base model and refiner as an *Ensemble of Expert Denoisers* as first proposed in [eDiff-I](https://research.nvidia.com/labs/dir/eDiff-I/) or - 2.) simply run the refiner in [SDEdit](https://arxiv.org/abs/2108.01073) fashion after the base model. @@ -81,15 +81,15 @@ When using the base and refiner model as an ensemble of expert of denoisers, the expert for the high-noise diffusion stage and the refiner serves as the expert for the low-noise diffusion stage. The advantage of 1.) over 2.) is that it requires less overall denoising steps and therefore should be significantly -faster. The drawback is that one cannot really inspect the output of the base model is it will still be heavily denoised. +faster. The drawback is that one cannot really inspect the output of the base model; it will still be heavily denoised. To use the base model and refiner as an ensemble of expert denoisers, make sure to define the fraction of timesteps which should be run through the high-noise denoising stage (*i.e.* the base model) and the low-noise -denoising stage (*i.e.* the refiner model) respectively. This fraction should be set as the [`denoising_end`]() of the base model -and as the [`denoising_start`]() of the refiner model. +denoising stage (*i.e.* the refiner model) respectively. This fraction should be set as the [`~StableDiffusionXLPipeline.__call__.denoising_end`] of the base model +and as the [`~StableDiffusionXLImg2ImgPipeline.__call__.denoising_start`] of the refiner model. Let's look at an example. -First, we import the two pipelines. Since the text encoder and variational autoencoder are the same +First, we import the two pipelines. Since the text encoders and variational autoencoder are the same you don't have to load those again for the refiner. ```py @@ -140,7 +140,11 @@ If we would have just run the base model on the same 40 steps, the image would h ![lion_base](https://huggingface.co/datasets/huggingface/documentation-images/blob/main/diffusers/lion_base.png) -**Note**: The ensemble-of-experts method works well on all available schedulers! + + +The ensemble-of-experts method works well on all available schedulers! + + #### Refining the image output from fully denoised base image @@ -161,8 +165,8 @@ pipe.to("cuda") refiner = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-refiner-0.9", - text_encoder_2=pipe_high_noise.text_encoder_2, - vae=pipe_high_noise.vae, + text_encoder_2=pipe.text_encoder_2, + vae=pipe.vae, torch_dtype=torch.float16, use_safetensors=True, variant="fp16", diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index a8045cbec2c8..3980283cae21 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -586,7 +586,7 @@ def __call__( 0.7 and `num_inference_steps` is fixed at 50, the process will execute only 35 (i.e., 0.7 * 50) denoising steps. As a result, the returned sample will still retain a substantial amount of noise. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a "Mixture of - Denoisers" multi-pipeline setup, as elaborated in []. + Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output) guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index ea040f4225fc..0725104799b4 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -688,7 +688,7 @@ def __call__( denoising step. Consequently, the initial part of the denoising process is skipped and it is assumed that the passed `image` is a partly denoised image. The `denoising_start` parameter is particularly beneficial when this pipeline is integrated into a "Mixture of Denoisers" multi-pipeline setup, as - detailed in []. + detailed in [**Refining the Image Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output). denoising_end (`float`, *optional*): When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be completed before it is intentionally prematurely terminated. For instance, if denoising_end is set to @@ -696,7 +696,7 @@ def __call__( denoising steps. As a result, the returned sample will still retain a substantial amount of noise (ca. 30%) and should be denoised by a successor pipeline that has `denoising_start` set to 0.7 so that it only denoised the final 30%. The denoising_end parameter should ideally be utilized when this pipeline - forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in []. + forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output). guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen From 3a75d211fe27f4313349bb3fd550fcbcc93b9114 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 12 Jul 2023 11:20:22 +0000 Subject: [PATCH 16/16] make style --- .../stable_diffusion_xl/pipeline_stable_diffusion_xl.py | 3 ++- .../pipeline_stable_diffusion_xl_img2img.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 3980283cae21..b3dcf1b67cda 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -586,7 +586,8 @@ def __call__( 0.7 and `num_inference_steps` is fixed at 50, the process will execute only 35 (i.e., 0.7 * 50) denoising steps. As a result, the returned sample will still retain a substantial amount of noise. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a "Mixture of - Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output) + Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image + Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output) guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 0725104799b4..7b0cdfad8c0a 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -688,7 +688,8 @@ def __call__( denoising step. Consequently, the initial part of the denoising process is skipped and it is assumed that the passed `image` is a partly denoised image. The `denoising_start` parameter is particularly beneficial when this pipeline is integrated into a "Mixture of Denoisers" multi-pipeline setup, as - detailed in [**Refining the Image Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output). + detailed in [**Refining the Image + Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output). denoising_end (`float`, *optional*): When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be completed before it is intentionally prematurely terminated. For instance, if denoising_end is set to @@ -696,7 +697,8 @@ def __call__( denoising steps. As a result, the returned sample will still retain a substantial amount of noise (ca. 30%) and should be denoised by a successor pipeline that has `denoising_start` set to 0.7 so that it only denoised the final 30%. The denoising_end parameter should ideally be utilized when this pipeline - forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output). + forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image + Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output). guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen