@@ -515,14 +515,14 @@ def get_timesteps(self, num_inference_steps, first_inference_step, strength, dev
515
515
if first_inference_step is None :
516
516
init_timestep = min (int (num_inference_steps * strength ), num_inference_steps )
517
517
else :
518
- init_timestep = first_inference_step
518
+ init_timestep = first_inference_step - 1
519
519
520
520
t_start = max (num_inference_steps - init_timestep , 0 )
521
521
timesteps = self .scheduler .timesteps [t_start * self .scheduler .order :]
522
522
523
523
return timesteps , num_inference_steps - t_start
524
524
525
- def prepare_latents (self , image , timestep , batch_size , num_images_per_prompt , dtype , device , generator = None ):
525
+ def prepare_latents (self , image , timestep , batch_size , num_images_per_prompt , dtype , device , generator = None , add_noise = True ):
526
526
if not isinstance (image , (torch .Tensor , PIL .Image .Image , list )):
527
527
raise ValueError (
528
528
f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is { type (image )} "
@@ -574,12 +574,12 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
574
574
)
575
575
else :
576
576
init_latents = torch .cat ([init_latents ], dim = 0 )
577
+ if add_noise :
578
+ shape = init_latents .shape
579
+ noise = randn_tensor (shape , generator = generator , device = device , dtype = dtype )
577
580
578
- shape = init_latents .shape
579
- noise = randn_tensor (shape , generator = generator , device = device , dtype = dtype )
580
-
581
- # get latents
582
- init_latents = self .scheduler .add_noise (init_latents , noise , timestep )
581
+ # get latents
582
+ init_latents = self .scheduler .add_noise (init_latents , noise , timestep )
583
583
latents = init_latents
584
584
585
585
return latents
@@ -671,7 +671,7 @@ def __call__(
671
671
instead.
672
672
image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
673
673
The image(s) to modify with the pipeline.
674
- strength (`float`, *optional*, defaults to 0.8 ):
674
+ strength (`float`, *optional*, defaults to 0.3 ):
675
675
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
676
676
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
677
677
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
@@ -810,10 +810,12 @@ def __call__(
810
810
self .scheduler .set_timesteps (num_inference_steps , device = device )
811
811
timesteps , num_inference_steps = self .get_timesteps (num_inference_steps , first_inference_step , strength , device )
812
812
latent_timestep = timesteps [:1 ].repeat (batch_size * num_images_per_prompt )
813
-
813
+ add_noise = True
814
+ if first_inference_step is not None :
815
+ add_noise = False
814
816
# 6. Prepare latent variables
815
817
latents = self .prepare_latents (
816
- image , latent_timestep , batch_size , num_images_per_prompt , prompt_embeds .dtype , device , generator
818
+ image , latent_timestep , batch_size , num_images_per_prompt , prompt_embeds .dtype , device , generator , add_noise
817
819
)
818
820
# 7. Prepare extra step kwargs.
819
821
extra_step_kwargs = self .prepare_extra_step_kwargs (generator , eta )
0 commit comments