From ba0f30ab02b7fe6f53ea69399cd9f11a58559ebc Mon Sep 17 00:00:00 2001 From: ryotaro Date: Thu, 27 Feb 2025 18:29:48 +0800 Subject: [PATCH 1/2] Bug fix in ltx --- src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py index 0577a56ec13d..7fe89d213e27 100644 --- a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py +++ b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py @@ -495,10 +495,13 @@ def prepare_latents( mask_shape = (batch_size, 1, num_frames, height, width) if latents is not None: - conditioning_mask = latents.new_zeros(shape) + conditioning_mask = latents.new_zeros(mask_shape) conditioning_mask[:, :, 0] = 1.0 conditioning_mask = self._pack_latents( conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size + ).squeeze(-1) + latents = self._pack_latents( + latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size ) return latents.to(device=device, dtype=dtype), conditioning_mask From 36e0dcba2a4f93355de42f45ddb5721cd1623441 Mon Sep 17 00:00:00 2001 From: ryotaro Date: Tue, 4 Mar 2025 20:44:53 +0800 Subject: [PATCH 2/2] Assume packed latents. --- .../pipelines/ltx/pipeline_ltx_image2video.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py index 7fe89d213e27..7ffd68ee2509 100644 --- a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py +++ b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py @@ -487,9 +487,7 @@ def prepare_latents( ) -> torch.Tensor: height = height // self.vae_spatial_compression_ratio width = width // self.vae_spatial_compression_ratio - num_frames = ( - (num_frames - 1) // self.vae_temporal_compression_ratio + 1 if latents is None else latents.size(2) - ) + num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1 shape = (batch_size, num_channels_latents, num_frames, height, width) mask_shape = (batch_size, 1, num_frames, height, width) @@ -500,9 +498,10 @@ def prepare_latents( conditioning_mask = self._pack_latents( conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size ).squeeze(-1) - latents = self._pack_latents( - latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size - ) + if latents.ndim != 3 or latents.shape[:2] != conditioning_mask.shape: + raise ValueError( + f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is {conditioning_mask.shape + (num_channels_latents,)}." + ) return latents.to(device=device, dtype=dtype), conditioning_mask if isinstance(generator, list):