From ba0f30ab02b7fe6f53ea69399cd9f11a58559ebc Mon Sep 17 00:00:00 2001
From: ryotaro <kakuda40611@gmail.com>
Date: Thu, 27 Feb 2025 18:29:48 +0800
Subject: [PATCH 1/2] Bug fix in ltx

---
 src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py
index 0577a56ec13d..7fe89d213e27 100644
--- a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py
+++ b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py
@@ -495,10 +495,13 @@ def prepare_latents(
         mask_shape = (batch_size, 1, num_frames, height, width)
 
         if latents is not None:
-            conditioning_mask = latents.new_zeros(shape)
+            conditioning_mask = latents.new_zeros(mask_shape)
             conditioning_mask[:, :, 0] = 1.0
             conditioning_mask = self._pack_latents(
                 conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
+            ).squeeze(-1)
+            latents = self._pack_latents(
+                latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
             )
             return latents.to(device=device, dtype=dtype), conditioning_mask
 

From 36e0dcba2a4f93355de42f45ddb5721cd1623441 Mon Sep 17 00:00:00 2001
From: ryotaro <kakuda40611@gmail.com>
Date: Tue, 4 Mar 2025 20:44:53 +0800
Subject: [PATCH 2/2] Assume packed latents.

---
 .../pipelines/ltx/pipeline_ltx_image2video.py         | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py
index 7fe89d213e27..7ffd68ee2509 100644
--- a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py
+++ b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py
@@ -487,9 +487,7 @@ def prepare_latents(
     ) -> torch.Tensor:
         height = height // self.vae_spatial_compression_ratio
         width = width // self.vae_spatial_compression_ratio
-        num_frames = (
-            (num_frames - 1) // self.vae_temporal_compression_ratio + 1 if latents is None else latents.size(2)
-        )
+        num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
 
         shape = (batch_size, num_channels_latents, num_frames, height, width)
         mask_shape = (batch_size, 1, num_frames, height, width)
@@ -500,9 +498,10 @@ def prepare_latents(
             conditioning_mask = self._pack_latents(
                 conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
             ).squeeze(-1)
-            latents = self._pack_latents(
-                latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
-            )
+            if latents.ndim != 3 or latents.shape[:2] != conditioning_mask.shape:
+                raise ValueError(
+                    f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is {conditioning_mask.shape + (num_channels_latents,)}."
+                )
             return latents.to(device=device, dtype=dtype), conditioning_mask
 
         if isinstance(generator, list):