From 8c2b2cdc52909c59d9b5f9417b9dd527396594d8 Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 1 Apr 2025 22:20:28 +0530 Subject: [PATCH 1/3] update --- src/diffusers/models/transformers/transformer_wan.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py index 4eb4add37601..aa03e97093aa 100644 --- a/src/diffusers/models/transformers/transformer_wan.py +++ b/src/diffusers/models/transformers/transformer_wan.py @@ -24,6 +24,7 @@ from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers from ..attention import FeedForward from ..attention_processor import Attention +from ..cache_utils import CacheMixin from ..embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed from ..modeling_outputs import Transformer2DModelOutput from ..modeling_utils import ModelMixin @@ -288,7 +289,7 @@ def forward( return hidden_states -class WanTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin): +class WanTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin): r""" A Transformer model for video-like data used in the Wan model. From 5ac65c4513801954f633fd1361bfe56b98a333d6 Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 1 Apr 2025 23:00:21 +0530 Subject: [PATCH 2/3] update --- src/diffusers/models/transformers/transformer_ltx.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/diffusers/models/transformers/transformer_ltx.py b/src/diffusers/models/transformers/transformer_ltx.py index c1f2df587927..2ae2418098f6 100644 --- a/src/diffusers/models/transformers/transformer_ltx.py +++ b/src/diffusers/models/transformers/transformer_ltx.py @@ -26,6 +26,7 @@ from ...utils.torch_utils import maybe_allow_in_graph from ..attention import FeedForward from ..attention_processor import Attention +from ..cache_utils import CacheMixin from ..embeddings import PixArtAlphaTextProjection from ..modeling_outputs import Transformer2DModelOutput from ..modeling_utils import ModelMixin @@ -298,7 +299,7 @@ def forward( @maybe_allow_in_graph -class LTXVideoTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin): +class LTXVideoTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin, CacheMixin): r""" A Transformer model for video-like data used in [LTX](https://huggingface.co/Lightricks/LTX-Video). From ca5cfbd37e0f37987e9685926f7a2b0c2f93ac48 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 2 Apr 2025 20:33:17 +0200 Subject: [PATCH 3/3] update --- src/diffusers/pipelines/ltx/pipeline_ltx.py | 7 +++++++ src/diffusers/pipelines/ltx/pipeline_ltx_condition.py | 7 +++++++ src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py | 7 +++++++ 3 files changed, 21 insertions(+) diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx.py b/src/diffusers/pipelines/ltx/pipeline_ltx.py index f7b0811d1a22..6f3faed8ff72 100644 --- a/src/diffusers/pipelines/ltx/pipeline_ltx.py +++ b/src/diffusers/pipelines/ltx/pipeline_ltx.py @@ -489,6 +489,10 @@ def do_classifier_free_guidance(self): def num_timesteps(self): return self._num_timesteps + @property + def current_timestep(self): + return self._current_timestep + @property def attention_kwargs(self): return self._attention_kwargs @@ -622,6 +626,7 @@ def __call__( self._guidance_scale = guidance_scale self._attention_kwargs = attention_kwargs self._interrupt = False + self._current_timestep = None # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -706,6 +711,8 @@ def __call__( if self.interrupt: continue + self._current_timestep = t + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents latent_model_input = latent_model_input.to(prompt_embeds.dtype) diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py b/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py index e7f3666cb2c7..ef1fd568397f 100644 --- a/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py +++ b/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py @@ -774,6 +774,10 @@ def do_classifier_free_guidance(self): def num_timesteps(self): return self._num_timesteps + @property + def current_timestep(self): + return self._current_timestep + @property def attention_kwargs(self): return self._attention_kwargs @@ -933,6 +937,7 @@ def __call__( self._guidance_scale = guidance_scale self._attention_kwargs = attention_kwargs self._interrupt = False + self._current_timestep = None # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -1066,6 +1071,8 @@ def __call__( if self.interrupt: continue + self._current_timestep = t + if image_cond_noise_scale > 0: # Add timestep-dependent noise to the hard-conditioning latents # This helps with motion continuity, especially when conditioned on a single frame diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py index 0f640dc33546..1ae67967c6f5 100644 --- a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py +++ b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py @@ -550,6 +550,10 @@ def do_classifier_free_guidance(self): def num_timesteps(self): return self._num_timesteps + @property + def current_timestep(self): + return self._current_timestep + @property def attention_kwargs(self): return self._attention_kwargs @@ -686,6 +690,7 @@ def __call__( self._guidance_scale = guidance_scale self._attention_kwargs = attention_kwargs self._interrupt = False + self._current_timestep = None # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -778,6 +783,8 @@ def __call__( if self.interrupt: continue + self._current_timestep = t + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents latent_model_input = latent_model_input.to(prompt_embeds.dtype)