ltx

stevhliu · stevhliu · commit ddab9b4505fd · 2025-05-20T09:42:41.000-07:00
diff --git a/docs/source/en/api/pipelines/ltx_video.md b/docs/source/en/api/pipelines/ltx_video.md
@@ -17,6 +17,7 @@
     <a href="https://huggingface.co/docs/diffusers/main/en/tutorials/using_peft_for_inference" target="_blank" rel="noopener">
       <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
     </a>
+    <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22">
   </div>
 </div>
 
@@ -64,7 +65,10 @@ apply_group_offloading(pipeline.text_encoder, onload_device=onload_device, offlo
 apply_group_offloading(pipeline.vae, onload_device=onload_device, offload_type="leaf_level")
 
 prompt = """
-A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage
+A woman with long brown hair and light skin smiles at another woman with long blonde hair.
+The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek.
+The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and 
+natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage
 """
 negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
 
@@ -102,7 +106,10 @@ pipeline.transformer = torch.compile(
 )
 
 prompt = """
-A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage
+A woman with long brown hair and light skin smiles at another woman with long blonde hair.
+The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek.
+The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and 
+natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage
 """
 negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
 
@@ -124,6 +131,200 @@ export_to_video(video, "output.mp4", fps=24)
 
 ## Notes
 
+- Refer to the following recommended settings for generation from the [LTX-Video](https://github.com/Lightricks/LTX-Video) repository.
+
+  - The recommended dtype for the transformer, VAE, and text encoder is `torch.bfloat16`. The VAE and text encoder can also be `torch.float32` or `torch.float16`.
+  - For guidance-distilled variants of LTX-Video, set `guidance_scale` to `1.0`. The `guidance_scale` for any other model should be set higher, like `5.0`, for good generation quality.
+  - For timestep-aware VAE variants (LTX-Video 0.9.1 and above), set `decode_timestep` to `0.05` and `image_cond_noise_scale` to `0.025`.
+  - For variants that support interpolation between multiple conditioning images and videos (LTX-Video 0.9.5 and above), use similar images and videos for the best results. Divergence from the conditioning inputs may lead to abrupt transitionts in the generated video.
+
+- LTX-Video 0.9.7 includes a spatial latent upscaler and a 13B parameter transformer. During inference, a low resolution video is quickly generated first and then upscaled and refined.
+
+  <details>
+  <summary>Show example code</summary>
+
+  ```py
+  import torch
+  from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
+  from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
+  from diffusers.utils import export_to_video, load_video
+
+  pipeline = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
+  pipeline_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipeline.vae, torch_dtype=torch.bfloat16)
+  pipeline.to("cuda")
+  pipe_upsample.to("cuda")
+  pipeline.vae.enable_tiling()
+
+  def round_to_nearest_resolution_acceptable_by_vae(height, width):
+      height = height - (height % pipeline.vae_temporal_compression_ratio)
+      width = width - (width % pipeline.vae_temporal_compression_ratio)
+      return height, width
+
+  video = load_video(
+      "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
+  )[:21]  # only use the first 21 frames as conditioning
+  condition1 = LTXVideoCondition(video=video, frame_index=0)
+
+  prompt = """
+  The video depicts a winding mountain road covered in snow, with a single vehicle 
+  traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. 
+  The landscape is characterized by rugged terrain and a river visible in the distance. 
+  The scene captures the solitude and beauty of a winter drive through a mountainous region.
+  """
+  negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
+  expected_height, expected_width = 768, 1152
+  downscale_factor = 2 / 3
+  num_frames = 161
+
+  # 1. Generate video at smaller resolution
+  # Text-only conditioning is also supported without the need to pass `conditions`
+  downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
+  downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
+  latents = pipeline(
+      conditions=[condition1],
+      prompt=prompt,
+      negative_prompt=negative_prompt,
+      width=downscaled_width,
+      height=downscaled_height,
+      num_frames=num_frames,
+      num_inference_steps=30,
+      decode_timestep=0.05,
+      decode_noise_scale=0.025,
+      image_cond_noise_scale=0.0,
+      guidance_scale=5.0,
+      guidance_rescale=0.7,
+      generator=torch.Generator().manual_seed(0),
+      output_type="latent",
+  ).frames
+
+  # 2. Upscale generated video using latent upsampler with fewer inference steps
+  # The available latent upsampler upscales the height/width by 2x
+  upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
+  upscaled_latents = pipe_upsample(
+      latents=latents,
+      output_type="latent"
+  ).frames
+
+  # 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
+  video = pipeline(
+      conditions=[condition1],
+      prompt=prompt,
+      negative_prompt=negative_prompt,
+      width=upscaled_width,
+      height=upscaled_height,
+      num_frames=num_frames,
+      denoise_strength=0.4,  # Effectively, 4 inference steps out of 10
+      num_inference_steps=10,
+      latents=upscaled_latents,
+      decode_timestep=0.05,
+      decode_noise_scale=0.025,
+      image_cond_noise_scale=0.0,
+      guidance_scale=5.0,
+      guidance_rescale=0.7,
+      generator=torch.Generator().manual_seed(0),
+      output_type="pil",
+  ).frames[0]
+
+  # 4. Downscale the video to the expected resolution
+  video = [frame.resize((expected_width, expected_height)) for frame in video]
+
+  export_to_video(video, "output.mp4", fps=24)
+  ```
+
+  </details>
+
+- LTX-Video 0.9.7 distilled model is guidance and timestep-distilled to speedup generation. It requires `guidance_scale` to be set to `1.0` and `num_inference_steps` should be set between `4` and `10` for good generation quality. You should also use the following custom timesteps for the best results.
+
+  - Base model inference to prepare for upscaling: `[1000, 993, 987, 981, 975, 909, 725, 0.03]`.
+  - Upscaling: `[1000, 909, 725, 421, 0]`.
+
+  <details>
+  <summary>Show example code</summary>
+
+  ```py
+  import torch
+  from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
+  from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
+  from diffusers.utils import export_to_video, load_video
+
+  pipeline = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-distilled", torch_dtype=torch.bfloat16)
+  pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipeline.vae, torch_dtype=torch.bfloat16)
+  pipeline.to("cuda")
+  pipe_upsample.to("cuda")
+  pipeline.vae.enable_tiling()
+
+  def round_to_nearest_resolution_acceptable_by_vae(height, width):
+      height = height - (height % pipeline.vae_temporal_compression_ratio)
+      width = width - (width % pipeline.vae_temporal_compression_ratio)
+      return height, width
+
+  prompt = """
+  artistic anatomical 3d render, utlra quality, human half full male body with transparent 
+  skin revealing structure instead of organs, muscular, intricate creative patterns, 
+  monochromatic with backlighting, lightning mesh, scientific concept art, blending biology 
+  with botany, surreal and ethereal quality, unreal engine 5, ray tracing, ultra realistic, 
+  16K UHD, rich details. camera zooms out in a rotating fashion
+  """
+  negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
+  expected_height, expected_width = 768, 1152
+  downscale_factor = 2 / 3
+  num_frames = 161
+
+  # 1. Generate video at smaller resolution
+  downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
+  downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
+  latents = pipeline(
+      prompt=prompt,
+      negative_prompt=negative_prompt,
+      width=downscaled_width,
+      height=downscaled_height,
+      num_frames=num_frames,
+      timesteps=[1000, 993, 987, 981, 975, 909, 725, 0.03],
+      decode_timestep=0.05,
+      decode_noise_scale=0.025,
+      image_cond_noise_scale=0.0,
+      guidance_scale=1.0,
+      guidance_rescale=0.7,
+      generator=torch.Generator().manual_seed(0),
+      output_type="latent",
+  ).frames
+
+  # 2. Upscale generated video using latent upsampler with fewer inference steps
+  # The available latent upsampler upscales the height/width by 2x
+  upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
+  upscaled_latents = pipe_upsample(
+      latents=latents,
+      adain_factor=1.0,
+      output_type="latent"
+  ).frames
+
+  # 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
+  video = pipeline(
+      prompt=prompt,
+      negative_prompt=negative_prompt,
+      width=upscaled_width,
+      height=upscaled_height,
+      num_frames=num_frames,
+      denoise_strength=0.999,  # Effectively, 4 inference steps out of 5
+      timesteps=[1000, 909, 725, 421, 0],
+      latents=upscaled_latents,
+      decode_timestep=0.05,
+      decode_noise_scale=0.025,
+      image_cond_noise_scale=0.0,
+      guidance_scale=1.0,
+      guidance_rescale=0.7,
+      generator=torch.Generator().manual_seed(0),
+      output_type="pil",
+  ).frames[0]
+
+  # 4. Downscale the video to the expected resolution
+  video = [frame.resize((expected_width, expected_height)) for frame in video]
+
+  export_to_video(video, "output.mp4", fps=24)
+  ```
+
+  </details>
+
 - LTX-Video supports LoRAs with [`~loaders.LTXVideoLoraLoaderMixin.load_lora_weights`].
 
   <details>