Skip to content

make autoencoders. controlnet_flux and wan_transformer3d_single_file pass on xpu #11461

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 23 additions & 10 deletions src/diffusers/hooks/group_offloading.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from contextlib import contextmanager, nullcontext
from typing import Dict, List, Optional, Set, Tuple
from typing import Dict, List, Optional, Set, Tuple, Union

import torch

Expand Down Expand Up @@ -55,7 +55,7 @@ def __init__(
parameters: Optional[List[torch.nn.Parameter]] = None,
buffers: Optional[List[torch.Tensor]] = None,
non_blocking: bool = False,
stream: Optional[torch.cuda.Stream] = None,
stream: Union[torch.cuda.Stream, torch.Stream, None] = None,
record_stream: Optional[bool] = False,
low_cpu_mem_usage: bool = False,
onload_self: bool = True,
Expand Down Expand Up @@ -115,8 +115,13 @@ def _pinned_memory_tensors(self):

def onload_(self):
r"""Onloads the group of modules to the onload_device."""
context = nullcontext() if self.stream is None else torch.cuda.stream(self.stream)
current_stream = torch.cuda.current_stream() if self.record_stream else None
torch_accelerator_module = (
getattr(torch, torch.accelerator.current_accelerator().type)
if hasattr(torch, "accelerator")
else torch.cuda
)
context = nullcontext() if self.stream is None else torch_accelerator_module.stream(self.stream)
current_stream = torch_accelerator_module.current_stream() if self.record_stream else None

if self.stream is not None:
# Wait for previous Host->Device transfer to complete
Expand Down Expand Up @@ -162,9 +167,15 @@ def onload_(self):

def offload_(self):
r"""Offloads the group of modules to the offload_device."""

torch_accelerator_module = (
getattr(torch, torch.accelerator.current_accelerator().type)
if hasattr(torch, "accelerator")
else torch.cuda
)
if self.stream is not None:
if not self.record_stream:
torch.cuda.current_stream().synchronize()
torch_accelerator_module.current_stream().synchronize()
for group_module in self.modules:
for param in group_module.parameters():
param.data = self.cpu_param_dict[param]
Expand Down Expand Up @@ -429,8 +440,10 @@ def apply_group_offloading(
if use_stream:
if torch.cuda.is_available():
stream = torch.cuda.Stream()
elif hasattr(torch, "xpu") and torch.xpu.is_available():
stream = torch.Stream()
else:
raise ValueError("Using streams for data transfer requires a CUDA device.")
raise ValueError("Using streams for data transfer requires a CUDA device, or an Intel XPU device.")

_raise_error_if_accelerate_model_or_sequential_hook_present(module)

Expand Down Expand Up @@ -468,7 +481,7 @@ def _apply_group_offloading_block_level(
offload_device: torch.device,
onload_device: torch.device,
non_blocking: bool,
stream: Optional[torch.cuda.Stream] = None,
stream: Union[torch.cuda.Stream, torch.Stream, None] = None,
record_stream: Optional[bool] = False,
low_cpu_mem_usage: bool = False,
) -> None:
Expand All @@ -486,7 +499,7 @@ def _apply_group_offloading_block_level(
non_blocking (`bool`):
If True, offloading and onloading is done asynchronously. This can be useful for overlapping computation
and data transfer.
stream (`torch.cuda.Stream`, *optional*):
stream (`torch.cuda.Stream`or `torch.Stream`, *optional*):
If provided, offloading and onloading is done asynchronously using the provided stream. This can be useful
for overlapping computation and data transfer.
record_stream (`bool`, defaults to `False`): When enabled with `use_stream`, it marks the current tensor
Expand Down Expand Up @@ -572,7 +585,7 @@ def _apply_group_offloading_leaf_level(
offload_device: torch.device,
onload_device: torch.device,
non_blocking: bool,
stream: Optional[torch.cuda.Stream] = None,
stream: Union[torch.cuda.Stream, torch.Stream, None] = None,
record_stream: Optional[bool] = False,
low_cpu_mem_usage: bool = False,
) -> None:
Expand All @@ -592,7 +605,7 @@ def _apply_group_offloading_leaf_level(
non_blocking (`bool`):
If True, offloading and onloading is done asynchronously. This can be useful for overlapping computation
and data transfer.
stream (`torch.cuda.Stream`, *optional*):
stream (`torch.cuda.Stream` or `torch.Stream`, *optional*):
If provided, offloading and onloading is done asynchronously using the provided stream. This can be useful
for overlapping computation and data transfer.
record_stream (`bool`, defaults to `False`): When enabled with `use_stream`, it marks the current tensor
Expand Down
29 changes: 22 additions & 7 deletions tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from diffusers import AsymmetricAutoencoderKL
from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.testing_utils import (
Expectations,
backend_empty_cache,
enable_full_determinism,
floats_tensor,
Expand Down Expand Up @@ -134,18 +135,32 @@ def get_generator(self, seed=0):
# fmt: off
[
33,
[-0.0336, 0.3011, 0.1764, 0.0087, -0.3401, 0.3645, -0.1247, 0.1205],
[-0.1603, 0.9878, -0.0495, -0.0790, -0.2709, 0.8375, -0.2060, -0.0824],
Expectations(
{
("xpu", 3): torch.tensor([-0.0343, 0.2873, 0.1680, -0.0140, -0.3459, 0.3522, -0.1336, 0.1075]),
("cuda", 7): torch.tensor([-0.0336, 0.3011, 0.1764, 0.0087, -0.3401, 0.3645, -0.1247, 0.1205]),
("mps", None): torch.tensor(
[-0.1603, 0.9878, -0.0495, -0.0790, -0.2709, 0.8375, -0.2060, -0.0824]
),
}
),
],
[
47,
[0.4400, 0.0543, 0.2873, 0.2946, 0.0553, 0.0839, -0.1585, 0.2529],
[-0.2376, 0.1168, 0.1332, -0.4840, -0.2508, -0.0791, -0.0493, -0.4089],
Expectations(
{
("xpu", 3): torch.tensor([0.4400, 0.0543, 0.2873, 0.2946, 0.0553, 0.0839, -0.1585, 0.2529]),
("cuda", 7): torch.tensor([0.4400, 0.0543, 0.2873, 0.2946, 0.0553, 0.0839, -0.1585, 0.2529]),
("mps", None): torch.tensor(
[-0.2376, 0.1168, 0.1332, -0.4840, -0.2508, -0.0791, -0.0493, -0.4089]
),
}
),
],
# fmt: on
]
)
def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
def test_stable_diffusion(self, seed, expected_slices):
model = self.get_sd_vae_model()
image = self.get_sd_image(seed)
generator = self.get_generator(seed)
Expand All @@ -156,9 +171,9 @@ def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
assert sample.shape == image.shape

output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)

assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
expected_slice = expected_slices.get_expectation()
assert torch_all_close(output_slice, expected_slice, atol=5e-3)

@parameterized.expand(
[
Expand Down
6 changes: 3 additions & 3 deletions tests/pipelines/controlnet_flux/test_controlnet_flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
enable_full_determinism,
nightly,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
torch_device,
)
from diffusers.utils.torch_utils import randn_tensor
Expand Down Expand Up @@ -210,8 +210,8 @@ def test_flux_image_output_shape(self):


@nightly
@require_big_gpu_with_torch_cuda
@pytest.mark.big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_accelerator
class FluxControlNetPipelineSlowTests(unittest.TestCase):
pipeline_class = FluxControlNetPipeline

Expand Down
4 changes: 2 additions & 2 deletions tests/single_file/test_model_wan_transformer3d_single_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
require_torch_accelerator,
torch_device,
)
Expand Down Expand Up @@ -62,7 +62,7 @@ def test_single_file_components(self):
)


@require_big_gpu_with_torch_cuda
@require_big_accelerator
@require_torch_accelerator
class WanTransformer3DModelImage2VideoSingleFileTest(unittest.TestCase):
model_class = WanTransformer3DModel
Expand Down
Loading