diff --git a/docs/source/en/optimization/onnx.mdx b/docs/source/en/optimization/onnx.mdx index e79efbde0742..667f28a5dea1 100644 --- a/docs/source/en/optimization/onnx.mdx +++ b/docs/source/en/optimization/onnx.mdx @@ -21,13 +21,13 @@ specific language governing permissions and limitations under the License. ## Stable Diffusion Inference -The snippet below demonstrates how to use the ONNX runtime. You need to use `StableDiffusionOnnxPipeline` instead of `StableDiffusionPipeline`. You also need to download the weights from the `onnx` branch of the repository, and indicate the runtime provider you want to use. +The snippet below demonstrates how to use the ONNX runtime. You need to use `OnnxStableDiffusionPipeline` instead of `StableDiffusionPipeline`. You also need to download the weights from the `onnx` branch of the repository, and indicate the runtime provider you want to use. ```python # make sure you're logged in with `huggingface-cli login` -from diffusers import StableDiffusionOnnxPipeline +from diffusers import OnnxStableDiffusionPipeline -pipe = StableDiffusionOnnxPipeline.from_pretrained( +pipe = OnnxStableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", revision="onnx", provider="CUDAExecutionProvider", @@ -37,6 +37,37 @@ prompt = "a photo of an astronaut riding a horse on mars" image = pipe(prompt).images[0] ``` +The snippet below demonstrates how to use the ONNX runtime with the Stable Diffusion upscaling pipeline. + +```python +from diffusers import OnnxStableDiffusionPipeline, OnnxStableDiffusionUpscalePipeline + +prompt = "a photo of an astronaut riding a horse on mars" +steps = 50 + +txt2img = OnnxStableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + revision="onnx", + provider="CUDAExecutionProvider", +) +small_image = txt2img( + prompt, + num_inference_steps=steps, +).images[0] + +generator = torch.manual_seed(0) +upscale = OnnxStableDiffusionUpscalePipeline.from_pretrained( + "ssube/stable-diffusion-x4-upscaler-onnx", + provider="CUDAExecutionProvider", +) +large_image = upscale( + prompt, + small_image, + generator=generator, + num_inference_steps=steps, +).images[0] +``` + ## Known Issues - Generating multiple prompts in a batch seems to take too much memory. While we look into it, you may need to iterate instead of batching. diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 3767a0b410f6..d0598ebaf731 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -156,6 +156,7 @@ OnnxStableDiffusionInpaintPipeline, OnnxStableDiffusionInpaintPipelineLegacy, OnnxStableDiffusionPipeline, + OnnxStableDiffusionUpscalePipeline, StableDiffusionOnnxPipeline, ) diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 2888eb6ced0d..ed7ab73673bc 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -92,6 +92,7 @@ OnnxStableDiffusionInpaintPipeline, OnnxStableDiffusionInpaintPipelineLegacy, OnnxStableDiffusionPipeline, + OnnxStableDiffusionUpscalePipeline, StableDiffusionOnnxPipeline, ) diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index 0059ba91954b..b95020588292 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -103,6 +103,7 @@ class StableDiffusionPipelineOutput(BaseOutput): from .pipeline_onnx_stable_diffusion_img2img import OnnxStableDiffusionImg2ImgPipeline from .pipeline_onnx_stable_diffusion_inpaint import OnnxStableDiffusionInpaintPipeline from .pipeline_onnx_stable_diffusion_inpaint_legacy import OnnxStableDiffusionInpaintPipelineLegacy + from .pipeline_onnx_stable_diffusion_upscale import OnnxStableDiffusionUpscalePipeline if is_transformers_available() and is_flax_available(): import flax diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py new file mode 100644 index 000000000000..45b5a50467b0 --- /dev/null +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py @@ -0,0 +1,290 @@ +from logging import getLogger +from typing import Any, Callable, List, Optional, Union + +import numpy as np +import PIL +import torch + +from ...schedulers import DDPMScheduler +from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel +from ..pipeline_utils import ImagePipelineOutput +from . import StableDiffusionUpscalePipeline + + +logger = getLogger(__name__) + + +NUM_LATENT_CHANNELS = 4 +NUM_UNET_INPUT_CHANNELS = 7 + +ORT_TO_PT_TYPE = { + "float16": torch.float16, + "float32": torch.float32, +} + + +def preprocess(image): + if isinstance(image, torch.Tensor): + return image + elif isinstance(image, PIL.Image.Image): + image = [image] + + if isinstance(image[0], PIL.Image.Image): + w, h = image[0].size + w, h = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 32 + + image = [np.array(i.resize((w, h)))[None, :] for i in image] + image = np.concatenate(image, axis=0) + image = np.array(image).astype(np.float32) / 255.0 + image = image.transpose(0, 3, 1, 2) + image = 2.0 * image - 1.0 + image = torch.from_numpy(image) + elif isinstance(image[0], torch.Tensor): + image = torch.cat(image, dim=0) + + return image + + +class OnnxStableDiffusionUpscalePipeline(StableDiffusionUpscalePipeline): + def __init__( + self, + vae: OnnxRuntimeModel, + text_encoder: OnnxRuntimeModel, + tokenizer: Any, + unet: OnnxRuntimeModel, + low_res_scheduler: DDPMScheduler, + scheduler: Any, + max_noise_level: int = 350, + ): + super().__init__(vae, text_encoder, tokenizer, unet, low_res_scheduler, scheduler, max_noise_level) + + def __call__( + self, + prompt: Union[str, List[str]], + image: Union[torch.FloatTensor, PIL.Image.Image, List[PIL.Image.Image]], + num_inference_steps: int = 75, + guidance_scale: float = 9.0, + noise_level: int = 20, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: Optional[int] = 1, + ): + # 1. Check inputs + self.check_inputs(prompt, image, noise_level, callback_steps) + + # 2. Define call parameters + batch_size = 1 if isinstance(prompt, str) else len(prompt) + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + text_embeddings = self._encode_prompt( + prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt + ) + + latents_dtype = ORT_TO_PT_TYPE[str(text_embeddings.dtype)] + + # 4. Preprocess image + image = preprocess(image) + image = image.cpu() + + # 5. set timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 5. Add noise to image + noise_level = torch.tensor([noise_level], dtype=torch.long, device=device) + noise = torch.randn(image.shape, generator=generator, device=device, dtype=latents_dtype) + image = self.low_res_scheduler.add_noise(image, noise, noise_level) + + batch_multiplier = 2 if do_classifier_free_guidance else 1 + image = np.concatenate([image] * batch_multiplier * num_images_per_prompt) + noise_level = np.concatenate([noise_level] * image.shape[0]) + + # 6. Prepare latent variables + height, width = image.shape[2:] + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + NUM_LATENT_CHANNELS, + height, + width, + latents_dtype, + device, + generator, + latents, + ) + + # 7. Check that sizes of image and latents match + num_channels_image = image.shape[1] + if NUM_LATENT_CHANNELS + num_channels_image != NUM_UNET_INPUT_CHANNELS: + raise ValueError( + "Incorrect configuration settings! The config of `pipeline.unet` expects" + f" {NUM_UNET_INPUT_CHANNELS} but received `num_channels_latents`: {NUM_LATENT_CHANNELS} +" + f" `num_channels_image`: {num_channels_image} " + f" = {NUM_LATENT_CHANNELS+num_channels_image}. Please verify the config of" + " `pipeline.unet` or your `image` input." + ) + + # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + timestep_dtype = next( + (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)" + ) + timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype] + + # 9. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents + + # concat latents, mask, masked_image_latents in the channel dimension + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + latent_model_input = np.concatenate([latent_model_input, image], axis=1) + + # timestep to tensor + timestep = np.array([t], dtype=timestep_dtype) + + # predict the noise residual + noise_pred = self.unet( + sample=latent_model_input, + timestep=timestep, + encoder_hidden_states=text_embeddings, + class_labels=noise_level.astype(np.int64), + )[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + torch.from_numpy(noise_pred), t, latents, **extra_step_kwargs + ).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + # 10. Post-processing + image = self.decode_latents(latents.float()) + + # 11. Convert to PIL + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) + + def decode_latents(self, latents): + latents = 1 / 0.08333 * latents + image = self.vae(latent_sample=latents)[0] + image = np.clip(image / 2 + 0.5, 0, 1) + image = image.transpose((0, 2, 3, 1)) + return image + + def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt): + batch_size = len(prompt) if isinstance(prompt, list) else 1 + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + # if hasattr(text_inputs, "attention_mask"): + # attention_mask = text_inputs.attention_mask.to(device) + # else: + # attention_mask = None + + # no positional arguments to text_encoder + text_embeddings = self.text_encoder( + input_ids=text_input_ids.int().to(device), + # attention_mask=attention_mask, + ) + text_embeddings = text_embeddings[0] + + bs_embed, seq_len, _ = text_embeddings.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + text_embeddings = text_embeddings.repeat(1, num_images_per_prompt) + text_embeddings = text_embeddings.reshape(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = text_input_ids.shape[-1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + # if hasattr(uncond_input, "attention_mask"): + # attention_mask = uncond_input.attention_mask.to(device) + # else: + # attention_mask = None + + uncond_embeddings = self.text_encoder( + input_ids=uncond_input.input_ids.int().to(device), + # attention_mask=attention_mask, + ) + uncond_embeddings = uncond_embeddings[0] + + seq_len = uncond_embeddings.shape[1] + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt) + uncond_embeddings = uncond_embeddings.reshape(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) + + return text_embeddings diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index b6c0b591ece3..1bfb186666c2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -88,21 +88,22 @@ def __init__( ): super().__init__() - # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate - is_vae_scaling_factor_set_to_0_08333 = ( - hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333 - ) - if not is_vae_scaling_factor_set_to_0_08333: - deprecation_message = ( - "The configuration file of the vae does not contain `scaling_factor` or it is set to" - f" {vae.config.scaling_factor}, which seems highly unlikely. If your checkpoint is a fine-tuned" - " version of `stabilityai/stable-diffusion-x4-upscaler` you should change 'scaling_factor' to 0.08333" - " Please make sure to update the config accordingly, as not doing so might lead to incorrect results" - " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be" - " very nice if you could open a Pull Request for the `vae/config.json` file" + if hasattr(vae, "config"): + # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate + is_vae_scaling_factor_set_to_0_08333 = ( + hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333 ) - deprecate("wrong scaling_factor", "1.0.0", deprecation_message, standard_warn=False) - vae.register_to_config(scaling_factor=0.08333) + if not is_vae_scaling_factor_set_to_0_08333: + deprecation_message = ( + "The configuration file of the vae does not contain `scaling_factor` or it is set to" + f" {vae.config.scaling_factor}, which seems highly unlikely. If your checkpoint is a fine-tuned" + " version of `stabilityai/stable-diffusion-x4-upscaler` you should change 'scaling_factor' to" + " 0.08333 Please make sure to update the config accordingly, as not doing so might lead to" + " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging" + " Face Hub, it would be very nice if you could open a Pull Request for the `vae/config.json` file" + ) + deprecate("wrong scaling_factor", "1.0.0", deprecation_message, standard_warn=False) + vae.register_to_config(scaling_factor=0.08333) self.register_modules( vae=vae, diff --git a/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py index 204500a1f195..b7afad8226b8 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py @@ -62,6 +62,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers", "onnx"]) +class OnnxStableDiffusionUpscalePipeline(metaclass=DummyObject): + _backends = ["torch", "transformers", "onnx"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers", "onnx"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers", "onnx"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers", "onnx"]) + + class StableDiffusionOnnxPipeline(metaclass=DummyObject): _backends = ["torch", "transformers", "onnx"] diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py new file mode 100644 index 000000000000..d1527a42a1e5 --- /dev/null +++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py @@ -0,0 +1,232 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import unittest + +import numpy as np +import torch + +from diffusers import ( + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + OnnxStableDiffusionUpscalePipeline, + PNDMScheduler, +) +from diffusers.utils import floats_tensor +from diffusers.utils.testing_utils import ( + is_onnx_available, + load_image, + nightly, + require_onnxruntime, + require_torch_gpu, +) + +from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin + + +if is_onnx_available(): + import onnxruntime as ort + + +class OnnxStableDiffusionUpscalePipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase): + # TODO: is there an appropriate internal test set? + hub_checkpoint = "ssube/stable-diffusion-x4-upscaler-onnx" + + def get_dummy_inputs(self, seed=0): + image = floats_tensor((1, 3, 128, 128), rng=random.Random(seed)) + generator = torch.manual_seed(seed) + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": image, + "generator": generator, + "num_inference_steps": 3, + "guidance_scale": 7.5, + "output_type": "numpy", + } + return inputs + + def test_pipeline_default_ddpm(self): + pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs() + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + # started as 128, should now be 512 + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array( + [0.6974782, 0.68902093, 0.70135885, 0.7583618, 0.7804545, 0.7854912, 0.78667426, 0.78743863, 0.78070223] + ) + assert np.abs(image_slice - expected_slice).max() < 1e-1 + + def test_pipeline_pndm(self): + pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") + pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs() + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array( + [0.6898892, 0.59240556, 0.52499527, 0.58866215, 0.52258235, 0.52572715, 0.62414473, 0.6174387, 0.6214964] + ) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 + + def test_pipeline_dpm_multistep(self): + pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") + pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs() + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array( + [0.7659278, 0.76437664, 0.75579107, 0.7691116, 0.77666986, 0.7727672, 0.7758664, 0.7812226, 0.76942515] + ) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 + + def test_pipeline_euler(self): + pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") + pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs() + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array( + [0.6974782, 0.68902093, 0.70135885, 0.7583618, 0.7804545, 0.7854912, 0.78667426, 0.78743863, 0.78070223] + ) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 + + def test_pipeline_euler_ancestral(self): + pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") + pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs() + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array( + [0.77424496, 0.773601, 0.7645288, 0.7769598, 0.7772739, 0.7738688, 0.78187233, 0.77879584, 0.767043] + ) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 + + +@nightly +@require_onnxruntime +@require_torch_gpu +class OnnxStableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase): + @property + def gpu_provider(self): + return ( + "CUDAExecutionProvider", + { + "gpu_mem_limit": "15000000000", # 15GB + "arena_extend_strategy": "kSameAsRequested", + }, + ) + + @property + def gpu_options(self): + options = ort.SessionOptions() + options.enable_mem_pattern = False + return options + + def test_inference_default_ddpm(self): + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/img2img/sketch-mountains-input.jpg" + ) + init_image = init_image.resize((128, 128)) + # using the PNDM scheduler by default + pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained( + "ssube/stable-diffusion-x4-upscaler-onnx", + provider=self.gpu_provider, + sess_options=self.gpu_options, + ) + pipe.set_progress_bar_config(disable=None) + + prompt = "A fantasy landscape, trending on artstation" + + generator = torch.manual_seed(0) + output = pipe( + prompt=prompt, + image=init_image, + guidance_scale=7.5, + num_inference_steps=10, + generator=generator, + output_type="np", + ) + images = output.images + image_slice = images[0, 255:258, 383:386, -1] + + assert images.shape == (1, 512, 512, 3) + expected_slice = np.array([0.4883, 0.4947, 0.4980, 0.4975, 0.4982, 0.4980, 0.5000, 0.5006, 0.4972]) + # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues + + assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2 + + def test_inference_k_lms(self): + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/img2img/sketch-mountains-input.jpg" + ) + init_image = init_image.resize((128, 128)) + lms_scheduler = LMSDiscreteScheduler.from_pretrained( + "ssube/stable-diffusion-x4-upscaler-onnx", subfolder="scheduler" + ) + pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained( + "ssube/stable-diffusion-x4-upscaler-onnx", + scheduler=lms_scheduler, + provider=self.gpu_provider, + sess_options=self.gpu_options, + ) + pipe.set_progress_bar_config(disable=None) + + prompt = "A fantasy landscape, trending on artstation" + + generator = torch.manual_seed(0) + output = pipe( + prompt=prompt, + image=init_image, + guidance_scale=7.5, + num_inference_steps=20, + generator=generator, + output_type="np", + ) + images = output.images + image_slice = images[0, 255:258, 383:386, -1] + + assert images.shape == (1, 512, 512, 3) + expected_slice = np.array( + [0.50173753, 0.50223356, 0.502039, 0.50233036, 0.5023725, 0.5022601, 0.5018758, 0.50234085, 0.50241566] + ) + # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues + + assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2