fix scheduler compatibility and class labels dtype

ssube · ssube · commit 295a96d6cfb7 · 2023-02-15T17:07:36.000-06:00
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
@@ -2,9 +2,8 @@
 from typing import Any, Callable, List, Optional, Union
 
 import numpy as np
-import torch
-
 import PIL
+import torch
 
 from ...schedulers import DDPMScheduler
 from ..onnx_utils import OnnxRuntimeModel
@@ -14,11 +13,15 @@
 
 logger = getLogger(__name__)
 
-# TODO: make this dynamic, from self.vae.config.latent_channels
-num_channels_latents = 4
 
-# TODO: make this dynamic, from self.unet.config.in_channels
-unet_in_channels = 7
+NUM_LATENT_CHANNELS = 4
+NUM_UNET_INPUT_CHANNELS = 7
+
+# TODO: should this be a lookup? it needs to match the conversion script
+class_labels_dtype = np.long
+
+# TODO: should this be a lookup or converted? can it vary on ONNX?
+text_embeddings_dtype = torch.float32
 
 ###
 # This is based on a combination of the ONNX img2img pipeline and the PyTorch upscale pipeline:
@@ -94,7 +97,6 @@ def __call__(
         text_embeddings = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
-        text_embeddings_dtype = torch.float32  # TODO: convert text_embeddings.dtype to torch dtype
 
         # 4. Preprocess image
         image = preprocess(image)
@@ -117,7 +119,7 @@ def __call__(
         height, width = image.shape[2:]
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
-            num_channels_latents,
+            NUM_LATENT_CHANNELS,
             height,
             width,
             text_embeddings_dtype,
@@ -128,12 +130,12 @@ def __call__(
 
         # 7. Check that sizes of image and latents match
         num_channels_image = image.shape[1]
-        if num_channels_latents + num_channels_image != unet_in_channels:
+        if NUM_LATENT_CHANNELS + num_channels_image != NUM_UNET_INPUT_CHANNELS:
             raise ValueError(
                 "Incorrect configuration settings! The config of `pipeline.unet` expects"
-                f" {unet_in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" {NUM_UNET_INPUT_CHANNELS} but received `num_channels_latents`: {NUM_LATENT_CHANNELS} +"
                 f" `num_channels_image`: {num_channels_image} "
-                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                f" = {NUM_LATENT_CHANNELS+num_channels_image}. Please verify the config of"
                 " `pipeline.unet` or your `image` input."
             )
 
@@ -159,7 +161,7 @@ def __call__(
                     sample=latent_model_input,
                     timestep=timestep,
                     encoder_hidden_states=text_embeddings,
-                    class_labels=noise_level,
+                    class_labels=noise_level.astype(class_labels_dtype),
                 )[0]
 
                 # perform guidance
@@ -168,7 +170,9 @@ def __call__(
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(
+                    torch.from_numpy(noise_pred), t, latents, **extra_step_kwargs
+                ).prev_sample
 
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py
@@ -19,7 +19,14 @@
 import numpy as np
 import torch
 
-from diffusers import OnnxStableDiffusionUpscalePipeline
+from diffusers import (
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    OnnxStableDiffusionUpscalePipeline,
+    PNDMScheduler,
+)
 from diffusers.utils import floats_tensor
 from diffusers.utils.testing_utils import (
     is_onnx_available,
@@ -68,6 +75,86 @@ def test_pipeline_default_ddpm(self):
         )
         assert np.abs(image_slice - expected_slice).max() < 1e-1
 
+    def test_pipeline_pndm(self):
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.6898892, 0.59240556, 0.52499527, 0.58866215, 0.52258235, 0.52572715, 0.62414473, 0.6174387, 0.6214964]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_pipeline_dpm_multistep(self):
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.7659278, 0.76437664, 0.75579107, 0.7691116, 0.77666986, 0.7727672, 0.7758664, 0.7812226, 0.76942515]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_pipeline_lms(self):
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        # warmup pass to apply optimizations
+        _ = pipe(**self.get_dummy_inputs())
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.6974782, 0.68902093, 0.70135885, 0.7583618, 0.7804545, 0.7854912, 0.78667426, 0.78743863, 0.78070223]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_pipeline_euler(self):
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.6974782, 0.68902093, 0.70135885, 0.7583618, 0.7804545, 0.7854912, 0.78667426, 0.78743863, 0.78070223]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
+    def test_pipeline_euler_ancestral(self):
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
+        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs()
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.77424496, 0.773601, 0.7645288, 0.7769598, 0.7772739, 0.7738688, 0.78187233, 0.77879584, 0.767043]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
+
 
 @nightly
 @require_onnxruntime
@@ -98,8 +185,6 @@ def test_inference_default_ddpm(self):
         # using the PNDM scheduler by default
         pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(
             "ssube/stable-diffusion-x4-upscaler-onnx",
-            safety_checker=None,
-            feature_extractor=None,
             provider=self.gpu_provider,
             sess_options=self.gpu_options,
         )
@@ -124,3 +209,42 @@ def test_inference_default_ddpm(self):
         # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
+
+    def test_inference_k_lms(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((128, 128))
+        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
+            "ssube/stable-diffusion-x4-upscaler-onnx", subfolder="scheduler"
+        )
+        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(
+            "ssube/stable-diffusion-x4-upscaler-onnx",
+            scheduler=lms_scheduler,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = torch.manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            guidance_scale=7.5,
+            num_inference_steps=20,
+            generator=generator,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 383:386, -1]
+
+        assert images.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.50173753, 0.50223356, 0.502039, 0.50233036, 0.5023725, 0.5022601, 0.5018758, 0.50234085, 0.50241566]
+        )
+        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2