|
| 1 | +from logging import getLogger |
| 2 | +from typing import Any, Callable, List, Optional, Union |
| 3 | + |
| 4 | +import numpy as np |
| 5 | +import PIL |
| 6 | +import torch |
| 7 | + |
| 8 | +from ...schedulers import DDPMScheduler |
| 9 | +from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel |
| 10 | +from ..pipeline_utils import ImagePipelineOutput |
| 11 | +from . import StableDiffusionUpscalePipeline |
| 12 | + |
| 13 | + |
| 14 | +logger = getLogger(__name__) |
| 15 | + |
| 16 | + |
| 17 | +NUM_LATENT_CHANNELS = 4 |
| 18 | +NUM_UNET_INPUT_CHANNELS = 7 |
| 19 | + |
| 20 | +ORT_TO_PT_TYPE = { |
| 21 | + "float16": torch.float16, |
| 22 | + "float32": torch.float32, |
| 23 | +} |
| 24 | + |
| 25 | + |
| 26 | +def preprocess(image): |
| 27 | + if isinstance(image, torch.Tensor): |
| 28 | + return image |
| 29 | + elif isinstance(image, PIL.Image.Image): |
| 30 | + image = [image] |
| 31 | + |
| 32 | + if isinstance(image[0], PIL.Image.Image): |
| 33 | + w, h = image[0].size |
| 34 | + w, h = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 32 |
| 35 | + |
| 36 | + image = [np.array(i.resize((w, h)))[None, :] for i in image] |
| 37 | + image = np.concatenate(image, axis=0) |
| 38 | + image = np.array(image).astype(np.float32) / 255.0 |
| 39 | + image = image.transpose(0, 3, 1, 2) |
| 40 | + image = 2.0 * image - 1.0 |
| 41 | + image = torch.from_numpy(image) |
| 42 | + elif isinstance(image[0], torch.Tensor): |
| 43 | + image = torch.cat(image, dim=0) |
| 44 | + |
| 45 | + return image |
| 46 | + |
| 47 | + |
| 48 | +class OnnxStableDiffusionUpscalePipeline(StableDiffusionUpscalePipeline): |
| 49 | + def __init__( |
| 50 | + self, |
| 51 | + vae: OnnxRuntimeModel, |
| 52 | + text_encoder: OnnxRuntimeModel, |
| 53 | + tokenizer: Any, |
| 54 | + unet: OnnxRuntimeModel, |
| 55 | + low_res_scheduler: DDPMScheduler, |
| 56 | + scheduler: Any, |
| 57 | + max_noise_level: int = 350, |
| 58 | + ): |
| 59 | + super().__init__(vae, text_encoder, tokenizer, unet, low_res_scheduler, scheduler, max_noise_level) |
| 60 | + |
| 61 | + def __call__( |
| 62 | + self, |
| 63 | + prompt: Union[str, List[str]], |
| 64 | + image: Union[torch.FloatTensor, PIL.Image.Image, List[PIL.Image.Image]], |
| 65 | + num_inference_steps: int = 75, |
| 66 | + guidance_scale: float = 9.0, |
| 67 | + noise_level: int = 20, |
| 68 | + negative_prompt: Optional[Union[str, List[str]]] = None, |
| 69 | + num_images_per_prompt: Optional[int] = 1, |
| 70 | + eta: float = 0.0, |
| 71 | + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, |
| 72 | + latents: Optional[torch.FloatTensor] = None, |
| 73 | + output_type: Optional[str] = "pil", |
| 74 | + return_dict: bool = True, |
| 75 | + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, |
| 76 | + callback_steps: Optional[int] = 1, |
| 77 | + ): |
| 78 | + # 1. Check inputs |
| 79 | + self.check_inputs(prompt, image, noise_level, callback_steps) |
| 80 | + |
| 81 | + # 2. Define call parameters |
| 82 | + batch_size = 1 if isinstance(prompt, str) else len(prompt) |
| 83 | + device = self._execution_device |
| 84 | + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) |
| 85 | + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` |
| 86 | + # corresponds to doing no classifier free guidance. |
| 87 | + do_classifier_free_guidance = guidance_scale > 1.0 |
| 88 | + |
| 89 | + # 3. Encode input prompt |
| 90 | + text_embeddings = self._encode_prompt( |
| 91 | + prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt |
| 92 | + ) |
| 93 | + |
| 94 | + latents_dtype = ORT_TO_PT_TYPE[str(text_embeddings.dtype)] |
| 95 | + |
| 96 | + # 4. Preprocess image |
| 97 | + image = preprocess(image) |
| 98 | + image = image.cpu() |
| 99 | + |
| 100 | + # 5. set timesteps |
| 101 | + self.scheduler.set_timesteps(num_inference_steps, device=device) |
| 102 | + timesteps = self.scheduler.timesteps |
| 103 | + |
| 104 | + # 5. Add noise to image |
| 105 | + noise_level = torch.tensor([noise_level], dtype=torch.long, device=device) |
| 106 | + noise = torch.randn(image.shape, generator=generator, device=device, dtype=latents_dtype) |
| 107 | + image = self.low_res_scheduler.add_noise(image, noise, noise_level) |
| 108 | + |
| 109 | + batch_multiplier = 2 if do_classifier_free_guidance else 1 |
| 110 | + image = np.concatenate([image] * batch_multiplier * num_images_per_prompt) |
| 111 | + noise_level = np.concatenate([noise_level] * image.shape[0]) |
| 112 | + |
| 113 | + # 6. Prepare latent variables |
| 114 | + height, width = image.shape[2:] |
| 115 | + latents = self.prepare_latents( |
| 116 | + batch_size * num_images_per_prompt, |
| 117 | + NUM_LATENT_CHANNELS, |
| 118 | + height, |
| 119 | + width, |
| 120 | + latents_dtype, |
| 121 | + device, |
| 122 | + generator, |
| 123 | + latents, |
| 124 | + ) |
| 125 | + |
| 126 | + # 7. Check that sizes of image and latents match |
| 127 | + num_channels_image = image.shape[1] |
| 128 | + if NUM_LATENT_CHANNELS + num_channels_image != NUM_UNET_INPUT_CHANNELS: |
| 129 | + raise ValueError( |
| 130 | + "Incorrect configuration settings! The config of `pipeline.unet` expects" |
| 131 | + f" {NUM_UNET_INPUT_CHANNELS} but received `num_channels_latents`: {NUM_LATENT_CHANNELS} +" |
| 132 | + f" `num_channels_image`: {num_channels_image} " |
| 133 | + f" = {NUM_LATENT_CHANNELS+num_channels_image}. Please verify the config of" |
| 134 | + " `pipeline.unet` or your `image` input." |
| 135 | + ) |
| 136 | + |
| 137 | + # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline |
| 138 | + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) |
| 139 | + |
| 140 | + timestep_dtype = next( |
| 141 | + (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)" |
| 142 | + ) |
| 143 | + timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype] |
| 144 | + |
| 145 | + # 9. Denoising loop |
| 146 | + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order |
| 147 | + with self.progress_bar(total=num_inference_steps) as progress_bar: |
| 148 | + for i, t in enumerate(timesteps): |
| 149 | + # expand the latents if we are doing classifier free guidance |
| 150 | + latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents |
| 151 | + |
| 152 | + # concat latents, mask, masked_image_latents in the channel dimension |
| 153 | + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) |
| 154 | + latent_model_input = np.concatenate([latent_model_input, image], axis=1) |
| 155 | + |
| 156 | + # timestep to tensor |
| 157 | + timestep = np.array([t], dtype=timestep_dtype) |
| 158 | + |
| 159 | + # predict the noise residual |
| 160 | + noise_pred = self.unet( |
| 161 | + sample=latent_model_input, |
| 162 | + timestep=timestep, |
| 163 | + encoder_hidden_states=text_embeddings, |
| 164 | + class_labels=noise_level.astype(np.int64), |
| 165 | + )[0] |
| 166 | + |
| 167 | + # perform guidance |
| 168 | + if do_classifier_free_guidance: |
| 169 | + noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) |
| 170 | + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) |
| 171 | + |
| 172 | + # compute the previous noisy sample x_t -> x_t-1 |
| 173 | + latents = self.scheduler.step( |
| 174 | + torch.from_numpy(noise_pred), t, latents, **extra_step_kwargs |
| 175 | + ).prev_sample |
| 176 | + |
| 177 | + # call the callback, if provided |
| 178 | + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): |
| 179 | + progress_bar.update() |
| 180 | + if callback is not None and i % callback_steps == 0: |
| 181 | + callback(i, t, latents) |
| 182 | + |
| 183 | + # 10. Post-processing |
| 184 | + image = self.decode_latents(latents.float()) |
| 185 | + |
| 186 | + # 11. Convert to PIL |
| 187 | + if output_type == "pil": |
| 188 | + image = self.numpy_to_pil(image) |
| 189 | + |
| 190 | + if not return_dict: |
| 191 | + return (image,) |
| 192 | + |
| 193 | + return ImagePipelineOutput(images=image) |
| 194 | + |
| 195 | + def decode_latents(self, latents): |
| 196 | + latents = 1 / 0.08333 * latents |
| 197 | + image = self.vae(latent_sample=latents)[0] |
| 198 | + image = np.clip(image / 2 + 0.5, 0, 1) |
| 199 | + image = image.transpose((0, 2, 3, 1)) |
| 200 | + return image |
| 201 | + |
| 202 | + def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt): |
| 203 | + batch_size = len(prompt) if isinstance(prompt, list) else 1 |
| 204 | + |
| 205 | + text_inputs = self.tokenizer( |
| 206 | + prompt, |
| 207 | + padding="max_length", |
| 208 | + max_length=self.tokenizer.model_max_length, |
| 209 | + truncation=True, |
| 210 | + return_tensors="pt", |
| 211 | + ) |
| 212 | + text_input_ids = text_inputs.input_ids |
| 213 | + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids |
| 214 | + |
| 215 | + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): |
| 216 | + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) |
| 217 | + logger.warning( |
| 218 | + "The following part of your input was truncated because CLIP can only handle sequences up to" |
| 219 | + f" {self.tokenizer.model_max_length} tokens: {removed_text}" |
| 220 | + ) |
| 221 | + |
| 222 | + # if hasattr(text_inputs, "attention_mask"): |
| 223 | + # attention_mask = text_inputs.attention_mask.to(device) |
| 224 | + # else: |
| 225 | + # attention_mask = None |
| 226 | + |
| 227 | + # no positional arguments to text_encoder |
| 228 | + text_embeddings = self.text_encoder( |
| 229 | + input_ids=text_input_ids.int().to(device), |
| 230 | + # attention_mask=attention_mask, |
| 231 | + ) |
| 232 | + text_embeddings = text_embeddings[0] |
| 233 | + |
| 234 | + bs_embed, seq_len, _ = text_embeddings.shape |
| 235 | + # duplicate text embeddings for each generation per prompt, using mps friendly method |
| 236 | + text_embeddings = text_embeddings.repeat(1, num_images_per_prompt) |
| 237 | + text_embeddings = text_embeddings.reshape(bs_embed * num_images_per_prompt, seq_len, -1) |
| 238 | + |
| 239 | + # get unconditional embeddings for classifier free guidance |
| 240 | + if do_classifier_free_guidance: |
| 241 | + uncond_tokens: List[str] |
| 242 | + if negative_prompt is None: |
| 243 | + uncond_tokens = [""] * batch_size |
| 244 | + elif type(prompt) is not type(negative_prompt): |
| 245 | + raise TypeError( |
| 246 | + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" |
| 247 | + f" {type(prompt)}." |
| 248 | + ) |
| 249 | + elif isinstance(negative_prompt, str): |
| 250 | + uncond_tokens = [negative_prompt] |
| 251 | + elif batch_size != len(negative_prompt): |
| 252 | + raise ValueError( |
| 253 | + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" |
| 254 | + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" |
| 255 | + " the batch size of `prompt`." |
| 256 | + ) |
| 257 | + else: |
| 258 | + uncond_tokens = negative_prompt |
| 259 | + |
| 260 | + max_length = text_input_ids.shape[-1] |
| 261 | + uncond_input = self.tokenizer( |
| 262 | + uncond_tokens, |
| 263 | + padding="max_length", |
| 264 | + max_length=max_length, |
| 265 | + truncation=True, |
| 266 | + return_tensors="pt", |
| 267 | + ) |
| 268 | + |
| 269 | + # if hasattr(uncond_input, "attention_mask"): |
| 270 | + # attention_mask = uncond_input.attention_mask.to(device) |
| 271 | + # else: |
| 272 | + # attention_mask = None |
| 273 | + |
| 274 | + uncond_embeddings = self.text_encoder( |
| 275 | + input_ids=uncond_input.input_ids.int().to(device), |
| 276 | + # attention_mask=attention_mask, |
| 277 | + ) |
| 278 | + uncond_embeddings = uncond_embeddings[0] |
| 279 | + |
| 280 | + seq_len = uncond_embeddings.shape[1] |
| 281 | + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method |
| 282 | + uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt) |
| 283 | + uncond_embeddings = uncond_embeddings.reshape(batch_size * num_images_per_prompt, seq_len, -1) |
| 284 | + |
| 285 | + # For classifier free guidance, we need to do two forward passes. |
| 286 | + # Here we concatenate the unconditional and text embeddings into a single batch |
| 287 | + # to avoid doing two forward passes |
| 288 | + text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) |
| 289 | + |
| 290 | + return text_embeddings |
0 commit comments