|
15 | 15 | from ...image_processor import VaeImageProcessor
|
16 | 16 | from ...models import AutoencoderKL, HiDreamImageTransformer2DModel
|
17 | 17 | from ...schedulers import FlowMatchEulerDiscreteScheduler, UniPCMultistepScheduler
|
18 |
| -from ...utils import is_torch_xla_available, logging |
| 18 | +from ...utils import is_torch_xla_available, logging, replace_example_docstring |
19 | 19 | from ...utils.torch_utils import randn_tensor
|
20 | 20 | from ..pipeline_utils import DiffusionPipeline
|
21 | 21 | from .pipeline_output import HiDreamImagePipelineOutput
|
@@ -523,6 +523,7 @@ def interrupt(self):
|
523 | 523 | return self._interrupt
|
524 | 524 |
|
525 | 525 | @torch.no_grad()
|
| 526 | + @replace_example_docstring(EXAMPLE_DOC_STRING) |
526 | 527 | def __call__(
|
527 | 528 | self,
|
528 | 529 | prompt: Union[str, List[str]] = None,
|
@@ -552,6 +553,102 @@ def __call__(
|
552 | 553 | callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
553 | 554 | max_sequence_length: int = 128,
|
554 | 555 | ):
|
| 556 | + r""" |
| 557 | + Function invoked when calling the pipeline for generation. |
| 558 | +
|
| 559 | + Args: |
| 560 | + prompt (`str` or `List[str]`, *optional*): |
| 561 | + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. |
| 562 | + instead. |
| 563 | + prompt_2 (`str` or `List[str]`, *optional*): |
| 564 | + The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is |
| 565 | + will be used instead. |
| 566 | + prompt_3 (`str` or `List[str]`, *optional*): |
| 567 | + The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is |
| 568 | + will be used instead. |
| 569 | + prompt_4 (`str` or `List[str]`, *optional*): |
| 570 | + The prompt or prompts to be sent to `tokenizer_4` and `text_encoder_4`. If not defined, `prompt` is |
| 571 | + will be used instead. |
| 572 | + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): |
| 573 | + The height in pixels of the generated image. This is set to 1024 by default for the best results. |
| 574 | + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): |
| 575 | + The width in pixels of the generated image. This is set to 1024 by default for the best results. |
| 576 | + num_inference_steps (`int`, *optional*, defaults to 50): |
| 577 | + The number of denoising steps. More denoising steps usually lead to a higher quality image at the |
| 578 | + expense of slower inference. |
| 579 | + sigmas (`List[float]`, *optional*): |
| 580 | + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in |
| 581 | + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed |
| 582 | + will be used. |
| 583 | + guidance_scale (`float`, *optional*, defaults to 3.5): |
| 584 | + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). |
| 585 | + `guidance_scale` is defined as `w` of equation 2. of [Imagen |
| 586 | + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > |
| 587 | + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, |
| 588 | + usually at the expense of lower image quality. |
| 589 | + negative_prompt (`str` or `List[str]`, *optional*): |
| 590 | + The prompt or prompts not to guide the image generation. If not defined, one has to pass |
| 591 | + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is |
| 592 | + not greater than `1`). |
| 593 | + negative_prompt_2 (`str` or `List[str]`, *optional*): |
| 594 | + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and |
| 595 | + `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. |
| 596 | + negative_prompt_3 (`str` or `List[str]`, *optional*): |
| 597 | + The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and |
| 598 | + `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders. |
| 599 | + negative_prompt_4 (`str` or `List[str]`, *optional*): |
| 600 | + The prompt or prompts not to guide the image generation to be sent to `tokenizer_4` and |
| 601 | + `text_encoder_4`. If not defined, `negative_prompt` is used in all the text-encoders. |
| 602 | + num_images_per_prompt (`int`, *optional*, defaults to 1): |
| 603 | + The number of images to generate per prompt. |
| 604 | + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): |
| 605 | + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) |
| 606 | + to make generation deterministic. |
| 607 | + latents (`torch.FloatTensor`, *optional*): |
| 608 | + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image |
| 609 | + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents |
| 610 | + tensor will ge generated by sampling using the supplied random `generator`. |
| 611 | + prompt_embeds (`torch.FloatTensor`, *optional*): |
| 612 | + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not |
| 613 | + provided, text embeddings will be generated from `prompt` input argument. |
| 614 | + negative_prompt_embeds (`torch.FloatTensor`, *optional*): |
| 615 | + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt |
| 616 | + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input |
| 617 | + argument. |
| 618 | + pooled_prompt_embeds (`torch.FloatTensor`, *optional*): |
| 619 | + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. |
| 620 | + If not provided, pooled text embeddings will be generated from `prompt` input argument. |
| 621 | + negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): |
| 622 | + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt |
| 623 | + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` |
| 624 | + input argument. |
| 625 | + output_type (`str`, *optional*, defaults to `"pil"`): |
| 626 | + The output format of the generate image. Choose between |
| 627 | + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. |
| 628 | + return_dict (`bool`, *optional*, defaults to `True`): |
| 629 | + Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple. |
| 630 | + attention_kwargs (`dict`, *optional*): |
| 631 | + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under |
| 632 | + `self.processor` in |
| 633 | + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). |
| 634 | + callback_on_step_end (`Callable`, *optional*): |
| 635 | + A function that calls at the end of each denoising steps during the inference. The function is called |
| 636 | + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, |
| 637 | + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by |
| 638 | + `callback_on_step_end_tensor_inputs`. |
| 639 | + callback_on_step_end_tensor_inputs (`List`, *optional*): |
| 640 | + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list |
| 641 | + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the |
| 642 | + `._callback_tensor_inputs` attribute of your pipeline class. |
| 643 | + max_sequence_length (`int` defaults to 128): Maximum sequence length to use with the `prompt`. |
| 644 | +
|
| 645 | + Examples: |
| 646 | +
|
| 647 | + Returns: |
| 648 | + [`~pipelines.hidream_image.HiDreamImagePipelineOutput`] or `tuple`: |
| 649 | + [`~pipelines.hidream_image.HiDreamImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When |
| 650 | + returning a tuple, the first element is a list with the generated. images. |
| 651 | + """ |
555 | 652 | height = height or self.default_sample_size * self.vae_scale_factor
|
556 | 653 | width = width or self.default_sample_size * self.vae_scale_factor
|
557 | 654 |
|
|
0 commit comments