Skip to content

Commit ebfec45

Browse files
authored
Merge branch 'main' into allow-device-placement-bnb
2 parents 7d47364 + 30f2e9b commit ebfec45

21 files changed

+129
-93
lines changed

docs/source/en/conceptual/evaluation.md

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ Then we load the [v1-5 checkpoint](https://huggingface.co/stable-diffusion-v1-5/
181181

182182
```python
183183
model_ckpt_1_5 = "stable-diffusion-v1-5/stable-diffusion-v1-5"
184-
sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=weight_dtype).to(device)
184+
sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=torch.float16).to("cuda")
185185

186186
images_1_5 = sd_pipeline_1_5(prompts, num_images_per_prompt=1, generator=generator, output_type="np").images
187187
```
@@ -280,7 +280,7 @@ from diffusers import StableDiffusionInstructPix2PixPipeline
280280

281281
instruct_pix2pix_pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
282282
"timbrooks/instruct-pix2pix", torch_dtype=torch.float16
283-
).to(device)
283+
).to("cuda")
284284
```
285285

286286
Now, we perform the edits:
@@ -326,9 +326,9 @@ from transformers import (
326326

327327
clip_id = "openai/clip-vit-large-patch14"
328328
tokenizer = CLIPTokenizer.from_pretrained(clip_id)
329-
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(device)
329+
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to("cuda")
330330
image_processor = CLIPImageProcessor.from_pretrained(clip_id)
331-
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device)
331+
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to("cuda")
332332
```
333333

334334
Notice that we are using a particular CLIP checkpoint, i.e., `openai/clip-vit-large-patch14`. This is because the Stable Diffusion pre-training was performed with this CLIP variant. For more details, refer to the [documentation](https://huggingface.co/docs/transformers/model_doc/clip).
@@ -350,7 +350,7 @@ class DirectionalSimilarity(nn.Module):
350350

351351
def preprocess_image(self, image):
352352
image = self.image_processor(image, return_tensors="pt")["pixel_values"]
353-
return {"pixel_values": image.to(device)}
353+
return {"pixel_values": image.to("cuda")}
354354

355355
def tokenize_text(self, text):
356356
inputs = self.tokenizer(
@@ -360,7 +360,7 @@ class DirectionalSimilarity(nn.Module):
360360
truncation=True,
361361
return_tensors="pt",
362362
)
363-
return {"input_ids": inputs.input_ids.to(device)}
363+
return {"input_ids": inputs.input_ids.to("cuda")}
364364

365365
def encode_image(self, image):
366366
preprocessed_image = self.preprocess_image(image)
@@ -459,6 +459,7 @@ with ZipFile(local_filepath, "r") as zipper:
459459
```python
460460
from PIL import Image
461461
import os
462+
import numpy as np
462463

463464
dataset_path = "sample-imagenet-images"
464465
image_paths = sorted([os.path.join(dataset_path, x) for x in os.listdir(dataset_path)])
@@ -477,6 +478,7 @@ Now that the images are loaded, let's apply some lightweight pre-processing on t
477478

478479
```python
479480
from torchvision.transforms import functional as F
481+
import torch
480482

481483

482484
def preprocess_image(image):
@@ -498,6 +500,10 @@ dit_pipeline = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=
498500
dit_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(dit_pipeline.scheduler.config)
499501
dit_pipeline = dit_pipeline.to("cuda")
500502

503+
seed = 0
504+
generator = torch.manual_seed(seed)
505+
506+
501507
words = [
502508
"cassette player",
503509
"chainsaw",

docs/source/en/training/create_dataset.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Create a dataset for training
22

3-
There are many datasets on the [Hub](https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads) to train a model on, but if you can't find one you're interested in or want to use your own, you can create a dataset with the 🤗 [Datasets](hf.co/docs/datasets) library. The dataset structure depends on the task you want to train your model on. The most basic dataset structure is a directory of images for tasks like unconditional image generation. Another dataset structure may be a directory of images and a text file containing their corresponding text captions for tasks like text-to-image generation.
3+
There are many datasets on the [Hub](https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads) to train a model on, but if you can't find one you're interested in or want to use your own, you can create a dataset with the 🤗 [Datasets](https://huggingface.co/docs/datasets) library. The dataset structure depends on the task you want to train your model on. The most basic dataset structure is a directory of images for tasks like unconditional image generation. Another dataset structure may be a directory of images and a text file containing their corresponding text captions for tasks like text-to-image generation.
44

55
This guide will show you two ways to create a dataset to finetune on:
66

@@ -87,4 +87,4 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \
8787

8888
Now that you've created a dataset, you can plug it into the `train_data_dir` (if your dataset is local) or `dataset_name` (if your dataset is on the Hub) arguments of a training script.
8989

90-
For your next steps, feel free to try and use your dataset to train a model for [unconditional generation](unconditional_training) or [text-to-image generation](text2image)!
90+
For your next steps, feel free to try and use your dataset to train a model for [unconditional generation](unconditional_training) or [text-to-image generation](text2image)!

docs/source/ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, num_inferen
121121

122122
### 이미지 결과물을 정제하기
123123

124-
[base 모델 체크포인트](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)에서, StableDiffusion-XL 또한 고주파 품질을 향상시키는 이미지를 생성하기 위해 낮은 노이즈 단계 이미지를 제거하는데 특화된 [refiner 체크포인트](huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 포함하고 있습니다. 이 refiner 체크포인트는 이미지 품질을 향상시키기 위해 base 체크포인트를 실행한 후 "두 번째 단계" 파이프라인에 사용될 수 있습니다.
124+
[base 모델 체크포인트](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)에서, StableDiffusion-XL 또한 고주파 품질을 향상시키는 이미지를 생성하기 위해 낮은 노이즈 단계 이미지를 제거하는데 특화된 [refiner 체크포인트](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 포함하고 있습니다. 이 refiner 체크포인트는 이미지 품질을 향상시키기 위해 base 체크포인트를 실행한 후 "두 번째 단계" 파이프라인에 사용될 수 있습니다.
125125

126126
refiner를 사용할 때, 쉽게 사용할 수 있습니다
127127
- 1.) base 모델과 refiner을 사용하는데, 이는 *Denoisers의 앙상블*을 위한 첫 번째 제안된 [eDiff-I](https://research.nvidia.com/labs/dir/eDiff-I/)를 사용하거나
@@ -215,7 +215,7 @@ image = refiner(
215215

216216
#### 2.) 노이즈가 완전히 제거된 기본 이미지에서 이미지 출력을 정제하기
217217

218-
일반적인 [`StableDiffusionImg2ImgPipeline`] 방식에서, 기본 모델에서 생성된 완전히 노이즈가 제거된 이미지는 [refiner checkpoint](huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 사용해 더 향상시킬 수 있습니다.
218+
일반적인 [`StableDiffusionImg2ImgPipeline`] 방식에서, 기본 모델에서 생성된 완전히 노이즈가 제거된 이미지는 [refiner checkpoint](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 사용해 더 향상시킬 수 있습니다.
219219

220220
이를 위해, 보통의 "base" text-to-image 파이프라인을 수행 후에 image-to-image 파이프라인으로써 refiner를 실행시킬 수 있습니다. base 모델의 출력을 잠재 공간에 남겨둘 수 있습니다.
221221

docs/source/ko/training/create_dataset.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# 학습을 위한 데이터셋 만들기
22

33
[Hub](https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads) 에는 모델 교육을 위한 많은 데이터셋이 있지만,
4-
관심이 있거나 사용하고 싶은 데이터셋을 찾을 수 없는 경우 🤗 [Datasets](hf.co/docs/datasets) 라이브러리를 사용하여 데이터셋을 만들 수 있습니다.
4+
관심이 있거나 사용하고 싶은 데이터셋을 찾을 수 없는 경우 🤗 [Datasets](https://huggingface.co/docs/datasets) 라이브러리를 사용하여 데이터셋을 만들 수 있습니다.
55
데이터셋 구조는 모델을 학습하려는 작업에 따라 달라집니다.
66
가장 기본적인 데이터셋 구조는 unconditional 이미지 생성과 같은 작업을 위한 이미지 디렉토리입니다.
77
또 다른 데이터셋 구조는 이미지 디렉토리와 text-to-image 생성과 같은 작업에 해당하는 텍스트 캡션이 포함된 텍스트 파일일 수 있습니다.

docs/source/ko/training/lora.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ specific language governing permissions and limitations under the License.
3636

3737
[cloneofsimo](https://github.com/cloneofsimo)는 인기 있는 [lora](https://github.com/cloneofsimo/lora) GitHub 리포지토리에서 Stable Diffusion을 위한 LoRA 학습을 최초로 시도했습니다. 🧨 Diffusers는 [text-to-image 생성](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image#training-with-lora)[DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#training-with-low-rank-adaptation-of-large-language-models-lora)을 지원합니다. 이 가이드는 두 가지를 모두 수행하는 방법을 보여줍니다.
3838

39-
모델을 저장하거나 커뮤니티와 공유하려면 Hugging Face 계정에 로그인하세요(아직 계정이 없는 경우 [생성](hf.co/join)하세요):
39+
모델을 저장하거나 커뮤니티와 공유하려면 Hugging Face 계정에 로그인하세요(아직 계정이 없는 경우 [생성](https://huggingface.co/join)하세요):
4040

4141
```bash
4242
huggingface-cli login

src/diffusers/loaders/single_file_model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
269269
pretrained_model_name_or_path=default_pretrained_model_config_name,
270270
subfolder=subfolder,
271271
local_files_only=local_files_only,
272+
token=token,
272273
)
273274
expected_kwargs, optional_kwargs = cls._get_signature_keys(cls)
274275

src/diffusers/loaders/single_file_utils.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,9 @@
127127
"sd35_large": {
128128
"pretrained_model_name_or_path": "stabilityai/stable-diffusion-3.5-large",
129129
},
130+
"sd35_medium": {
131+
"pretrained_model_name_or_path": "stabilityai/stable-diffusion-3.5-medium",
132+
},
130133
"animatediff_v1": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5"},
131134
"animatediff_v2": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-2"},
132135
"animatediff_v3": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-3"},
@@ -527,7 +530,10 @@ def infer_diffusers_model_type(checkpoint):
527530
model_type = "stable_cascade_stage_b"
528531

529532
elif CHECKPOINT_KEY_NAMES["sd3"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["sd3"]].shape[-1] == 9216:
530-
model_type = "sd3"
533+
if checkpoint["model.diffusion_model.pos_embed"].shape[1] == 36864:
534+
model_type = "sd3"
535+
elif checkpoint["model.diffusion_model.pos_embed"].shape[1] == 147456:
536+
model_type = "sd35_medium"
531537

532538
elif CHECKPOINT_KEY_NAMES["sd35_large"] in checkpoint:
533539
model_type = "sd35_large"

src/diffusers/models/controlnets/controlnet_sd3.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,20 @@ def _set_gradient_checkpointing(self, module, value=False):
266266
if hasattr(module, "gradient_checkpointing"):
267267
module.gradient_checkpointing = value
268268

269+
# Notes: This is for SD3.5 8b controlnet, which shares the pos_embed with the transformer
270+
# we should have handled this in conversion script
271+
def _get_pos_embed_from_transformer(self, transformer):
272+
pos_embed = PatchEmbed(
273+
height=transformer.config.sample_size,
274+
width=transformer.config.sample_size,
275+
patch_size=transformer.config.patch_size,
276+
in_channels=transformer.config.in_channels,
277+
embed_dim=transformer.inner_dim,
278+
pos_embed_max_size=transformer.config.pos_embed_max_size,
279+
)
280+
pos_embed.load_state_dict(transformer.pos_embed.state_dict(), strict=True)
281+
return pos_embed
282+
269283
@classmethod
270284
def from_transformer(
271285
cls, transformer, num_layers=12, num_extra_conditioning_channels=1, load_weights_from_transformer=True

src/diffusers/models/upsampling.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,14 @@ def forward(self, hidden_states: torch.Tensor, output_size: Optional[int] = None
165165
# if `output_size` is passed we force the interpolation output
166166
# size and do not make use of `scale_factor=2`
167167
if self.interpolate:
168+
# upsample_nearest_nhwc also fails when the number of output elements is large
169+
# https://github.com/pytorch/pytorch/issues/141831
170+
scale_factor = (
171+
2 if output_size is None else max([f / s for f, s in zip(output_size, hidden_states.shape[-2:])])
172+
)
173+
if hidden_states.numel() * scale_factor > pow(2, 31):
174+
hidden_states = hidden_states.contiguous()
175+
168176
if output_size is None:
169177
hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
170178
else:

src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,19 @@ def __init__(
194194
super().__init__()
195195
if isinstance(controlnet, (list, tuple)):
196196
controlnet = SD3MultiControlNetModel(controlnet)
197+
if isinstance(controlnet, SD3MultiControlNetModel):
198+
for controlnet_model in controlnet.nets:
199+
# for SD3.5 8b controlnet, it shares the pos_embed with the transformer
200+
if (
201+
hasattr(controlnet_model.config, "use_pos_embed")
202+
and controlnet_model.config.use_pos_embed is False
203+
):
204+
pos_embed = controlnet_model._get_pos_embed_from_transformer(transformer)
205+
controlnet_model.pos_embed = pos_embed.to(controlnet_model.dtype).to(controlnet_model.device)
206+
elif isinstance(controlnet, SD3ControlNetModel):
207+
if hasattr(controlnet.config, "use_pos_embed") and controlnet.config.use_pos_embed is False:
208+
pos_embed = controlnet._get_pos_embed_from_transformer(transformer)
209+
controlnet.pos_embed = pos_embed.to(controlnet.dtype).to(controlnet.device)
197210

198211
self.register_modules(
199212
vae=vae,
@@ -1042,15 +1055,9 @@ def __call__(
10421055
controlnet_cond_scale = controlnet_cond_scale[0]
10431056
cond_scale = controlnet_cond_scale * controlnet_keep[i]
10441057

1045-
if controlnet_config.use_pos_embed is False:
1046-
# sd35 (offical) 8b controlnet
1047-
controlnet_model_input = self.transformer.pos_embed(latent_model_input)
1048-
else:
1049-
controlnet_model_input = latent_model_input
1050-
10511058
# controlnet(s) inference
10521059
control_block_samples = self.controlnet(
1053-
hidden_states=controlnet_model_input,
1060+
hidden_states=latent_model_input,
10541061
timestep=timestep,
10551062
encoder_hidden_states=controlnet_encoder_hidden_states,
10561063
pooled_projections=controlnet_pooled_projections,

src/diffusers/pipelines/flux/pipeline_flux.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ def __call__(
554554
height: Optional[int] = None,
555555
width: Optional[int] = None,
556556
num_inference_steps: int = 28,
557-
timesteps: List[int] = None,
557+
sigmas: Optional[List[float]] = None,
558558
guidance_scale: float = 3.5,
559559
num_images_per_prompt: Optional[int] = 1,
560560
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -585,10 +585,10 @@ def __call__(
585585
num_inference_steps (`int`, *optional*, defaults to 50):
586586
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
587587
expense of slower inference.
588-
timesteps (`List[int]`, *optional*):
589-
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
590-
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
591-
passed will be used. Must be in descending order.
588+
sigmas (`List[float]`, *optional*):
589+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
590+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
591+
will be used.
592592
guidance_scale (`float`, *optional*, defaults to 7.0):
593593
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
594594
`guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -699,7 +699,7 @@ def __call__(
699699
)
700700

701701
# 5. Prepare timesteps
702-
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
702+
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
703703
image_seq_len = latents.shape[1]
704704
mu = calculate_shift(
705705
image_seq_len,
@@ -712,8 +712,7 @@ def __call__(
712712
self.scheduler,
713713
num_inference_steps,
714714
device,
715-
timesteps,
716-
sigmas,
715+
sigmas=sigmas,
717716
mu=mu,
718717
)
719718
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)

src/diffusers/pipelines/flux/pipeline_flux_control.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,7 @@ def __call__(
621621
height: Optional[int] = None,
622622
width: Optional[int] = None,
623623
num_inference_steps: int = 28,
624-
timesteps: List[int] = None,
624+
sigmas: Optional[List[float]] = None,
625625
guidance_scale: float = 3.5,
626626
num_images_per_prompt: Optional[int] = 1,
627627
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -660,10 +660,10 @@ def __call__(
660660
num_inference_steps (`int`, *optional*, defaults to 50):
661661
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
662662
expense of slower inference.
663-
timesteps (`List[int]`, *optional*):
664-
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
665-
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
666-
passed will be used. Must be in descending order.
663+
sigmas (`List[float]`, *optional*):
664+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
665+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
666+
will be used.
667667
guidance_scale (`float`, *optional*, defaults to 7.0):
668668
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
669669
`guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -799,7 +799,7 @@ def __call__(
799799
)
800800

801801
# 5. Prepare timesteps
802-
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
802+
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
803803
image_seq_len = latents.shape[1]
804804
mu = calculate_shift(
805805
image_seq_len,
@@ -812,8 +812,7 @@ def __call__(
812812
self.scheduler,
813813
num_inference_steps,
814814
device,
815-
timesteps,
816-
sigmas,
815+
sigmas=sigmas,
817816
mu=mu,
818817
)
819818
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)

0 commit comments

Comments
 (0)