Skip to content

Commit a5f51bb

Browse files
authored
Merge branch 'main' into pyramid-attention-broadcast
2 parents 37d2366 + d2e5cb3 commit a5f51bb

38 files changed

+402
-452
lines changed

.github/workflows/nightly_tests.yml

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,62 @@ jobs:
180180
pip install slack_sdk tabulate
181181
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
182182
183+
run_big_gpu_torch_tests:
184+
name: Torch tests on big GPU
185+
strategy:
186+
fail-fast: false
187+
max-parallel: 2
188+
runs-on:
189+
group: aws-g6e-xlarge-plus
190+
container:
191+
image: diffusers/diffusers-pytorch-cuda
192+
options: --shm-size "16gb" --ipc host --gpus 0
193+
steps:
194+
- name: Checkout diffusers
195+
uses: actions/checkout@v3
196+
with:
197+
fetch-depth: 2
198+
- name: NVIDIA-SMI
199+
run: nvidia-smi
200+
- name: Install dependencies
201+
run: |
202+
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
203+
python -m uv pip install -e [quality,test]
204+
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
205+
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
206+
python -m uv pip install pytest-reportlog
207+
- name: Environment
208+
run: |
209+
python utils/print_env.py
210+
- name: Selected Torch CUDA Test on big GPU
211+
env:
212+
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
213+
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
214+
CUBLAS_WORKSPACE_CONFIG: :16:8
215+
BIG_GPU_MEMORY: 40
216+
run: |
217+
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
218+
-m "big_gpu_with_torch_cuda" \
219+
--make-reports=tests_big_gpu_torch_cuda \
220+
--report-log=tests_big_gpu_torch_cuda.log \
221+
tests/
222+
- name: Failure short reports
223+
if: ${{ failure() }}
224+
run: |
225+
cat reports/tests_big_gpu_torch_cuda_stats.txt
226+
cat reports/tests_big_gpu_torch_cuda_failures_short.txt
227+
- name: Test suite reports artifacts
228+
if: ${{ always() }}
229+
uses: actions/upload-artifact@v4
230+
with:
231+
name: torch_cuda_big_gpu_test_reports
232+
path: reports
233+
- name: Generate Report and Notify Channel
234+
if: always()
235+
run: |
236+
pip install slack_sdk tabulate
237+
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
238+
183239
run_flax_tpu_tests:
184240
name: Nightly Flax TPU Tests
185241
runs-on: docker-tpu

.github/workflows/ssh-runner.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@ on:
44
workflow_dispatch:
55
inputs:
66
runner_type:
7-
description: 'Type of runner to test (aws-g6-4xlarge-plus: a10 or aws-g4dn-2xlarge: t4)'
7+
description: 'Type of runner to test (aws-g6-4xlarge-plus: a10, aws-g4dn-2xlarge: t4, aws-g6e-xlarge-plus: L40)'
88
type: choice
99
required: true
1010
options:
1111
- aws-g6-4xlarge-plus
1212
- aws-g4dn-2xlarge
13+
- aws-g6e-xlarge-plus
1314
docker_image:
1415
description: 'Name of the Docker image'
1516
required: true

examples/advanced_diffusion_training/train_dreambooth_lora_flux_advanced.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1778,15 +1778,10 @@ def load_model_hook(models, input_dir):
17781778
if not args.enable_t5_ti:
17791779
# pure textual inversion - only clip
17801780
if pure_textual_inversion:
1781-
params_to_optimize = [
1782-
text_parameters_one_with_lr,
1783-
]
1781+
params_to_optimize = [text_parameters_one_with_lr]
17841782
te_idx = 0
17851783
else: # regular te training or regular pivotal for clip
1786-
params_to_optimize = [
1787-
transformer_parameters_with_lr,
1788-
text_parameters_one_with_lr,
1789-
]
1784+
params_to_optimize = [transformer_parameters_with_lr, text_parameters_one_with_lr]
17901785
te_idx = 1
17911786
elif args.enable_t5_ti:
17921787
# pivotal tuning of clip & t5
@@ -1809,9 +1804,7 @@ def load_model_hook(models, input_dir):
18091804
]
18101805
te_idx = 1
18111806
else:
1812-
params_to_optimize = [
1813-
transformer_parameters_with_lr,
1814-
]
1807+
params_to_optimize = [transformer_parameters_with_lr]
18151808

18161809
# Optimizer creation
18171810
if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
@@ -1871,7 +1864,6 @@ def load_model_hook(models, input_dir):
18711864
params_to_optimize[-1]["lr"] = args.learning_rate
18721865
optimizer = optimizer_class(
18731866
params_to_optimize,
1874-
lr=args.learning_rate,
18751867
betas=(args.adam_beta1, args.adam_beta2),
18761868
beta3=args.prodigy_beta3,
18771869
weight_decay=args.adam_weight_decay,

examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,10 +1358,7 @@ def load_model_hook(models, input_dir):
13581358
else args.adam_weight_decay,
13591359
"lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
13601360
}
1361-
params_to_optimize = [
1362-
unet_lora_parameters_with_lr,
1363-
text_lora_parameters_one_with_lr,
1364-
]
1361+
params_to_optimize = [unet_lora_parameters_with_lr, text_lora_parameters_one_with_lr]
13651362
else:
13661363
params_to_optimize = [unet_lora_parameters_with_lr]
13671364

@@ -1423,7 +1420,6 @@ def load_model_hook(models, input_dir):
14231420

14241421
optimizer = optimizer_class(
14251422
params_to_optimize,
1426-
lr=args.learning_rate,
14271423
betas=(args.adam_beta1, args.adam_beta2),
14281424
beta3=args.prodigy_beta3,
14291425
weight_decay=args.adam_weight_decay,

examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1794,7 +1794,6 @@ def load_model_hook(models, input_dir):
17941794

17951795
optimizer = optimizer_class(
17961796
params_to_optimize,
1797-
lr=args.learning_rate,
17981797
betas=(args.adam_beta1, args.adam_beta2),
17991798
beta3=args.prodigy_beta3,
18001799
weight_decay=args.adam_weight_decay,

examples/cogvideo/train_cogvideox_image_to_video_lora.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -947,7 +947,6 @@ def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
947947

948948
optimizer = optimizer_class(
949949
params_to_optimize,
950-
lr=args.learning_rate,
951950
betas=(args.adam_beta1, args.adam_beta2),
952951
beta3=args.prodigy_beta3,
953952
weight_decay=args.adam_weight_decay,

examples/cogvideo/train_cogvideox_lora.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -969,7 +969,6 @@ def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
969969

970970
optimizer = optimizer_class(
971971
params_to_optimize,
972-
lr=args.learning_rate,
973972
betas=(args.adam_beta1, args.adam_beta2),
974973
beta3=args.prodigy_beta3,
975974
weight_decay=args.adam_weight_decay,

examples/dreambooth/train_dreambooth_flux.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1226,10 +1226,7 @@ def load_model_hook(models, input_dir):
12261226
"weight_decay": args.adam_weight_decay_text_encoder,
12271227
"lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
12281228
}
1229-
params_to_optimize = [
1230-
transformer_parameters_with_lr,
1231-
text_parameters_one_with_lr,
1232-
]
1229+
params_to_optimize = [transformer_parameters_with_lr, text_parameters_one_with_lr]
12331230
else:
12341231
params_to_optimize = [transformer_parameters_with_lr]
12351232

@@ -1291,7 +1288,6 @@ def load_model_hook(models, input_dir):
12911288

12921289
optimizer = optimizer_class(
12931290
params_to_optimize,
1294-
lr=args.learning_rate,
12951291
betas=(args.adam_beta1, args.adam_beta2),
12961292
beta3=args.prodigy_beta3,
12971293
weight_decay=args.adam_weight_decay,

examples/dreambooth/train_dreambooth_lora_flux.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1335,10 +1335,7 @@ def load_model_hook(models, input_dir):
13351335
"weight_decay": args.adam_weight_decay_text_encoder,
13361336
"lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
13371337
}
1338-
params_to_optimize = [
1339-
transformer_parameters_with_lr,
1340-
text_parameters_one_with_lr,
1341-
]
1338+
params_to_optimize = [transformer_parameters_with_lr, text_parameters_one_with_lr]
13421339
else:
13431340
params_to_optimize = [transformer_parameters_with_lr]
13441341

@@ -1400,7 +1397,6 @@ def load_model_hook(models, input_dir):
14001397

14011398
optimizer = optimizer_class(
14021399
params_to_optimize,
1403-
lr=args.learning_rate,
14041400
betas=(args.adam_beta1, args.adam_beta2),
14051401
beta3=args.prodigy_beta3,
14061402
weight_decay=args.adam_weight_decay,

examples/dreambooth/train_dreambooth_lora_sd3.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1468,7 +1468,6 @@ def load_model_hook(models, input_dir):
14681468

14691469
optimizer = optimizer_class(
14701470
params_to_optimize,
1471-
lr=args.learning_rate,
14721471
betas=(args.adam_beta1, args.adam_beta2),
14731472
beta3=args.prodigy_beta3,
14741473
weight_decay=args.adam_weight_decay,

examples/dreambooth/train_dreambooth_lora_sdxl.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1402,7 +1402,6 @@ def load_model_hook(models, input_dir):
14021402

14031403
optimizer = optimizer_class(
14041404
params_to_optimize,
1405-
lr=args.learning_rate,
14061405
betas=(args.adam_beta1, args.adam_beta2),
14071406
beta3=args.prodigy_beta3,
14081407
weight_decay=args.adam_weight_decay,

examples/dreambooth/train_dreambooth_sd3.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1328,7 +1328,6 @@ def load_model_hook(models, input_dir):
13281328

13291329
optimizer = optimizer_class(
13301330
params_to_optimize,
1331-
lr=args.learning_rate,
13321331
betas=(args.adam_beta1, args.adam_beta2),
13331332
beta3=args.prodigy_beta3,
13341333
weight_decay=args.adam_weight_decay,

examples/research_projects/flux_lora_quantization/train_dreambooth_lora_flux_miniature.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -349,14 +349,19 @@ def parse_args(input_args=None):
349349
"--optimizer",
350350
type=str,
351351
default="AdamW",
352-
help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
352+
choices=["AdamW", "Prodigy", "AdEMAMix"],
353353
)
354354

355355
parser.add_argument(
356356
"--use_8bit_adam",
357357
action="store_true",
358358
help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
359359
)
360+
parser.add_argument(
361+
"--use_8bit_ademamix",
362+
action="store_true",
363+
help="Whether or not to use 8-bit AdEMAMix from bitsandbytes.",
364+
)
360365

361366
parser.add_argument(
362367
"--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
@@ -820,16 +825,15 @@ def load_model_hook(models, input_dir):
820825
params_to_optimize = [transformer_parameters_with_lr]
821826

822827
# Optimizer creation
823-
if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
828+
if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
824829
logger.warning(
825-
f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
826-
"Defaulting to adamW"
830+
f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
831+
f"set to {args.optimizer.lower()}"
827832
)
828-
args.optimizer = "adamw"
829833

830-
if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
834+
if args.use_8bit_ademamix and not args.optimizer.lower() == "ademamix":
831835
logger.warning(
832-
f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
836+
f"use_8bit_ademamix is ignored when optimizer is not set to 'AdEMAMix'. Optimizer was "
833837
f"set to {args.optimizer.lower()}"
834838
)
835839

@@ -853,6 +857,20 @@ def load_model_hook(models, input_dir):
853857
eps=args.adam_epsilon,
854858
)
855859

860+
elif args.optimizer.lower() == "ademamix":
861+
try:
862+
import bitsandbytes as bnb
863+
except ImportError:
864+
raise ImportError(
865+
"To use AdEMAMix (or its 8bit variant), please install the bitsandbytes library: `pip install -U bitsandbytes`."
866+
)
867+
if args.use_8bit_ademamix:
868+
optimizer_class = bnb.optim.AdEMAMix8bit
869+
else:
870+
optimizer_class = bnb.optim.AdEMAMix
871+
872+
optimizer = optimizer_class(params_to_optimize)
873+
856874
if args.optimizer.lower() == "prodigy":
857875
try:
858876
import prodigyopt
@@ -868,7 +886,6 @@ def load_model_hook(models, input_dir):
868886

869887
optimizer = optimizer_class(
870888
params_to_optimize,
871-
lr=args.learning_rate,
872889
betas=(args.adam_beta1, args.adam_beta2),
873890
beta3=args.prodigy_beta3,
874891
weight_decay=args.adam_weight_decay,
@@ -1020,12 +1037,12 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
10201037
model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
10211038
model_input = model_input.to(dtype=weight_dtype)
10221039

1023-
vae_scale_factor = 2 ** (len(vae_config_block_out_channels))
1040+
vae_scale_factor = 2 ** (len(vae_config_block_out_channels) - 1)
10241041

10251042
latent_image_ids = FluxPipeline._prepare_latent_image_ids(
10261043
model_input.shape[0],
1027-
model_input.shape[2],
1028-
model_input.shape[3],
1044+
model_input.shape[2] // 2,
1045+
model_input.shape[3] // 2,
10291046
accelerator.device,
10301047
weight_dtype,
10311048
)
@@ -1059,7 +1076,7 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
10591076
)
10601077

10611078
# handle guidance
1062-
if transformer.config.guidance_embeds:
1079+
if unwrap_model(transformer).config.guidance_embeds:
10631080
guidance = torch.tensor([args.guidance_scale], device=accelerator.device)
10641081
guidance = guidance.expand(model_input.shape[0])
10651082
else:
@@ -1082,8 +1099,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
10821099
)[0]
10831100
model_pred = FluxPipeline._unpack_latents(
10841101
model_pred,
1085-
height=int(model_input.shape[2] * vae_scale_factor / 2),
1086-
width=int(model_input.shape[3] * vae_scale_factor / 2),
1102+
height=model_input.shape[2] * vae_scale_factor,
1103+
width=model_input.shape[3] * vae_scale_factor,
10871104
vae_scale_factor=vae_scale_factor,
10881105
)
10891106

examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1475,7 +1475,6 @@ def load_model_hook(models, input_dir):
14751475

14761476
optimizer = optimizer_class(
14771477
params_to_optimize,
1478-
lr=args.learning_rate,
14791478
betas=(args.adam_beta1, args.adam_beta2),
14801479
beta3=args.prodigy_beta3,
14811480
weight_decay=args.adam_weight_decay,

src/diffusers/models/autoencoders/autoencoder_kl_allegro.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1091,8 +1091,6 @@ def forward(
10911091
sample_posterior: bool = False,
10921092
return_dict: bool = True,
10931093
generator: Optional[torch.Generator] = None,
1094-
encoder_local_batch_size: int = 2,
1095-
decoder_local_batch_size: int = 2,
10961094
) -> Union[DecoderOutput, torch.Tensor]:
10971095
r"""
10981096
Args:
@@ -1103,18 +1101,14 @@ def forward(
11031101
Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
11041102
generator (`torch.Generator`, *optional*):
11051103
PyTorch random number generator.
1106-
encoder_local_batch_size (`int`, *optional*, defaults to 2):
1107-
Local batch size for the encoder's batch inference.
1108-
decoder_local_batch_size (`int`, *optional*, defaults to 2):
1109-
Local batch size for the decoder's batch inference.
11101104
"""
11111105
x = sample
1112-
posterior = self.encode(x, local_batch_size=encoder_local_batch_size).latent_dist
1106+
posterior = self.encode(x).latent_dist
11131107
if sample_posterior:
11141108
z = posterior.sample(generator=generator)
11151109
else:
11161110
z = posterior.mode()
1117-
dec = self.decode(z, local_batch_size=decoder_local_batch_size).sample
1111+
dec = self.decode(z).sample
11181112

11191113
if not return_dict:
11201114
return (dec,)

0 commit comments

Comments
 (0)