From 4a9f4561d1109a78c267bca607357e1d249aff71 Mon Sep 17 00:00:00 2001 From: jmarin Date: Mon, 26 Feb 2024 14:33:12 +0100 Subject: [PATCH] Correct when to set_device in ddp --- beginner_source/ddp_series_multigpu.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/beginner_source/ddp_series_multigpu.rst b/beginner_source/ddp_series_multigpu.rst index 2d294c97930..4a735af56ed 100644 --- a/beginner_source/ddp_series_multigpu.rst +++ b/beginner_source/ddp_series_multigpu.rst @@ -78,6 +78,8 @@ Imports Constructing the process group ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- First, before initializing the group process, call `set_device `__, + which sets the default GPU for each process. This is important to prevent hangs or excessive memory utilization on `GPU:0` - The process group can be initialized by TCP (default) or from a shared file-system. Read more on `process group initialization `__ @@ -85,8 +87,6 @@ Constructing the process group initializes the distributed process group. - Read more about `choosing a DDP backend `__ -- `set_device `__ - sets the default GPU for each process. This is important to prevent hangs or excessive memory utilization on `GPU:0` .. code-block:: diff @@ -98,8 +98,9 @@ Constructing the process group + """ + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" - + init_process_group(backend="nccl", rank=rank, world_size=world_size) + torch.cuda.set_device(rank) + + init_process_group(backend="nccl", rank=rank, world_size=world_size) + Constructing the DDP model