From 51f640eff56851cdd04016e3a2dc314c72827ef3 Mon Sep 17 00:00:00 2001 From: David Berard Date: Fri, 11 Nov 2022 19:25:33 -0800 Subject: [PATCH 1/4] Update FSDP tutorial * rename default_auto_wrap_policy -> size_based_auto_wrap_policy * import functools * indentation --- intermediate_source/FSDP_tutorial.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst index ecfc0dc4c7c..b2d74b2c202 100644 --- a/intermediate_source/FSDP_tutorial.rst +++ b/intermediate_source/FSDP_tutorial.rst @@ -67,6 +67,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”. # Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py import os import argparse + import functools import torch import torch.nn as nn import torch.nn.functional as F @@ -82,14 +83,13 @@ We add the following code snippets to a python script “FSDP_mnist.py”. from torch.utils.data.distributed import DistributedSampler from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp.fully_sharded_data_parallel import ( - FullyShardedDataParallel as FSDP, - CPUOffload, - BackwardPrefetch, + CPUOffload, + BackwardPrefetch, ) from torch.distributed.fsdp.wrap import ( - default_auto_wrap_policy, - enable_wrap, - wrap, + size_based_auto_wrap_policy, + enable_wrap, + wrap, ) 1.3 Distributed training setup. As we mentioned FSDP is a type of data parallelism which requires a distributed training environment, so here we use two helper functions to initialize the processes for distributed training and clean up. @@ -196,7 +196,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”. transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) - ]) + ]) dataset1 = datasets.MNIST('../data', train=True, download=True, transform=transform) @@ -217,7 +217,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”. train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) my_auto_wrap_policy = functools.partial( - default_auto_wrap_policy, min_num_params=100 + size_based_auto_wrap_policy, min_num_params=100 ) torch.cuda.set_device(rank) @@ -248,9 +248,9 @@ We add the following code snippets to a python script “FSDP_mnist.py”. # use a barrier to make sure training is done on all ranks dist_barrier() # state_dict for FSDP model is only available on Nightlies for now - States = model.state_dict() - if rank == 0: - torch.save(states, "mnist_cnn.pt") + states = model.state_dict() + if rank == 0: + torch.save(states, "mnist_cnn.pt") cleanup() @@ -343,7 +343,7 @@ Finding an optimal auto wrap policy is challenging, PyTorch will add auto tuning .. code-block:: python my_auto_wrap_policy = functools.partial( - default_auto_wrap_policy, min_num_params=20000 + size_based_auto_wrap_policy, min_num_params=20000 ) torch.cuda.set_device(rank) model = Net().to(rank) From 5b68901d7661ebd1adfeb9c20eebc54d25444091 Mon Sep 17 00:00:00 2001 From: David Berard Date: Fri, 11 Nov 2022 19:41:13 -0800 Subject: [PATCH 2/4] another indentation fix --- intermediate_source/FSDP_tutorial.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst index b2d74b2c202..7934c2b43e9 100644 --- a/intermediate_source/FSDP_tutorial.rst +++ b/intermediate_source/FSDP_tutorial.rst @@ -217,8 +217,8 @@ We add the following code snippets to a python script “FSDP_mnist.py”. train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) my_auto_wrap_policy = functools.partial( - size_based_auto_wrap_policy, min_num_params=100 - ) + size_based_auto_wrap_policy, min_num_params=100 + ) torch.cuda.set_device(rank) From 49253dd6a5dd8cbbb2cf80d1a0e692e29555db9a Mon Sep 17 00:00:00 2001 From: David Berard Date: Mon, 14 Nov 2022 09:36:54 -0800 Subject: [PATCH 3/4] Add note about default_auto_wrap_policy -> size_based_auto_wrap_policy --- intermediate_source/FSDP_tutorial.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst index 7934c2b43e9..76c5374699f 100644 --- a/intermediate_source/FSDP_tutorial.rst +++ b/intermediate_source/FSDP_tutorial.rst @@ -62,6 +62,9 @@ We add the following code snippets to a python script “FSDP_mnist.py”. 1.2 Import necessary packages +.. note:: + This tutorial is intended for pytorch versions 1.12 and later. If you are using 1.11 or earlier, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy`. + .. code-block:: python # Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py From 14afc05d84af2d02097791b265f682b21193a8dc Mon Sep 17 00:00:00 2001 From: David Berard Date: Mon, 14 Nov 2022 09:55:26 -0800 Subject: [PATCH 4/4] Use Svetlana's suggestion on wording for the note on size_based_auto_wrap_policy Co-authored-by: Svetlana Karslioglu --- intermediate_source/FSDP_tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst index 76c5374699f..d711637ae34 100644 --- a/intermediate_source/FSDP_tutorial.rst +++ b/intermediate_source/FSDP_tutorial.rst @@ -63,7 +63,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”. 1.2 Import necessary packages .. note:: - This tutorial is intended for pytorch versions 1.12 and later. If you are using 1.11 or earlier, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy`. + This tutorial is intended for PyTorch versions 1.12 and later. If you are using an earlier version, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy`. .. code-block:: python