Update FSDP tutorial

davidberard98 · davidberard98 · commit 51f640eff568 · 2022-11-11T19:25:33.000-08:00
* rename default_auto_wrap_policy -&gt; size_based_auto_wrap_policy
* import functools
* indentation
diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst
@@ -67,6 +67,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
     # Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py
     import os
     import argparse
+    import functools
     import torch
     import torch.nn as nn
     import torch.nn.functional as F
@@ -82,14 +83,13 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
     from torch.utils.data.distributed import DistributedSampler
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
     from torch.distributed.fsdp.fully_sharded_data_parallel import (
-    FullyShardedDataParallel as FSDP,
-    CPUOffload,
-    BackwardPrefetch,
+        CPUOffload,
+        BackwardPrefetch,
     )
     from torch.distributed.fsdp.wrap import (
-    default_auto_wrap_policy,
-    enable_wrap,
-    wrap,
+        size_based_auto_wrap_policy,
+        enable_wrap,
+        wrap,
     )
 
 1.3 Distributed training setup. As we mentioned FSDP is a type of data parallelism which requires a distributed training environment, so here we use two helper functions to initialize the processes for distributed training and clean up.
@@ -196,7 +196,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
         transform=transforms.Compose([
             transforms.ToTensor(),
             transforms.Normalize((0.1307,), (0.3081,))
-            ])
+        ])
 
         dataset1 = datasets.MNIST('../data', train=True, download=True,
                             transform=transform)
@@ -217,7 +217,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
         train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
         test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
         my_auto_wrap_policy = functools.partial(
-                default_auto_wrap_policy, min_num_params=100
+                size_based_auto_wrap_policy, min_num_params=100
             )
         torch.cuda.set_device(rank)
         
@@ -248,9 +248,9 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
             # use a barrier to make sure training is done on all ranks
             dist_barrier()
             # state_dict for FSDP model is only available on Nightlies for now
-            States = model.state_dict()
-        if rank == 0:
-            torch.save(states, "mnist_cnn.pt")
+            states = model.state_dict()
+            if rank == 0:
+                torch.save(states, "mnist_cnn.pt")
         
         cleanup()
 
@@ -343,7 +343,7 @@ Finding an optimal auto wrap policy is challenging, PyTorch will add auto tuning
 .. code-block:: python
 
     my_auto_wrap_policy = functools.partial(
-            default_auto_wrap_policy, min_num_params=20000
+            size_based_auto_wrap_policy, min_num_params=20000
         )
     torch.cuda.set_device(rank)
     model = Net().to(rank)