Address comments

mikaylagawarecki · mikaylagawarecki · commit ec5988126872 · 2023-09-18T09:26:09.000-07:00
diff --git a/recipes_source/recipes/module_load_state_dict_tips.py b/recipes_source/recipes/module_load_state_dict_tips.py
@@ -15,8 +15,8 @@
 """
 
 
-########################################
-# Let us consider a simple ``nn.Module``
+###############################################################################
+# Let us consider a simple ``nn.Module`` that contains a list of Linear layers:
 import torch
 from torch import nn
 import time
@@ -33,35 +33,41 @@ def forward(self, x):
 m = SomeModule(1000)
 torch.save(m.state_dict(), 'checkpoint.pth')
 
-#################################################################
-# The follow snippet demonstrates the use of the three utilities.
+##############################################################################
+# The follow snippet demonstrates the use of the the ``mmap`` keyword argument
+# to ``torch.load``, the ``torch.device()`` context manager and the ``assign``
+# keyword argument to ``nn.Module.load_state_dict()``.
 
 state_dict = torch.load('checkpoint.pth', mmap=True)
 with torch.device('meta'):
   meta_m = SomeModule(1000)
 meta_m.load_state_dict(state_dict, assign=True)
 
 #############################################################################
-# Taking a step back, let us inspect the following more vanilla code snippet
-# that does not use any of the features listed above:
+# Compare the snippet below to the one above:
 
 state_dict = torch.load('checkpoint.pth')
 m = SomeModule(1000)
 m.load_state_dict(state_dict)
 
-#################################################################################
+# The second example does not use any of the features listed above and will be
+# less compute and memory efficient for loading a checkpoint. In the following
+# sections, we will discuss each of the features in further detail.
+
+#####################################################################################
 # Using ``torch.load(mmap=True)``
 # -------------------------------
-# First let us consider what happens when we ``torch.load`` the checkpoint.
-# At ``torch.save`` time, tensor storages are tagged with the device they are
-# saved on. At ``torch.load`` time, tensor storages will be loaded to the device
+# First, let us consider what happens when we load the checkpoint with``torch.load``.
+# When we save a checkpoint with ``torch.save``, tensor storages are tagged with the device they are
+# saved on. With ``torch.load``, tensor storages will be loaded to the device
 # they were tagged with (unless this behavior is overridden using the
 # ``map_location`` flag). For ease of explanation, let us assume that the tensors
 # were saved on CPU. This means that on the first line all tensor storages will be
-# loaded into CPU RAM, which can be undesirable when
-#     1. CPU RAM is smaller than the size of the checkpoint
-#     2. Waiting for the entire checkpoint to be loaded into RAM before
-#        doing for example some per-tensor processing
+# loaded into CPU RAM, which can be undesirable when:
+#
+# * CPU RAM is smaller than the size of the checkpoint.
+# * Waiting for the entire checkpoint to be loaded into RAM before
+#    performing, for example, some per-tensor processing.
 
 start_time = time.time()
 state_dict = torch.load('checkpoint.pth')
@@ -83,7 +89,7 @@ def forward(self, x):
 
 ######################################################################################
 # As mentioned above, one can use this argument to do per-tensor processing on a
-# checkpoint without loading all tensor storages into CPU memory upfront. For example,
+# checkpoint without loading all tensor storages into CPU memory upfront. For example:
 def my_special_routine(t, device):
     # this could be a much fancier operation
     return t.to(dtype=torch.bfloat16, device=device)
@@ -92,35 +98,35 @@ def my_processing_function(key, device):
     t = state_dict[key]
     processed_t = my_special_routine(t, device)
     del t
-    return processed_t
+    state_dict[key] = processed_t
 
 for key in state_dict.keys():
-    device = torch.device('cuda:' + str(int(key.lstrip("linears.")[0]) % 8))
-    state_dict[key] = my_processing_function(key, device)
+    device = torch.device('cuda')
+    my_processing_function(key, device)
 
-##############################################
+##################################################
 # Using ``torch.device('meta')``
 # ------------------------------
-# Next, we consider the creation of the module.
+# Next, let's consider the creation of the module.
 m = SomeModule(1000)
 
 #######################################################################################################
 # This allocates memory for all parameters/buffers and initializes them per
 # the default initialization schemes defined in ``SomeModule.__init__()``, which
-# is wasteful when we want to load a checkpoint as
-#     1. The result of the initialization kernels will be overwritten by ``load_state_dict()``
-#        without ever being used, so initialization is wasteful.
-#     2. We are allocating memory for these parameters/buffers in RAM while ``torch.load`` of
-#        the saved state dictionary also allocates memory for the parameters/buffers in the checkpoint.
+# is wasteful when we want to load a checkpoint for the following reasons:
+# * The result of the initialization kernels will be overwritten by ``load_state_dict()``
+#    without ever being used, so initialization is wasteful.
+# * We are allocating memory for these parameters/buffers in RAM while ``torch.load`` of
+#    the saved state dictionary also allocates memory in RAM for the parameters/buffers in the checkpoint.
 #
 # In order to solve these two problems, we can use the ``torch.device()``
 # context manager with ``device='meta'`` when we instantiate the ``nn.Module()``.
 #
 # The `torch.device() <https://pytorch.org/docs/main/tensor_attributes.html#torch-device>`_
 # context manager makes sure that factory calls will be performed as if they
 # were passed the specified ``device`` as an argument. Tensors on ``torch.device('meta')`` do not
-# carry data. However, they possess all other metadata a tensor carries such as ``.size()``, ``.stride()``
-# and ``.requires_grad`` etc.
+# carry data. However, they possess all other metadata a tensor carries such as ``.size()``, ``.stride()``,
+# ``.requires_grad``, and others.
 with torch.device('meta'):
   new_m = SomeModule(1000)
 
@@ -131,11 +137,11 @@ def my_processing_function(key, device):
 
 m.load_state_dict(state_dict)
 
-###############################################################################
+######################################################################################
 # ``nn.Module.load_state_dict()`` is usually implemented via an in-place
-# ``param_in_model.copy_(param_in_state_dict)`` (i.e. a copy from the
-# parameter/buffer with the corresponding key in the state dictionary into
-# the parameter/buffer in the ``nn.Module``).
+# ``param_in_model.copy_(param_in_state_dict)``. This means that the parameter/buffer
+# with the corresponding key in the state dictionary is copied into the
+# parameter/buffer in the ``nn.Module``.
 #
 # However, an in-place copy into a tensor on the ``meta`` device is a no-op.
 # In order to avoid this, we can pass the ``assign=True`` keyword argument to
@@ -150,7 +156,10 @@ def my_processing_function(key, device):
 opt = torch.optim.SGD(new_m.parameters(), lr=1e-3)
 
 ###############################################################################
+# Conclusion
+# -------------
+#
 # To recap, in this tutorial we learned about ``torch.load(mmap=True)``, the
-# ``torch.device()`` context manager with ``device=meta`` and
+# ``torch.device()`` context manager with ``device=meta``, and
 # ``nn.Module.load_state_dict(assign=True)`` as well as how these tools could
 # be used to aid when loading a model from a checkpoint.