pytorch
diff --git a/‎.pyspelling.yml
Lines changed: 2 additions & 2 deletions b/‎.pyspelling.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎_static/img/distributed/fsdp_sharding.png
91 KB b/‎_static/img/distributed/fsdp_sharding.png
91 KB
diff --git a/‎_static/img/pendulum.gif
122 KB b/‎_static/img/pendulum.gif
122 KB
diff --git a/‎_static/img/rollout_recurrent.png
338 KB b/‎_static/img/rollout_recurrent.png
338 KB
diff --git a/‎advanced_source/pendulum.py
Lines changed: 912 additions & 0 deletions b/‎advanced_source/pendulum.py
Lines changed: 912 additions & 0 deletions
diff --git a/‎advanced_source/static_quantization_tutorial.rst
Lines changed: 5 additions & 4 deletions b/‎advanced_source/static_quantization_tutorial.rst
Lines changed: 5 additions & 4 deletions
diff --git a/‎advanced_source/super_resolution_with_onnxruntime.py
Lines changed: 1 addition & 1 deletion b/‎advanced_source/super_resolution_with_onnxruntime.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎beginner_source/introyt/tensorboardyt_tutorial.py
Lines changed: 3 additions & 2 deletions b/‎beginner_source/introyt/tensorboardyt_tutorial.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎beginner_source/onnx/export_simple_model_to_onnx_tutorial.py
Lines changed: 5 additions & 5 deletions b/‎beginner_source/onnx/export_simple_model_to_onnx_tutorial.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎beginner_source/onnx/onnx_registry_tutorial.py
Lines changed: 28 additions & 28 deletions b/‎beginner_source/onnx/onnx_registry_tutorial.py
Lines changed: 28 additions & 28 deletions
diff --git a/‎beginner_source/transformer_tutorial.py
Lines changed: 10 additions & 3 deletions b/‎beginner_source/transformer_tutorial.py
Lines changed: 10 additions & 3 deletions
diff --git a/‎en-wordlist.txt
Lines changed: 1 addition & 0 deletions b/‎en-wordlist.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎index.rst
Lines changed: 14 additions & 1 deletion b/‎index.rst
Lines changed: 14 additions & 1 deletion
diff --git a/‎intermediate_source/FSDP_tutorial.rst
Lines changed: 9 additions & 0 deletions b/‎intermediate_source/FSDP_tutorial.rst
Lines changed: 9 additions & 0 deletions
@@ -19,7 +19,7 @@ matrix:
         - open: '\.\.\s+(figure|literalinclude|math|image|grid)::'
           close: '\n'
         # Exclude roles:
-        - open: ':(?:(class|py:mod|mod|func)):`'
+        - open: ':(?:(class|py:mod|mod|func|meth|obj)):`'
           content: '[^`]*'
           close: '`'
         # Exclude reStructuredText hyperlinks
@@ -70,7 +70,7 @@ matrix:
       - open: ':figure:.*'
         close: '\n'
       # Ignore reStructuredText roles
-      - open: ':(?:(class|file|func|math|ref|octicon)):`'
+      - open: ':(?:(class|file|func|math|ref|octicon|meth|obj)):`'
         content: '[^`]*'
         close: '`'
       - open: ':width:'
 
@@ -206,14 +206,15 @@ Note: this code is taken from
 
         # Fuse Conv+BN and Conv+BN+Relu modules prior to quantization 
         # This operation does not change the numerics 
-        def fuse_model(self): 
+        def fuse_model(self, is_qat=False): 
+            fuse_modules = torch.ao.quantization.fuse_modules_qat if is_qat else torch.ao.quantization.fuse_modules
             for m in self.modules():  
                 if type(m) == ConvBNReLU: 
-                    torch.ao.quantization.fuse_modules(m, ['0', '1', '2'], inplace=True)
+                    fuse_modules(m, ['0', '1', '2'], inplace=True)
                 if type(m) == InvertedResidual: 
                     for idx in range(len(m.conv)):  
                         if type(m.conv[idx]) == nn.Conv2d:  
-                            torch.ao.quantization.fuse_modules(m.conv, [str(idx), str(idx + 1)], inplace=True)
+                            fuse_modules(m.conv, [str(idx), str(idx + 1)], inplace=True)
 
 2. Helper functions 
 ------------------- 
@@ -533,7 +534,7 @@ We fuse modules as before
 .. code:: python
 
     qat_model = load_model(saved_model_dir + float_model_file)  
-    qat_model.fuse_model()  
+    qat_model.fuse_model(is_qat=True)  
 
     optimizer = torch.optim.SGD(qat_model.parameters(), lr = 0.0001) 
     # The old 'fbgemm' is still available but 'x86' is the recommended default. 
 
@@ -26,7 +26,7 @@
 .. code-block:: bash
 
    %%bash
-   pip install onnxruntime
+   pip install onnx onnxruntime
 
 ONNX Runtime recommends using the latest stable runtime for PyTorch.
 
 
@@ -214,13 +214,14 @@ def forward(self, x):
             # Check against the validation set
             running_vloss = 0.0
 
-            net.train(False) # Don't need to track gradents for validation
+            # In evaluation mode some model specific operations can be omitted eg. dropout layer
+            net.train(False) # Switching to evaluation mode, eg. turning off regularisation
             for j, vdata in enumerate(validation_loader, 0):
                 vinputs, vlabels = vdata
                 voutputs = net(vinputs)
                 vloss = criterion(voutputs, vlabels)
                 running_vloss += vloss.item()
-            net.train(True) # Turn gradients back on for training
+            net.train(True) # Switching back to training mode, eg. turning on regularisation
 
             avg_loss = running_loss / 1000
             avg_vloss = running_vloss / len(validation_loader)
 
@@ -90,19 +90,19 @@ def forward(self, x):
 
 torch_model = MyModel()
 torch_input = torch.randn(1, 1, 32, 32)
-export_output = torch.onnx.dynamo_export(torch_model, torch_input)
+onnx_program = torch.onnx.dynamo_export(torch_model, torch_input)
 
 ######################################################################
 # As we can see, we didn't need any code change to the model.
-# The resulting ONNX model is stored within ``torch.onnx.ExportOutput`` as a binary protobuf file.
+# The resulting ONNX model is stored within ``torch.onnx.ONNXProgram`` as a binary protobuf file.
 #
 # 4. Save the ONNX model in a file
 # --------------------------------
 #
 # Although having the exported model loaded in memory is useful in many applications,
 # we can save it to disk with the following code:
 
-export_output.save("my_image_classifier.onnx")
+onnx_program.save("my_image_classifier.onnx")
 
 ######################################################################
 # You can load the ONNX file back into memory and check if it is well formed with the following code:
@@ -155,7 +155,7 @@ def forward(self, x):
 
 import onnxruntime
 
-onnx_input = export_output.adapt_torch_inputs_to_onnx(torch_input)
+onnx_input = onnx_program.adapt_torch_inputs_to_onnx(torch_input)
 print(f"Input length: {len(onnx_input)}")
 print(f"Sample input: {onnx_input}")
 
@@ -179,7 +179,7 @@ def to_numpy(tensor):
 # Before comparing the results, we need to convert the PyTorch's output to match ONNX's format.
 
 torch_outputs = torch_model(torch_input)
-torch_outputs = export_output.adapt_torch_outputs_to_onnx(torch_outputs)
+torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs)
 
 assert len(torch_outputs) == len(onnxruntime_outputs)
 for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs):
 
@@ -114,7 +114,7 @@ def custom_aten_add(input_x, input_y, alpha: float = 1.0):
       {onnx_registry.is_registered_op(namespace='aten', op_name='add', overload='Tensor')}"
       )
 export_options = torch.onnx.ExportOptions(onnx_registry=onnx_registry)
-export_output = torch.onnx.dynamo_export(
+onnx_program = torch.onnx.dynamo_export(
     aten_add_model, input_add_x, input_add_y, export_options=export_options
     )
 
@@ -125,14 +125,14 @@ def custom_aten_add(input_x, input_y, alpha: float = 1.0):
 #
 
 # graph node domain is the custom domain we registered
-assert export_output.model_proto.graph.node[0].domain == "custom.aten"
-assert len(export_output.model_proto.graph.node) == 1
+assert onnx_program.model_proto.graph.node[0].domain == "custom.aten"
+assert len(onnx_program.model_proto.graph.node) == 1
 # graph node name is the function name
-assert export_output.model_proto.graph.node[0].op_type == "custom_aten_add"
+assert onnx_program.model_proto.graph.node[0].op_type == "custom_aten_add"
 # function node domain is empty because we use standard ONNX operators
-assert export_output.model_proto.functions[0].node[3].domain == ""
+assert onnx_program.model_proto.functions[0].node[3].domain == ""
 # function node name is the standard ONNX operator name
-assert export_output.model_proto.functions[0].node[3].op_type == "Add"
+assert onnx_program.model_proto.functions[0].node[3].op_type == "Add"
 
 
 ######################################################################
@@ -155,20 +155,20 @@ def custom_aten_add(input_x, input_y, alpha: float = 1.0):
 
 
 # Use ONNX Runtime to run the model, and compare the results with PyTorch
-export_output.save("./custom_add_model.onnx")
+onnx_program.save("./custom_add_model.onnx")
 ort_session = onnxruntime.InferenceSession(
     "./custom_add_model.onnx", providers=['CPUExecutionProvider']
     )
 
 def to_numpy(tensor):
     return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
 
-onnx_input = export_output.adapt_torch_inputs_to_onnx(input_add_x, input_add_y)
+onnx_input = onnx_program.adapt_torch_inputs_to_onnx(input_add_x, input_add_y)
 onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)}
 onnxruntime_outputs = ort_session.run(None, onnxruntime_input)
 
 torch_outputs = aten_add_model(input_add_x, input_add_y)
-torch_outputs = export_output.adapt_torch_outputs_to_onnx(torch_outputs)
+torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs)
 
 assert len(torch_outputs) == len(onnxruntime_outputs)
 for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs):
@@ -225,7 +225,7 @@ def custom_aten_gelu(input_x, approximate: str = "none"):
 aten_gelu_model = CustomGelu()
 input_gelu_x = torch.randn(3, 3)
 
-export_output = torch.onnx.dynamo_export(
+onnx_program = torch.onnx.dynamo_export(
     aten_gelu_model, input_gelu_x, export_options=export_options
     )
 
@@ -238,13 +238,13 @@ def custom_aten_gelu(input_x, approximate: str = "none"):
 #
 
 # graph node domain is the custom domain we registered
-assert export_output.model_proto.graph.node[0].domain == "com.microsoft"
+assert onnx_program.model_proto.graph.node[0].domain == "com.microsoft"
 # graph node name is the function name
-assert export_output.model_proto.graph.node[0].op_type == "custom_aten_gelu"
+assert onnx_program.model_proto.graph.node[0].op_type == "custom_aten_gelu"
 # function node domain is the custom domain we registered
-assert export_output.model_proto.functions[0].node[0].domain == "com.microsoft"
+assert onnx_program.model_proto.functions[0].node[0].domain == "com.microsoft"
 # function node name is the node name used in the function
-assert export_output.model_proto.functions[0].node[0].op_type == "Gelu"
+assert onnx_program.model_proto.functions[0].node[0].op_type == "Gelu"
 
 
 ######################################################################
@@ -263,20 +263,20 @@ def custom_aten_gelu(input_x, approximate: str = "none"):
 # and compare the results with PyTorch.
 #
 
-export_output.save("./custom_gelu_model.onnx")
+onnx_program.save("./custom_gelu_model.onnx")
 ort_session = onnxruntime.InferenceSession(
     "./custom_gelu_model.onnx", providers=['CPUExecutionProvider']
     )
 
 def to_numpy(tensor):
     return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
 
-onnx_input = export_output.adapt_torch_inputs_to_onnx(input_gelu_x)
+onnx_input = onnx_program.adapt_torch_inputs_to_onnx(input_gelu_x)
 onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)}
 onnxruntime_outputs = ort_session.run(None, onnxruntime_input)
 
 torch_outputs = aten_gelu_model(input_gelu_x)
-torch_outputs = export_output.adapt_torch_outputs_to_onnx(torch_outputs)
+torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs)
 
 assert len(torch_outputs) == len(onnxruntime_outputs)
 for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs):
@@ -365,24 +365,24 @@ def custom_addandround(input_x):
     )
 
 export_options = torch.onnx.ExportOptions(onnx_registry=onnx_registry)
-export_output = torch.onnx.dynamo_export(
+onnx_program = torch.onnx.dynamo_export(
     custom_addandround_model, input_addandround_x, export_options=export_options
     )
-export_output.save("./custom_addandround_model.onnx")
+onnx_program.save("./custom_addandround_model.onnx")
 
 
 ######################################################################
-# The ``export_output`` exposes the exported model as protobuf through ``export_output.model_proto``.
+# The ``onnx_program`` exposes the exported model as protobuf through ``onnx_program.model_proto``.
 # The graph has one graph nodes for ``custom_addandround``, and inside ``custom_addandround``,
 # there are two function nodes, one for each operator.
 #
 
-assert export_output.model_proto.graph.node[0].domain == "test.customop"
-assert export_output.model_proto.graph.node[0].op_type == "custom_addandround"
-assert export_output.model_proto.functions[0].node[0].domain == "test.customop"
-assert export_output.model_proto.functions[0].node[0].op_type == "CustomOpOne"
-assert export_output.model_proto.functions[0].node[1].domain == "test.customop"
-assert export_output.model_proto.functions[0].node[1].op_type == "CustomOpTwo"
+assert onnx_program.model_proto.graph.node[0].domain == "test.customop"
+assert onnx_program.model_proto.graph.node[0].op_type == "custom_addandround"
+assert onnx_program.model_proto.functions[0].node[0].domain == "test.customop"
+assert onnx_program.model_proto.functions[0].node[0].op_type == "CustomOpOne"
+assert onnx_program.model_proto.functions[0].node[1].domain == "test.customop"
+assert onnx_program.model_proto.functions[0].node[1].op_type == "CustomOpTwo"
 
 
 ######################################################################
@@ -432,12 +432,12 @@ def custom_addandround(input_x):
 #    def to_numpy(tensor):
 #        return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
 #
-#    onnx_input = export_output.adapt_torch_inputs_to_onnx(input_addandround_x)
+#    onnx_input = onnx_program.adapt_torch_inputs_to_onnx(input_addandround_x)
 #    onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)}
 #    onnxruntime_outputs = ort_session.run(None, onnxruntime_input)
 #
 #    torch_outputs = custom_addandround_model(input_addandround_x)
-#    torch_outputs = export_output.adapt_torch_outputs_to_onnx(torch_outputs)
+#    torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs)
 #
 #    assert len(torch_outputs) == len(onnxruntime_outputs)
 #    for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs):
 
@@ -29,7 +29,7 @@
 
 ######################################################################
 # In this tutorial, we train a ``nn.TransformerEncoder`` model on a
-# language modeling task. Please note that this tutorial does not cover
+# causal language modeling task. Please note that this tutorial does not cover
 # the training of `nn.TransformerDecoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html#torch.nn.TransformerDecoder>`__, as depicted in
 # the right half of the diagram above. The language modeling task is to assign a
 # probability for the likelihood of a given word (or a sequence of words)
@@ -41,8 +41,10 @@
 # Along with the input sequence, a square attention mask is required because the
 # self-attention layers in ``nn.TransformerDecoder`` are only allowed to attend
 # the earlier positions in the sequence. For the language modeling task, any
-# tokens on the future positions should be masked. To produce a probability
-# distribution over output words, the output of the ``nn.TransformerEncoder``
+# tokens on the future positions should be masked.  This masking, combined with fact that 
+# the output embeddings are offset with later positions ensures that the
+# predictions for position i can depend only on the known outputs at positions less than i.
+# To produce a probability  distribution over output words, the output of the ``nn.TransformerEncoder``
 # model is passed through a linear layer to output unnormalized logits.
 # The log-softmax function isn't applied here due to the later use of
 # `CrossEntropyLoss <https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html>`__,
@@ -91,6 +93,11 @@ def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
         """
         src = self.embedding(src) * math.sqrt(self.d_model)
         src = self.pos_encoder(src)
+        if src_mask is None:
+            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
+            Unmasked positions are filled with float(0.0).
+            """
+            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
         output = self.transformer_encoder(src, src_mask)
         output = self.linear(output)
         return output
 
@@ -62,6 +62,7 @@ Colab
 Conv
 ConvNet
 ConvNets
+customizable
 DCGAN
 DCGANs
 DDP
 
@@ -312,14 +312,26 @@ What's new in PyTorch tutorials?
    :link: intermediate/mario_rl_tutorial.html
    :tags: Reinforcement-Learning
 
+.. customcarditem::
+   :header: Recurrent DQN
+   :card_description: Use TorchRL to train recurrent policies
+   :image: _static/img/rollout_recurrent.png
+   :link: intermediate/dqn_with_rnn_tutorial.html
+   :tags: Reinforcement-Learning
+
 .. customcarditem::
    :header: Code a DDPG Loss
    :card_description: Use TorchRL to code a DDPG Loss
    :image: _static/img/half_cheetah.gif
    :link: advanced/coding_ddpg.html
    :tags: Reinforcement-Learning
 
-
+.. customcarditem::
+   :header: Writing your environment and transforms
+   :card_description: Use TorchRL to code a Pendulum
+   :image: _static/img/pendulum.gif
+   :link: advanced/pendulum.html
+   :tags: Reinforcement-Learning
 
 .. Deploying PyTorch Models in Production
 
@@ -959,6 +971,7 @@ Additional Resources
    intermediate/reinforcement_q_learning
    intermediate/reinforcement_ppo
    intermediate/mario_rl_tutorial
+   advanced/pendulum
 
 .. toctree::
    :maxdepth: 2
 
@@ -46,6 +46,15 @@ At a high level FSDP works as follow:
 * Run reduce_scatter to sync gradients
 * Discard parameters. 
 
+One way to view FSDP's sharding is to decompose the DDP gradient all-reduce into reduce-scatter and all-gather. Specifically, during the backward pass, FSDP reduces and scatters gradients, ensuring that each rank possesses a shard of the gradients. Then it updates the corresponding shard of the parameters in the optimizer step. Finally, in the subsequent forward pass, it performs an all-gather operation to collect and combine the updated parameter shards.
+
+.. figure:: /_static/img/distributed/fsdp_sharding.png
+   :width: 100%
+   :align: center
+   :alt: FSDP allreduce
+
+   FSDP Allreduce
+
 How to use FSDP
 --------------
 Here we use a toy model to run training on the MNIST dataset for demonstration purposes. The APIs and logic can be applied to training larger models as well.