pytorch
diff --git a/‎.ci/docker/requirements.txt
Lines changed: 3 additions & 1 deletion b/‎.ci/docker/requirements.txt
Lines changed: 3 additions & 1 deletion
diff --git a/‎.jenkins/metadata.json
Lines changed: 3 additions & 0 deletions b/‎.jenkins/metadata.json
Lines changed: 3 additions & 0 deletions
diff --git a/‎beginner_source/dist_overview.rst
Lines changed: 1 addition & 1 deletion b/‎beginner_source/dist_overview.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎en-wordlist.txt
Lines changed: 29 additions & 0 deletions b/‎en-wordlist.txt
Lines changed: 29 additions & 0 deletions
diff --git a/‎index.rst
Lines changed: 10 additions & 2 deletions b/‎index.rst
Lines changed: 10 additions & 2 deletions
diff --git a/‎intermediate_source/compiled_autograd_tutorial.rst
Lines changed: 302 additions & 0 deletions b/‎intermediate_source/compiled_autograd_tutorial.rst
Lines changed: 302 additions & 0 deletions
diff --git a/‎intermediate_source/scaled_dot_product_attention_tutorial.py
Lines changed: 5 additions & 5 deletions b/‎intermediate_source/scaled_dot_product_attention_tutorial.py
Lines changed: 5 additions & 5 deletions
@@ -68,5 +68,7 @@ iopath
 pygame==2.6.0
 pycocotools
 semilearn==0.3.2
-torchao==0.0.3
+torchao==0.5.0
 segment_anything==1.0
+torchrec==0.8.0
+fbgemm-gpu==0.8.0
@@ -28,6 +28,9 @@
   "intermediate_source/model_parallel_tutorial.py": {
     "needs": "linux.16xlarge.nvidia.gpu"
   },
+  "intermediate_source/torchrec_intro_tutorial.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
   "recipes_source/torch_export_aoti_python.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu"
   }, 
 
@@ -35,7 +35,7 @@ Sharding primitives
 
 ``DTensor`` and ``DeviceMesh`` are primitives used to build parallelism in terms of sharded or replicated tensors on N-dimensional process groups.
 
-- `DTensor <https://github.com/pytorch/pytorch/blob/main/torch/distributed/_tensor/README.md>`__ represents a tensor that is sharded and/or replicated, and communicates automatically to reshard tensors as needed by operations.
+- `DTensor <https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md>`__ represents a tensor that is sharded and/or replicated, and communicates automatically to reshard tensors as needed by operations.
 - `DeviceMesh <https://pytorch.org/docs/stable/distributed.html#devicemesh>`__ abstracts the accelerator device communicators into a multi-dimensional array, which manages the underlying ``ProcessGroup`` instances for collective communications in multi-dimensional parallelisms.  Try out our `Device Mesh Recipe <https://pytorch.org/tutorials/recipes/distributed_device_mesh.html>`__ to learn more.
 
 Communications APIs
 
@@ -619,3 +619,32 @@ warmup
 webp
 wsi
 wsis
+Meta's
+RecSys
+TorchRec
+sharding
+TBE
+dtype
+EBC
+sharder
+hyperoptimized
+DMP
+unsharded
+lookups
+KJTs
+amongst
+async
+everytime
+prototyped
+GBs
+HBM
+gloo
+nccl
+Localhost
+gpu
+torchmetrics
+url
+colab
+sharders
+Criteo
+torchrec
@@ -439,6 +439,13 @@ Welcome to PyTorch Tutorials
    :link: advanced/python_custom_ops.html
    :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA
 
+.. customcarditem::
+   :header: Compiled Autograd: Capturing a larger backward graph for ``torch.compile``
+   :card_description: Learn how to use compiled autograd to capture a larger backward graph.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/compiled_autograd_tutorial
+   :tags: Model-Optimization,CUDA
+
 .. customcarditem::
    :header: Custom C++ and CUDA Operators
    :card_description: How to extend PyTorch with custom C++ and CUDA operators.
@@ -846,7 +853,7 @@ Welcome to PyTorch Tutorials
    :header: Introduction to TorchRec
    :card_description: TorchRec is a PyTorch domain library built to provide common sparsity & parallelism primitives needed for large-scale recommender systems.
    :image: _static/img/thumbnails/torchrec.png
-   :link: intermediate/torchrec_tutorial.html
+   :link: intermediate/torchrec_intro_tutorial.html
    :tags: TorchRec,Recommender
 
 .. customcarditem::
@@ -1132,6 +1139,7 @@ Additional Resources
    intermediate/nvfuser_intro_tutorial
    intermediate/ax_multiobjective_nas_tutorial
    intermediate/torch_compile_tutorial
+   intermediate/compiled_autograd_tutorial
    intermediate/inductor_debug_cpu
    intermediate/scaled_dot_product_attention_tutorial
    beginner/knowledge_distillation_tutorial
@@ -1180,7 +1188,7 @@ Additional Resources
    :hidden:
    :caption: Recommendation Systems
 
-   intermediate/torchrec_tutorial
+   intermediate/torchrec_intro_tutorial
    advanced/sharding
 
 .. toctree::
 
@@ -244,7 +244,7 @@ def generate_rand_batch(
 
 ######################################################################
 # Using SDPA with ``torch.compile``
-# =================================
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # With the release of PyTorch 2.0, a new feature called
 # ``torch.compile()`` has been introduced, which can provide
@@ -324,9 +324,9 @@ def generate_rand_batch(
 #
 
 ######################################################################
-# Using SDPA with attn_bias subclasses`
-# ==========================================
-#
+# Using SDPA with attn_bias subclasses
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 # As of PyTorch 2.3, we have added a new submodule that contains tensor subclasses.
 # Designed to be used with ``torch.nn.functional.scaled_dot_product_attention``.
 # The module is named ``torch.nn.attention.bias`` and contains the following two
@@ -394,7 +394,7 @@ def generate_rand_batch(
 
 ######################################################################
 # Conclusion
-# ==========
+# ~~~~~~~~~~~
 #
 # In this tutorial, we have demonstrated the basic usage of
 # ``torch.nn.functional.scaled_dot_product_attention``. We have shown how