Merge branch 'main' into angelayi/aoti_fix

svekars · web-flow · commit cf0aca5ab169 · 2025-01-24T10:24:52.000-08:00
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -28,8 +28,8 @@ tensorboard
 jinja2==3.1.3
 pytorch-lightning
 torchx
-torchrl==0.5.0
-tensordict==0.5.0
+torchrl==0.6.0
+tensordict==0.6.0
 ax-platform>=0.4.0
 nbformat>=5.9.2
 datasets
diff --git a/.jenkins/build.sh b/.jenkins/build.sh
@@ -22,8 +22,10 @@ sudo apt-get install -y pandoc
 #Install PyTorch Nightly for test.
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
-# pip uninstall -y torch torchvision torchaudio torchtext torchdata
-# pip3 install torch==2.5.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
+sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
+sudo pip3 install torch==2.6.0 torchvision --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
+sudo pip uninstall -y fbgemm-gpu torchrec
+sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm
diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
@@ -51,7 +51,10 @@
     "intermediate_source/flask_rest_api_tutorial",
     "intermediate_source/text_to_speech_with_torchaudio",
     "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
-    "intermediate_source/torch_export_tutorial" # reenable after 2940 is fixed.
+    "intermediate_source/torch_export_tutorial", # reenable after 2940 is fixed.
+    "advanced_source/pendulum",
+    "beginner_source/onnx/export_simple_model_to_onnx_tutorial",
+    "beginner_source/onnx/onnx_registry_tutorial"
 ]
 
 def tutorial_source_dirs() -> List[Path]:
diff --git a/.lycheeignore b/.lycheeignore
@@ -12,3 +12,9 @@ https://pytorch.org/tutorials/beginner/colab/n
 
 # Ignore local host link from intermediate_source/tensorboard_tutorial.rst
 http://localhost:6006
+
+# Ignore local host link from recipes_source/deployment_with_flask.rst
+http://localhost:5000/predict 
+
+# Ignore local host link from advanced_source/cpp_frontend.rst 
+https://www.uber.com/blog/deep-neuroevolution/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -218,9 +218,8 @@ described in the preceding sections:
 - [NLP From Scratch: Generating Names with a Character-Level RNN
 Tutorial](https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html)
 
-If you are creating a recipe, we recommend that you use [this
-template](https://github.com/pytorch/tutorials/blob/tutorials_refresh/recipes_source/recipes/example_recipe.py)
-as a guide.
+If you are creating a recipe, [this is a good
+example.](https://github.com/pytorch/tutorials/blob/main/recipes_source/recipes/what_is_state_dict.py)
 
 
 # Submission Process #
diff --git a/advanced_source/cpp_autograd.rst b/advanced_source/cpp_autograd.rst
@@ -255,9 +255,9 @@ Out:
   [ CPUFloatType{3,4} ]
 
 Please see the documentation for ``torch::autograd::backward``
-(`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1afa9b5d4329085df4b6b3d4b4be48914b.html>`_)
+(`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1a1403bf65b1f4f8c8506a9e6e5312d030.html>`_)
 and ``torch::autograd::grad``
-(`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1a1e03c42b14b40c306f9eb947ef842d9c.html>`_)
+(`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1ab9fa15dc09a8891c26525fb61d33401a.html>`_)
 for more information on how to use them.
 
 Using custom autograd function in C++
@@ -394,9 +394,9 @@ C++ using the following table:
 +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Python                         | C++                                                                                                                                                                    |
 +================================+========================================================================================================================================================================+
-| ``torch.autograd.backward``    | ``torch::autograd::backward`` (`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1afa9b5d4329085df4b6b3d4b4be48914b.html>`_)                  |
+| ``torch.autograd.backward``    | ``torch::autograd::backward`` (`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1a1403bf65b1f4f8c8506a9e6e5312d030.html>`_)                  |
 +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ``torch.autograd.grad``        | ``torch::autograd::grad`` (`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1a1e03c42b14b40c306f9eb947ef842d9c.html>`_)                      |
+| ``torch.autograd.grad``        | ``torch::autograd::grad`` (`link <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1autograd_1ab9fa15dc09a8891c26525fb61d33401a.html>`_)                      |
 +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``torch.Tensor.detach``        | ``torch::Tensor::detach`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor6detachEv>`_)                                              |
 +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
diff --git a/advanced_source/cpp_custom_ops.rst b/advanced_source/cpp_custom_ops.rst
@@ -19,6 +19,10 @@ Custom C++ and CUDA Operators
        * PyTorch 2.4 or later
        * Basic understanding of C++ and CUDA programming
 
+.. note::
+
+  This tutorial will also work on AMD ROCm with no additional modifications.
+
 PyTorch offers a large library of operators that work on Tensors (e.g. torch.add, torch.sum, etc).
 However, you may wish to bring a new custom operator to PyTorch. This tutorial demonstrates the
 blessed path to authoring a custom operator written in C++/CUDA.
diff --git a/advanced_source/cpp_frontend.rst b/advanced_source/cpp_frontend.rst
@@ -57,7 +57,7 @@ the right tool for the job. Examples for such environments include:
   Multiprocessing is an alternative, but not as scalable and has significant
   shortcomings. C++ has no such constraints and threads are easy to use and
   create. Models requiring heavy parallelization, like those used in `Deep
-  Neuroevolution <https://eng.uber.com/deep-neuroevolution/>`_, can benefit from
+  Neuroevolution <https://www.uber.com/blog/deep-neuroevolution/>`_, can benefit from
   this.
 - **Existing C++ Codebases**: You may be the owner of an existing C++
   application doing anything from serving web pages in a backend server to
@@ -662,7 +662,7 @@ Defining the DCGAN Modules
 We now have the necessary background and introduction to define the modules for
 the machine learning task we want to solve in this post. To recap: our task is
 to generate images of digits from the `MNIST dataset
-<http://yann.lecun.com/exdb/mnist/>`_. We want to use a `generative adversarial
+<https://huggingface.co/datasets/ylecun/mnist>`_. We want to use a `generative adversarial
 network (GAN)
 <https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf>`_ to solve
 this task. In particular, we'll use a `DCGAN architecture
diff --git a/advanced_source/custom_ops_landing_page.rst b/advanced_source/custom_ops_landing_page.rst
@@ -23,6 +23,7 @@ You may wish to author a custom operator from Python (as opposed to C++) if:
   respect to ``torch.compile`` and ``torch.export``.
 - you have some Python bindings to C++/CUDA kernels and want those to compose with PyTorch
   subsystems (like ``torch.compile`` or ``torch.autograd``)
+- you are using Python (and not a C++-only environment like AOTInductor).
 
 Integrating custom C++ and/or CUDA code with PyTorch
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/advanced_source/pendulum.py b/advanced_source/pendulum.py
@@ -33,9 +33,9 @@
 
   In the process, we will touch three crucial components of TorchRL:
 
-* `environments <https://pytorch.org/rl/reference/envs.html>`__
-* `transforms <https://pytorch.org/rl/reference/envs.html#transforms>`__
-* `models (policy and value function) <https://pytorch.org/rl/reference/modules.html>`__
+* `environments <https://pytorch.org/rl/stable/reference/envs.html>`__
+* `transforms <https://pytorch.org/rl/stable/reference/envs.html#transforms>`__
+* `models (policy and value function) <https://pytorch.org/rl/stable/reference/modules.html>`__
 
 """
 
@@ -384,7 +384,7 @@ def _reset(self, tensordict):
 # convenient shortcuts to the content of the output and input spec containers.
 #
 # TorchRL offers multiple :class:`~torchrl.data.TensorSpec`
-# `subclasses <https://pytorch.org/rl/reference/data.html#tensorspec>`_ to
+# `subclasses <https://pytorch.org/rl/stable/reference/data.html#tensorspec>`_ to
 # encode the environment's input and output characteristics.
 #
 # Specs shape
diff --git a/advanced_source/python_custom_ops.py b/advanced_source/python_custom_ops.py
@@ -30,6 +30,12 @@
   into the function).
 - Adding training support to an arbitrary Python function
 
+Use :func:`torch.library.custom_op` to create Python custom operators.
+Use the C++ ``TORCH_LIBRARY`` APIs to create C++ custom operators (these
+work in Python-less environments).
+See the `Custom Operators Landing Page <https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html>`_
+for more details.
+
 Please note that if your operation can be expressed as a composition of
 existing PyTorch operators, then there is usually no need to use the custom operator
 API -- everything (for example ``torch.compile``, training support) should
diff --git a/beginner_source/blitz/autograd_tutorial.py b/beginner_source/blitz/autograd_tutorial.py
@@ -191,15 +191,15 @@
 # .. math::
 #
 #
-#      J^{T}\cdot \vec{v} = m \cdot \left(\begin{array}{ccc}
+#      J^{T}\cdot \vec{v} = \left(\begin{array}{ccc}
 #       \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{1}}\\
 #       \vdots & \ddots & \vdots\\
 #       \frac{\partial y_{1}}{\partial x_{n}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
 #       \end{array}\right)\left(\begin{array}{c}
 #       \frac{\partial l}{\partial y_{1}}\\
 #       \vdots\\
 #       \frac{\partial l}{\partial y_{m}}
-#       \end{array}\right) = m \cdot \left(\begin{array}{c}
+#       \end{array}\right) = \left(\begin{array}{c}
 #       \frac{\partial l}{\partial x_{1}}\\
 #       \vdots\\
 #       \frac{\partial l}{\partial x_{n}}
diff --git a/beginner_source/onnx/README.txt b/beginner_source/onnx/README.txt
@@ -3,7 +3,7 @@ ONNX
 
 1. intro_onnx.py
     Introduction to ONNX
-    https://pytorch.org/tutorials/onnx/intro_onnx.html
+    https://pytorch.org/tutorials/beginner/onnx/intro_onnx.html
 
 2. export_simple_model_to_onnx_tutorial.py
     Exporting a PyTorch model to ONNX
diff --git a/beginner_source/pytorch_with_examples.rst b/beginner_source/pytorch_with_examples.rst
@@ -149,7 +149,7 @@ which will be optimized during learning.
 
 In TensorFlow, packages like
 `Keras <https://github.com/fchollet/keras>`__,
-`TensorFlow-Slim <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim>`__,
+`TensorFlow-Slim <https://github.com/google-research/tf-slim>`__,
 and `TFLearn <http://tflearn.org/>`__ provide higher-level abstractions
 over raw computational graphs that are useful for building neural
 networks.
diff --git a/en-wordlist.txt b/en-wordlist.txt
@@ -81,6 +81,8 @@ FX
 FX's
 FairSeq
 Fastpath
+FakeTensor
+FakeTensors
 FFN
 FloydHub
 FloydHub's
@@ -368,6 +370,8 @@ downsample
 downsamples
 dropdown
 dtensor
+dtype
+dtypes
 duration
 elementwise
 embeddings
@@ -392,6 +396,8 @@ FlexAttention
 fp
 frontend
 functionalized
+functionalizes
+functionalization
 functorch
 fuser
 geomean
@@ -613,6 +619,7 @@ triton
 uint
 UX
 umap
+unbacked
 uncomment
 uncommented
 underflowing
@@ -649,7 +656,6 @@ RecSys
 TorchRec
 sharding
 TBE
-dtype
 EBC
 sharder
 hyperoptimized
diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst
@@ -11,7 +11,7 @@ It also comes with considerable engineering complexity to handle the training of
 `PyTorch FSDP <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`__, released in PyTorch 1.11 makes this easier.
 
 In this tutorial, we show how to use `FSDP APIs <https://pytorch.org/docs/stable/fsdp.html>`__, for simple MNIST models that can be extended to other larger models such as `HuggingFace BERT models <https://huggingface.co/blog/zero-deepspeed-fairscale>`__, 
-`GPT 3 models up to 1T parameters <https://pytorch.medium.com/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff>`__ . The sample DDP MNIST code has been borrowed from `here <https://github.com/yqhu/mnist_examples>`__. 
+`GPT 3 models up to 1T parameters <https://pytorch.medium.com/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff>`__ . The sample DDP MNIST code courtesy of `Patrick Hu <https://github.com/yqhu/>`_. 
 
 
 How FSDP works
@@ -251,6 +251,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
         init_end_event.record()
 
         if rank == 0:
+            init_end_event.synchronize()
             print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec")
             print(f"{model}")
 
diff --git a/intermediate_source/ddp_series_minGPT.rst b/intermediate_source/ddp_series_minGPT.rst
@@ -6,7 +6,7 @@ training <ddp_series_multinode.html>`__ \|\| **minGPT Training**
 Training “real-world” models with DDP
 =====================================
 
-Authors: `Suraj Subramanian <https://github.com/suraj813>`__
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
 
 .. grid:: 2
 
diff --git a/intermediate_source/ddp_series_multinode.rst b/intermediate_source/ddp_series_multinode.rst
@@ -6,7 +6,7 @@ training** \|\| `minGPT Training <ddp_series_minGPT.html>`__
 Multinode Training
 ==================
 
-Authors: `Suraj Subramanian <https://github.com/suraj813>`__
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
 
 .. grid:: 2
 
diff --git a/intermediate_source/dist_tuto.rst b/intermediate_source/dist_tuto.rst
@@ -47,6 +47,7 @@ the following template.
     """run.py:"""
     #!/usr/bin/env python
     import os
+    import sys
     import torch
     import torch.distributed as dist
     import torch.multiprocessing as mp
@@ -66,8 +67,12 @@ the following template.
     if __name__ == "__main__":
         world_size = 2
         processes = []
-        mp.set_start_method("spawn")
-        for rank in range(world_size):
+        if "google.colab" in sys.modules:
+            print("Running in Google Colab")
+            mp.get_context("spawn")
+        else:
+            mp.set_start_method("spawn")
+        for rank in range(size):
             p = mp.Process(target=init_process, args=(rank, world_size, run))
             p.start()
             processes.append(p)
@@ -156,7 +161,8 @@ we should not modify the sent tensor nor access the received tensor before ``req
 In other words,
 
 -  writing to ``tensor`` after ``dist.isend()`` will result in undefined behaviour.
--  reading from ``tensor`` after ``dist.irecv()`` will result in undefined behaviour.
+- reading from ``tensor`` after ``dist.irecv()`` will result in undefined
+  behaviour, until ``req.wait()`` has been executed.
 
 However, after ``req.wait()``
 has been executed we are guaranteed that the communication took place,
diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.rst b/intermediate_source/dynamic_quantization_bert_tutorial.rst
@@ -138,7 +138,7 @@ the following helper functions: one for converting the text examples
 into the feature vectors; The other one for measuring the F1 score of
 the predicted result.
 
-The `glue_convert_examples_to_features <https://github.com/huggingface/transformers/blob/master/transformers/data/processors/glue.py>`_ function converts the texts into input features:
+The `glue_convert_examples_to_features <https://github.com/huggingface/transformers/blob/main/src/transformers/data/datasets/glue.py>`_ function converts the texts into input features:
 
 -  Tokenize the input sequences;
 -  Insert [CLS] in the beginning;
@@ -147,7 +147,7 @@ The `glue_convert_examples_to_features <https://github.com/huggingface/transform
 -  Generate token type ids to indicate whether a token belongs to the
    first sequence or the second sequence.
 
-The `glue_compute_metrics <https://github.com/huggingface/transformers/blob/master/transformers/data/processors/glue.py>`_  function has the compute metrics with
+The `glue_compute_metrics <https://github.com/huggingface/transformers/blob/main/src/transformers/data/metrics/__init__.py#L60>`_  function has the compute metrics with
 the `F1 score <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html>`_, which
 can be interpreted as a weighted average of the precision and recall,
 where an F1 score reaches its best value at 1 and worst score at 0. The
@@ -273,7 +273,7 @@ We load the tokenizer and fine-tuned BERT sequence classifier model
 2.3 Define the tokenize and evaluation function
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-We reuse the tokenize and evaluation function from `HuggingFace <https://github.com/huggingface/transformers/blob/master/examples/run_glue.py>`_.
+We reuse the tokenize and evaluation function from `HuggingFace <https://github.com/huggingface/transformers/blob/main/examples/legacy/pytorch-lightning/run_glue.py>`_.
 
 .. code:: python
 
diff --git a/intermediate_source/inductor_debug_cpu.py b/intermediate_source/inductor_debug_cpu.py
@@ -19,8 +19,8 @@
 #
 # Meanwhile, you may also find related tutorials about ``torch.compile`` 
 # around `basic usage <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_, 
-# comprehensive `troubleshooting <https://pytorch.org/docs/stable/dynamo/troubleshooting.html>`_ 
-# and GPU-specific knowledge like `GPU performance profiling <https://github.com/pytorch/pytorch/blob/main/docs/source/compile/profiling_torch_compile.rst>`_.
+# comprehensive `troubleshooting <https://pytorch.org/docs/stable/torch.compiler_troubleshooting.html>`_ 
+# and GPU-specific knowledge like `GPU performance profiling <https://pytorch.org/docs/stable/torch.compiler_inductor_profiling.html>`_.
 #
 # We will start debugging with a motivating example that triggers compilation issues and accuracy problems 
 # by demonstrating the process of debugging to pinpoint the problems.
@@ -343,7 +343,7 @@ def forward2(self, arg0_1):
     return (neg,)
 
 ######################################################################
-# For more usage details about Minifier, please refer to `Troubleshooting <https://pytorch.org/docs/stable/dynamo/troubleshooting.html>`_.
+# For more usage details about Minifier, please refer to `Troubleshooting <https://pytorch.org/docs/stable/torch.compiler_troubleshooting.html>`_.
 
 
 ######################################################################
diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py
@@ -639,7 +639,7 @@
         # number of steps (1000, which is our ``env`` horizon).
         # The ``rollout`` method of the ``env`` can take a policy as argument:
         # it will then execute this policy at each step.
-        with set_exploration_type(ExplorationType.MEAN), torch.no_grad():
+        with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
             # execute a rollout with the trained policy
             eval_rollout = env.rollout(1000, policy_module)
             logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
diff --git a/intermediate_source/rpc_tutorial.rst b/intermediate_source/rpc_tutorial.rst
diff --git a/intermediate_source/torch_export_tutorial.py b/intermediate_source/torch_export_tutorial.py
diff --git a/intermediate_source/torchserve_with_ipex.rst b/intermediate_source/torchserve_with_ipex.rst
diff --git a/prototype_source/fx_graph_mode_ptq_static.rst b/prototype_source/fx_graph_mode_ptq_static.rst
diff --git a/prototype_source/tracing_based_selective_build.rst b/prototype_source/tracing_based_selective_build.rst
diff --git a/recipes_source/distributed_device_mesh.rst b/recipes_source/distributed_device_mesh.rst
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
diff --git a/recipes_source/torch_export_challenges_solutions.rst b/recipes_source/torch_export_challenges_solutions.rst