diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 31f42fdbd85..c646b8f9a86 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -11,8 +11,9 @@ IMAGE_NAME="$1"
shift
export UBUNTU_VERSION="20.04"
+export CUDA_VERSION="12.4.1"
-export BASE_IMAGE="ubuntu:${UBUNTU_VERSION}"
+export BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
echo "Building ${IMAGE_NAME} Docker image"
docker build \
diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh
index b20286a4099..c7eabda555d 100644
--- a/.ci/docker/common/common_utils.sh
+++ b/.ci/docker/common/common_utils.sh
@@ -22,5 +22,5 @@ conda_run() {
}
pip_install() {
- as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $*
+ as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip3 install --progress-bar off $*
}
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
index 00cf2f21033..bd3711bfb0e 100644
--- a/.ci/docker/requirements.txt
+++ b/.ci/docker/requirements.txt
@@ -30,8 +30,8 @@ pytorch-lightning
torchx
torchrl==0.5.0
tensordict==0.5.0
-ax-platform>==0.4.0
-nbformat>==5.9.2
+ax-platform>=0.4.0
+nbformat>=5.9.2
datasets
transformers
torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable
@@ -64,8 +64,8 @@ pyopengl
gymnasium[mujoco]==0.27.0
timm
iopath
-pygame==2.1.2
+pygame==2.6.0
pycocotools
semilearn==0.3.2
torchao==0.0.3
-segment_anything==1.0
\ No newline at end of file
+segment_anything==1.0
diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
index 4814f9a7d2b..2f1a9933aab 100644
--- a/.jenkins/metadata.json
+++ b/.jenkins/metadata.json
@@ -28,6 +28,9 @@
"intermediate_source/model_parallel_tutorial.py": {
"needs": "linux.16xlarge.nvidia.gpu"
},
+ "recipes_source/torch_export_aoti_python.py": {
+ "needs": "linux.g5.4xlarge.nvidia.gpu"
+ },
"advanced_source/pendulum.py": {
"needs": "linux.g5.4xlarge.nvidia.gpu",
"_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run."
diff --git a/README.md b/README.md
index 0c961afd262..af84d9ebe79 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,8 @@ We use sphinx-gallery's [notebook styled examples](https://sphinx-gallery.github
Here is how you can create a new tutorial (for a detailed description, see [CONTRIBUTING.md](./CONTRIBUTING.md)):
+NOTE: Before submitting a new tutorial, read [PyTorch Tutorial Submission Policy](./tutorial_submission_policy.md).
+
1. Create a Python file. If you want it executed while inserted into documentation, save the file with the suffix `tutorial` so that the file name is `your_tutorial.py`.
2. Put it in one of the `beginner_source`, `intermediate_source`, `advanced_source` directory based on the level of difficulty. If it is a recipe, add it to `recipes_source`. For tutorials demonstrating unstable prototype features, add to the `prototype_source`.
3. For Tutorials (except if it is a prototype feature), include it in the `toctree` directive and create a `customcarditem` in [index.rst](./index.rst).
@@ -31,7 +33,7 @@ If you are starting off with a Jupyter notebook, you can use [this script](https
## Building locally
-The tutorial build is very large and requires a GPU. If your machine does not have a GPU device, you can preview your HTML build without actually downloading the data and running the tutorial code:
+The tutorial build is very large and requires a GPU. If your machine does not have a GPU device, you can preview your HTML build without actually downloading the data and running the tutorial code:
1. Install required dependencies by running: `pip install -r requirements.txt`.
@@ -40,8 +42,6 @@ The tutorial build is very large and requires a GPU. If your machine does not ha
- If you have a GPU-powered laptop, you can build using `make docs`. This will download the data, execute the tutorials and build the documentation to `docs/` directory. This might take about 60-120 min for systems with GPUs. If you do not have a GPU installed on your system, then see next step.
- You can skip the computationally intensive graph generation by running `make html-noplot` to build basic html documentation to `_build/html`. This way, you can quickly preview your tutorial.
-> If you get **ModuleNotFoundError: No module named 'pytorch_sphinx_theme' make: *** [html-noplot] Error 2** from /tutorials/src/pytorch-sphinx-theme or /venv/src/pytorch-sphinx-theme (while using virtualenv), run `python setup.py install`.
-
## Building a single tutorial
You can build a single tutorial by using the `GALLERY_PATTERN` environment variable. For example to run only `neural_style_transfer_tutorial.py`, run:
@@ -59,8 +59,8 @@ The `GALLERY_PATTERN` variable respects regular expressions.
## About contributing to PyTorch Documentation and Tutorials
-* You can find information about contributing to PyTorch documentation in the
-PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file.
+* You can find information about contributing to PyTorch documentation in the
+PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file.
* Additional information can be found in [PyTorch CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md).
diff --git a/_static/css/custom.css b/_static/css/custom.css
index a467a088159..cc195d99061 100755
--- a/_static/css/custom.css
+++ b/_static/css/custom.css
@@ -91,3 +91,24 @@
transition: none;
transform-origin: none;
}
+
+.pytorch-left-menu-search input[type=text] {
+ background-image: none;
+}
+
+.gsc-control-cse {
+ padding-left: 0px !important;
+ padding-bottom: 0px !important;
+}
+
+.gsc-search-button .gsc-search-button-v2:focus {
+ border: transparent !important;
+ outline: none;
+ box-shadow: none;
+}
+.gsc-search-button-v2:active {
+ border: none !important;
+}
+.gsc-search-button-v2 {
+ border: none !important;
+}
diff --git a/_templates/layout.html b/_templates/layout.html
index 22129040e49..1c632de63f8 100644
--- a/_templates/layout.html
+++ b/_templates/layout.html
@@ -11,6 +11,23 @@
{%- endblock %}
+{% block sidebartitle %}
+ {% if theme_display_version %}
+ {%- set nav_version = version %}
+ {% if READTHEDOCS and current_version %}
+ {%- set nav_version = current_version %}
+ {% endif %}
+ {% if nav_version %}
+
+ {{ nav_version }}
+
+ {% endif %}
+ {% endif %}
+
+{% endblock %}
{% block footer %}
{{ super() }}
diff --git a/advanced_source/cpp_custom_ops.rst b/advanced_source/cpp_custom_ops.rst
index 435ff088bc0..ffabd6eff77 100644
--- a/advanced_source/cpp_custom_ops.rst
+++ b/advanced_source/cpp_custom_ops.rst
@@ -174,6 +174,8 @@ To add ``torch.compile`` support for an operator, we must add a FakeTensor kerne
known as a "meta kernel" or "abstract impl"). FakeTensors are Tensors that have
metadata (such as shape, dtype, device) but no data: the FakeTensor kernel for an
operator specifies how to compute the metadata of output tensors given the metadata of input tensors.
+The FakeTensor kernel should return dummy Tensors of your choice with
+the correct Tensor metadata (shape/strides/``dtype``/device).
We recommend that this be done from Python via the `torch.library.register_fake` API,
though it is possible to do this from C++ as well (see
diff --git a/advanced_source/dynamic_quantization_tutorial.py b/advanced_source/dynamic_quantization_tutorial.py
index 9cc07a1d956..c8d94789d5d 100644
--- a/advanced_source/dynamic_quantization_tutorial.py
+++ b/advanced_source/dynamic_quantization_tutorial.py
@@ -151,7 +151,8 @@ def tokenize(self, path):
model.load_state_dict(
torch.load(
model_data_filepath + 'word_language_model_quantize.pth',
- map_location=torch.device('cpu')
+ map_location=torch.device('cpu'),
+ weights_only=True
)
)
diff --git a/advanced_source/python_custom_ops.py b/advanced_source/python_custom_ops.py
index 1e429b76b35..0b3bf6e4748 100644
--- a/advanced_source/python_custom_ops.py
+++ b/advanced_source/python_custom_ops.py
@@ -66,7 +66,7 @@ def display(img):
######################################################################
# ``crop`` is not handled effectively out-of-the-box by
# ``torch.compile``: ``torch.compile`` induces a
-# `"graph break" `_
+# `"graph break" `_
# on functions it is unable to handle and graph breaks are bad for performance.
# The following code demonstrates this by raising an error
# (``torch.compile`` with ``fullgraph=True`` raises an error if a
@@ -85,9 +85,9 @@ def f(img):
#
# 1. wrap the function into a PyTorch custom operator.
# 2. add a "``FakeTensor`` kernel" (aka "meta kernel") to the operator.
-# Given the metadata (e.g. shapes)
-# of the input Tensors, this function says how to compute the metadata
-# of the output Tensor(s).
+# Given some ``FakeTensors`` inputs (dummy Tensors that don't have storage),
+# this function should return dummy Tensors of your choice with the correct
+# Tensor metadata (shape/strides/``dtype``/device).
from typing import Sequence
@@ -130,6 +130,11 @@ def f(img):
# ``autograd.Function`` with PyTorch operator registration APIs can lead to (and
# has led to) silent incorrectness when composed with ``torch.compile``.
#
+# If you don't need training support, there is no need to use
+# ``torch.library.register_autograd``.
+# If you end up training with a ``custom_op`` that doesn't have an autograd
+# registration, we'll raise an error message.
+#
# The gradient formula for ``crop`` is essentially ``PIL.paste`` (we'll leave the
# derivation as an exercise to the reader). Let's first wrap ``paste`` into a
# custom operator:
@@ -203,7 +208,7 @@ def setup_context(ctx, inputs, output):
######################################################################
# Mutable Python Custom operators
# -------------------------------
-# You can also wrap a Python function that mutates its inputs into a custom
+# You can also wrap a Python function that mutates its inputs into a custom
# operator.
# Functions that mutate inputs are common because that is how many low-level
# kernels are written; for example, a kernel that computes ``sin`` may take in
diff --git a/advanced_source/static_quantization_tutorial.rst b/advanced_source/static_quantization_tutorial.rst
index 3b818aa03aa..efb171c0dfe 100644
--- a/advanced_source/static_quantization_tutorial.rst
+++ b/advanced_source/static_quantization_tutorial.rst
@@ -286,7 +286,7 @@ We next define several helper functions to help with model evaluation. These mos
def load_model(model_file):
model = MobileNetV2()
- state_dict = torch.load(model_file)
+ state_dict = torch.load(model_file, weights_only=True)
model.load_state_dict(state_dict)
model.to('cpu')
return model
diff --git a/beginner_source/basics/quickstart_tutorial.py b/beginner_source/basics/quickstart_tutorial.py
index 07a1be517d1..df7628081ba 100644
--- a/beginner_source/basics/quickstart_tutorial.py
+++ b/beginner_source/basics/quickstart_tutorial.py
@@ -216,7 +216,7 @@ def test(dataloader, model, loss_fn):
# the state dictionary into it.
model = NeuralNetwork().to(device)
-model.load_state_dict(torch.load("model.pth"))
+model.load_state_dict(torch.load("model.pth", weights_only=True))
#############################################################
# This model can now be used to make predictions.
diff --git a/beginner_source/basics/saveloadrun_tutorial.py b/beginner_source/basics/saveloadrun_tutorial.py
index 16a9f037417..5b3aef124b0 100644
--- a/beginner_source/basics/saveloadrun_tutorial.py
+++ b/beginner_source/basics/saveloadrun_tutorial.py
@@ -32,9 +32,14 @@
##########################
# To load model weights, you need to create an instance of the same model first, and then load the parameters
# using ``load_state_dict()`` method.
+#
+# In the code below, we set ``weights_only=True`` to limit the
+# functions executed during unpickling to only those necessary for
+# loading weights. Using ``weights_only=True`` is considered
+# a best practice when loading weights.
model = models.vgg16() # we do not specify ``weights``, i.e. create untrained model
-model.load_state_dict(torch.load('model_weights.pth'))
+model.load_state_dict(torch.load('model_weights.pth', weights_only=True))
model.eval()
###########################
@@ -50,9 +55,14 @@
torch.save(model, 'model.pth')
########################
-# We can then load the model like this:
+# We can then load the model as demonstrated below.
+#
+# As described in `Saving and loading torch.nn.Modules `__,
+# saving ``state_dict``s is considered the best practice. However,
+# below we use ``weights_only=False`` because this involves loading the
+# model, which is a legacy use case for ``torch.save``.
-model = torch.load('model.pth')
+model = torch.load('model.pth', weights_only=False),
########################
# .. note:: This approach uses Python `pickle `_ module when serializing the model, thus it relies on the actual class definition to be available when loading the model.
diff --git a/beginner_source/blitz/cifar10_tutorial.py b/beginner_source/blitz/cifar10_tutorial.py
index 8e3f3252921..f38abdd5666 100644
--- a/beginner_source/blitz/cifar10_tutorial.py
+++ b/beginner_source/blitz/cifar10_tutorial.py
@@ -221,7 +221,7 @@ def forward(self, x):
# wasn't necessary here, we only did it to illustrate how to do so):
net = Net()
-net.load_state_dict(torch.load(PATH))
+net.load_state_dict(torch.load(PATH, weights_only=True))
########################################################################
# Okay, now let us see what the neural network thinks these examples above are:
diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py
index 44310cc3620..f902f8cd717 100644
--- a/beginner_source/chatbot_tutorial.py
+++ b/beginner_source/chatbot_tutorial.py
@@ -84,8 +84,7 @@
# Preparations
# ------------
#
-# To start, Download the data ZIP file
-# `here `__
+# To get started, `download `__ the Movie-Dialogs Corpus zip file.
# and put in a ``data/`` directory under the current directory.
#
diff --git a/beginner_source/deeplabv3_on_android.rst b/beginner_source/deeplabv3_on_android.rst
index f2fe0e48f15..5ca7f01ad06 100644
--- a/beginner_source/deeplabv3_on_android.rst
+++ b/beginner_source/deeplabv3_on_android.rst
@@ -5,6 +5,10 @@ Image Segmentation DeepLabV3 on Android
**Reviewed by**: `Jeremiah Chung `_
+.. warning::
+ PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `_, PyTorch’s all-new on-device inference library. You can also review our `end-to-end workflows `_ and review the `source code for DeepLabV3 `_.
+
+
Introduction
------------
diff --git a/beginner_source/fgsm_tutorial.py b/beginner_source/fgsm_tutorial.py
index 007ad3fd956..9bdf52d84b4 100644
--- a/beginner_source/fgsm_tutorial.py
+++ b/beginner_source/fgsm_tutorial.py
@@ -192,7 +192,7 @@ def forward(self, x):
model = Net().to(device)
# Load the pretrained model
-model.load_state_dict(torch.load(pretrained_model, map_location=device))
+model.load_state_dict(torch.load(pretrained_model, map_location=device, weights_only=True))
# Set the model in evaluation mode. In this case this is for the Dropout layers
model.eval()
diff --git a/beginner_source/onnx/intro_onnx.py b/beginner_source/onnx/intro_onnx.py
index b5cbafc1c64..ec625ec78ff 100644
--- a/beginner_source/onnx/intro_onnx.py
+++ b/beginner_source/onnx/intro_onnx.py
@@ -39,13 +39,14 @@
- `ONNX `_ standard library
- `ONNX Script `_ library that enables developers to author ONNX operators,
- functions and models using a subset of Python in an expressive, and yet simple fashion.
+ functions and models using a subset of Python in an expressive, and yet simple fashion
+ - `ONNX Runtime `_ accelerated machine learning library.
They can be installed through `pip `_:
.. code-block:: bash
- pip install --upgrade onnx onnxscript
+ pip install --upgrade onnx onnxscript onnxruntime
To validate the installation, run the following commands:
diff --git a/beginner_source/saving_loading_models.py b/beginner_source/saving_loading_models.py
index fcd33be2537..6c9b6b1fd77 100644
--- a/beginner_source/saving_loading_models.py
+++ b/beginner_source/saving_loading_models.py
@@ -153,7 +153,7 @@
# .. code:: python
#
# model = TheModelClass(*args, **kwargs)
-# model.load_state_dict(torch.load(PATH))
+# model.load_state_dict(torch.load(PATH), weights_only=True)
# model.eval()
#
# .. note::
@@ -206,7 +206,7 @@
# .. code:: python
#
# # Model class must be defined somewhere
-# model = torch.load(PATH)
+# model = torch.load(PATH, weights_only=False)
# model.eval()
#
# This save/load process uses the most intuitive syntax and involves the
@@ -290,7 +290,7 @@
# model = TheModelClass(*args, **kwargs)
# optimizer = TheOptimizerClass(*args, **kwargs)
#
-# checkpoint = torch.load(PATH)
+# checkpoint = torch.load(PATH, weights_only=True)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']
@@ -354,7 +354,7 @@
# optimizerA = TheOptimizerAClass(*args, **kwargs)
# optimizerB = TheOptimizerBClass(*args, **kwargs)
#
-# checkpoint = torch.load(PATH)
+# checkpoint = torch.load(PATH, weights_only=True)
# modelA.load_state_dict(checkpoint['modelA_state_dict'])
# modelB.load_state_dict(checkpoint['modelB_state_dict'])
# optimizerA.load_state_dict(checkpoint['optimizerA_state_dict'])
@@ -407,7 +407,7 @@
# .. code:: python
#
# modelB = TheModelBClass(*args, **kwargs)
-# modelB.load_state_dict(torch.load(PATH), strict=False)
+# modelB.load_state_dict(torch.load(PATH), strict=False, weights_only=True)
#
# Partially loading a model or loading a partial model are common
# scenarios when transfer learning or training a new complex model.
@@ -446,7 +446,7 @@
#
# device = torch.device('cpu')
# model = TheModelClass(*args, **kwargs)
-# model.load_state_dict(torch.load(PATH, map_location=device))
+# model.load_state_dict(torch.load(PATH, map_location=device, weights_only=True))
#
# When loading a model on a CPU that was trained with a GPU, pass
# ``torch.device('cpu')`` to the ``map_location`` argument in the
@@ -469,7 +469,7 @@
#
# device = torch.device("cuda")
# model = TheModelClass(*args, **kwargs)
-# model.load_state_dict(torch.load(PATH))
+# model.load_state_dict(torch.load(PATH, weights_only=True))
# model.to(device)
# # Make sure to call input = input.to(device) on any input tensors that you feed to the model
#
@@ -497,7 +497,7 @@
#
# device = torch.device("cuda")
# model = TheModelClass(*args, **kwargs)
-# model.load_state_dict(torch.load(PATH, map_location="cuda:0")) # Choose whatever GPU device number you want
+# model.load_state_dict(torch.load(PATH, weights_only=True, map_location="cuda:0")) # Choose whatever GPU device number you want
# model.to(device)
# # Make sure to call input = input.to(device) on any input tensors that you feed to the model
#
diff --git a/beginner_source/transfer_learning_tutorial.py b/beginner_source/transfer_learning_tutorial.py
index 7a2b053763a..de7a178bd7d 100644
--- a/beginner_source/transfer_learning_tutorial.py
+++ b/beginner_source/transfer_learning_tutorial.py
@@ -209,7 +209,7 @@ def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
print(f'Best val Acc: {best_acc:4f}')
# load best model weights
- model.load_state_dict(torch.load(best_model_params_path))
+ model.load_state_dict(torch.load(best_model_params_path, weights_only=True))
return model
diff --git a/conf.py b/conf.py
index f0f4905844c..e4bca1ac7fa 100644
--- a/conf.py
+++ b/conf.py
@@ -67,6 +67,12 @@
#
# needs_sphinx = '1.0'
+html_meta = {
+ 'description': 'Master PyTorch with our step-by-step tutorials for all skill levels. Start your journey to becoming a PyTorch expert today!',
+ 'keywords': 'PyTorch, tutorials, Getting Started, deep learning, AI',
+ 'author': 'PyTorch Contributors'
+}
+
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
diff --git a/en-wordlist.txt b/en-wordlist.txt
index 62762ab69cc..e69cbaa1a5f 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -2,6 +2,7 @@
ACL
ADI
AOT
+AOTInductor
APIs
ATen
AVX
@@ -617,4 +618,4 @@ warmstarting
warmup
webp
wsi
-wsis
\ No newline at end of file
+wsis
diff --git a/index.rst b/index.rst
index 91517834fd8..95c4a8f3efb 100644
--- a/index.rst
+++ b/index.rst
@@ -3,6 +3,7 @@ Welcome to PyTorch Tutorials
**What's new in PyTorch tutorials?**
+* `torch.export AOTInductor Tutorial for Python runtime (Beta) `__
* `A guide on good usage of non_blocking and pin_memory() in PyTorch `__
* `Introduction to Distributed Pipeline Parallelism `__
* `Introduction to Libuv TCPStore Backend `__
diff --git a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py
index f16b170ee6a..ed581426c2e 100644
--- a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py
+++ b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py
@@ -397,7 +397,7 @@ def pack_hook(tensor):
return name
def unpack_hook(name):
- return torch.load(name)
+ return torch.load(name, weights_only=True)
######################################################################
@@ -420,7 +420,7 @@ def pack_hook(tensor):
return name
def unpack_hook(name):
- tensor = torch.load(name)
+ tensor = torch.load(name, weights_only=True)
os.remove(name)
return tensor
@@ -462,7 +462,7 @@ def pack_hook(tensor):
return temp_file
def unpack_hook(temp_file):
- return torch.load(temp_file.name)
+ return torch.load(temp_file.name, weights_only=True)
######################################################################
diff --git a/intermediate_source/ddp_tutorial.rst b/intermediate_source/ddp_tutorial.rst
index 13297fb2a12..cff5105fa54 100644
--- a/intermediate_source/ddp_tutorial.rst
+++ b/intermediate_source/ddp_tutorial.rst
@@ -214,7 +214,7 @@ and elasticity support, please refer to `TorchElastic `_.
+We provide the fine-tuned BERT model for MRPC task `here `_.
To save time, you can download the model file (~400 MB) directly into your local folder ``$OUT_DIR``.
2.1 Set global configurations
@@ -273,7 +273,7 @@ We load the tokenizer and fine-tuned BERT sequence classifier model
2.3 Define the tokenize and evaluation function
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-We reuse the tokenize and evaluation function from `Huggingface `_.
+We reuse the tokenize and evaluation function from `HuggingFace `_.
.. code:: python
diff --git a/intermediate_source/tiatoolbox_tutorial.rst b/intermediate_source/tiatoolbox_tutorial.rst
index dbaf3cdc464..de9b3031330 100644
--- a/intermediate_source/tiatoolbox_tutorial.rst
+++ b/intermediate_source/tiatoolbox_tutorial.rst
@@ -368,7 +368,7 @@ The PatchPredictor class runs a CNN-based classifier written in PyTorch.
# Users can load any PyTorch model architecture instead using the following script
model = vanilla.CNNModel(backbone="resnet18", num_classes=9) # Importing model from torchvision.models.resnet18
- model.load_state_dict(torch.load(weights_path, map_location="cpu"), strict=True)
+ model.load_state_dict(torch.load(weights_path, map_location="cpu", weights_only=True), strict=True)
def preproc_func(img):
img = PIL.Image.fromarray(img)
img = transforms.ToTensor()(img)
diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py
index 5e7112f5b93..67b055d9ff2 100644
--- a/intermediate_source/torch_compile_tutorial.py
+++ b/intermediate_source/torch_compile_tutorial.py
@@ -73,17 +73,21 @@ def foo(x, y):
######################################################################
# Alternatively, we can decorate the function.
+t1 = torch.randn(10, 10)
+t2 = torch.randn(10, 10)
@torch.compile
def opt_foo2(x, y):
a = torch.sin(x)
b = torch.cos(y)
return a + b
-print(opt_foo2(torch.randn(10, 10), torch.randn(10, 10)))
+print(opt_foo2(t1, t2))
######################################################################
# We can also optimize ``torch.nn.Module`` instances.
+t = torch.randn(10, 100)
+
class MyModule(torch.nn.Module):
def __init__(self):
super().__init__()
@@ -94,7 +98,101 @@ def forward(self, x):
mod = MyModule()
opt_mod = torch.compile(mod)
-print(opt_mod(torch.randn(10, 100)))
+print(opt_mod(t))
+
+######################################################################
+# torch.compile and Nested Calls
+# ------------------------------
+# Nested function calls within the decorated function will also be compiled.
+
+def nested_function(x):
+ return torch.sin(x)
+
+@torch.compile
+def outer_function(x, y):
+ a = nested_function(x)
+ b = torch.cos(y)
+ return a + b
+
+print(outer_function(t1, t2))
+
+######################################################################
+# In the same fashion, when compiling a module all sub-modules and methods
+# within it, that are not in a skip list, are also compiled.
+
+class OuterModule(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.inner_module = MyModule()
+ self.outer_lin = torch.nn.Linear(10, 2)
+
+ def forward(self, x):
+ x = self.inner_module(x)
+ return torch.nn.functional.relu(self.outer_lin(x))
+
+outer_mod = OuterModule()
+opt_outer_mod = torch.compile(outer_mod)
+print(opt_outer_mod(t))
+
+######################################################################
+# We can also disable some functions from being compiled by using
+# ``torch.compiler.disable``. Suppose you want to disable the tracing on just
+# the ``complex_function`` function, but want to continue the tracing back in
+# ``complex_conjugate``. In this case, you can use
+# ``torch.compiler.disable(recursive=False)`` option. Otherwise, the default is
+# ``recursive=True``.
+
+def complex_conjugate(z):
+ return torch.conj(z)
+
+@torch.compiler.disable(recursive=False)
+def complex_function(real, imag):
+ # Assuming this function cause problems in the compilation
+ z = torch.complex(real, imag)
+ return complex_conjugate(z)
+
+def outer_function():
+ real = torch.tensor([2, 3], dtype=torch.float32)
+ imag = torch.tensor([4, 5], dtype=torch.float32)
+ z = complex_function(real, imag)
+ return torch.abs(z)
+
+# Try to compile the outer_function
+try:
+ opt_outer_function = torch.compile(outer_function)
+ print(opt_outer_function())
+except Exception as e:
+ print("Compilation of outer_function failed:", e)
+
+######################################################################
+# Best Practices and Recommendations
+# ----------------------------------
+#
+# Behavior of ``torch.compile`` with Nested Modules and Function Calls
+#
+# When you use ``torch.compile``, the compiler will try to recursively compile
+# every function call inside the target function or module inside the target
+# function or module that is not in a skip list (such as built-ins, some functions in
+# the torch.* namespace).
+#
+# **Best Practices:**
+#
+# 1. **Top-Level Compilation:** One approach is to compile at the highest level
+# possible (i.e., when the top-level module is initialized/called) and
+# selectively disable compilation when encountering excessive graph breaks or
+# errors. If there are still many compile issues, compile individual
+# subcomponents instead.
+#
+# 2. **Modular Testing:** Test individual functions and modules with ``torch.compile``
+# before integrating them into larger models to isolate potential issues.
+#
+# 3. **Disable Compilation Selectively:** If certain functions or sub-modules
+# cannot be handled by `torch.compile`, use the `torch.compiler.disable` context
+# managers to recursively exclude them from compilation.
+#
+# 4. **Compile Leaf Functions First:** In complex models with multiple nested
+# functions and modules, start by compiling the leaf functions or modules first.
+# For more information see `TorchDynamo APIs for fine-grained tracing `__.
######################################################################
# Demonstrating Speedups
diff --git a/prototype_source/README.txt b/prototype_source/README.txt
index 4ab9ce8f6a9..2dcb5e0cb2d 100644
--- a/prototype_source/README.txt
+++ b/prototype_source/README.txt
@@ -7,7 +7,7 @@ Prototype Tutorials
2. graph_mode_static_quantization_tutorial.py
Graph Mode Post Training Static Quantization in PyTorch
https://pytorch.org/tutorials/prototype/graph_mode_static_quantization_tutorial.html
-
+
3. graph_mode_dynamic_bert_tutorial.rst
Graph Mode Dynamic Quantization on BERT
https://github.com/pytorch/tutorials/blob/main/prototype_source/graph_mode_dynamic_bert_tutorial.rst
@@ -30,9 +30,12 @@ Prototype Tutorials
8. fx_graph_mode_ptq_dynamic.py
FX Graph Mode Post Training Dynamic Quantization
- https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html
+ https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html
9. fx_graph_mode_quant_guide.py
FX Graph Mode Quantization User Guide
- https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html
+ https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html
+10 flight_recorder_tutorial.rst
+ Flight Recorder User Guide
+ https://pytorch.org/tutorials/prototype/flight_recorder_tutorial.html
diff --git a/prototype_source/flight_recorder_tutorial.rst b/prototype_source/flight_recorder_tutorial.rst
new file mode 100644
index 00000000000..8130e1537bb
--- /dev/null
+++ b/prototype_source/flight_recorder_tutorial.rst
@@ -0,0 +1,186 @@
+(prototype) Flight Recorder for Debugging
+=========================================
+**Author**: `Chirag Pandya `_, `Junjie Wang `_
+
+What you will learn
+-------------------
+* Learn about a new tool for debugging stuck jobs during distributed training.
+* Learn how you can enable the tool and use the collected data for analyzing stuck jobs.
+
+Prerequisites
+-------------
+- PyTorch version 2.5 or later.
+
+
+Overview
+--------
+An AI distributed training job refers to the process of training a machine learning model using multiple devices, such
+as GPUs or CPUs, connected in a network. This approach allows for faster and more efficient training of large models
+that require significant computational resources.
+An engineer’s goal is to complete an AI training job as quickly as possible and make continuous improvements so that
+subsequent training can be done faster. A trained, usable model is the final desired outcome.
+One of the biggest impediment to completing training is the concept of a *stuck job*.
+
+A distributed AI training job is considered `stuck` when it stops making meaningful progress for an extended period of
+time.
+
+A job can get stuck for various reasons:
+
+- **Data Starvation:** This occurs when the training job is not receiving data at the expected rate, possibly due to issues with the data pipeline or the data source.
+
+- **Resource Constraints:** If the system running the job does not have enough computational resources (such as CPU, GPU, or memory), the job might not be able to proceed.
+
+- **Network Issues:** In a distributed training setup, different parts of the model or data may be processed on different devices. If there are network issues, communication between these devices may be disrupted, causing the job to get stuck.
+
+- **Software Bugs or Errors:** Errors in the training code or the underlying libraries and frameworks can also cause a job to get stuck.
+
+- **Synchronization Issues:** In distributed training, different parts of the computation are often run in parallel and need to be synchronized at certain points. If this synchronization fails, the job can get stuck. For example, a deadlock can occur if one or more ranks fail to join a collective while the remaining ranks have joined. This results in an indefinite wait for the job to progress.
+
+Flight Recorder, as the name suggests, captures diagnostics information as collectives run. The captured diagnostic
+information is used to help root cause issues when jobs get stuck.
+There are two core parts to Flight Recorder.
+
+- The collection portion: when enabled, information about collectives is recorded in an in-memory circular buffer. Upon job timeout, or on demand, the in-memory buffer can be retrieved or dumped to file.
+
+- An analyzer script is available in the `tools/flight_recorder `__ directory (details below).
+ The analyzer script runs known heuristics using the collected data and attempts to automatically identify the underlying issue that caused the job to stall.
+
+Enabling Flight Recorder
+------------------------
+There are two required environment variables to get the initial version of Flight Recorder working.
+
+- ``TORCH_NCCL_TRACE_BUFFER_SIZE`` (``0``, ``N`` where ``N`` is a positive number): Setting ``N`` enables collection.
+ ``N`` represents the number of entries that will be kept internally in a circular buffer.
+ We recommended to set this value at 2000.
+- ``TORCH_NCCL_DUMP_ON_TIMEOUT = (true, false)``: Setting this to ``true`` will write out diagnostic files to disk on job timeout.
+ If enabled, there will be one file per rank output in the job's running directory.
+
+**Optional settings:**
+
+- ``TORCH_NCCL_TRACE_CPP_STACK (true, false)``: Setting this to true enables C++ stack stack trace captures in Flight Recorder.
+ C++ stack traces can be useful in providing the exact code path from a PyTorch Python call down to the primitive
+ C++ implementations. Also see ``TORCH_SYMBOLIZE_MODE`` in additional settings.
+- ``TORCH_NCCL_ENABLE_TIMING (true, false)``: true = enable additional cuda events at the start of each collective and
+ records the `duration` of each collective. This may incur some CPU overhead. In the collected data, the
+ ``duration`` field indicates how long each collective took to execute.
+
+Additional Settings
+-------------------
+
+- ``TORCH_SYMBOLIZE_MODE {dladdr, addr2line, fast}``: This setting determines the program used to retrieve C++ traces from a running program.
+ The default setting is ``addr2line``.
+
+ ``fast`` is a new experimental mode that is shown to be much faster than the traditional ``addr2line``.
+ Use this setting in conjunction with ``TORCH_NCCL_TRACE_CPP_STACK`` to collect C++ traces in the Flight Recorder data.
+
+Retrieving Flight Recorder Data via an API
+------------------------------------------
+
+You can also retrieve Flight Recorder data with an API call.
+The API with the default arguments is shown below:
+
+.. code:: python
+
+ torch._C._distributed_c10d._dump_nccl_trace(includeCollectives=True, includeStackTraces=True, onlyActive=False)
+
+
+To view the data, you can ``unpickle`` it as shown below:
+
+.. code:: python
+
+ t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+ print(t)
+
+Flight Recorder File Formats
+----------------------------
+
+Flight Recorder files are dumped in ``pickle`` format. Files are written to local disks or mounted shared NFS
+folders.
+
+The contents of a Flight Recorder ``unpickled`` file are shown below:
+
+.. code-block:: json
+
+ {
+ "version": "2.5",
+ "pg_config": {
+ "0": {
+ "name": "0",
+ "desc": "default_pg",
+ "ranks": "[0, 1]"
+ }
+ },
+ "pg_status": {
+ "0": {
+ "last_enqueued_collective": 2,
+ "last_started_collective": -1,
+ "last_completed_collective": 2
+ }
+ },
+ "entries": [
+ {
+ "frames": [
+ {
+ "name": "test_short_pickle",
+ "filename": "pytorch/test/distributed/test_c10d_nccl.py",
+ "line": 3647
+ },
+ {
+ "name": "spawn_main",
+ "filename": ".conda/envs/pytorch-3.10/lib/python3.10/multiprocessing/spawn.py",
+ "line": 116
+ },
+ {
+ "name": "",
+ "filename": "",
+ "line": 1
+ }
+ ],
+ "record_id": 0,
+ "pg_id": 0,
+ "process_group": ("0", "default_pg"),
+ "collective_seq_id": 1,
+ "p2p_seq_id": 0,
+ "op_id": 1,
+ "profiling_name": "nccl:all_reduce",
+ "time_created_ns": 1724779239936775119,
+ "input_sizes": [[3, 4]],
+ "input_dtypes": ["Float"],
+ "output_sizes": [[3, 4]],
+ "output_dtypes": ["Float"],
+ "state": "completed",
+ "time_discovered_started_ns": null,
+ "time_discovered_completed_ns": 1724779239975811724,
+ "retired": true,
+ "timeout_ms": 600000,
+ "is_p2p": false
+ },
+ ...
+ ]
+ }
+
+
+Analyzing Flight Recorder Dumps
+-------------------------------
+
+We have convenient scripts available in `pytorch/tools/flight_recorder` directory for analyzing captured
+data.
+
+To run the convenience script, follow these steps:
+
+1. Copy all files from a rank into a single directory.
+
+2. To run the script, use this command:
+
+.. code:: python
+
+ python fr_trace.py -d [-o