diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 31f42fdbd85..c646b8f9a86 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -11,8 +11,9 @@ IMAGE_NAME="$1" shift export UBUNTU_VERSION="20.04" +export CUDA_VERSION="12.4.1" -export BASE_IMAGE="ubuntu:${UBUNTU_VERSION}" +export BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}" echo "Building ${IMAGE_NAME} Docker image" docker build \ diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh index b20286a4099..c7eabda555d 100644 --- a/.ci/docker/common/common_utils.sh +++ b/.ci/docker/common/common_utils.sh @@ -22,5 +22,5 @@ conda_run() { } pip_install() { - as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $* + as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip3 install --progress-bar off $* } diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index 00cf2f21033..bd3711bfb0e 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -30,8 +30,8 @@ pytorch-lightning torchx torchrl==0.5.0 tensordict==0.5.0 -ax-platform>==0.4.0 -nbformat>==5.9.2 +ax-platform>=0.4.0 +nbformat>=5.9.2 datasets transformers torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable @@ -64,8 +64,8 @@ pyopengl gymnasium[mujoco]==0.27.0 timm iopath -pygame==2.1.2 +pygame==2.6.0 pycocotools semilearn==0.3.2 torchao==0.0.3 -segment_anything==1.0 \ No newline at end of file +segment_anything==1.0 diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index 4814f9a7d2b..2f1a9933aab 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -28,6 +28,9 @@ "intermediate_source/model_parallel_tutorial.py": { "needs": "linux.16xlarge.nvidia.gpu" }, + "recipes_source/torch_export_aoti_python.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, "advanced_source/pendulum.py": { "needs": "linux.g5.4xlarge.nvidia.gpu", "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run." diff --git a/README.md b/README.md index 0c961afd262..af84d9ebe79 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ We use sphinx-gallery's [notebook styled examples](https://sphinx-gallery.github Here is how you can create a new tutorial (for a detailed description, see [CONTRIBUTING.md](./CONTRIBUTING.md)): +NOTE: Before submitting a new tutorial, read [PyTorch Tutorial Submission Policy](./tutorial_submission_policy.md). + 1. Create a Python file. If you want it executed while inserted into documentation, save the file with the suffix `tutorial` so that the file name is `your_tutorial.py`. 2. Put it in one of the `beginner_source`, `intermediate_source`, `advanced_source` directory based on the level of difficulty. If it is a recipe, add it to `recipes_source`. For tutorials demonstrating unstable prototype features, add to the `prototype_source`. 3. For Tutorials (except if it is a prototype feature), include it in the `toctree` directive and create a `customcarditem` in [index.rst](./index.rst). @@ -31,7 +33,7 @@ If you are starting off with a Jupyter notebook, you can use [this script](https ## Building locally -The tutorial build is very large and requires a GPU. If your machine does not have a GPU device, you can preview your HTML build without actually downloading the data and running the tutorial code: +The tutorial build is very large and requires a GPU. If your machine does not have a GPU device, you can preview your HTML build without actually downloading the data and running the tutorial code: 1. Install required dependencies by running: `pip install -r requirements.txt`. @@ -40,8 +42,6 @@ The tutorial build is very large and requires a GPU. If your machine does not ha - If you have a GPU-powered laptop, you can build using `make docs`. This will download the data, execute the tutorials and build the documentation to `docs/` directory. This might take about 60-120 min for systems with GPUs. If you do not have a GPU installed on your system, then see next step. - You can skip the computationally intensive graph generation by running `make html-noplot` to build basic html documentation to `_build/html`. This way, you can quickly preview your tutorial. -> If you get **ModuleNotFoundError: No module named 'pytorch_sphinx_theme' make: *** [html-noplot] Error 2** from /tutorials/src/pytorch-sphinx-theme or /venv/src/pytorch-sphinx-theme (while using virtualenv), run `python setup.py install`. - ## Building a single tutorial You can build a single tutorial by using the `GALLERY_PATTERN` environment variable. For example to run only `neural_style_transfer_tutorial.py`, run: @@ -59,8 +59,8 @@ The `GALLERY_PATTERN` variable respects regular expressions. ## About contributing to PyTorch Documentation and Tutorials -* You can find information about contributing to PyTorch documentation in the -PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file. +* You can find information about contributing to PyTorch documentation in the +PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file. * Additional information can be found in [PyTorch CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md). diff --git a/_static/css/custom.css b/_static/css/custom.css index a467a088159..cc195d99061 100755 --- a/_static/css/custom.css +++ b/_static/css/custom.css @@ -91,3 +91,24 @@ transition: none; transform-origin: none; } + +.pytorch-left-menu-search input[type=text] { + background-image: none; +} + +.gsc-control-cse { + padding-left: 0px !important; + padding-bottom: 0px !important; +} + +.gsc-search-button .gsc-search-button-v2:focus { + border: transparent !important; + outline: none; + box-shadow: none; +} +.gsc-search-button-v2:active { + border: none !important; +} +.gsc-search-button-v2 { + border: none !important; +} diff --git a/_templates/layout.html b/_templates/layout.html index 22129040e49..1c632de63f8 100644 --- a/_templates/layout.html +++ b/_templates/layout.html @@ -11,6 +11,23 @@ {%- endblock %} +{% block sidebartitle %} + {% if theme_display_version %} + {%- set nav_version = version %} + {% if READTHEDOCS and current_version %} + {%- set nav_version = current_version %} + {% endif %} + {% if nav_version %} +
+ {{ nav_version }} +
+ {% endif %} + {% endif %} + +{% endblock %} {% block footer %} {{ super() }} diff --git a/advanced_source/cpp_custom_ops.rst b/advanced_source/cpp_custom_ops.rst index 435ff088bc0..ffabd6eff77 100644 --- a/advanced_source/cpp_custom_ops.rst +++ b/advanced_source/cpp_custom_ops.rst @@ -174,6 +174,8 @@ To add ``torch.compile`` support for an operator, we must add a FakeTensor kerne known as a "meta kernel" or "abstract impl"). FakeTensors are Tensors that have metadata (such as shape, dtype, device) but no data: the FakeTensor kernel for an operator specifies how to compute the metadata of output tensors given the metadata of input tensors. +The FakeTensor kernel should return dummy Tensors of your choice with +the correct Tensor metadata (shape/strides/``dtype``/device). We recommend that this be done from Python via the `torch.library.register_fake` API, though it is possible to do this from C++ as well (see diff --git a/advanced_source/dynamic_quantization_tutorial.py b/advanced_source/dynamic_quantization_tutorial.py index 9cc07a1d956..c8d94789d5d 100644 --- a/advanced_source/dynamic_quantization_tutorial.py +++ b/advanced_source/dynamic_quantization_tutorial.py @@ -151,7 +151,8 @@ def tokenize(self, path): model.load_state_dict( torch.load( model_data_filepath + 'word_language_model_quantize.pth', - map_location=torch.device('cpu') + map_location=torch.device('cpu'), + weights_only=True ) ) diff --git a/advanced_source/python_custom_ops.py b/advanced_source/python_custom_ops.py index 1e429b76b35..0b3bf6e4748 100644 --- a/advanced_source/python_custom_ops.py +++ b/advanced_source/python_custom_ops.py @@ -66,7 +66,7 @@ def display(img): ###################################################################### # ``crop`` is not handled effectively out-of-the-box by # ``torch.compile``: ``torch.compile`` induces a -# `"graph break" `_ +# `"graph break" `_ # on functions it is unable to handle and graph breaks are bad for performance. # The following code demonstrates this by raising an error # (``torch.compile`` with ``fullgraph=True`` raises an error if a @@ -85,9 +85,9 @@ def f(img): # # 1. wrap the function into a PyTorch custom operator. # 2. add a "``FakeTensor`` kernel" (aka "meta kernel") to the operator. -# Given the metadata (e.g. shapes) -# of the input Tensors, this function says how to compute the metadata -# of the output Tensor(s). +# Given some ``FakeTensors`` inputs (dummy Tensors that don't have storage), +# this function should return dummy Tensors of your choice with the correct +# Tensor metadata (shape/strides/``dtype``/device). from typing import Sequence @@ -130,6 +130,11 @@ def f(img): # ``autograd.Function`` with PyTorch operator registration APIs can lead to (and # has led to) silent incorrectness when composed with ``torch.compile``. # +# If you don't need training support, there is no need to use +# ``torch.library.register_autograd``. +# If you end up training with a ``custom_op`` that doesn't have an autograd +# registration, we'll raise an error message. +# # The gradient formula for ``crop`` is essentially ``PIL.paste`` (we'll leave the # derivation as an exercise to the reader). Let's first wrap ``paste`` into a # custom operator: @@ -203,7 +208,7 @@ def setup_context(ctx, inputs, output): ###################################################################### # Mutable Python Custom operators # ------------------------------- -# You can also wrap a Python function that mutates its inputs into a custom +# You can also wrap a Python function that mutates its inputs into a custom # operator. # Functions that mutate inputs are common because that is how many low-level # kernels are written; for example, a kernel that computes ``sin`` may take in diff --git a/advanced_source/static_quantization_tutorial.rst b/advanced_source/static_quantization_tutorial.rst index 3b818aa03aa..efb171c0dfe 100644 --- a/advanced_source/static_quantization_tutorial.rst +++ b/advanced_source/static_quantization_tutorial.rst @@ -286,7 +286,7 @@ We next define several helper functions to help with model evaluation. These mos def load_model(model_file): model = MobileNetV2() - state_dict = torch.load(model_file) + state_dict = torch.load(model_file, weights_only=True) model.load_state_dict(state_dict) model.to('cpu') return model diff --git a/beginner_source/basics/quickstart_tutorial.py b/beginner_source/basics/quickstart_tutorial.py index 07a1be517d1..df7628081ba 100644 --- a/beginner_source/basics/quickstart_tutorial.py +++ b/beginner_source/basics/quickstart_tutorial.py @@ -216,7 +216,7 @@ def test(dataloader, model, loss_fn): # the state dictionary into it. model = NeuralNetwork().to(device) -model.load_state_dict(torch.load("model.pth")) +model.load_state_dict(torch.load("model.pth", weights_only=True)) ############################################################# # This model can now be used to make predictions. diff --git a/beginner_source/basics/saveloadrun_tutorial.py b/beginner_source/basics/saveloadrun_tutorial.py index 16a9f037417..5b3aef124b0 100644 --- a/beginner_source/basics/saveloadrun_tutorial.py +++ b/beginner_source/basics/saveloadrun_tutorial.py @@ -32,9 +32,14 @@ ########################## # To load model weights, you need to create an instance of the same model first, and then load the parameters # using ``load_state_dict()`` method. +# +# In the code below, we set ``weights_only=True`` to limit the +# functions executed during unpickling to only those necessary for +# loading weights. Using ``weights_only=True`` is considered +# a best practice when loading weights. model = models.vgg16() # we do not specify ``weights``, i.e. create untrained model -model.load_state_dict(torch.load('model_weights.pth')) +model.load_state_dict(torch.load('model_weights.pth', weights_only=True)) model.eval() ########################### @@ -50,9 +55,14 @@ torch.save(model, 'model.pth') ######################## -# We can then load the model like this: +# We can then load the model as demonstrated below. +# +# As described in `Saving and loading torch.nn.Modules `__, +# saving ``state_dict``s is considered the best practice. However, +# below we use ``weights_only=False`` because this involves loading the +# model, which is a legacy use case for ``torch.save``. -model = torch.load('model.pth') +model = torch.load('model.pth', weights_only=False), ######################## # .. note:: This approach uses Python `pickle `_ module when serializing the model, thus it relies on the actual class definition to be available when loading the model. diff --git a/beginner_source/blitz/cifar10_tutorial.py b/beginner_source/blitz/cifar10_tutorial.py index 8e3f3252921..f38abdd5666 100644 --- a/beginner_source/blitz/cifar10_tutorial.py +++ b/beginner_source/blitz/cifar10_tutorial.py @@ -221,7 +221,7 @@ def forward(self, x): # wasn't necessary here, we only did it to illustrate how to do so): net = Net() -net.load_state_dict(torch.load(PATH)) +net.load_state_dict(torch.load(PATH, weights_only=True)) ######################################################################## # Okay, now let us see what the neural network thinks these examples above are: diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py index 44310cc3620..f902f8cd717 100644 --- a/beginner_source/chatbot_tutorial.py +++ b/beginner_source/chatbot_tutorial.py @@ -84,8 +84,7 @@ # Preparations # ------------ # -# To start, Download the data ZIP file -# `here `__ +# To get started, `download `__ the Movie-Dialogs Corpus zip file. # and put in a ``data/`` directory under the current directory. # diff --git a/beginner_source/deeplabv3_on_android.rst b/beginner_source/deeplabv3_on_android.rst index f2fe0e48f15..5ca7f01ad06 100644 --- a/beginner_source/deeplabv3_on_android.rst +++ b/beginner_source/deeplabv3_on_android.rst @@ -5,6 +5,10 @@ Image Segmentation DeepLabV3 on Android **Reviewed by**: `Jeremiah Chung `_ +.. warning:: + PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `_, PyTorch’s all-new on-device inference library. You can also review our `end-to-end workflows `_ and review the `source code for DeepLabV3 `_. + + Introduction ------------ diff --git a/beginner_source/fgsm_tutorial.py b/beginner_source/fgsm_tutorial.py index 007ad3fd956..9bdf52d84b4 100644 --- a/beginner_source/fgsm_tutorial.py +++ b/beginner_source/fgsm_tutorial.py @@ -192,7 +192,7 @@ def forward(self, x): model = Net().to(device) # Load the pretrained model -model.load_state_dict(torch.load(pretrained_model, map_location=device)) +model.load_state_dict(torch.load(pretrained_model, map_location=device, weights_only=True)) # Set the model in evaluation mode. In this case this is for the Dropout layers model.eval() diff --git a/beginner_source/onnx/intro_onnx.py b/beginner_source/onnx/intro_onnx.py index b5cbafc1c64..ec625ec78ff 100644 --- a/beginner_source/onnx/intro_onnx.py +++ b/beginner_source/onnx/intro_onnx.py @@ -39,13 +39,14 @@ - `ONNX `_ standard library - `ONNX Script `_ library that enables developers to author ONNX operators, - functions and models using a subset of Python in an expressive, and yet simple fashion. + functions and models using a subset of Python in an expressive, and yet simple fashion + - `ONNX Runtime `_ accelerated machine learning library. They can be installed through `pip `_: .. code-block:: bash - pip install --upgrade onnx onnxscript + pip install --upgrade onnx onnxscript onnxruntime To validate the installation, run the following commands: diff --git a/beginner_source/saving_loading_models.py b/beginner_source/saving_loading_models.py index fcd33be2537..6c9b6b1fd77 100644 --- a/beginner_source/saving_loading_models.py +++ b/beginner_source/saving_loading_models.py @@ -153,7 +153,7 @@ # .. code:: python # # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH)) +# model.load_state_dict(torch.load(PATH), weights_only=True) # model.eval() # # .. note:: @@ -206,7 +206,7 @@ # .. code:: python # # # Model class must be defined somewhere -# model = torch.load(PATH) +# model = torch.load(PATH, weights_only=False) # model.eval() # # This save/load process uses the most intuitive syntax and involves the @@ -290,7 +290,7 @@ # model = TheModelClass(*args, **kwargs) # optimizer = TheOptimizerClass(*args, **kwargs) # -# checkpoint = torch.load(PATH) +# checkpoint = torch.load(PATH, weights_only=True) # model.load_state_dict(checkpoint['model_state_dict']) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # epoch = checkpoint['epoch'] @@ -354,7 +354,7 @@ # optimizerA = TheOptimizerAClass(*args, **kwargs) # optimizerB = TheOptimizerBClass(*args, **kwargs) # -# checkpoint = torch.load(PATH) +# checkpoint = torch.load(PATH, weights_only=True) # modelA.load_state_dict(checkpoint['modelA_state_dict']) # modelB.load_state_dict(checkpoint['modelB_state_dict']) # optimizerA.load_state_dict(checkpoint['optimizerA_state_dict']) @@ -407,7 +407,7 @@ # .. code:: python # # modelB = TheModelBClass(*args, **kwargs) -# modelB.load_state_dict(torch.load(PATH), strict=False) +# modelB.load_state_dict(torch.load(PATH), strict=False, weights_only=True) # # Partially loading a model or loading a partial model are common # scenarios when transfer learning or training a new complex model. @@ -446,7 +446,7 @@ # # device = torch.device('cpu') # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH, map_location=device)) +# model.load_state_dict(torch.load(PATH, map_location=device, weights_only=True)) # # When loading a model on a CPU that was trained with a GPU, pass # ``torch.device('cpu')`` to the ``map_location`` argument in the @@ -469,7 +469,7 @@ # # device = torch.device("cuda") # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH)) +# model.load_state_dict(torch.load(PATH, weights_only=True)) # model.to(device) # # Make sure to call input = input.to(device) on any input tensors that you feed to the model # @@ -497,7 +497,7 @@ # # device = torch.device("cuda") # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH, map_location="cuda:0")) # Choose whatever GPU device number you want +# model.load_state_dict(torch.load(PATH, weights_only=True, map_location="cuda:0")) # Choose whatever GPU device number you want # model.to(device) # # Make sure to call input = input.to(device) on any input tensors that you feed to the model # diff --git a/beginner_source/transfer_learning_tutorial.py b/beginner_source/transfer_learning_tutorial.py index 7a2b053763a..de7a178bd7d 100644 --- a/beginner_source/transfer_learning_tutorial.py +++ b/beginner_source/transfer_learning_tutorial.py @@ -209,7 +209,7 @@ def train_model(model, criterion, optimizer, scheduler, num_epochs=25): print(f'Best val Acc: {best_acc:4f}') # load best model weights - model.load_state_dict(torch.load(best_model_params_path)) + model.load_state_dict(torch.load(best_model_params_path, weights_only=True)) return model diff --git a/conf.py b/conf.py index f0f4905844c..e4bca1ac7fa 100644 --- a/conf.py +++ b/conf.py @@ -67,6 +67,12 @@ # # needs_sphinx = '1.0' +html_meta = { + 'description': 'Master PyTorch with our step-by-step tutorials for all skill levels. Start your journey to becoming a PyTorch expert today!', + 'keywords': 'PyTorch, tutorials, Getting Started, deep learning, AI', + 'author': 'PyTorch Contributors' +} + # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. diff --git a/en-wordlist.txt b/en-wordlist.txt index 62762ab69cc..e69cbaa1a5f 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -2,6 +2,7 @@ ACL ADI AOT +AOTInductor APIs ATen AVX @@ -617,4 +618,4 @@ warmstarting warmup webp wsi -wsis \ No newline at end of file +wsis diff --git a/index.rst b/index.rst index 91517834fd8..95c4a8f3efb 100644 --- a/index.rst +++ b/index.rst @@ -3,6 +3,7 @@ Welcome to PyTorch Tutorials **What's new in PyTorch tutorials?** +* `torch.export AOTInductor Tutorial for Python runtime (Beta) `__ * `A guide on good usage of non_blocking and pin_memory() in PyTorch `__ * `Introduction to Distributed Pipeline Parallelism `__ * `Introduction to Libuv TCPStore Backend `__ diff --git a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py index f16b170ee6a..ed581426c2e 100644 --- a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py +++ b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py @@ -397,7 +397,7 @@ def pack_hook(tensor): return name def unpack_hook(name): - return torch.load(name) + return torch.load(name, weights_only=True) ###################################################################### @@ -420,7 +420,7 @@ def pack_hook(tensor): return name def unpack_hook(name): - tensor = torch.load(name) + tensor = torch.load(name, weights_only=True) os.remove(name) return tensor @@ -462,7 +462,7 @@ def pack_hook(tensor): return temp_file def unpack_hook(temp_file): - return torch.load(temp_file.name) + return torch.load(temp_file.name, weights_only=True) ###################################################################### diff --git a/intermediate_source/ddp_tutorial.rst b/intermediate_source/ddp_tutorial.rst index 13297fb2a12..cff5105fa54 100644 --- a/intermediate_source/ddp_tutorial.rst +++ b/intermediate_source/ddp_tutorial.rst @@ -214,7 +214,7 @@ and elasticity support, please refer to `TorchElastic `_. +We provide the fine-tuned BERT model for MRPC task `here `_. To save time, you can download the model file (~400 MB) directly into your local folder ``$OUT_DIR``. 2.1 Set global configurations @@ -273,7 +273,7 @@ We load the tokenizer and fine-tuned BERT sequence classifier model 2.3 Define the tokenize and evaluation function ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We reuse the tokenize and evaluation function from `Huggingface `_. +We reuse the tokenize and evaluation function from `HuggingFace `_. .. code:: python diff --git a/intermediate_source/tiatoolbox_tutorial.rst b/intermediate_source/tiatoolbox_tutorial.rst index dbaf3cdc464..de9b3031330 100644 --- a/intermediate_source/tiatoolbox_tutorial.rst +++ b/intermediate_source/tiatoolbox_tutorial.rst @@ -368,7 +368,7 @@ The PatchPredictor class runs a CNN-based classifier written in PyTorch. # Users can load any PyTorch model architecture instead using the following script model = vanilla.CNNModel(backbone="resnet18", num_classes=9) # Importing model from torchvision.models.resnet18 - model.load_state_dict(torch.load(weights_path, map_location="cpu"), strict=True) + model.load_state_dict(torch.load(weights_path, map_location="cpu", weights_only=True), strict=True) def preproc_func(img): img = PIL.Image.fromarray(img) img = transforms.ToTensor()(img) diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py index 5e7112f5b93..67b055d9ff2 100644 --- a/intermediate_source/torch_compile_tutorial.py +++ b/intermediate_source/torch_compile_tutorial.py @@ -73,17 +73,21 @@ def foo(x, y): ###################################################################### # Alternatively, we can decorate the function. +t1 = torch.randn(10, 10) +t2 = torch.randn(10, 10) @torch.compile def opt_foo2(x, y): a = torch.sin(x) b = torch.cos(y) return a + b -print(opt_foo2(torch.randn(10, 10), torch.randn(10, 10))) +print(opt_foo2(t1, t2)) ###################################################################### # We can also optimize ``torch.nn.Module`` instances. +t = torch.randn(10, 100) + class MyModule(torch.nn.Module): def __init__(self): super().__init__() @@ -94,7 +98,101 @@ def forward(self, x): mod = MyModule() opt_mod = torch.compile(mod) -print(opt_mod(torch.randn(10, 100))) +print(opt_mod(t)) + +###################################################################### +# torch.compile and Nested Calls +# ------------------------------ +# Nested function calls within the decorated function will also be compiled. + +def nested_function(x): + return torch.sin(x) + +@torch.compile +def outer_function(x, y): + a = nested_function(x) + b = torch.cos(y) + return a + b + +print(outer_function(t1, t2)) + +###################################################################### +# In the same fashion, when compiling a module all sub-modules and methods +# within it, that are not in a skip list, are also compiled. + +class OuterModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.inner_module = MyModule() + self.outer_lin = torch.nn.Linear(10, 2) + + def forward(self, x): + x = self.inner_module(x) + return torch.nn.functional.relu(self.outer_lin(x)) + +outer_mod = OuterModule() +opt_outer_mod = torch.compile(outer_mod) +print(opt_outer_mod(t)) + +###################################################################### +# We can also disable some functions from being compiled by using +# ``torch.compiler.disable``. Suppose you want to disable the tracing on just +# the ``complex_function`` function, but want to continue the tracing back in +# ``complex_conjugate``. In this case, you can use +# ``torch.compiler.disable(recursive=False)`` option. Otherwise, the default is +# ``recursive=True``. + +def complex_conjugate(z): + return torch.conj(z) + +@torch.compiler.disable(recursive=False) +def complex_function(real, imag): + # Assuming this function cause problems in the compilation + z = torch.complex(real, imag) + return complex_conjugate(z) + +def outer_function(): + real = torch.tensor([2, 3], dtype=torch.float32) + imag = torch.tensor([4, 5], dtype=torch.float32) + z = complex_function(real, imag) + return torch.abs(z) + +# Try to compile the outer_function +try: + opt_outer_function = torch.compile(outer_function) + print(opt_outer_function()) +except Exception as e: + print("Compilation of outer_function failed:", e) + +###################################################################### +# Best Practices and Recommendations +# ---------------------------------- +# +# Behavior of ``torch.compile`` with Nested Modules and Function Calls +# +# When you use ``torch.compile``, the compiler will try to recursively compile +# every function call inside the target function or module inside the target +# function or module that is not in a skip list (such as built-ins, some functions in +# the torch.* namespace). +# +# **Best Practices:** +# +# 1. **Top-Level Compilation:** One approach is to compile at the highest level +# possible (i.e., when the top-level module is initialized/called) and +# selectively disable compilation when encountering excessive graph breaks or +# errors. If there are still many compile issues, compile individual +# subcomponents instead. +# +# 2. **Modular Testing:** Test individual functions and modules with ``torch.compile`` +# before integrating them into larger models to isolate potential issues. +# +# 3. **Disable Compilation Selectively:** If certain functions or sub-modules +# cannot be handled by `torch.compile`, use the `torch.compiler.disable` context +# managers to recursively exclude them from compilation. +# +# 4. **Compile Leaf Functions First:** In complex models with multiple nested +# functions and modules, start by compiling the leaf functions or modules first. +# For more information see `TorchDynamo APIs for fine-grained tracing `__. ###################################################################### # Demonstrating Speedups diff --git a/prototype_source/README.txt b/prototype_source/README.txt index 4ab9ce8f6a9..2dcb5e0cb2d 100644 --- a/prototype_source/README.txt +++ b/prototype_source/README.txt @@ -7,7 +7,7 @@ Prototype Tutorials 2. graph_mode_static_quantization_tutorial.py Graph Mode Post Training Static Quantization in PyTorch https://pytorch.org/tutorials/prototype/graph_mode_static_quantization_tutorial.html - + 3. graph_mode_dynamic_bert_tutorial.rst Graph Mode Dynamic Quantization on BERT https://github.com/pytorch/tutorials/blob/main/prototype_source/graph_mode_dynamic_bert_tutorial.rst @@ -30,9 +30,12 @@ Prototype Tutorials 8. fx_graph_mode_ptq_dynamic.py FX Graph Mode Post Training Dynamic Quantization - https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html + https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html 9. fx_graph_mode_quant_guide.py FX Graph Mode Quantization User Guide - https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html + https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html +10 flight_recorder_tutorial.rst + Flight Recorder User Guide + https://pytorch.org/tutorials/prototype/flight_recorder_tutorial.html diff --git a/prototype_source/flight_recorder_tutorial.rst b/prototype_source/flight_recorder_tutorial.rst new file mode 100644 index 00000000000..8130e1537bb --- /dev/null +++ b/prototype_source/flight_recorder_tutorial.rst @@ -0,0 +1,186 @@ +(prototype) Flight Recorder for Debugging +========================================= +**Author**: `Chirag Pandya `_, `Junjie Wang `_ + +What you will learn +------------------- +* Learn about a new tool for debugging stuck jobs during distributed training. +* Learn how you can enable the tool and use the collected data for analyzing stuck jobs. + +Prerequisites +------------- +- PyTorch version 2.5 or later. + + +Overview +-------- +An AI distributed training job refers to the process of training a machine learning model using multiple devices, such +as GPUs or CPUs, connected in a network. This approach allows for faster and more efficient training of large models +that require significant computational resources. +An engineer’s goal is to complete an AI training job as quickly as possible and make continuous improvements so that +subsequent training can be done faster. A trained, usable model is the final desired outcome. +One of the biggest impediment to completing training is the concept of a *stuck job*. + +A distributed AI training job is considered `stuck` when it stops making meaningful progress for an extended period of +time. + +A job can get stuck for various reasons: + +- **Data Starvation:** This occurs when the training job is not receiving data at the expected rate, possibly due to issues with the data pipeline or the data source. + +- **Resource Constraints:** If the system running the job does not have enough computational resources (such as CPU, GPU, or memory), the job might not be able to proceed. + +- **Network Issues:** In a distributed training setup, different parts of the model or data may be processed on different devices. If there are network issues, communication between these devices may be disrupted, causing the job to get stuck. + +- **Software Bugs or Errors:** Errors in the training code or the underlying libraries and frameworks can also cause a job to get stuck. + +- **Synchronization Issues:** In distributed training, different parts of the computation are often run in parallel and need to be synchronized at certain points. If this synchronization fails, the job can get stuck. For example, a deadlock can occur if one or more ranks fail to join a collective while the remaining ranks have joined. This results in an indefinite wait for the job to progress. + +Flight Recorder, as the name suggests, captures diagnostics information as collectives run. The captured diagnostic +information is used to help root cause issues when jobs get stuck. +There are two core parts to Flight Recorder. + +- The collection portion: when enabled, information about collectives is recorded in an in-memory circular buffer. Upon job timeout, or on demand, the in-memory buffer can be retrieved or dumped to file. + +- An analyzer script is available in the `tools/flight_recorder `__ directory (details below). + The analyzer script runs known heuristics using the collected data and attempts to automatically identify the underlying issue that caused the job to stall. + +Enabling Flight Recorder +------------------------ +There are two required environment variables to get the initial version of Flight Recorder working. + +- ``TORCH_NCCL_TRACE_BUFFER_SIZE`` (``0``, ``N`` where ``N`` is a positive number): Setting ``N`` enables collection. + ``N`` represents the number of entries that will be kept internally in a circular buffer. + We recommended to set this value at 2000. +- ``TORCH_NCCL_DUMP_ON_TIMEOUT = (true, false)``: Setting this to ``true`` will write out diagnostic files to disk on job timeout. + If enabled, there will be one file per rank output in the job's running directory. + +**Optional settings:** + +- ``TORCH_NCCL_TRACE_CPP_STACK (true, false)``: Setting this to true enables C++ stack stack trace captures in Flight Recorder. + C++ stack traces can be useful in providing the exact code path from a PyTorch Python call down to the primitive + C++ implementations. Also see ``TORCH_SYMBOLIZE_MODE`` in additional settings. +- ``TORCH_NCCL_ENABLE_TIMING (true, false)``: true = enable additional cuda events at the start of each collective and + records the `duration` of each collective. This may incur some CPU overhead. In the collected data, the + ``duration`` field indicates how long each collective took to execute. + +Additional Settings +------------------- + +- ``TORCH_SYMBOLIZE_MODE {dladdr, addr2line, fast}``: This setting determines the program used to retrieve C++ traces from a running program. + The default setting is ``addr2line``. + + ``fast`` is a new experimental mode that is shown to be much faster than the traditional ``addr2line``. + Use this setting in conjunction with ``TORCH_NCCL_TRACE_CPP_STACK`` to collect C++ traces in the Flight Recorder data. + +Retrieving Flight Recorder Data via an API +------------------------------------------ + +You can also retrieve Flight Recorder data with an API call. +The API with the default arguments is shown below: + +.. code:: python + + torch._C._distributed_c10d._dump_nccl_trace(includeCollectives=True, includeStackTraces=True, onlyActive=False) + + +To view the data, you can ``unpickle`` it as shown below: + +.. code:: python + + t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace()) + print(t) + +Flight Recorder File Formats +---------------------------- + +Flight Recorder files are dumped in ``pickle`` format. Files are written to local disks or mounted shared NFS +folders. + +The contents of a Flight Recorder ``unpickled`` file are shown below: + +.. code-block:: json + + { + "version": "2.5", + "pg_config": { + "0": { + "name": "0", + "desc": "default_pg", + "ranks": "[0, 1]" + } + }, + "pg_status": { + "0": { + "last_enqueued_collective": 2, + "last_started_collective": -1, + "last_completed_collective": 2 + } + }, + "entries": [ + { + "frames": [ + { + "name": "test_short_pickle", + "filename": "pytorch/test/distributed/test_c10d_nccl.py", + "line": 3647 + }, + { + "name": "spawn_main", + "filename": ".conda/envs/pytorch-3.10/lib/python3.10/multiprocessing/spawn.py", + "line": 116 + }, + { + "name": "", + "filename": "", + "line": 1 + } + ], + "record_id": 0, + "pg_id": 0, + "process_group": ("0", "default_pg"), + "collective_seq_id": 1, + "p2p_seq_id": 0, + "op_id": 1, + "profiling_name": "nccl:all_reduce", + "time_created_ns": 1724779239936775119, + "input_sizes": [[3, 4]], + "input_dtypes": ["Float"], + "output_sizes": [[3, 4]], + "output_dtypes": ["Float"], + "state": "completed", + "time_discovered_started_ns": null, + "time_discovered_completed_ns": 1724779239975811724, + "retired": true, + "timeout_ms": 600000, + "is_p2p": false + }, + ... + ] + } + + +Analyzing Flight Recorder Dumps +------------------------------- + +We have convenient scripts available in `pytorch/tools/flight_recorder` directory for analyzing captured +data. + +To run the convenience script, follow these steps: + +1. Copy all files from a rank into a single directory. + +2. To run the script, use this command: + +.. code:: python + + python fr_trace.py -d [-o ] + + +Conclusion +---------- +In this tutorial, we have learned about a new PyTorch diagnostic tool called Flight Recorder. +We have discussed how to enable Flight Recorder to collect diagnostic data from a machine. +Additionally, we explored how to analyze the data captured from the Flight Recorder using a +convenience script located in the `tools/flight_recorder `__ +directory of the PyTorch repository. diff --git a/prototype_source/fx_graph_mode_ptq_dynamic.py b/prototype_source/fx_graph_mode_ptq_dynamic.py index 84d6ccb1832..fc29e5fa97b 100644 --- a/prototype_source/fx_graph_mode_ptq_dynamic.py +++ b/prototype_source/fx_graph_mode_ptq_dynamic.py @@ -171,7 +171,8 @@ def tokenize(self, path): model.load_state_dict( torch.load( model_data_filepath + 'word_language_model_quantize.pth', - map_location=torch.device('cpu') + map_location=torch.device('cpu'), + weights_only=True ) ) diff --git a/prototype_source/fx_graph_mode_ptq_static.rst b/prototype_source/fx_graph_mode_ptq_static.rst index a7165f713f8..0c4f8065e37 100644 --- a/prototype_source/fx_graph_mode_ptq_static.rst +++ b/prototype_source/fx_graph_mode_ptq_static.rst @@ -157,7 +157,7 @@ Download the `torchvision resnet18 model `_ and download the ``vit_h`` checkpoint. Alternatively, you can just use ``wget``: `wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth --directory-prefix= +# 1. Go to the `segment-anything repo checkpoint `_ and download the ``vit_h`` checkpoint. Alternatively, you can use ``wget`` (for example, ``wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth --directory-prefix=``). # 2. Pass in that directory by editing the code below to say: # -# .. code-block:: -# -# {sam_checkpoint_base_path}= +# .. code-block:: bash # -# This was run on an A100-PG509-200 power limited to 330.00 W +# {sam_checkpoint_base_path}= # import torch @@ -297,7 +295,7 @@ def get_sam_model(only_one_block=False, batchsize=1): # ----------------- # In this tutorial, we have learned about the quantization and optimization techniques # on the example of the segment anything model. - +# # In the end, we achieved a full-model apples to apples quantization speedup # of about 7.7% on batch size 16 (677.28ms to 729.65ms). We can push this a # bit further by increasing the batch size and optimizing other parts of diff --git a/prototype_source/lite_interpreter.rst b/prototype_source/lite_interpreter.rst new file mode 100644 index 00000000000..73e950d72e2 --- /dev/null +++ b/prototype_source/lite_interpreter.rst @@ -0,0 +1,9 @@ +(Prototype) Introduce lite interpreter workflow in Android and iOS +======================= + +This tutorial has been moved to https://pytorch.org/tutorials/recipes/mobile_interpreter.html + + +.. raw:: html + + diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst index 8d965194f88..af0da6ea56b 100644 --- a/prototype_source/prototype_index.rst +++ b/prototype_source/prototype_index.rst @@ -80,8 +80,8 @@ Prototype features are not available as part of binary distributions like PyPI o :card_description: Learn how to use Post Training Quantization in PyTorch 2 Export. :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png :link: ../prototype/pt2e_quant_ptq.html - :tags: Quantization - + :tags: Quantization + .. customcarditem:: :header: PyTorch 2 Export Quantization-Aware Training :card_description: Learn how to use Quantization-Aware-Training in PyTorch 2 Export. @@ -203,11 +203,11 @@ Prototype features are not available as part of binary distributions like PyPI o .. customcarditem:: :header: MaskedTensor: Simplifying Adagrad Sparse Semantics - :card_description: See a showcase on how masked tensors can enable sparse semantics and provide for a cleaner dev experience + :card_description: See a showcase on how masked tensors can enable sparse semantics and provide for a cleaner dev experience :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png :link: ../prototype/maskedtensor_adagrad.html :tags: MaskedTensor - + .. Model-Optimization .. customcarditem:: @@ -217,6 +217,15 @@ Prototype features are not available as part of binary distributions like PyPI o :link: ../prototype/inductor_cpp_wrapper_tutorial.html :tags: Model-Optimization +.. Distributed + +.. customcarditem:: + :header: Flight Recorder Tutorial + :card_description: Debug your stuck jobs with Flight Recorder + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../prototype/flight_recorder_tutorial.html + :tags: Distributed, Debugging, FlightRecorder + .. End of tutorial card section .. raw:: html @@ -238,6 +247,7 @@ Prototype features are not available as part of binary distributions like PyPI o prototype/fx_graph_mode_quant_guide.html prototype/fx_graph_mode_ptq_dynamic.html prototype/fx_graph_mode_ptq_static.html + prototype/flight_recorder_tutorial.html prototype/graph_mode_dynamic_bert_tutorial.html prototype/inductor_cpp_wrapper_tutorial.html prototype/pt2e_quantizer.html diff --git a/prototype_source/pt2e_quant_ptq.rst b/prototype_source/pt2e_quant_ptq.rst index 7f46c86e42e..0fe713f8abe 100644 --- a/prototype_source/pt2e_quant_ptq.rst +++ b/prototype_source/pt2e_quant_ptq.rst @@ -274,7 +274,7 @@ and rename it to ``data/resnet18_pretrained_float.pth``. def load_model(model_file): model = resnet18(pretrained=False) - state_dict = torch.load(model_file) + state_dict = torch.load(model_file, weights_only=True) model.load_state_dict(state_dict) model.to("cpu") return model diff --git a/prototype_source/pt2e_quant_qat.rst b/prototype_source/pt2e_quant_qat.rst index 6d995d368e0..d716af5fec8 100644 --- a/prototype_source/pt2e_quant_qat.rst +++ b/prototype_source/pt2e_quant_qat.rst @@ -172,7 +172,7 @@ prepare the data. These steps are very similar to the ones defined in the def load_model(model_file): model = resnet18(pretrained=False) - state_dict = torch.load(model_file) + state_dict = torch.load(model_file, weights_only=True) model.load_state_dict(state_dict) return model diff --git a/recipes_source/distributed_checkpoint_recipe.rst b/recipes_source/distributed_checkpoint_recipe.rst index 2467db878eb..8f93c2222d6 100644 --- a/recipes_source/distributed_checkpoint_recipe.rst +++ b/recipes_source/distributed_checkpoint_recipe.rst @@ -289,7 +289,7 @@ the intent is to save or load in "non-distributed" style, meaning entirely in th import os import torch - import torch.distributed.checkpoint as DCP + import torch.distributed.checkpoint as dcp import torch.nn as nn diff --git a/recipes_source/distributed_comm_debug_mode.rst b/recipes_source/distributed_comm_debug_mode.rst new file mode 100644 index 00000000000..db79cdc8992 --- /dev/null +++ b/recipes_source/distributed_comm_debug_mode.rst @@ -0,0 +1,210 @@ +Getting Started with ``CommDebugMode`` +===================================================== + +**Author**: `Anshul Sinha `__ + + +In this tutorial, we will explore how to use ``CommDebugMode`` with PyTorch's +DistributedTensor (DTensor) for debugging by tracking collective operations in distributed training environments. + +Prerequisites +--------------------- + +* Python 3.8 - 3.11 +* PyTorch 2.2 or later + + +What is ``CommDebugMode`` and why is it useful +---------------------------------------------------- +As the size of models continues to increase, users are seeking to leverage various combinations +of parallel strategies to scale up distributed training. However, the lack of interoperability +between existing solutions poses a significant challenge, primarily due to the absence of a +unified abstraction that can bridge these different parallelism strategies. To address this +issue, PyTorch has proposed `DistributedTensor(DTensor) +`_ +which abstracts away the complexities of tensor communication in distributed training, +providing a seamless user experience. However, when dealing with existing parallelism solutions and +developing parallelism solutions using the unified abstraction like DTensor, the lack of transparency +about what and when the collective communications happens under the hood could make it challenging +for advanced users to identify and resolve issues. To address this challenge, ``CommDebugMode``, a +Python context manager will serve as one of the primary debugging tools for DTensors, enabling +users to view when and why collective operations are happening when using DTensors, effectively +addressing this issue. + + +Using ``CommDebugMode`` +------------------------ + +Here is how you can use ``CommDebugMode``: + +.. code-block:: python + + # The model used in this example is a MLPModule applying Tensor Parallel + comm_mode = CommDebugMode() + with comm_mode: + output = model(inp) + + # print the operation level collective tracing information + print(comm_mode.generate_comm_debug_tracing_table(noise_level=0)) + + # log the operation level collective tracing information to a file + comm_mode.log_comm_debug_tracing_table_to_file( + noise_level=1, file_name="transformer_operation_log.txt" + ) + + # dump the operation level collective tracing information to json file, + # used in the visual browser below + comm_mode.generate_json_dump(noise_level=2) + +This is what the output looks like for a MLPModule at noise level 0: + +.. code-block:: python + + Expected Output: + Global + FORWARD PASS + *c10d_functional.all_reduce: 1 + MLPModule + FORWARD PASS + *c10d_functional.all_reduce: 1 + MLPModule.net1 + MLPModule.relu + MLPModule.net2 + FORWARD PASS + *c10d_functional.all_reduce: 1 + +To use ``CommDebugMode``, you must wrap the code running the model in ``CommDebugMode`` and call the API that +you want to use to display the data. You can also use a ``noise_level`` argument to control the verbosity +level of displayed information. Here is what each noise level displays: + +| 0. Prints module-level collective counts +| 1. Prints DTensor operations (not including trivial operations), module sharding information +| 2. Prints tensor operations (not including trivial operations) +| 3. Prints all operations + +In the example above, you can see that the collective operation, all_reduce, occurs once in the forward pass +of the ``MLPModule``. Furthermore, you can use ``CommDebugMode`` to pinpoint that the all-reduce operation happens +in the second linear layer of the ``MLPModule``. + + +Below is the interactive module tree visualization that you can use to upload your own JSON dump: + +.. raw:: html + + + + + + + CommDebugMode Module Tree + + + +
+
+ Drag file here +
+ +
+
+ + + + +Conclusion +------------------------------------------ + +In this recipe, we have learned how to use ``CommDebugMode`` to debug Distributed Tensors and +parallelism solutions that uses communication collectives with PyTorch. You can use your own +JSON outputs in the embedded visual browser. + +For more detailed information about ``CommDebugMode``, see +`comm_mode_features_example.py +`_ diff --git a/recipes_source/intel_neural_compressor_for_pytorch.rst b/recipes_source/intel_neural_compressor_for_pytorch.rst index 67f1a7f333e..02ce3d7b378 100755 --- a/recipes_source/intel_neural_compressor_for_pytorch.rst +++ b/recipes_source/intel_neural_compressor_for_pytorch.rst @@ -115,7 +115,7 @@ In this tutorial, the LeNet model is used to demonstrate how to deal with *Intel return F.log_softmax(x, dim=1) model = Net() - model.load_state_dict(torch.load('./lenet_mnist_model.pth')) + model.load_state_dict(torch.load('./lenet_mnist_model.pth', weights_only=True)) The pretrained model weight `lenet_mnist_model.pth` comes from `here `_. diff --git a/recipes_source/mobile_interpreter.rst b/recipes_source/mobile_interpreter.rst index dda1dd92435..44036e74ffd 100644 --- a/recipes_source/mobile_interpreter.rst +++ b/recipes_source/mobile_interpreter.rst @@ -3,6 +3,9 @@ **Author**: `Chen Lai `_, `Martin Yuan `_ +.. warning:: + PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `_, PyTorch’s all-new on-device inference library. You can also review our new documentation to learn more about how to build `iOS `_ and `Android `_ apps with ExecuTorch. + Introduction ------------ diff --git a/recipes_source/mobile_perf.rst b/recipes_source/mobile_perf.rst index aae1447cbf8..14f183ab69e 100644 --- a/recipes_source/mobile_perf.rst +++ b/recipes_source/mobile_perf.rst @@ -1,6 +1,9 @@ Pytorch Mobile Performance Recipes ================================== +.. warning:: + PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `_, PyTorch’s all-new on-device inference library. You can also learn more about `quantization `_, `Hardware acceleration (op fusion using hw) `_, and `benchmarking `_ on ExecuTorch’s documentation pages. + Introduction ---------------- Performance (aka latency) is crucial to most, if not all, @@ -245,7 +248,7 @@ For example, using ResNet-50 and running the following script: -you would get the following result: +you would get the following result: :: diff --git a/recipes_source/ptmobile_recipes_summary.rst b/recipes_source/ptmobile_recipes_summary.rst index cddee940f2a..6cc8f6f7514 100644 --- a/recipes_source/ptmobile_recipes_summary.rst +++ b/recipes_source/ptmobile_recipes_summary.rst @@ -1,6 +1,9 @@ Summary of PyTorch Mobile Recipes ===================================== +.. warning:: + Note: PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `_, PyTorch’s all-new on-device inference library. You can also review these `ExecuTorch examples `_. + This summary provides a top level overview of recipes for PyTorch Mobile to help developers choose which recipes to follow for their PyTorch-powered mobile app development. Introduction diff --git a/recipes_source/recipes/module_load_state_dict_tips.py b/recipes_source/recipes/module_load_state_dict_tips.py index 17c812b016f..70e9830cb3c 100644 --- a/recipes_source/recipes/module_load_state_dict_tips.py +++ b/recipes_source/recipes/module_load_state_dict_tips.py @@ -39,7 +39,7 @@ def forward(self, x): # to ``torch.load``, the ``torch.device()`` context manager and the ``assign`` # keyword argument to ``nn.Module.load_state_dict()``. -state_dict = torch.load('checkpoint.pth', mmap=True) +state_dict = torch.load('checkpoint.pth', mmap=True, weights_only=True) with torch.device('meta'): meta_m = SomeModule(1000) meta_m.load_state_dict(state_dict, assign=True) @@ -47,7 +47,7 @@ def forward(self, x): ############################################################################# # Compare the snippet below to the one above: -state_dict = torch.load('checkpoint.pth') +state_dict = torch.load('checkpoint.pth', weights_only=True) m = SomeModule(1000) m.load_state_dict(state_dict) @@ -71,7 +71,7 @@ def forward(self, x): # * Waiting for the entire checkpoint to be loaded into RAM before performing, for example, some per-tensor processing. start_time = time.time() -state_dict = torch.load('checkpoint.pth') +state_dict = torch.load('checkpoint.pth', weights_only=True) end_time = time.time() print(f"loading time without mmap={end_time - start_time}") @@ -84,7 +84,7 @@ def forward(self, x): # storages will be memory-mapped. start_time = time.time() -state_dict = torch.load('checkpoint.pth', mmap=True) +state_dict = torch.load('checkpoint.pth', mmap=True, weights_only=True) end_time = time.time() print(f"loading time with mmap={end_time - start_time}") diff --git a/recipes_source/recipes/profiler_recipe.py b/recipes_source/recipes/profiler_recipe.py index 47d9f86d8a8..f35172159b8 100644 --- a/recipes_source/recipes/profiler_recipe.py +++ b/recipes_source/recipes/profiler_recipe.py @@ -73,7 +73,6 @@ # - ``record_shapes`` - whether to record shapes of the operator inputs; # - ``profile_memory`` - whether to report amount of memory consumed by # model's Tensors; -# - ``use_cuda`` - whether to measure execution time of CUDA kernels. # # Note: when using CUDA, profiler also shows the runtime CUDA events # occurring on the host. diff --git a/recipes_source/recipes/save_load_across_devices.py b/recipes_source/recipes/save_load_across_devices.py index be950e15b13..c59af8821e9 100644 --- a/recipes_source/recipes/save_load_across_devices.py +++ b/recipes_source/recipes/save_load_across_devices.py @@ -97,7 +97,7 @@ def forward(self, x): # Load device = torch.device('cpu') model = Net() -model.load_state_dict(torch.load(PATH, map_location=device)) +model.load_state_dict(torch.load(PATH, map_location=device, weights_only=True)) ###################################################################### diff --git a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py index 31b14f3a28a..8c773a14909 100644 --- a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py +++ b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py @@ -131,7 +131,7 @@ def forward(self, x): model = Net() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) -checkpoint = torch.load(PATH) +checkpoint = torch.load(PATH, weights_only=True) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch = checkpoint['epoch'] diff --git a/recipes_source/recipes/saving_and_loading_models_for_inference.py b/recipes_source/recipes/saving_and_loading_models_for_inference.py index cd24b77c1de..7adce2a90b5 100644 --- a/recipes_source/recipes/saving_and_loading_models_for_inference.py +++ b/recipes_source/recipes/saving_and_loading_models_for_inference.py @@ -117,7 +117,7 @@ def forward(self, x): # Load model = Net() -model.load_state_dict(torch.load(PATH)) +model.load_state_dict(torch.load(PATH, weights_only=True)) model.eval() diff --git a/recipes_source/recipes/saving_multiple_models_in_one_file.py b/recipes_source/recipes/saving_multiple_models_in_one_file.py index f468d7ac6a1..e938be03b45 100644 --- a/recipes_source/recipes/saving_multiple_models_in_one_file.py +++ b/recipes_source/recipes/saving_multiple_models_in_one_file.py @@ -128,7 +128,7 @@ def forward(self, x): optimModelA = optim.SGD(modelA.parameters(), lr=0.001, momentum=0.9) optimModelB = optim.SGD(modelB.parameters(), lr=0.001, momentum=0.9) -checkpoint = torch.load(PATH) +checkpoint = torch.load(PATH, weights_only=True) modelA.load_state_dict(checkpoint['modelA_state_dict']) modelB.load_state_dict(checkpoint['modelB_state_dict']) optimizerA.load_state_dict(checkpoint['optimizerA_state_dict']) diff --git a/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py b/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py index 40aeeea9db8..a0752bfc67d 100644 --- a/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py +++ b/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py @@ -124,7 +124,7 @@ def forward(self, x): # are loading into. # -netB.load_state_dict(torch.load(PATH), strict=False) +netB.load_state_dict(torch.load(PATH, weights_only=True), strict=False) ###################################################################### diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index 8959ea98a38..caccdcc28f7 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -150,6 +150,12 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/recipes/swap_tensors.html :tags: Basics +.. customcarditem:: + :header: torch.export AOTInductor Tutorial for Python runtime + :card_description: Learn an end-to-end example of how to use AOTInductor for python runtime. + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/torch_export_aoti_python.html + :tags: Basics .. Interpretability @@ -395,6 +401,13 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/distributed_async_checkpoint_recipe.html :tags: Distributed-Training +.. customcarditem:: + :header: Getting Started with CommDebugMode + :card_description: Learn how to use CommDebugMode for DTensors + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/distributed_comm_debug_mode.html + :tags: Distributed-Training + .. TorchServe .. customcarditem:: @@ -449,3 +462,4 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu /recipes/cuda_rpc /recipes/distributed_optim_torchscript /recipes/mobile_interpreter + /recipes/distributed_comm_debug_mode diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py new file mode 100644 index 00000000000..312491b660f --- /dev/null +++ b/recipes_source/torch_export_aoti_python.py @@ -0,0 +1,224 @@ +# -*- coding: utf-8 -*- + +""" +.. meta:: + :description: An end-to-end example of how to use AOTInductor for Python runtime. + :keywords: torch.export, AOTInductor, torch._inductor.aot_compile, torch._export.aot_load + +``torch.export`` AOTInductor Tutorial for Python runtime (Beta) +=============================================================== +**Author:** Ankith Gunapal, Bin Bao, Angela Yi +""" + +###################################################################### +# +# .. warning:: +# +# ``torch._inductor.aot_compile`` and ``torch._export.aot_load`` are in Beta status and are subject to backwards compatibility +# breaking changes. This tutorial provides an example of how to use these APIs for model deployment using Python runtime. +# +# It has been shown `previously `__ how AOTInductor can be used +# to do Ahead-of-Time compilation of PyTorch exported models by creating +# a shared library that can be run in a non-Python environment. +# +# +# In this tutorial, you will learn an end-to-end example of how to use AOTInductor for Python runtime. +# We will look at how to use :func:`torch._inductor.aot_compile` along with :func:`torch.export.export` to generate a +# shared library. Additionally, we will examine how to execute the shared library in Python runtime using :func:`torch._export.aot_load`. +# You will learn about the speed up seen in the first inference time using AOTInductor, especially when using +# ``max-autotune`` mode which can take some time to execute. +# +# **Contents** +# +# .. contents:: +# :local: + +###################################################################### +# Prerequisites +# ------------- +# * PyTorch 2.4 or later +# * Basic understanding of ``torch.export`` and AOTInductor +# * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models `_ tutorial + +###################################################################### +# What you will learn +# ---------------------- +# * How to use AOTInductor for python runtime. +# * How to use :func:`torch._inductor.aot_compile` along with :func:`torch.export.export` to generate a shared library +# * How to run a shared library in Python runtime using :func:`torch._export.aot_load`. +# * When do you use AOTInductor for python runtime + +###################################################################### +# Model Compilation +# ----------------- +# +# We will use the TorchVision pretrained `ResNet18` model and TorchInductor on the +# exported PyTorch program using :func:`torch._inductor.aot_compile`. +# +# .. note:: +# +# This API also supports :func:`torch.compile` options like ``mode`` +# This means that if used on a CUDA enabled device, you can, for example, set ``"max_autotune": True`` +# which leverages Triton based matrix multiplications & convolutions, and enables CUDA graphs by default. +# +# We also specify ``dynamic_shapes`` for the batch dimension. In this example, ``min=2`` is not a bug and is +# explained in `The 0/1 Specialization Problem `__ + + +import os +import torch +from torchvision.models import ResNet18_Weights, resnet18 + +model = resnet18(weights=ResNet18_Weights.DEFAULT) +model.eval() + +with torch.inference_mode(): + + # Specify the generated shared library path + aot_compile_options = { + "aot_inductor.output_path": os.path.join(os.getcwd(), "resnet18_pt2.so"), + } + if torch.cuda.is_available(): + device = "cuda" + aot_compile_options.update({"max_autotune": True}) + else: + device = "cpu" + + model = model.to(device=device) + example_inputs = (torch.randn(2, 3, 224, 224, device=device),) + + # min=2 is not a bug and is explained in the 0/1 Specialization Problem + batch_dim = torch.export.Dim("batch", min=2, max=32) + exported_program = torch.export.export( + model, + example_inputs, + # Specify the first dimension of the input x as dynamic + dynamic_shapes={"x": {0: batch_dim}}, + ) + so_path = torch._inductor.aot_compile( + exported_program.module(), + example_inputs, + # Specify the generated shared library path + options=aot_compile_options + ) + + +###################################################################### +# Model Inference in Python +# ------------------------- +# +# Typically, the shared object generated above is used in a non-Python environment. In PyTorch 2.3, +# we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime. +# The API follows a structure similar to the :func:`torch.jit.load` API . You need to specify the path +# of the shared library and the device where it should be loaded. +# +# .. note:: +# In the example above, we specified ``batch_size=1`` for inference and it still functions correctly even though we specified ``min=2`` in +# :func:`torch.export.export`. + + +import os +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" +model_so_path = os.path.join(os.getcwd(), "resnet18_pt2.so") + +model = torch._export.aot_load(model_so_path, device) +example_inputs = (torch.randn(1, 3, 224, 224, device=device),) + +with torch.inference_mode(): + output = model(example_inputs) + +###################################################################### +# When to use AOTInductor for Python Runtime +# ------------------------------------------ +# +# One of the requirements for using AOTInductor is that the model shouldn't have any graph breaks. +# Once this requirement is met, the primary use case for using AOTInductor Python Runtime is for +# model deployment using Python. +# There are mainly two reasons why you would use AOTInductor Python Runtime: +# +# - ``torch._inductor.aot_compile`` generates a shared library. This is useful for model +# versioning for deployments and tracking model performance over time. +# - With :func:`torch.compile` being a JIT compiler, there is a warmup +# cost associated with the first compilation. Your deployment needs to account for the +# compilation time taken for the first inference. With AOTInductor, the compilation is +# done offline using ``torch.export.export`` & ``torch._indutor.aot_compile``. The deployment +# would only load the shared library using ``torch._export.aot_load`` and run inference. +# +# +# The section below shows the speedup achieved with AOTInductor for first inference +# +# We define a utility function ``timed`` to measure the time taken for inference +# + +import time +def timed(fn): + # Returns the result of running `fn()` and the time it took for `fn()` to run, + # in seconds. We use CUDA events and synchronization for accurate + # measurement on CUDA enabled devices. + if torch.cuda.is_available(): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + else: + start = time.time() + + result = fn() + if torch.cuda.is_available(): + end.record() + torch.cuda.synchronize() + else: + end = time.time() + + # Measure time taken to execute the function in miliseconds + if torch.cuda.is_available(): + duration = start.elapsed_time(end) + else: + duration = (end - start) * 1000 + + return result, duration + + +###################################################################### +# Lets measure the time for first inference using AOTInductor + +torch._dynamo.reset() + +model = torch._export.aot_load(model_so_path, device) +example_inputs = (torch.randn(1, 3, 224, 224, device=device),) + +with torch.inference_mode(): + _, time_taken = timed(lambda: model(example_inputs)) + print(f"Time taken for first inference for AOTInductor is {time_taken:.2f} ms") + + +###################################################################### +# Lets measure the time for first inference using ``torch.compile`` + +torch._dynamo.reset() + +model = resnet18(weights=ResNet18_Weights.DEFAULT).to(device) +model.eval() + +model = torch.compile(model) +example_inputs = torch.randn(1, 3, 224, 224, device=device) + +with torch.inference_mode(): + _, time_taken = timed(lambda: model(example_inputs)) + print(f"Time taken for first inference for torch.compile is {time_taken:.2f} ms") + +###################################################################### +# We see that there is a drastic speedup in first inference time using AOTInductor compared +# to ``torch.compile`` + +###################################################################### +# Conclusion +# ---------- +# +# In this recipe, we have learned how to effectively use the AOTInductor for Python runtime by +# compiling and loading a pretrained ``ResNet18`` model using the ``torch._inductor.aot_compile`` +# and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of +# generating a shared library and running it within a Python environment, even with dynamic shape +# considerations and device-specific optimizations. We also looked at the advantage of using +# AOTInductor in model deployments, with regards to speed up in first inference time. diff --git a/tutorial_submission_policy.md b/tutorial_submission_policy.md new file mode 100644 index 00000000000..c5c3a800876 --- /dev/null +++ b/tutorial_submission_policy.md @@ -0,0 +1,107 @@ +# PyTorch Tutorial Submission Policy + +This policy outlines the criteria and process for submitting new +tutorials to the PyTorch community. +Our goal is to ensure that all tutorials are of high quality, +relevant, and up-to-date, supporting both the growth of the PyTorch +users and the evolution of the PyTorch framework itself. By following +these guidelines, contributors can help us maintain a robust and +informative educational environment. + +## Acceptance Criteria For New Tutorials + +We accept new tutorials that adhere to one of the following use cases: + +* **Demonstrate New PyTorch Features:** Tutorials that support new features + for upcoming PyTorch releases are typically authored by the engineers who + are developing these features. These tutorials are crucial for showcasing + the latest advancements in PyTorch. We typically do not require more than + one tutorial per feature. + +* **Tutorials showcasing PyTorch usage with other tools and libraries:** We + accept community-contributed tutorials that illustrate innovative uses of + PyTorch alongside other open-source projects, models, and tools. Please + ensure that your tutorial remains neutral and does not promote or endorse + proprietary technologies over others. + +The first use case does not require going through the submission +process outlined below. If your tutorial falls under the second category, +please read and follow the instructions in the +**Submission Process For Community-Contributed Tutorials** section. + +## Submission Process For Community-Contributed Tutorials + +To maintain the quality and relevance of tutorials, we request that +community-contributed tutorials undergo a review process. If you are +interested in contributing a tutorial, please follow these steps: + +1. **Create an issue:** + * Open an issue in the pytorch/tutorials repository proposing the + new tutorial. Clearly explain the importance of the tutorial and + confirm that there is no existing tutorial covering the same or + similar topic. A tutorial should not disproportionately endorse + one technology over another. Please consult with Core Maintainers + to ensure your content adheres to these guidelines. + Use the provided [ISSUE_TEMPLATE](https://github.com/pytorch/tutorials/blob/main/.github/ISSUE_TEMPLATE/feature-request.yml) for the new tutorial request - select **Feature request** when submitting an issue. + + * If there is an existing tutorial on the topic that you would + like to significantly refactor, you can submit a PR. In the + description of the PR, explain why the changes are needed and + how they improve the tutorial. + + * These issues will be triaged by PyTorch maintainers on a case-by-case basis. + * Link any supporting materials including discussions in other repositories. + +1. **Await Approval:** + * Wait for a response from the PyTorch Tutorials maintainers. A PyTorch + tutorial maintainer will review your proposal and + determine whether a tutorial on the proposed topic is desirable. + A comment and an **approved** label will be added to your issue + by a maintainer. The review process for new tutorial PRs submitted + without the corresponding issue may take longer. + +1. **Adhere to writing and styling guidelines:** + * Once approved, follow the guidelines outlined in [CONTRIBUTING.md](https://github.com/pytorch/tutorials/blob/main/CONTRIBUTING.md) + and use the provided [template](https://github.com/pytorch/tutorials/blob/main/beginner_source/template_tutorial.py) for creating your tutorial. + * Link the issue in which you received approval for your tutorial + in the PR. + * We accept tutorials in both ``.rst`` (ReStructuredText) and ``.py`` + (Python) formats. However, unless your tutorial involves using + multiple GPU, parallel/distributed training, or requires extended + execution time (25 minutes or more), we prefer submissions + in Python file format. + +## Maintaining Tutorials + +When you submit a new tutorial, we encourage you to keep it in sync +with the latest PyTorch updates and features. Additionally, we may +contact you to review any PRs, issues, and other related matters to +ensure the tutorial remains a valuable resource. + +Please note the following: + +* If a tutorial breaks against the main branch, it will + be excluded from the build and an issue will be filed against it, + with the author/maintainer notified. If the issue is not resolved + within 90 days, the tutorial might be deleted from the repository. + +* We recommend that each tutorial is reviewed at least once a year to + ensure its relevance. + +## Deleting Stale Tutorials + +A tutorial might be considered stale when it no longer aligns with +the latest PyTorch updates, features, or best practices or best +practices: + +* The tutorial is no longer functional due to changes in PyTorch or + its dependencies +* The tutorial has been superseded by a newer, more comprehensive, or + more accurate tutorial +* The tutorial does not run successfully in the (CI), indicating + potential compatibility or dependency issues. + +If a tutorial is deemed stale, we will attempt to contact the code owner, +or someone from the tutorial mainatainers might attempt to update it. +However, if despite those attempts we fail to fix it, the tutorial +might be removed from the repository.