From 4c449dc746b0b4c67b73c56ee38cb11caa1adf48 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Fri, 12 Jan 2024 15:54:59 -0800 Subject: [PATCH 1/2] Fix the Underline too short warnings --- advanced_source/ddp_pipeline.py | 2 +- advanced_source/dispatcher.rst | 2 +- advanced_source/usb_semisup_learn.py | 2 +- beginner_source/basics/autogradqs_tutorial.py | 4 ++-- beginner_source/basics/buildmodel_tutorial.py | 4 ++-- beginner_source/basics/data_tutorial.py | 14 +++++------ beginner_source/basics/intro.py | 4 ++-- beginner_source/basics/tensorqs_tutorial.py | 4 ++-- beginner_source/blitz/autograd_tutorial.py | 4 ++-- beginner_source/blitz/cifar10_tutorial.py | 2 +- beginner_source/blitz/tensor_tutorial.py | 2 +- .../ddp_series_fault_tolerance.rst | 4 ---- beginner_source/ddp_series_multigpu.rst | 1 - beginner_source/dist_overview.rst | 2 +- .../knowledge_distillation_tutorial.py | 2 +- beginner_source/nn_tutorial.py | 2 +- beginner_source/profiler.py | 3 ++- beginner_source/pytorch_with_examples.rst | 11 +++++---- beginner_source/t5_tutorial.py | 8 ++++--- beginner_source/vt_tutorial.py | 2 +- intermediate_source/FSDP_tutorial.rst | 6 ++--- intermediate_source/ddp_tutorial.rst | 4 ++-- .../dynamic_quantization_bert_tutorial.rst | 2 +- intermediate_source/ensembling.py | 2 +- intermediate_source/fx_conv_bn_fuser.py | 2 +- intermediate_source/fx_profiling_tutorial.py | 2 +- intermediate_source/mario_rl_tutorial.py | 2 +- .../model_parallel_tutorial.py | 2 +- .../optimizer_step_in_backward_tutorial.py | 4 ++-- intermediate_source/pipeline_tutorial.py | 2 +- intermediate_source/realtime_rpi.rst | 2 +- .../tensorboard_profiler_tutorial.py | 2 +- intermediate_source/torch_compile_tutorial.py | 4 ++-- intermediate_source/torchserve_with_ipex.rst | 2 +- .../torchserve_with_ipex_2.rst | 2 +- prototype_source/fx_graph_mode_ptq_dynamic.py | 2 +- prototype_source/fx_graph_mode_ptq_static.rst | 2 +- .../graph_mode_dynamic_bert_tutorial.rst | 2 +- .../inductor_cpp_wrapper_tutorial.rst | 2 +- prototype_source/ios_gpu_workflow.rst | 4 ++-- prototype_source/nestedtensor.py | 12 ++++------ prototype_source/nnapi_mobilenetv2.rst | 2 +- prototype_source/numeric_suite_tutorial.py | 2 +- prototype_source/pt2e_quant_qat.rst | 2 +- prototype_source/semi_structured_sparse.rst | 4 ++-- prototype_source/vulkan_workflow.rst | 2 +- recipes_source/bundled_inputs.rst | 12 +++++++--- recipes_source/mobile_interpreter.rst | 2 +- recipes_source/recipes/amp_recipe.py | 2 +- .../recipes/dynamic_quantization.py | 24 ++++++------------- .../recipes/tensorboard_with_pytorch.py | 6 ++--- recipes_source/recipes/tuning_guide.py | 2 +- 52 files changed, 97 insertions(+), 106 deletions(-) diff --git a/advanced_source/ddp_pipeline.py b/advanced_source/ddp_pipeline.py index 1eb956a7836..4c143feabaa 100644 --- a/advanced_source/ddp_pipeline.py +++ b/advanced_source/ddp_pipeline.py @@ -439,7 +439,7 @@ def evaluate(eval_model, data_source): ###################################################################### # Evaluate the model with the test dataset -# ------------------------------------- +# ---------------------------------------- # # Apply the best model to check the result with the test dataset. diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst index 1a8034a62e5..0b5fd3c8aff 100644 --- a/advanced_source/dispatcher.rst +++ b/advanced_source/dispatcher.rst @@ -129,7 +129,7 @@ for debugging in larger models where previously it can be hard to pin-point exactly where the ``requires_grad``-ness is lost during the forward pass. In-place or view ops -^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^ To ensure correctness and best possible performance, if your op mutates an input in-place or returns a tensor that aliases with one of the inputs, two additional diff --git a/advanced_source/usb_semisup_learn.py b/advanced_source/usb_semisup_learn.py index b970ceba4bb..4f93fc61cab 100644 --- a/advanced_source/usb_semisup_learn.py +++ b/advanced_source/usb_semisup_learn.py @@ -157,7 +157,7 @@ ###################################################################### # Use USB to Train ``SoftMatch`` with specific imbalanced algorithm on imbalanced CIFAR-10 -# ------------------------------------------------------------------------------------ +# ---------------------------------------------------------------------------------------- # # Now let's say we have imbalanced labeled set and unlabeled set of CIFAR-10, # and we want to train a ``SoftMatch`` model on it. diff --git a/beginner_source/basics/autogradqs_tutorial.py b/beginner_source/basics/autogradqs_tutorial.py index d8b53d6175b..8eff127ddee 100644 --- a/beginner_source/basics/autogradqs_tutorial.py +++ b/beginner_source/basics/autogradqs_tutorial.py @@ -10,7 +10,7 @@ `Save & Load Model `_ Automatic Differentiation with ``torch.autograd`` -======================================= +================================================= When training neural networks, the most frequently used algorithm is **back propagation**. In this algorithm, parameters (model weights) are @@ -170,7 +170,7 @@ ###################################################################### # Optional Reading: Tensor Gradients and Jacobian Products -# -------------------------------------- +# -------------------------------------------------------- # # In many cases, we have a scalar loss function, and we need to compute # the gradient with respect to some parameters. However, there are cases diff --git a/beginner_source/basics/buildmodel_tutorial.py b/beginner_source/basics/buildmodel_tutorial.py index cae5c99134a..987bc7c44a2 100644 --- a/beginner_source/basics/buildmodel_tutorial.py +++ b/beginner_source/basics/buildmodel_tutorial.py @@ -10,7 +10,7 @@ `Save & Load Model `_ Build the Neural Network -=================== +======================== Neural networks comprise of layers/modules that perform operations on data. The `torch.nn `_ namespace provides all the building blocks you need to @@ -197,5 +197,5 @@ def forward(self, x): ################################################################# # Further Reading -# -------------- +# ----------------- # - `torch.nn API `_ diff --git a/beginner_source/basics/data_tutorial.py b/beginner_source/basics/data_tutorial.py index 0ef1fb6b777..561e9723fde 100644 --- a/beginner_source/basics/data_tutorial.py +++ b/beginner_source/basics/data_tutorial.py @@ -10,7 +10,7 @@ `Save & Load Model `_ Datasets & DataLoaders -=================== +====================== """ @@ -69,7 +69,7 @@ ################################################################# # Iterating and Visualizing the Dataset -# ----------------- +# ------------------------------------- # # We can index ``Datasets`` manually like a list: ``training_data[index]``. # We use ``matplotlib`` to visualize some samples in our training data. @@ -144,7 +144,7 @@ def __getitem__(self, idx): ################################################################# -# __init__ +# ``__init__`` # ^^^^^^^^^^^^^^^^^^^^ # # The __init__ function is run once when instantiating the Dataset object. We initialize @@ -167,7 +167,7 @@ def __init__(self, annotations_file, img_dir, transform=None, target_transform=N ################################################################# -# __len__ +# ``__len__`` # ^^^^^^^^^^^^^^^^^^^^ # # The __len__ function returns the number of samples in our dataset. @@ -180,7 +180,7 @@ def __len__(self): ################################################################# -# __getitem__ +# ``__getitem__`` # ^^^^^^^^^^^^^^^^^^^^ # # The __getitem__ function loads and returns a sample from the dataset at the given index ``idx``. @@ -220,7 +220,7 @@ def __getitem__(self, idx): ########################### # Iterate through the DataLoader -# -------------------------- +# ------------------------------- # # We have loaded that dataset into the ``DataLoader`` and can iterate through the dataset as needed. # Each iteration below returns a batch of ``train_features`` and ``train_labels`` (containing ``batch_size=64`` features and labels respectively). @@ -243,5 +243,5 @@ def __getitem__(self, idx): ################################################################# # Further Reading -# -------------- +# ---------------- # - `torch.utils.data API `_ diff --git a/beginner_source/basics/intro.py b/beginner_source/basics/intro.py index b7369938643..bc0d3d72a2e 100644 --- a/beginner_source/basics/intro.py +++ b/beginner_source/basics/intro.py @@ -31,7 +31,7 @@ Running the Tutorial Code ------------------- +------------------------- You can run this tutorial in a couple of ways: - **In the cloud**: This is the easiest way to get started! Each section has a "Run in Microsoft Learn" and "Run in Google Colab" link at the top, which opens an integrated notebook in Microsoft Learn or Google Colab, respectively, with the code in a fully-hosted environment. @@ -39,7 +39,7 @@ How to Use this Guide ------------------ +--------------------- If you're familiar with other deep learning frameworks, check out the `0. Quickstart `_ first to quickly familiarize yourself with PyTorch's API. diff --git a/beginner_source/basics/tensorqs_tutorial.py b/beginner_source/basics/tensorqs_tutorial.py index 1a086fc5ad8..70a966d9f89 100644 --- a/beginner_source/basics/tensorqs_tutorial.py +++ b/beginner_source/basics/tensorqs_tutorial.py @@ -80,7 +80,7 @@ ###################################################################### # Attributes of a Tensor -# ~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~ # # Tensor attributes describe their shape, datatype, and the device on which they are stored. @@ -97,7 +97,7 @@ ###################################################################### # Operations on Tensors -# ~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~ # # Over 100 tensor operations, including arithmetic, linear algebra, matrix manipulation (transposing, # indexing, slicing), sampling and more are diff --git a/beginner_source/blitz/autograd_tutorial.py b/beginner_source/blitz/autograd_tutorial.py index b6d4007303f..18d666ce8eb 100644 --- a/beginner_source/blitz/autograd_tutorial.py +++ b/beginner_source/blitz/autograd_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ A Gentle Introduction to ``torch.autograd`` ---------------------------------- +=========================================== ``torch.autograd`` is PyTorch’s automatic differentiation engine that powers neural network training. In this section, you will get a conceptual @@ -149,7 +149,7 @@ ###################################################################### # Optional Reading - Vector Calculus using ``autograd`` -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # Mathematically, if you have a vector valued function # :math:`\vec{y}=f(\vec{x})`, then the gradient of :math:`\vec{y}` with diff --git a/beginner_source/blitz/cifar10_tutorial.py b/beginner_source/blitz/cifar10_tutorial.py index 5a9cde3f105..8e3f3252921 100644 --- a/beginner_source/blitz/cifar10_tutorial.py +++ b/beginner_source/blitz/cifar10_tutorial.py @@ -115,7 +115,7 @@ def imshow(img): ######################################################################## # 2. Define a Convolutional Neural Network -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # Copy the neural network from the Neural Networks section before and modify it to # take 3-channel images (instead of 1-channel images as it was defined). diff --git a/beginner_source/blitz/tensor_tutorial.py b/beginner_source/blitz/tensor_tutorial.py index 5219ad4ee43..ac54945bc3a 100644 --- a/beginner_source/blitz/tensor_tutorial.py +++ b/beginner_source/blitz/tensor_tutorial.py @@ -1,6 +1,6 @@ """ Tensors --------------------------------------------- +======== Tensors are a specialized data structure that are very similar to arrays and matrices. In PyTorch, we use tensors to encode the inputs and diff --git a/beginner_source/ddp_series_fault_tolerance.rst b/beginner_source/ddp_series_fault_tolerance.rst index 95da10525a8..7a4e3cc8c80 100644 --- a/beginner_source/ddp_series_fault_tolerance.rst +++ b/beginner_source/ddp_series_fault_tolerance.rst @@ -93,11 +93,7 @@ In elastic training, whenever there are any membership changes (adding or removi on available devices. Having this structure ensures your training job can continue without manual intervention. - - - Diff for `multigpu.py `__ v/s `multigpu_torchrun.py `__ ------------------------------------------------------------ Process group initialization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/beginner_source/ddp_series_multigpu.rst b/beginner_source/ddp_series_multigpu.rst index a5eb60708d2..2d294c97930 100644 --- a/beginner_source/ddp_series_multigpu.rst +++ b/beginner_source/ddp_series_multigpu.rst @@ -52,7 +52,6 @@ Along the way, we will talk through important concepts in distributed training w Diff for `single_gpu.py `__ v/s `multigpu.py `__ ----------------------------------------------------- These are the changes you typically make to a single-GPU training script to enable DDP. diff --git a/beginner_source/dist_overview.rst b/beginner_source/dist_overview.rst index 12e9bfa0e55..7768dc4876c 100644 --- a/beginner_source/dist_overview.rst +++ b/beginner_source/dist_overview.rst @@ -150,7 +150,7 @@ throws an exception, it is likely to lead to desynchronization (mismatched adds fault tolerance and the ability to make use of a dynamic pool of machines (elasticity). RPC-Based Distributed Training ----------------------------- +------------------------------ Many training paradigms do not fit into data parallelism, e.g., parameter server paradigm, distributed pipeline parallelism, reinforcement diff --git a/beginner_source/knowledge_distillation_tutorial.py b/beginner_source/knowledge_distillation_tutorial.py index 304ac661d4e..cc03cbb1d6c 100644 --- a/beginner_source/knowledge_distillation_tutorial.py +++ b/beginner_source/knowledge_distillation_tutorial.py @@ -25,7 +25,7 @@ # - How to improve the performance of lightweight models by using more complex models as teachers # # Prerequisites -# ~~~~~~~~~~~ +# ~~~~~~~~~~~~~ # # * 1 GPU, 4GB of memory # * PyTorch v2.0 or later diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py index 183aca1748b..ccb22555db0 100644 --- a/beginner_source/nn_tutorial.py +++ b/beginner_source/nn_tutorial.py @@ -98,7 +98,7 @@ ############################################################################### # Neural net from scratch (without ``torch.nn``) -# --------------------------------------------- +# ----------------------------------------------- # # Let's first create a model using nothing but PyTorch tensor operations. We're assuming # you're already familiar with the basics of neural networks. (If you're not, you can diff --git a/beginner_source/profiler.py b/beginner_source/profiler.py index 95d077f7ba3..ed0f173b154 100644 --- a/beginner_source/profiler.py +++ b/beginner_source/profiler.py @@ -1,6 +1,7 @@ """ Profiling your PyTorch Module ------------- +----------------------------- + **Author:** `Suraj Subramanian `_ PyTorch includes a profiler API that is useful to identify the time and diff --git a/beginner_source/pytorch_with_examples.rst b/beginner_source/pytorch_with_examples.rst index c43b90a4c44..202832f9c3f 100644 --- a/beginner_source/pytorch_with_examples.rst +++ b/beginner_source/pytorch_with_examples.rst @@ -1,5 +1,6 @@ Learning PyTorch with Examples -****************************** +============================== + **Author**: `Justin Johnson `_ .. note:: @@ -29,7 +30,7 @@ between the network output and the true output. :local: Tensors -======= +~~~~~~~ Warm-up: numpy -------------- @@ -74,7 +75,7 @@ and backward passes through the network: Autograd -======== +~~~~~~~~ PyTorch: Tensors and autograd ------------------------------- @@ -133,7 +134,7 @@ our model: .. includenodoc:: /beginner/examples_autograd/polynomial_custom_function.py ``nn`` module -=========== +~~~~~~~~~~~~~ PyTorch: ``nn`` --------------- @@ -219,7 +220,7 @@ We can easily implement this model as a Module subclass: .. _examples-download: Examples -======== +~~~~~~~~ You can browse the above examples here. diff --git a/beginner_source/t5_tutorial.py b/beginner_source/t5_tutorial.py index 8f77cd278ea..1387975ad3d 100644 --- a/beginner_source/t5_tutorial.py +++ b/beginner_source/t5_tutorial.py @@ -223,8 +223,10 @@ def process_labels(labels, x): ####################################################################### -# Summarization Output (Might vary since we shuffle the dataloader) +# Summarization Output # -------------------- +# +# Summarization output might vary since we shuffle the dataloader. # # .. code-block:: # @@ -315,7 +317,7 @@ def process_labels(labels, x): # Sentiment Output # ---------------- # -# :: +# .. code-block:: bash # # Example 1: # @@ -408,7 +410,7 @@ def process_labels(labels, x): # Translation Output # ------------------ # -# :: +# .. code-block:: bash # # Example 1: # diff --git a/beginner_source/vt_tutorial.py b/beginner_source/vt_tutorial.py index 1b0a93b8b4b..e612fe32a73 100644 --- a/beginner_source/vt_tutorial.py +++ b/beginner_source/vt_tutorial.py @@ -1,6 +1,6 @@ """ Optimizing Vision Transformer Model for Deployment -=========================== +================================================== `Jeff Tang `_, `Geeta Chauhan `_ diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst index 58fa0ca0c25..7f1a73c146c 100644 --- a/intermediate_source/FSDP_tutorial.rst +++ b/intermediate_source/FSDP_tutorial.rst @@ -1,5 +1,5 @@ Getting Started with Fully Sharded Data Parallel(FSDP) -===================================================== +====================================================== **Author**: `Hamid Shojanazeri `__, `Yanli Zhao `__, `Shen Li `__ @@ -56,7 +56,7 @@ One way to view FSDP's sharding is to decompose the DDP gradient all-reduce into FSDP Allreduce How to use FSDP --------------- +--------------- Here we use a toy model to run training on the MNIST dataset for demonstration purposes. The APIs and logic can be applied to training larger models as well. *Setup* @@ -267,7 +267,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”. -2.5 Finally parse the arguments and set the main function +2.5 Finally, parse the arguments and set the main function .. code-block:: python diff --git a/intermediate_source/ddp_tutorial.rst b/intermediate_source/ddp_tutorial.rst index 1553cf1ac29..13297fb2a12 100644 --- a/intermediate_source/ddp_tutorial.rst +++ b/intermediate_source/ddp_tutorial.rst @@ -236,7 +236,7 @@ and elasticity support, please refer to `TorchElastic torch.nn.Module: ###################################################################### # Benchmarking our Fusion on ResNet18 -# ---------- +# ----------------------------------- # We can test our fusion pass on a larger model like ResNet18 and see how much # this pass improves inference performance. import torchvision.models as models diff --git a/intermediate_source/fx_profiling_tutorial.py b/intermediate_source/fx_profiling_tutorial.py index 18d8bc67cf4..8caaf7be39b 100644 --- a/intermediate_source/fx_profiling_tutorial.py +++ b/intermediate_source/fx_profiling_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ (beta) Building a Simple CPU Performance Profiler with FX -******************************************************* +********************************************************* **Author**: `James Reed `_ In this tutorial, we are going to use FX to do the following: diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py index 67d50b121dc..8fe5b327d02 100755 --- a/intermediate_source/mario_rl_tutorial.py +++ b/intermediate_source/mario_rl_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ Train a Mario-playing RL Agent -================ +=============================== **Authors:** `Yuansong Feng `__, `Suraj Subramanian `__, `Howard Wang `__, `Steven Guo `__. diff --git a/intermediate_source/model_parallel_tutorial.py b/intermediate_source/model_parallel_tutorial.py index d7a4da73371..562064614b9 100644 --- a/intermediate_source/model_parallel_tutorial.py +++ b/intermediate_source/model_parallel_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ Single-Machine Model Parallel Best Practices -================================ +============================================ **Author**: `Shen Li `_ Model parallel is widely-used in distributed training diff --git a/intermediate_source/optimizer_step_in_backward_tutorial.py b/intermediate_source/optimizer_step_in_backward_tutorial.py index fd5fcb74fc2..fd72f733c50 100644 --- a/intermediate_source/optimizer_step_in_backward_tutorial.py +++ b/intermediate_source/optimizer_step_in_backward_tutorial.py @@ -147,7 +147,7 @@ def train(model, optimizer): # API on Tensor. # # ``Tensor.register_post_accumulate_grad_hook(hook)`` API and our technique -# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # Our technique relies on not having to save the gradients during ``backward()``. Instead, # once a gradient has been accumulated, we will immediately apply the optimizer to # the corresponding parameter and drop that gradient entirely! This removes the need @@ -265,4 +265,4 @@ def train(model): # fusing the optimizer into the backward step through the new # ``Tensor.register_post_accumulate_grad_hook()`` API and *when* to apply this # technique (when gradients memory is significant). Along the way, we also learned -# about memory snapshots, which are generally useful in memory optimization. \ No newline at end of file +# about memory snapshots, which are generally useful in memory optimization. diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py index 33561f60592..4cebeba63a9 100644 --- a/intermediate_source/pipeline_tutorial.py +++ b/intermediate_source/pipeline_tutorial.py @@ -406,7 +406,7 @@ def evaluate(eval_model, data_source): ###################################################################### # Evaluate the model with the test dataset -# ------------------------------------- +# ---------------------------------------- # diff --git a/intermediate_source/realtime_rpi.rst b/intermediate_source/realtime_rpi.rst index 9b11f899a3b..bb1a576a2c2 100644 --- a/intermediate_source/realtime_rpi.rst +++ b/intermediate_source/realtime_rpi.rst @@ -312,7 +312,7 @@ Detecting a mug: Troubleshooting: Performance -~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PyTorch by default will use all of the cores available. If you have anything running in the background on the Raspberry Pi it may cause contention with the diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py index 4ac30945fd1..00bdcfbf079 100644 --- a/intermediate_source/tensorboard_profiler_tutorial.py +++ b/intermediate_source/tensorboard_profiler_tutorial.py @@ -157,7 +157,7 @@ def train(data): ###################################################################### # 4. Use TensorBoard to view results and analyze model performance -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # .. note:: # TensorBoard Plugin support has been deprecated, so some of these functions may not diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py index 5de08bc4dda..7e5849d2c72 100644 --- a/intermediate_source/torch_compile_tutorial.py +++ b/intermediate_source/torch_compile_tutorial.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- """ -torch.compile Tutorial -================ +Introduction to ``torch.compile`` +================================= **Author:** William Wen """ diff --git a/intermediate_source/torchserve_with_ipex.rst b/intermediate_source/torchserve_with_ipex.rst index fbf705a7c47..1a11b4180f4 100644 --- a/intermediate_source/torchserve_with_ipex.rst +++ b/intermediate_source/torchserve_with_ipex.rst @@ -265,7 +265,7 @@ Additionally, notice that thread (TID:97097) was executing on a large number of Compare local vs. remote memory access over time. We observe that about half, 51.09%, of the memory accesses were remote accesses, indicating sub-optimal NUMA configuration. 2. torch.set_num_threads = ``number of physical cores / number of workers`` (no core pinning) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ For an apple-to-apple comparison with launcher's core pinning, we'll set the number of threads to the number of cores divided by the number of workers (launcher does this internally). Add the following code snippet in the `base_handler `_: diff --git a/intermediate_source/torchserve_with_ipex_2.rst b/intermediate_source/torchserve_with_ipex_2.rst index 6ace1e6a3e2..64f3db6b27c 100644 --- a/intermediate_source/torchserve_with_ipex_2.rst +++ b/intermediate_source/torchserve_with_ipex_2.rst @@ -366,7 +366,7 @@ Above is oneDNN verbose from channels first. We can verify that there are reorde Above is oneDNN verbose from channels last. We can verify that channels last memory format avoids unnecessary reorders. Performance Boost with Intel® Extension for PyTorch* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Below summarizes performance boost of TorchServe with Intel® Extension for PyTorch* for ResNet50 and BERT-base-uncased. .. figure:: /_static/img/torchserve-ipex-images-2/19.png diff --git a/prototype_source/fx_graph_mode_ptq_dynamic.py b/prototype_source/fx_graph_mode_ptq_dynamic.py index 98ece5f3d31..84d6ccb1832 100644 --- a/prototype_source/fx_graph_mode_ptq_dynamic.py +++ b/prototype_source/fx_graph_mode_ptq_dynamic.py @@ -1,6 +1,6 @@ """ (prototype) FX Graph Mode Post Training Dynamic Quantization -=========================================================== +============================================================ **Author**: `Jerry Zhang `_ diff --git a/prototype_source/fx_graph_mode_ptq_static.rst b/prototype_source/fx_graph_mode_ptq_static.rst index c0b692275a0..a7165f713f8 100644 --- a/prototype_source/fx_graph_mode_ptq_static.rst +++ b/prototype_source/fx_graph_mode_ptq_static.rst @@ -228,7 +228,7 @@ For post training quantization, we'll need to set model to eval mode. 4. Specify how to quantize the model with ``QConfigMapping`` ----------------------------------------------------------- +------------------------------------------------------------ .. code:: python diff --git a/prototype_source/graph_mode_dynamic_bert_tutorial.rst b/prototype_source/graph_mode_dynamic_bert_tutorial.rst index 5d76ddef79a..949002a55dc 100644 --- a/prototype_source/graph_mode_dynamic_bert_tutorial.rst +++ b/prototype_source/graph_mode_dynamic_bert_tutorial.rst @@ -1,5 +1,5 @@ (prototype) Graph Mode Dynamic Quantization on BERT -============================================== +=================================================== **Author**: `Supriya Rao `_ diff --git a/prototype_source/inductor_cpp_wrapper_tutorial.rst b/prototype_source/inductor_cpp_wrapper_tutorial.rst index 199a66b2b28..4bcc9009075 100644 --- a/prototype_source/inductor_cpp_wrapper_tutorial.rst +++ b/prototype_source/inductor_cpp_wrapper_tutorial.rst @@ -21,7 +21,7 @@ thereby reducing the Python overhead within the graph. Enabling the API ------------- +---------------- This feature is still in prototype stage. To activate this feature, add the following to your code: .. code:: python diff --git a/prototype_source/ios_gpu_workflow.rst b/prototype_source/ios_gpu_workflow.rst index 0e87ad815f9..cb7a0034b23 100644 --- a/prototype_source/ios_gpu_workflow.rst +++ b/prototype_source/ios_gpu_workflow.rst @@ -71,7 +71,7 @@ Those are all the ops we need to run the mobilenetv2 model on iOS GPU. Cool! Now Use PyTorch iOS library with Metal ---------------------- +---------------------------------- The PyTorch iOS library with Metal support ``LibTorch-Lite-Nightly`` is available in Cocoapods. You can read the `Using the Nightly PyTorch iOS Libraries in CocoaPods `_ section from the iOS tutorial for more detail about its usage. We also have the `HelloWorld-Metal example `_ that shows how to conect all pieces together. @@ -88,7 +88,7 @@ This is because by default Metal uses fp16 rather than fp32 to compute. The prec Use LibTorch-Lite Built from Source ---------------------- +----------------------------------- You can also build a custom LibTorch-Lite from Source and use it to run GPU models on iOS Metal. In this section, we'll be using the `HelloWorld example `_ to demonstrate this process. diff --git a/prototype_source/nestedtensor.py b/prototype_source/nestedtensor.py index 0d2898cc4ac..4462055b0c5 100644 --- a/prototype_source/nestedtensor.py +++ b/prototype_source/nestedtensor.py @@ -28,10 +28,8 @@ ###################################################################### # NestedTensor Initialization -# ---------------- +# ---------------------------- # - -###################################################################### # From the Python frontend, a nestedtensor can be created from a list of tensors. # We denote nt[i] as the ith tensor component of a nestedtensor. nt = torch.nested.nested_tensor([torch.arange(12).reshape( @@ -66,10 +64,8 @@ ###################################################################### # Nested Tensor Operations -# ---------------- +# ------------------------ # - -###################################################################### # As each operation must be explicitly implemented for nestedtensors, # operation coverage for nestedtensors is currently narrower than that of regular tensors. # For now, only basic operations such as index, dropout, softmax, transpose, reshape, linear, bmm are covered. @@ -123,7 +119,7 @@ ###################################################################### # Why Nested Tensor -# ---------------- +# ----------------- # ###################################################################### @@ -492,4 +488,4 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: print("Nested and padded calculations differ by", (torch.nested.to_padded_tensor(out_lib, 0.0) - padded_out).abs().max().item()) print("Nested library multi-head attention takes", nested_time, "seconds") print("Padded tensor multi-head attention takes", padded_time, "seconds") -print(f"Nested Speedup: {padded_time / nested_time:.3f}") \ No newline at end of file +print(f"Nested Speedup: {padded_time / nested_time:.3f}") diff --git a/prototype_source/nnapi_mobilenetv2.rst b/prototype_source/nnapi_mobilenetv2.rst index 3036fdefa46..ed9548a387d 100644 --- a/prototype_source/nnapi_mobilenetv2.rst +++ b/prototype_source/nnapi_mobilenetv2.rst @@ -166,7 +166,7 @@ by passing ``--use_caching_allocator=true``. Running model on host --------------------- +--------------------- We can now run models on your linux machine using the reference implementation of NNAPI. You need to build the NNAPI library from Android source code: diff --git a/prototype_source/numeric_suite_tutorial.py b/prototype_source/numeric_suite_tutorial.py index b5ec2a7e133..a630d27e6a6 100644 --- a/prototype_source/numeric_suite_tutorial.py +++ b/prototype_source/numeric_suite_tutorial.py @@ -271,7 +271,7 @@ def forward(self, x, y): ############################################################################### # Numeric Suite for Dynamic Quantization -# ------------------------------------- +# -------------------------------------- # # Numeric Suite APIs are designed in such as way that they work for both dynamic quantized model and static quantized model. We will use a model with both LSTM and Linear modules to demonstrate the usage of Numeric Suite on dynamic quantized model. This model is the same one used in the tutorial of dynamic quantization on LSTM word language model [1]. # diff --git a/prototype_source/pt2e_quant_qat.rst b/prototype_source/pt2e_quant_qat.rst index d40b640128c..6d995d368e0 100644 --- a/prototype_source/pt2e_quant_qat.rst +++ b/prototype_source/pt2e_quant_qat.rst @@ -64,7 +64,7 @@ respectively. Define Helper Functions and Prepare the Dataset -------------------------------------------- +----------------------------------------------- To run the code in this tutorial using the entire ImageNet dataset, first download ImageNet by following the instructions in diff --git a/prototype_source/semi_structured_sparse.rst b/prototype_source/semi_structured_sparse.rst index 4044888b583..c7b82fd43cd 100644 --- a/prototype_source/semi_structured_sparse.rst +++ b/prototype_source/semi_structured_sparse.rst @@ -1,5 +1,5 @@ (prototype) Accelerating BERT with semi-structured (2:4) sparsity -================================================================ +================================================================= **Author**: `Jesse Cai `_ Like other forms of sparsity, **semi-structured sparsity** is a model optimization technique that seeks to reduce the memory overhead and latency of a neural network at the expense of some model accuracy. @@ -477,7 +477,7 @@ Once we've reached a satisfied state, we can call ``squash_mask`` to fuse the ma # [ 0.0000, 0.0225, -0.0395, -0.0000, ..., -0.0000, 0.0684, -0.0344, -0.0000]], device='cuda:0', requires_grad=True) Accelerating 2:4 sparse models for inference -------------------------------------------- +--------i------------------------------------ Now that we have a model in this format, we can accelerate it for inference just like in the QuickStart Guide. .. code:: python diff --git a/prototype_source/vulkan_workflow.rst b/prototype_source/vulkan_workflow.rst index 7cd3a5c9864..2f78ac97d74 100644 --- a/prototype_source/vulkan_workflow.rst +++ b/prototype_source/vulkan_workflow.rst @@ -182,7 +182,7 @@ Python API ``.vulkan()`` at the moment of writing of this tutorial is not exposed to Python API, but it is planned to be there. Android Java API ---------------- +---------------- For Android API to run model on Vulkan backend we have to specify this during model loading: diff --git a/recipes_source/bundled_inputs.rst b/recipes_source/bundled_inputs.rst index bae2a67832f..1bdf5c7b7d2 100644 --- a/recipes_source/bundled_inputs.rst +++ b/recipes_source/bundled_inputs.rst @@ -10,8 +10,11 @@ This tutorial introduces the steps to use PyTorch's utility to bundle example or The interface of the model remains unchanged (other than adding a few methods), so it can still be safely deployed to production. The advantage of this standardized interface is that tools that run models can use it instead of having some sort of external file (or worse, document) that tells you how to run the model properly. -Common case, bundling an input to a model that only uses 'forward' for inference +Common case ------------------- + +One of the common cases—bundling an input to a model that only uses 'forward' for inference. + 1. **Prepare model**: Convert your model to TorchScript through either tracing or scripting .. code:: python @@ -52,8 +55,11 @@ Common case, bundling an input to a model that only uses 'forward' for inference print(bundled_model(*sample_inputs[0])) -Uncommon case, bundling and retrieving inputs for functions beyond 'forward' -------------------- +Uncommon case +-------------- + +An uncommon case would be bundling and retrieving inputs for functions beyond 'forward'. + 1. **Prepare model**: Convert your model to TorchScript through either tracing or scripting .. code:: python diff --git a/recipes_source/mobile_interpreter.rst b/recipes_source/mobile_interpreter.rst index 135eed1d51e..dda1dd92435 100644 --- a/recipes_source/mobile_interpreter.rst +++ b/recipes_source/mobile_interpreter.rst @@ -148,7 +148,7 @@ Get ImageSegmentation demo app in iOS: https://github.com/pytorch/ios-demo-app/t 4. Build and test the app in Xcode. How to use mobile interpreter + custom build ------------------------------------------- +--------------------------------------------- A custom PyTorch interpreter library can be created to reduce binary size, by only containing the operators needed by the model. In order to do that follow these steps: 1. To dump the operators in your model, say `deeplabv3_scripted`, run the following lines of Python code: diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py index 141bc41a034..b8a4d942333 100644 --- a/recipes_source/recipes/amp_recipe.py +++ b/recipes_source/recipes/amp_recipe.py @@ -320,7 +320,7 @@ def make_model(in_size, out_size, num_layers): # shows forcing a subregion to run in ``float32`` (by locally disabling ``autocast`` and casting the subregion's inputs). # # Type mismatch error (may manifest as ``CUDNN_STATUS_BAD_PARAM``) -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ``Autocast`` tries to cover all ops that benefit from or require casting. # `Ops that receive explicit coverage `_ # are chosen based on numerical properties, but also on experience. diff --git a/recipes_source/recipes/dynamic_quantization.py b/recipes_source/recipes/dynamic_quantization.py index cdb3d22da72..25fc9cd3982 100644 --- a/recipes_source/recipes/dynamic_quantization.py +++ b/recipes_source/recipes/dynamic_quantization.py @@ -24,7 +24,7 @@ reduction without losing a lot of accuracy. What is dynamic quantization? -------------- +----------------------------- Quantizing a network means converting it to use a reduced precision integer representation for the weights and/or activations. This saves on @@ -281,7 +281,7 @@ def print_size_of_model(model, label=""): ###################################################################### -# Learn More +# Learn More # ------------ # We've explained what dynamic quantization is, what benefits it brings, # and you have used the ``torch.quantization.quantize_dynamic()`` function @@ -292,20 +292,10 @@ def print_size_of_model(model, label=""): # # # Additional Resources -# ========= -# Documentation -# ~~~~~~~~~~~~~~ -# -# `Quantization API Documentaion `_ -# -# Tutorials -# ~~~~~~~~~~~~~~ -# -# `(beta) Dynamic Quantization on BERT `_ -# -# `(beta) Dynamic Quantization on an LSTM Word Language Model `_ +# -------------------- # -# Blogs -# ~~~~~~~~~~~~~~ -# `Introduction to Quantization on PyTorch `_ +# * `Quantization API Documentaion `_ +# * `(beta) Dynamic Quantization on BERT `_ +# * `(beta) Dynamic Quantization on an LSTM Word Language Model `_ +# * `Introduction to Quantization on PyTorch `_ # diff --git a/recipes_source/recipes/tensorboard_with_pytorch.py b/recipes_source/recipes/tensorboard_with_pytorch.py index 3b9455b7f44..00ee7292a1d 100644 --- a/recipes_source/recipes/tensorboard_with_pytorch.py +++ b/recipes_source/recipes/tensorboard_with_pytorch.py @@ -28,7 +28,7 @@ ###################################################################### # Using TensorBoard in PyTorch -# ----- +# ----------------------------- # # Let’s now try using TensorBoard with PyTorch! Before logging anything, # we need to create a ``SummaryWriter`` instance. @@ -45,7 +45,7 @@ ###################################################################### # Log scalars -# ----- +# ----------- # # In machine learning, it’s important to understand key metrics such as # loss and how they change during training. Scalar helps to save @@ -91,7 +91,7 @@ def train_model(iter): ###################################################################### # Run TensorBoard -# ----- +# ---------------- # # Install TensorBoard through the command line to visualize data you logged # diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py index d23f3fe666e..43f67deffa0 100644 --- a/recipes_source/recipes/tuning_guide.py +++ b/recipes_source/recipes/tuning_guide.py @@ -465,7 +465,7 @@ def fused_gelu(x): ############################################################################### # Match the order of layers in constructors and during the execution if using ``DistributedDataParallel``(find_unused_parameters=True) -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # `torch.nn.parallel.DistributedDataParallel `_ # with ``find_unused_parameters=True`` uses the order of layers and parameters # from model constructors to build buckets for ``DistributedDataParallel`` From d3d9d256be697bba5cefa09326ede29521e7d389 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 16 Jan 2024 08:16:47 -0800 Subject: [PATCH 2/2] Update --- beginner_source/pytorch_with_examples.rst | 2 +- intermediate_source/inductor_debug_cpu.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/beginner_source/pytorch_with_examples.rst b/beginner_source/pytorch_with_examples.rst index 202832f9c3f..6705b5b21a4 100644 --- a/beginner_source/pytorch_with_examples.rst +++ b/beginner_source/pytorch_with_examples.rst @@ -262,7 +262,7 @@ Autograd
``nn`` module ------------ +-------------- .. toctree:: :maxdepth: 2 diff --git a/intermediate_source/inductor_debug_cpu.py b/intermediate_source/inductor_debug_cpu.py index b534c432d88..061ef063856 100644 --- a/intermediate_source/inductor_debug_cpu.py +++ b/intermediate_source/inductor_debug_cpu.py @@ -64,7 +64,7 @@ def neg1(x): # # # Get more logging information -# ^^^^^^^^^^^^^^^^^ +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # No debugging information would be provided if you run this simple example by default. In order to get more useful debugging and logging information, we usually add a ``TORCH_COMPILE_DEBUG`` environment variable like below: #