diff --git a/_static/css/custom.css b/_static/css/custom.css old mode 100644 new mode 100755 index 75dd0261983..7b7055fff78 --- a/_static/css/custom.css +++ b/_static/css/custom.css @@ -71,4 +71,3 @@ .sd-card:hover:after { transform: scaleX(1); } - diff --git a/conf.py b/conf.py index f9c8a512d8b..c01d7f332ac 100644 --- a/conf.py +++ b/conf.py @@ -275,16 +275,16 @@ def setup(app): # and can be moved outside of this function (and the setup(app) function # can be deleted). #html_css_files = [ - # 'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css' + # 'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css', + # 'css/custom.css' #] # In Sphinx 1.8 it was renamed to `add_css_file`, 1.7 and prior it is # `add_stylesheet` (deprecated in 1.8). #add_css = getattr(app, 'add_css_file', app.add_stylesheet) #for css_file in html_css_files: # add_css(css_file) - # Custom CSS - # app.add_stylesheet('css/pytorch_theme.css') + #app.add_stylesheet('css/pytorch_theme.css') # app.add_stylesheet('https://fonts.googleapis.com/css?family=Lato') # Custom directives app.add_directive('includenodoc', IncludeDirective) diff --git a/distributed/home.rst b/distributed/home.rst new file mode 100644 index 00000000000..a501a95e0bd --- /dev/null +++ b/distributed/home.rst @@ -0,0 +1,151 @@ +Distributed and Parallel Training Tutorials +=========================================== + +Distributed training is a model training paradigm that involves +spreading training workload across multiple worker nodes, therefore +significantly improving the speed of training and model accuracy. While +distributed training can be used for any type of ML model training, it +is most beneficial to use it for large models and compute demanding +tasks as deep learning. + +There are a few ways you can perform distributed training in +PyTorch with each method having their advantages in certain use cases: + +* `DistributedDataParallel (DDP) <#learn-ddp>`__ +* `Fully Sharded Data Parallel (FSDP) <#learn-fsdp>`__ +* `Remote Procedure Call (RPC) distributed training <#learn-rpc>`__ +* `Custom Extensions <#custom-extensions>`__ + +Read more about these options in `Distributed Overview <../beginner/dist_overview.html>`__. + +.. _learn-ddp: + +Learn DDP +--------- + +.. grid:: 3 + + .. grid-item-card:: :octicon:`file-code;1em` + DDP Intro Video Tutorials + :link: https://pytorch.org/tutorials/beginner/ddp_series_intro.html?utm_source=distr_landing&utm_medium=ddp_series_intro + :link-type: url + + A step-by-step video series on how to get started with + `DistributedDataParallel` and advance to more complex topics + +++ + :octicon:`code;1em` Code :octicon:`square-fill;1em` :octicon:`video;1em` Video + + .. grid-item-card:: :octicon:`file-code;1em` + Getting Started with Distributed Data Parallel + :link: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html?utm_source=distr_landing&utm_medium=intermediate_ddp_tutorial + :link-type: url + + This tutorial provides a short and gentle intro to the PyTorch + DistributedData Parallel. + +++ + :octicon:`code;1em` Code + + .. grid-item-card:: :octicon:`file-code;1em` + Distributed Training with Uneven Inputs Using + the Join Context Manager + :link: https://pytorch.org/tutorials/advanced/generic_join.html?utm_source=distr_landing&utm_medium=generic_join + :link-type: url + + This tutorial provides a short and gentle intro to the PyTorch + DistributedData Parallel. + +++ + :octicon:`code;1em` Code + +.. _learn-fsdp: + +Learn FSDP +---------- + +.. grid:: 3 + + .. grid-item-card:: :octicon:`file-code;1em` + Getting Started with FSDP + :link: https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html?utm_source=distr_landing&utm_medium=FSDP_getting_started + :link-type: url + + This tutorial demonstrates how you can perform distributed training + with FSDP on a MNIST dataset. + +++ + :octicon:`code;1em` Code + + .. grid-item-card:: :octicon:`file-code;1em` + FSDP Advanced + :link: https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html?utm_source=distr_landing&utm_medium=FSDP_advanced + :link-type: url + + In this tutorial, you will learn how to fine-tune a HuggingFace (HF) T5 + model with FSDP for text summarization. + +++ + :octicon:`code;1em` Code + +.. _learn-rpc: + +Learn RPC +--------- + +.. grid:: 3 + + .. grid-item-card:: :octicon:`file-code;1em` + Getting Started with Distributed RPC Framework + :link: https://pytorch.org/tutorials/intermediate/rpc_tutorial.html?utm_source=distr_landing&utm_medium=rpc_getting_started + :link-type: url + + This tutorial demonstrates how to get started with RPC-based distributed + training. + +++ + :octicon:`code;1em` Code + + .. grid-item-card:: :octicon:`file-code;1em` + Implementing a Parameter Server Using Distributed RPC Framework + :link: https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html?utm_source=distr_landing&utm_medium=rpc_param_server_tutorial + :link-type: url + + This tutorial walks you through a simple example of implementing a + parameter server using PyTorch’s Distributed RPC framework. + +++ + :octicon:`code;1em` Code + + .. grid-item-card:: :octicon:`file-code;1em` + Implementing Batch RPC Processing Using Asynchronous Executions + :link: https://pytorch.org/tutorials/intermediate/rpc_async_execution.html?utm_source=distr_landing&utm_medium=rpc_async_execution + :link-type: url + + In this tutorial you will build batch-processing RPC applications + with the @rpc.functions.async_execution decorator. + +++ + :octicon:`code;1em` Code + +.. grid:: 3 + + .. grid-item-card:: :octicon:`file-code;1em` + Combining Distributed DataParallel with Distributed RPC Framework + :link: https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html?utm_source=distr_landing&utm_medium=rpc_plus_ddp + :link-type: url + + In this tutorial you will learn how to combine distributed data + parallelism with distributed model parallelism. + +++ + :octicon:`code;1em` Code + +.. _custom-extensions: + +Custom Extensions +----------------- + +.. grid:: 3 + + .. grid-item-card:: :octicon:`file-code;1em` + Customize Process Group Backends Using Cpp Extensions + :link: https://pytorch.org/tutorials/intermediate/process_group_cpp_extension_tutorial.html?utm_source=distr_landing&utm_medium=custom_extensions_cpp + :link-type: url + + In this tutorial you will learn to implement a custom `ProcessGroup` + backend and plug that into PyTorch distributed package using + cpp extensions. + +++ + :octicon:`code;1em` Code diff --git a/index.rst b/index.rst index 259ff23a436..c80d0bb42a3 100644 --- a/index.rst +++ b/index.rst @@ -891,6 +891,7 @@ Additional Resources :hidden: :caption: Parallel and Distributed Training + distributed/home beginner/dist_overview beginner/ddp_series_intro intermediate/model_parallel_tutorial