pytorch
diff --git a/‎.pyspelling.yml
Lines changed: 5 additions & 1 deletion b/‎.pyspelling.yml
Lines changed: 5 additions & 1 deletion
diff --git a/‎beginner_source/basics/optimization_tutorial.py
Lines changed: 1 addition & 1 deletion b/‎beginner_source/basics/optimization_tutorial.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎beginner_source/basics/quickstart_tutorial.py
Lines changed: 1 addition & 1 deletion b/‎beginner_source/basics/quickstart_tutorial.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎beginner_source/basics/saveloadrun_tutorial.py
Lines changed: 1 addition & 1 deletion b/‎beginner_source/basics/saveloadrun_tutorial.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎beginner_source/basics/tensorqs_tutorial.py
Lines changed: 1 addition & 1 deletion b/‎beginner_source/basics/tensorqs_tutorial.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎beginner_source/dcgan_faces_tutorial.py
Lines changed: 1 addition & 1 deletion b/‎beginner_source/dcgan_faces_tutorial.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎beginner_source/ddp_series_theory.rst
Lines changed: 3 additions & 0 deletions b/‎beginner_source/ddp_series_theory.rst
Lines changed: 3 additions & 0 deletions
diff --git a/‎beginner_source/dist_overview.rst
Lines changed: 1 addition & 1 deletion b/‎beginner_source/dist_overview.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎beginner_source/transformer_tutorial.py
Lines changed: 3 additions & 3 deletions b/‎beginner_source/transformer_tutorial.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎en-wordlist.txt
Lines changed: 36 additions & 0 deletions b/‎en-wordlist.txt
Lines changed: 36 additions & 0 deletions
diff --git a/‎recipes_source/recipes/Captum_Recipe.py
Lines changed: 1 addition & 1 deletion b/‎recipes_source/recipes/Captum_Recipe.py
Lines changed: 1 addition & 1 deletion
@@ -5,6 +5,7 @@ matrix:
     - beginner_source/*.py
     - intermediate_source/*.py
     - advanced_source/*.py
+    - recipes_source/*/*.py
   dictionary:
     wordlists:
       - en-wordlist.txt
@@ -21,10 +22,13 @@ matrix:
         - open: ':(?:(class|py:mod|mod|func)):`'
           content: '[^`]*'
           close: '`'
+        # Exclude reStructuredText hyperlinks
+        - open: '\s'
+          content: '\w*'
+          close: '_'
         # Exclude raw directive
         - open: '\.\. (raw)::.*$\n*'
           close: '\n'
-        # Exclude
         # Exclude Python coding directives
         - open: '-\*- coding:'
           close: '\n'
 
@@ -155,9 +155,9 @@ def train_loop(dataloader, model, loss_fn, optimizer):
         loss = loss_fn(pred, y)
 
         # Backpropagation
-        optimizer.zero_grad()
         loss.backward()
         optimizer.step()
+        optimizer.zero_grad()
 
         if batch % 100 == 0:
             loss, current = loss.item(), (batch + 1) * len(X)
 
@@ -152,9 +152,9 @@ def train(dataloader, model, loss_fn, optimizer):
         loss = loss_fn(pred, y)
 
         # Backpropagation
-        optimizer.zero_grad()
         loss.backward()
         optimizer.step()
+        optimizer.zero_grad()
 
         if batch % 100 == 0:
             loss, current = loss.item(), (batch + 1) * len(X)
 
@@ -33,7 +33,7 @@
 # To load model weights, you need to create an instance of the same model first, and then load the parameters
 # using ``load_state_dict()`` method.
 
-model = models.vgg16() # we do not specify weights, i.e. create untrained model
+model = models.vgg16() # we do not specify ``weights``, i.e. create untrained model
 model.load_state_dict(torch.load('model_weights.pth'))
 model.eval()
 
 
@@ -133,7 +133,7 @@
 ######################################################################
 # **Joining tensors** You can use ``torch.cat`` to concatenate a sequence of tensors along a given dimension.
 # See also `torch.stack <https://pytorch.org/docs/stable/generated/torch.stack.html>`__,
-# another tensor joining option that is subtly different from ``torch.cat``.
+# another tensor joining operator that is subtly different from ``torch.cat``.
 t1 = torch.cat([tensor, tensor, tensor], dim=1)
 print(t1)
 
 
@@ -514,7 +514,7 @@ def forward(self, input):
 # practices shown in `ganhacks <https://github.com/soumith/ganhacks>`__.
 # Namely, we will “construct different mini-batches for real and fake”
 # images, and also adjust G’s objective function to maximize
-# :math:`logD(G(z))`. Training is split up into two main parts. Part 1
+# :math:`log(D(G(z)))`. Training is split up into two main parts. Part 1
 # updates the Discriminator and Part 2 updates the Generator.
 # 
 # **Part 1 - Train the Discriminator**
 
@@ -37,6 +37,8 @@ ensures each device gets a non-overlapping input batch. The model is replicated
 each replica calculates gradients and simultaneously synchronizes with the others using the `ring all-reduce
 algorithm <https://tech.preferred.jp/en/blog/technologies-behind-distributed-deep-learning-allreduce/>`__.
 
+This `illustrative tutorial <https://pytorch.org/tutorials/intermediate/dist_tuto.html#>`__ provides a more in-depth python view of the mechanics of DDP.
+
 Why you should prefer DDP over DataParallel (DP)
 -------------------------------------------------
 
@@ -66,3 +68,4 @@ Further Reading
    API <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__
 -  `DDP Internal
    Design <https://pytorch.org/docs/master/notes/ddp.html#internal-design>`__
+-  `DDP Mechanics Tutorial <https://pytorch.org/tutorials/intermediate/dist_tuto.html#>`__
@@ -131,7 +131,7 @@ DDP materials are listed below:
 4. The `Shard Optimizer States With ZeroRedundancyOptimizer <../recipes/zero_redundancy_optimizer.html>`__
    recipe demonstrates how `ZeroRedundancyOptimizer <https://pytorch.org/docs/stable/distributed.optim.html>`__
    helps to reduce optimizer memory footprint.
-5. The `Distributed Training with Uneven Inputs Using the Join Context Manager <../advanced/generic_oin.html>`__
+5. The `Distributed Training with Uneven Inputs Using the Join Context Manager <../advanced/generic_join.html>`__
    tutorial walks through using the generic join context for distributed training with uneven inputs.
 
 torch.distributed.elastic
 
@@ -135,7 +135,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 ######################################################################
 # This tutorial uses ``torchtext`` to generate Wikitext-2 dataset.
-# To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. 
+# To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data.
 # %%
 #  .. code-block:: bash
 #
@@ -175,7 +175,7 @@ def forward(self, x: Tensor) -> Tensor:
 train_iter = WikiText2(split='train')
 tokenizer = get_tokenizer('basic_english')
 vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
-vocab.set_default_index(vocab['<unk>']) 
+vocab.set_default_index(vocab['<unk>'])
 
 def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
     """Converts raw text into a flat Tensor."""
@@ -196,7 +196,7 @@ def batchify(data: Tensor, bsz: int) -> Tensor:
     that wouldn't cleanly fit.
 
     Arguments:
-        data: Tensor, shape [N]
+        data: Tensor, shape ``[N]``
         bsz: int, batch size
 
     Returns:
 
@@ -1,5 +1,6 @@
 APIs
 ATen
+AVX
 Args
 Autograd
 BCE
@@ -15,8 +16,11 @@ CLS
 CNNDM
 CNNs
 CPUs
+CPython
 CUDA
 Caffe
+Captum
+Captum's
 CartPole
 Cayley
 Chatbots
@@ -28,6 +32,7 @@ DCGAN
 DCGANs
 DDP
 DDQN
+DLRM
 DNN
 DQN
 DataPipe
@@ -66,8 +71,10 @@ HVP
 Hugging Face
 IMDB
 IOT
+ISA
 ImageNet
 Initializations
+Interpretability
 Iteratively
 JSON
 JVP
@@ -95,6 +102,7 @@ NCHW
 NES
 NLP
 NTK
+NUMA
 NaN
 NanoGPT
 NeurIPS
@@ -103,6 +111,7 @@ Numericalization
 Numpy's
 ONNX
 OpenAI
+OpenMP
 PIL
 PPO
 Plotly
@@ -117,11 +126,13 @@ RPC
 RTX
 Radford
 ReLU
+ReLUs
 ResNet
 Runtime's
 SDPA
 SGD
 SPD
+SSD
 SST2
 STN
 SciPy
@@ -131,6 +142,7 @@ SoTA
 Spacy
 TPU
 TensorBoard
+TensorBoards
 TextVQA
 Tokenization
 TorchDynamo
@@ -160,19 +172,23 @@ approximators
 autodiff
 autoencoder
 autograd
+autotuner
 backend
 backends
 backprop
 backpropagate
 backpropagated
 backpropagates
 backpropagation
+backtrace
 batchnorm
 batchnorm's
 benchmarking
+bitwise
 boolean
 broadcasted
 bytecode
+cancelation
 cardinality
 chatbot
 chatbot's
@@ -204,10 +220,13 @@ deserialized
 deterministically
 dimensionality
 dir
+discontiguous
+distractor
 downsample
 downsamples
 dropdown
 duration
+elementwise
 embeddings
 encodings
 ensembling
@@ -245,6 +264,7 @@ iteratively
 jacobian
 jacobians
 jit
+jitter
 jpg
 judgements
 kwargs
@@ -253,6 +273,7 @@ learnable
 learnings
 loadFilename
 manualSeed
+matmul
 matplotlib
 minibatch
 minibatches
@@ -275,6 +296,7 @@ numericalize
 numpy
 nvFuser
 nvFuser's
+oneDNN
 optimizable
 optimizer's
 optimizers
@@ -286,8 +308,13 @@ parametrizations
 parametrized
 parametrizing
 perceptibility
+pickleable
 pipelining
 pointwise
+postprocessing
+preallocate
+preallocates
+preallocation
 precompute
 precomputing
 prepend
@@ -339,7 +366,11 @@ subdirectories
 submodule
 submodules
 subnetworks
+subprocess
+subprocesses
 subreddit
+subregion
+subregion's
 summarization
 tanh
 th
@@ -365,13 +396,17 @@ tradeoff
 tradeoffs
 uncomment
 uncommented
+underflowing
 unfused
 unimodal
 unnormalized
 unoptimized
 unparametrized
 unpickling
 unpruned
+unscale
+unscaled
+unscales
 upscaled
 utils
 vectorization
@@ -381,4 +416,5 @@ vhp
 voc
 walkthrough
 warmstart
+warmstarted
 warmstarting
@@ -59,7 +59,7 @@
 
 normalize = transforms.Compose([
     transforms.ToTensor(),               # converts the image to a tensor with values between 0 and 1
-    transforms.Normalize(                # normalize to follow 0-centered imagenet pixel rgb distribution
+    transforms.Normalize(                # normalize to follow 0-centered imagenet pixel RGB distribution
      mean=[0.485, 0.456, 0.406],
      std=[0.229, 0.224, 0.225]
     )
Original file line number	Diff line number	Diff line change
`@@ -514,7 +514,7 @@ def forward(self, input):`
`514`	`514`	# practices shown in `ganhacks <https://github.com/soumith/ganhacks>`__.
`515`	`515`	`# Namely, we will “construct different mini-batches for real and fake”`
`516`	`516`	`# images, and also adjust G’s objective function to maximize`
`517`		-# :math:`logD(G(z))`. Training is split up into two main parts. Part 1
	`517`	+# :math:`log(D(G(z)))`. Training is split up into two main parts. Part 1
`518`	`518`	`# updates the Discriminator and Part 2 updates the Generator.`
`519`	`519`	`#`
`520`	`520`	`# Part 1 - Train the Discriminator`
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@`
`59`	`59`
`60`	`60`	`normalize = transforms.Compose([`
`61`	`61`	`transforms.ToTensor(), # converts the image to a tensor with values between 0 and 1`
`62`		`- transforms.Normalize( # normalize to follow 0-centered imagenet pixel rgb distribution`
	`62`	`+ transforms.Normalize( # normalize to follow 0-centered imagenet pixel RGB distribution`
`63`	`63`	`mean=[0.485, 0.456, 0.406],`
`64`	`64`	`std=[0.229, 0.224, 0.225]`
`65`	`65`	`)`