diff --git a/.gitignore b/.gitignore
index 2d9a9e5a634..ef7a026d9e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -124,3 +124,6 @@ cleanup.sh
 
 # VSCode
 *.vscode
+
+# pyspelling
+dictionary.dic
diff --git a/.pyspelling.yml b/.pyspelling.yml
index 04dcda37b75..015ac975b7f 100644
--- a/.pyspelling.yml
+++ b/.pyspelling.yml
@@ -1,11 +1,11 @@
 spellchecker: aspell
 matrix:
-- name: beginner
+- name: python
   sources:
-    - beginner_source/data_loading_tutorial.py
+    - beginner_source/*.py
   dictionary:
     wordlists:
-      - tutorials-wordlist.txt
+      - en-wordlist.txt
   pipeline:
   - pyspelling.filters.python:
       group_comments: true
@@ -13,13 +13,30 @@ matrix:
       context_visible_first: true
       delimiters:
         # Exclude figure rST tags
-        - open: '\.\.\s+(figure|literalinclude|)::'
+        - open: '\.\.\s+(figure|literalinclude|math|image|grid)::'
+          close: '\n'
+        # Exclude raw directive
+        - open: '\.\. (raw)::.*$\n*'
           close: '\n'
         # Exclude Python coding directives
         - open: '-\*- coding:'
           close: '\n'
+        # Exclude Authors:
+        - open: 'Author(|s):'
+          close: '\n'
+        # Exclude .rst directives:
+        - open: ':math:`.*`'
+          close: ' '
+        # Ignore multiline content in codeblock
+        - open: '(?s)^::\n\n  '
+          close: '^\n'
+        # Ignore reStructuredText block directives
+        - open: '\.\. (code-block)::.*$\n*'
+          content: '(?P<first>(^(?P<indent>[ ]+).*$\n))(?P<other>(^([ \t]+.*|[ \t]*)$\n)*)'
+          close: '(^(?![ \t]+.*$))'
   - pyspelling.filters.markdown:
   - pyspelling.filters.html:
       ignores:
         - code
         - pre
+  - pyspelling.filters.url:
diff --git a/beginner_source/Intro_to_TorchScript_tutorial.py b/beginner_source/Intro_to_TorchScript_tutorial.py
index 02757752135..063abd442d1 100644
--- a/beginner_source/Intro_to_TorchScript_tutorial.py
+++ b/beginner_source/Intro_to_TorchScript_tutorial.py
@@ -2,7 +2,7 @@
 Introduction to TorchScript
 ===========================
 
-*James Reed (jamesreed@fb.com), Michael Suo (suo@fb.com)*, rev2
+**Authors:** James Reed (jamesreed@fb.com), Michael Suo (suo@fb.com), rev2
 
 This tutorial is an introduction to TorchScript, an intermediate
 representation of a PyTorch model (subclass of ``nn.Module``) that
@@ -147,7 +147,7 @@ def forward(self, x, h):
 
 
 ######################################################################
-# We’ve once again redefined our MyCell class, but here we’ve defined
+# We’ve once again redefined our ``MyCell`` class, but here we’ve defined
 # ``MyDecisionGate``. This module utilizes **control flow**. Control flow
 # consists of things like loops and ``if``-statements.
 #
@@ -202,7 +202,7 @@ def forward(self, x, h):
 # inputs* the network might see.
 #
 # What exactly has this done? It has invoked the ``Module``, recorded the
-# operations that occured when the ``Module`` was run, and created an
+# operations that occurred when the ``Module`` was run, and created an
 # instance of ``torch.jit.ScriptModule`` (of which ``TracedModule`` is an
 # instance)
 #
@@ -283,7 +283,7 @@ def forward(self, x, h):
 # Looking at the ``.code`` output, we can see that the ``if-else`` branch
 # is nowhere to be found! Why? Tracing does exactly what we said it would:
 # run the code, record the operations *that happen* and construct a
-# ScriptModule that does exactly that. Unfortunately, things like control
+# ``ScriptModule`` that does exactly that. Unfortunately, things like control
 # flow are erased.
 #
 # How can we faithfully represent this module in TorchScript? We provide a
diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py
index 3e4905126c4..02185a6ba3e 100644
--- a/beginner_source/chatbot_tutorial.py
+++ b/beginner_source/chatbot_tutorial.py
@@ -31,28 +31,28 @@
 #    :align: center
 #    :alt: bot
 #
-# .. code:: python
-#
-#   > hello?
-#   Bot: hello .
-#   > where am I?
-#   Bot: you re in a hospital .
-#   > who are you?
-#   Bot: i m a lawyer .
-#   > how are you doing?
-#   Bot: i m fine .
-#   > are you my friend?
-#   Bot: no .
-#   > you're under arrest
-#   Bot: i m trying to help you !
-#   > i'm just kidding
-#   Bot: i m sorry .
-#   > where are you from?
-#   Bot: san francisco .
-#   > it's time for me to leave
-#   Bot: i know .
-#   > goodbye
-#   Bot: goodbye .
+# .. code-block:: python
+#
+#    > hello?
+#    Bot: hello .
+#    > where am I?
+#    Bot: you re in a hospital .
+#    > who are you?
+#    Bot: i m a lawyer .
+#    > how are you doing?
+#    Bot: i m fine .
+#    > are you my friend?
+#    Bot: no .
+#    > you're under arrest
+#    Bot: i m trying to help you !
+#    > i'm just kidding
+#    Bot: i m sorry .
+#    > where are you from?
+#    Bot: san francisco .
+#    > it's time for me to leave
+#    Bot: i know .
+#    > goodbye
+#    Bot: goodbye .
 #
 # **Tutorial Highlights**
 #
@@ -65,7 +65,7 @@
 # -  Implement greedy-search decoding module
 # -  Interact with trained chatbot
 #
-# **Acknowledgements**
+# **Acknowledgments**
 #
 # This tutorial borrows code from the following sources:
 #
@@ -75,7 +75,7 @@
 # 2) Sean Robertson’s practical-pytorch seq2seq-translation example:
 #    https://github.com/spro/practical-pytorch/tree/master/seq2seq-translation
 #
-# 3) FloydHub’s Cornell Movie Corpus preprocessing code:
+# 3) FloydHub Cornell Movie Corpus preprocessing code:
 #    https://github.com/floydhub/textutil-preprocess-cornell-movie-corpus
 #
 
@@ -162,11 +162,11 @@ def printLines(file, n=10):
 # contains a tab-separated *query sentence* and a *response sentence* pair.
 #
 # The following functions facilitate the parsing of the raw
-# *utterances.jsonl* data file.
+# ``utterances.jsonl`` data file.
 #
 # -  ``loadLinesAndConversations`` splits each line of the file into a dictionary of
-#    lines with fields: lineID, characterID, and text and then groups them
-#    into conversations with fields: conversationID, movieID, and lines.
+#    lines with fields: ``lineID``, ``characterID``, and text and then groups them
+#    into conversations with fields: ``conversationID``, ``movieID``, and lines.
 # -  ``extractSentencePairs`` extracts pairs of sentences from
 #    conversations
 #
@@ -215,7 +215,7 @@ def extractSentencePairs(conversations):
 
 ######################################################################
 # Now we’ll call these functions and create the file. We’ll call it
-# *formatted_movie_lines.txt*.
+# ``formatted_movie_lines.txt``.
 #
 
 # Define path to new file
@@ -359,12 +359,12 @@ def readVocs(datafile, corpus_name):
     voc = Voc(corpus_name)
     return voc, pairs
 
-# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
+# Returns True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
 def filterPair(p):
     # Input sequences need to preserve the last word for EOS token
     return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
 
-# Filter pairs using filterPair condition
+# Filter pairs using the ``filterPair`` condition
 def filterPairs(pairs):
     return [pair for pair in pairs if filterPair(pair)]
 
@@ -659,7 +659,7 @@ def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
         self.hidden_size = hidden_size
         self.embedding = embedding
 
-        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
+        # Initialize GRU; the input_size and hidden_size parameters are both set to 'hidden_size'
         #   because our input size is a word embedding with number of features == hidden_size
         self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                           dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
@@ -958,7 +958,7 @@ def train(input_variable, lengths, target_variable, mask, max_target_len, encode
     input_variable = input_variable.to(device)
     target_variable = target_variable.to(device)
     mask = mask.to(device)
-    # Lengths for rnn packing should always be on the cpu
+    # Lengths for RNN packing should always be on the CPU
     lengths = lengths.to("cpu")
 
     # Initialize variables
@@ -1007,7 +1007,7 @@ def train(input_variable, lengths, target_variable, mask, max_target_len, encode
             print_losses.append(mask_loss.item() * nTotal)
             n_totals += nTotal
 
-    # Perform backpropatation
+    # Perform backpropagation
     loss.backward()
 
     # Clip gradients: gradients are modified in place
@@ -1032,8 +1032,8 @@ def train(input_variable, lengths, target_variable, mask, max_target_len, encode
 # lifting with the ``train`` function.
 #
 # One thing to note is that when we save our model, we save a tarball
-# containing the encoder and decoder state_dicts (parameters), the
-# optimizers’ state_dicts, the loss, the iteration, etc. Saving the model
+# containing the encoder and decoder ``state_dicts`` (parameters), the
+# optimizers’ ``state_dicts``, the loss, the iteration, etc. Saving the model
 # in this way will give us the ultimate flexibility with the checkpoint.
 # After loading a checkpoint, we will be able to use the model parameters
 # to run inference, or we can continue training right where we left off.
@@ -1240,8 +1240,8 @@ def evaluateInput(encoder, decoder, searcher, voc):
 # Configure models
 model_name = 'cb_model'
 attn_model = 'dot'
-#attn_model = 'general'
-#attn_model = 'concat'
+#``attn_model = 'general'``
+#``attn_model = 'concat'``
 hidden_size = 500
 encoder_n_layers = 2
 decoder_n_layers = 2
@@ -1251,12 +1251,17 @@ def evaluateInput(encoder, decoder, searcher, voc):
 # Set checkpoint to load from; set to None if starting from scratch
 loadFilename = None
 checkpoint_iter = 4000
-#loadFilename = os.path.join(save_dir, model_name, corpus_name,
-#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
-#                            '{}_checkpoint.tar'.format(checkpoint_iter))
 
+#############################################################
+# Sample code to load from a checkpoint:
+#
+# .. code-block:: python
+#
+#    loadFilename = os.path.join(save_dir, model_name, corpus_name,
+#                        '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
+#                        '{}_checkpoint.tar'.format(checkpoint_iter))
 
-# Load model if a loadFilename is provided
+# Load model if a ``loadFilename`` is provided
 if loadFilename:
     # If loading on same machine the model was trained on
     checkpoint = torch.load(loadFilename)
@@ -1319,7 +1324,7 @@ def evaluateInput(encoder, decoder, searcher, voc):
     encoder_optimizer.load_state_dict(encoder_optimizer_sd)
     decoder_optimizer.load_state_dict(decoder_optimizer_sd)
 
-# If you have cuda, configure cuda to call
+# If you have CUDA, configure CUDA to call
 for state in encoder_optimizer.state.values():
     for k, v in state.items():
         if isinstance(v, torch.Tensor):
@@ -1344,7 +1349,7 @@ def evaluateInput(encoder, decoder, searcher, voc):
 # To chat with your model, run the following block.
 #
 
-# Set dropout layers to eval mode
+# Set dropout layers to ``eval`` mode
 encoder.eval()
 decoder.eval()
 
diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py
index a909f713393..2ee43d72d2f 100644
--- a/beginner_source/dcgan_faces_tutorial.py
+++ b/beginner_source/dcgan_faces_tutorial.py
@@ -15,7 +15,7 @@
 # This tutorial will give an introduction to DCGANs through an example. We
 # will train a generative adversarial network (GAN) to generate new
 # celebrities after showing it pictures of many real celebrities. Most of
-# the code here is from the dcgan implementation in
+# the code here is from the DCGAN implementation in
 # `pytorch/examples <https://github.com/pytorch/examples>`__, and this
 # document will give a thorough explanation of the implementation and shed
 # light on how and why this model works. But don’t worry, no prior
@@ -30,8 +30,8 @@
 # What is a GAN?
 # ~~~~~~~~~~~~~~
 # 
-# GANs are a framework for teaching a DL model to capture the training
-# data’s distribution so we can generate new data from that same
+# GANs are a framework for teaching a deep learning model to capture the training
+# data distribution so we can generate new data from that same
 # distribution. GANs were invented by Ian Goodfellow in 2014 and first
 # described in the paper `Generative Adversarial
 # Nets <https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf>`__.
@@ -145,35 +145,35 @@
 # 
 # Let’s define some inputs for the run:
 # 
-# -  **dataroot** - the path to the root of the dataset folder. We will
-#    talk more about the dataset in the next section
-# -  **workers** - the number of worker threads for loading the data with
-#    the DataLoader
-# -  **batch_size** - the batch size used in training. The DCGAN paper
-#    uses a batch size of 128
-# -  **image_size** - the spatial size of the images used for training.
+# -  ``dataroot`` - the path to the root of the dataset folder. We will
+#    talk more about the dataset in the next section.
+# -  ``workers`` - the number of worker threads for loading the data with
+#    the ``DataLoader``.
+# -  ``batch_size`` - the batch size used in training. The DCGAN paper
+#    uses a batch size of 128.
+# -  ``image_size`` - the spatial size of the images used for training.
 #    This implementation defaults to 64x64. If another size is desired,
 #    the structures of D and G must be changed. See
 #    `here <https://github.com/pytorch/examples/issues/70>`__ for more
-#    details
-# -  **nc** - number of color channels in the input images. For color
-#    images this is 3
-# -  **nz** - length of latent vector
-# -  **ngf** - relates to the depth of feature maps carried through the
-#    generator
-# -  **ndf** - sets the depth of feature maps propagated through the
-#    discriminator
-# -  **num_epochs** - number of training epochs to run. Training for
+#    details.
+# -  ``nc`` - number of color channels in the input images. For color
+#    images this is 3.
+# -  ``nz`` - length of latent vector.
+# -  ``ngf`` - relates to the depth of feature maps carried through the
+#    generator.
+# -  ``ndf`` - sets the depth of feature maps propagated through the
+#    discriminator.
+# -  ``num_epochs`` - number of training epochs to run. Training for
 #    longer will probably lead to better results but will also take much
-#    longer
-# -  **lr** - learning rate for training. As described in the DCGAN paper,
-#    this number should be 0.0002
-# -  **beta1** - beta1 hyperparameter for Adam optimizers. As described in
-#    paper, this number should be 0.5
-# -  **ngpu** - number of GPUs available. If this is 0, code will run in
+#    longer.
+# -  ``lr`` - learning rate for training. As described in the DCGAN paper,
+#    this number should be 0.0002.
+# -  ``beta1`` - beta1 hyperparameter for Adam optimizers. As described in
+#    paper, this number should be 0.5.
+# -  ``ngpu`` - number of GPUs available. If this is 0, code will run in
 #    CPU mode. If this number is greater than 0 it will run on that number
-#    of GPUs
-# 
+#    of GPUs.
+#
 
 # Root directory for dataset
 dataroot = "data/celeba"
@@ -206,7 +206,7 @@
 # Learning rate for optimizers
 lr = 0.0002
 
-# Beta1 hyperparam for Adam optimizers
+# Beta1 hyperparameter for Adam optimizers
 beta1 = 0.5
 
 # Number of GPUs available. Use 0 for CPU mode.
@@ -221,10 +221,10 @@
 # dataset <http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html>`__ which can
 # be downloaded at the linked site, or in `Google
 # Drive <https://drive.google.com/drive/folders/0B7EVK8r0v71pTUZsaXdaSnZBZzg>`__.
-# The dataset will download as a file named *img_align_celeba.zip*. Once
-# downloaded, create a directory named *celeba* and extract the zip file
-# into that directory. Then, set the *dataroot* input for this notebook to
-# the *celeba* directory you just created. The resulting directory
+# The dataset will download as a file named ``img_align_celeba.zip``. Once
+# downloaded, create a directory named ``celeba`` and extract the zip file
+# into that directory. Then, set the ``dataroot`` input for this notebook to
+# the ``celeba`` directory you just created. The resulting directory
 # structure should be:
 # 
 # ::
@@ -237,9 +237,9 @@
 #            -> 537394.jpg
 #               ...
 # 
-# This is an important step because we will be using the ImageFolder
+# This is an important step because we will be using the ``ImageFolder``
 # dataset class, which requires there to be subdirectories in the
-# dataset’s root folder. Now, we can create the dataset, create the
+# dataset root folder. Now, we can create the dataset, create the
 # dataloader, set the device to run on, and finally visualize some of the
 # training data.
 # 
@@ -282,14 +282,14 @@
 # ~~~~~~~~~~~~~~~~~~~~~
 # 
 # From the DCGAN paper, the authors specify that all model weights shall
-# be randomly initialized from a Normal distribution with mean=0,
-# stdev=0.02. The ``weights_init`` function takes an initialized model as
+# be randomly initialized from a Normal distribution with ``mean=0``,
+# ``stdev=0.02``. The ``weights_init`` function takes an initialized model as
 # input and reinitializes all convolutional, convolutional-transpose, and
 # batch normalization layers to meet this criteria. This function is
 # applied to the models immediately after initialization.
 # 
 
-# custom weights initialization called on netG and netD
+# custom weights initialization called on ``netG`` and ``netD``
 def weights_init(m):
     classname = m.__class__.__name__
     if classname.find('Conv') != -1:
@@ -319,10 +319,10 @@ def weights_init(m):
 # .. figure:: /_static/img/dcgan_generator.png
 #    :alt: dcgan_generator
 #
-# Notice, how the inputs we set in the input section (*nz*, *ngf*, and
-# *nc*) influence the generator architecture in code. *nz* is the length
-# of the z input vector, *ngf* relates to the size of the feature maps
-# that are propagated through the generator, and *nc* is the number of
+# Notice, how the inputs we set in the input section (``nz``, ``ngf``, and
+# ``nc``) influence the generator architecture in code. ``nz`` is the length
+# of the z input vector, ``ngf`` relates to the size of the feature maps
+# that are propagated through the generator, and ``nc`` is the number of
 # channels in the output image (set to 3 for RGB images). Below is the
 # code for the generator.
 # 
@@ -338,22 +338,22 @@ def __init__(self, ngpu):
             nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),
             nn.BatchNorm2d(ngf * 8),
             nn.ReLU(True),
-            # state size. (ngf*8) x 4 x 4
+            # state size. ``(ngf*8) x 4 x 4``
             nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
             nn.BatchNorm2d(ngf * 4),
             nn.ReLU(True),
-            # state size. (ngf*4) x 8 x 8
+            # state size. ``(ngf*4) x 8 x 8``
             nn.ConvTranspose2d( ngf * 4, ngf * 2, 4, 2, 1, bias=False),
             nn.BatchNorm2d(ngf * 2),
             nn.ReLU(True),
-            # state size. (ngf*2) x 16 x 16
+            # state size. ``(ngf*2) x 16 x 16``
             nn.ConvTranspose2d( ngf * 2, ngf, 4, 2, 1, bias=False),
             nn.BatchNorm2d(ngf),
             nn.ReLU(True),
-            # state size. (ngf) x 32 x 32
+            # state size. ``(ngf) x 32 x 32``
             nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
             nn.Tanh()
-            # state size. (nc) x 64 x 64
+            # state size. ``(nc) x 64 x 64``
         )
 
     def forward(self, input):
@@ -369,12 +369,12 @@ def forward(self, input):
 # Create the generator
 netG = Generator(ngpu).to(device)
 
-# Handle multi-gpu if desired
+# Handle multi-GPU if desired
 if (device.type == 'cuda') and (ngpu > 1):
     netG = nn.DataParallel(netG, list(range(ngpu)))
 
-# Apply the weights_init function to randomly initialize all weights
-#  to mean=0, stdev=0.02.
+# Apply the ``weights_init`` function to randomly initialize all weights
+#  to ``mean=0``, ``stdev=0.02``.
 netG.apply(weights_init)
 
 # Print the model
@@ -408,22 +408,22 @@ def __init__(self, ngpu):
         super(Discriminator, self).__init__()
         self.ngpu = ngpu
         self.main = nn.Sequential(
-            # input is (nc) x 64 x 64
+            # input is ``(nc) x 64 x 64``
             nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
             nn.LeakyReLU(0.2, inplace=True),
-            # state size. (ndf) x 32 x 32
+            # state size. ``(ndf) x 32 x 32``
             nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
             nn.BatchNorm2d(ndf * 2),
             nn.LeakyReLU(0.2, inplace=True),
-            # state size. (ndf*2) x 16 x 16
+            # state size. ``(ndf*2) x 16 x 16``
             nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
             nn.BatchNorm2d(ndf * 4),
             nn.LeakyReLU(0.2, inplace=True),
-            # state size. (ndf*4) x 8 x 8
+            # state size. ``(ndf*4) x 8 x 8``
             nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
             nn.BatchNorm2d(ndf * 8),
             nn.LeakyReLU(0.2, inplace=True),
-            # state size. (ndf*8) x 4 x 4
+            # state size. ``(ndf*8) x 4 x 4``
             nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
             nn.Sigmoid()
         )
@@ -440,12 +440,12 @@ def forward(self, input):
 # Create the Discriminator
 netD = Discriminator(ngpu).to(device)
 
-# Handle multi-gpu if desired
+# Handle multi-GPU if desired
 if (device.type == 'cuda') and (ngpu > 1):
     netD = nn.DataParallel(netD, list(range(ngpu)))
     
-# Apply the weights_init function to randomly initialize all weights
-#  to mean=0, stdev=0.2.
+# Apply the ``weights_init`` function to randomly initialize all weights
+# like this: ``to mean=0, stdev=0.2``.
 netD.apply(weights_init)
 
 # Print the model
@@ -485,7 +485,7 @@ def forward(self, input):
 # images form out of the noise.
 # 
 
-# Initialize BCELoss function
+# Initialize the ``BCELoss`` function
 criterion = nn.BCELoss()
 
 # Create batch of latent vectors that we will use to visualize
@@ -509,7 +509,8 @@ def forward(self, input):
 # we can train it. Be mindful that training GANs is somewhat of an art
 # form, as incorrect hyperparameter settings lead to mode collapse with
 # little explanation of what went wrong. Here, we will closely follow
-# Algorithm 1 from Goodfellow’s paper, while abiding by some of the best
+# Algorithm 1 from the `Goodfellow’s paper <https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf>`__, 
+# while abiding by some of the best
 # practices shown in `ganhacks <https://github.com/soumith/ganhacks>`__.
 # Namely, we will “construct different mini-batches for real and fake”
 # images, and also adjust G’s objective function to maximize
@@ -523,7 +524,8 @@ def forward(self, input):
 # terms of Goodfellow, we wish to “update the discriminator by ascending
 # its stochastic gradient”. Practically, we want to maximize
 # :math:`log(D(x)) + log(1-D(G(z)))`. Due to the separate mini-batch
-# suggestion from ganhacks, we will calculate this in two steps. First, we
+# suggestion from `ganhacks <https://github.com/soumith/ganhacks>`__,
+# we will calculate this in two steps. First, we
 # will construct a batch of real samples from the training set, forward
 # pass through :math:`D`, calculate the loss (:math:`log(D(x))`), then
 # calculate the gradients in a backward pass. Secondly, we will construct
@@ -545,7 +547,7 @@ def forward(self, input):
 # G’s gradients in a backward pass, and finally updating G’s parameters
 # with an optimizer step. It may seem counter-intuitive to use the real
 # labels as GT labels for the loss function, but this allows us to use the
-# :math:`log(x)` part of the BCELoss (rather than the :math:`log(1-x)`
+# :math:`log(x)` part of the ``BCELoss`` (rather than the :math:`log(1-x)`
 # part) which is exactly what we want.
 # 
 # Finally, we will do some statistic reporting and at the end of each
diff --git a/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py b/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py
index 3d1b4c13b32..5e985b58598 100644
--- a/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py
+++ b/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py
@@ -39,7 +39,7 @@
 # the Python runtime.
 #
 # The API for converting eager-mode PyTorch programs into TorchScript is
-# found in the torch.jit module. This module has two core modalities for
+# found in the ``torch.jit`` module. This module has two core modalities for
 # converting an eager-mode model to a TorchScript graph representation:
 # **tracing** and **scripting**. The ``torch.jit.trace`` function takes a
 # module or function and a set of example inputs. It then runs the example
@@ -74,18 +74,18 @@
 
 
 ######################################################################
-# Acknowledgements
+# Acknowledgments
 # ----------------
 #
 # This tutorial was inspired by the following sources:
 #
-# 1) Yuan-Kuei Wu’s pytorch-chatbot implementation:
+# 1) Yuan-Kuei Wu's pytorch-chatbot implementation:
 #    https://github.com/ywk991112/pytorch-chatbot
 #
-# 2) Sean Robertson’s practical-pytorch seq2seq-translation example:
+# 2) Sean Robertson's practical-pytorch seq2seq-translation example:
 #    https://github.com/spro/practical-pytorch/tree/master/seq2seq-translation
 #
-# 3) FloydHub’s Cornell Movie Corpus preprocessing code:
+# 3) FloydHub's Cornell Movie Corpus preprocessing code:
 #    https://github.com/floydhub/textutil-preprocess-cornell-movie-corpus
 #
 
@@ -290,7 +290,7 @@ def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
         self.hidden_size = hidden_size
         self.embedding = embedding
 
-        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
+        # Initialize GRU; the ``input_size`` and ``hidden_size`` parameters are both set to 'hidden_size'
         #   because our input size is a word embedding with number of features == hidden_size
         self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                           dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
@@ -525,7 +525,7 @@ def forward(self, input_step, last_hidden, encoder_outputs):
 #       we can use function type annotations as introduced in `PEP
 #       3107 <https://www.python.org/dev/peps/pep-3107/>`__. In addition,
 #       it is possible to declare arguments of different types using
-#       MyPy-style type annotations (see
+#       Mypy-style type annotations (see
 #       `doc <https://pytorch.org/docs/master/jit.html#types>`__).
 #
 #
@@ -618,7 +618,7 @@ def evaluate(searcher, voc, sentence, max_length=MAX_LENGTH):
     return decoded_words
 
 
-# Evaluate inputs from user input (stdin)
+# Evaluate inputs from user input (``stdin``)
 def evaluateInput(searcher, voc):
     input_sentence = ''
     while(1):
@@ -638,7 +638,7 @@ def evaluateInput(searcher, voc):
         except KeyError:
             print("Error: Encountered unknown word.")
 
-# Normalize input sentence and call evaluate()
+# Normalize input sentence and call ``evaluate()``
 def evaluateExample(sentence, searcher, voc):
     print("> " + sentence)
     # Normalize sentence
@@ -653,7 +653,7 @@ def evaluateExample(sentence, searcher, voc):
 # Load Pretrained Parameters
 # --------------------------
 #
-# Ok, its time to load our model!
+# No, let's load our model!
 #
 # Use hosted model
 # ~~~~~~~~~~~~~~~~
@@ -671,7 +671,7 @@ def evaluateExample(sentence, searcher, voc):
 # Use your own model
 # ~~~~~~~~~~~~~~~~~~
 #
-# To load your own pre-trained model:
+# To load your own pretrained model:
 #
 # 1) Set the ``loadFilename`` variable to the path to the checkpoint file
 #    that you wish to load. Note that if you followed the convention for
@@ -691,9 +691,9 @@ def evaluateExample(sentence, searcher, voc):
 # ~~~~~~~~~~~~~~~~~~~~~~
 #
 # Notice that we initialize and load parameters into our encoder and
-# decoder models as usual. If you are using tracing mode(`torch.jit.trace`)
-# for some part of your models, you must call .to(device) to set the device
-# options of the models and .eval() to set the dropout layers to test mode
+# decoder models as usual. If you are using tracing mode(``torch.jit.trace``)
+# for some part of your models, you must call ``.to(device)`` to set the device
+# options of the models and ``.eval()`` to set the dropout layers to test mode
 # **before** tracing the models. `TracedModule` objects do not inherit the
 # ``to`` or ``eval`` methods. Since in this tutorial we are only using
 # scripting instead of tracing, we only need to do this before we do
@@ -706,7 +706,7 @@ def evaluateExample(sentence, searcher, voc):
 # Configure models
 model_name = 'cb_model'
 attn_model = 'dot'
-#attn_model = 'general'
+#attn_model = 'general'``
 #attn_model = 'concat'
 hidden_size = 500
 encoder_n_layers = 2
@@ -717,7 +717,13 @@ def evaluateExample(sentence, searcher, voc):
 # If you're loading your own model
 # Set checkpoint to load from
 checkpoint_iter = 4000
-# loadFilename = os.path.join(save_dir, model_name, corpus_name,
+
+#############################################################
+# Sample code to load from a checkpoint:
+#
+# .. code-block:: python
+#
+#    loadFilename = os.path.join(save_dir, model_name, corpus_name,
 #                             '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
 #                             '{}_checkpoint.tar'.format(checkpoint_iter))
 
@@ -743,13 +749,13 @@ def evaluateExample(sentence, searcher, voc):
 # Initialize encoder & decoder models
 encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
 decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
-# Load trained model params
+# Load trained model parameters
 encoder.load_state_dict(encoder_sd)
 decoder.load_state_dict(decoder_sd)
 # Use appropriate device
 encoder = encoder.to(device)
 decoder = decoder.to(device)
-# Set dropout layers to eval mode
+# Set dropout layers to ``eval`` mode
 encoder.eval()
 decoder.eval()
 print('Models built and ready to go!')
@@ -794,7 +800,7 @@ def evaluateExample(sentence, searcher, voc):
 # data-dependent control flow. In the case of scripting, we do necessary
 # language changes to make sure the implementation complies with
 # TorchScript. We initialize the scripted searcher the same way that we
-# would initialize an un-scripted variant.
+# would initialize an unscripted variant.
 #
 
 ### Compile the whole greedy search model to TorchScript model
@@ -847,7 +853,7 @@ def evaluateExample(sentence, searcher, voc):
 
 # Use appropriate device
 scripted_searcher.to(device)
-# Set dropout layers to eval mode
+# Set dropout layers to ``eval`` mode
 scripted_searcher.eval()
 
 # Evaluate examples
@@ -855,8 +861,8 @@ def evaluateExample(sentence, searcher, voc):
 for s in sentences:
     evaluateExample(s, scripted_searcher, voc)
 
-# Evaluate your input
-#evaluateInput(traced_encoder, traced_decoder, scripted_searcher, voc)
+# Evaluate your input by running
+# ``evaluateInput(traced_encoder, traced_decoder, scripted_searcher, voc)``
 
 
 ######################################################################
diff --git a/beginner_source/fgsm_tutorial.py b/beginner_source/fgsm_tutorial.py
index 69b7f2e9964..fa23680496c 100644
--- a/beginner_source/fgsm_tutorial.py
+++ b/beginner_source/fgsm_tutorial.py
@@ -120,7 +120,7 @@
 # There are only three inputs for this tutorial, and are defined as
 # follows:
 # 
-# -  **epsilons** - List of epsilon values to use for the run. It is
+# -  ``epsilons`` - List of epsilon values to use for the run. It is
 #    important to keep 0 in the list because it represents the model
 #    performance on the original test set. Also, intuitively we would
 #    expect the larger the epsilon, the more noticeable the perturbations
@@ -128,12 +128,12 @@
 #    accuracy. Since the data range here is :math:`[0,1]`, no epsilon
 #    value should exceed 1.
 # 
-# -  **pretrained_model** - path to the pretrained MNIST model which was
+# -  ``pretrained_model`` - path to the pretrained MNIST model which was
 #    trained with
 #    `pytorch/examples/mnist <https://github.com/pytorch/examples/tree/master/mnist>`__.
 #    For simplicity, download the pretrained model `here <https://drive.google.com/drive/folders/1fn83DF14tWmit0RTKWRhPq5uVXt73e0h?usp=sharing>`__.
 # 
-# -  **use_cuda** - boolean flag to use CUDA if desired and available.
+# -  ``use_cuda`` - boolean flag to use CUDA if desired and available.
 #    Note, a GPU with CUDA is not critical for this tutorial as a CPU will
 #    not take much time.
 # 
@@ -263,7 +263,7 @@ def test( model, device, test_loader, epsilon ):
         output = model(data)
         init_pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
 
-        # If the initial prediction is wrong, dont bother attacking, just move on
+        # If the initial prediction is wrong, don't bother attacking, just move on
         if init_pred.item() != target.item():
             continue
 
@@ -276,7 +276,7 @@ def test( model, device, test_loader, epsilon ):
         # Calculate gradients of model in backward pass
         loss.backward()
 
-        # Collect datagrad
+        # Collect ``datagrad``
         data_grad = data.grad.data
 
         # Call FGSM Attack
@@ -366,7 +366,7 @@ def test( model, device, test_loader, epsilon ):
 # Remember the idea of no free lunch? In this case, as epsilon increases
 # the test accuracy decreases **BUT** the perturbations become more easily
 # perceptible. In reality, there is a tradeoff between accuracy
-# degredation and perceptibility that an attacker must consider. Here, we
+# degradation and perceptibility that an attacker must consider. Here, we
 # show some examples of successful adversarial examples at each epsilon
 # value. Each row of the plot shows a different epsilon value. The first
 # row is the :math:`\epsilon=0` examples which represent the original
diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
index ef54c5ea06d..12e20f475f8 100644
--- a/beginner_source/flava_finetuning_tutorial.py
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -24,7 +24,7 @@
 ######################################################################
 # Installation
 # -----------------
-# We will use TextVQA dataset and bert tokenizer from HuggingFace for this
+# We will use TextVQA dataset and ``bert tokenizer`` from Hugging Face for this
 # tutorial. So you need to install datasets and transformers in addition to TorchMultimodal.
 #
 # .. note::
@@ -40,21 +40,21 @@
 #
 
 ######################################################################
-# Steps 
+# Steps
 # -----
-# 
-# 1. Download the HuggingFace dataset to a directory on your computer by running the following command:
-# 
+#
+# 1. Download the Hugging Face dataset to a directory on your computer by running the following command:
+#
 #    .. code-block::
-# 
+#
 #       wget http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz 
 #       tar xf vocab.tar.gz
-# 
+#
 #    .. note:: 
 #       If you are running this tutorial in Google Colab, run these commands
 #       in a new cell and prepend these commands with an exclamation mark (!)
 #
-#  
+#
 # 2. For this tutorial, we treat VQA as a classification task where
 #    the inputs are images and question (text) and the output is an answer class. 
 #    So we need to download the vocab file with answer classes and create the answer to
@@ -62,7 +62,7 @@
 #
 #    We also load the `textvqa
 #    dataset <https://arxiv.org/pdf/1904.08920.pdf>`__ containing 34602 training samples
-#    (images,questions and answers) from HuggingFace
+#    (images,questions and answers) from Hugging Face
 #
 # We see there are 3997 answer classes including a class representing
 # unknown answers.
@@ -98,8 +98,8 @@
 # 3. Next, we write the transform function to convert the image and text into
 # Tensors consumable by our model - For images, we use the transforms from
 # torchvision to convert to Tensor and resize to uniform sizes - For text,
-# we tokenize (and pad) them using the BertTokenizer from HuggingFace -
-# For answers (i.e. labels), we take the most frequently occuring answer
+# we tokenize (and pad) them using the ``BertTokenizer`` from Hugging Face -
+# For answers (i.e. labels), we take the most frequently occurring answer
 # as the label to train with:
 #
 
@@ -133,8 +133,8 @@ def transform(tokenizer, input):
 
 
 ######################################################################
-# 4. Finally, we import the flava_model_for_classification from
-# torchmultimodal. It loads the pretrained flava checkpoint by default and
+# 4. Finally, we import the ``flava_model_for_classification`` from
+# ``torchmultimodal``. It loads the pretrained FLAVA checkpoint by default and
 # includes a classification head.
 #
 # The model forward function passes the image through the visual encoder
diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py
index 7f93ad233ba..35ab5c9538e 100644
--- a/beginner_source/hyperparameter_tuning_tutorial.py
+++ b/beginner_source/hyperparameter_tuning_tutorial.py
@@ -389,7 +389,7 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
         grace_period=1,
         reduction_factor=2)
     reporter = CLIReporter(
-        # parameter_columns=["l1", "l2", "lr", "batch_size"],
+        # ``parameter_columns=["l1", "l2", "lr", "batch_size"]``,
         metric_columns=["loss", "accuracy", "training_iteration"])
     result = tune.run(
         partial(train_cifar, data_dir=data_dir),
@@ -425,7 +425,7 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
 
 if __name__ == "__main__":
     # sphinx_gallery_start_ignore
-    # Fixes AttributeError: '_LoggingTee' object has no attribute 'fileno'.
+    # Fixes ``AttributeError: '_LoggingTee' object has no attribute 'fileno'``.
     # This is only needed to run with sphinx-build.
     import sys
     sys.stdout.fileno = lambda: False
diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py
index cdc20040654..bc32131b93a 100644
--- a/beginner_source/nn_tutorial.py
+++ b/beginner_source/nn_tutorial.py
@@ -2,10 +2,12 @@
 """
 What is `torch.nn` *really*?
 ============================
-by Jeremy Howard, `fast.ai <https://www.fast.ai>`_. Thanks to Rachel Thomas and Francisco Ingham.
+
+**Authors:** Jeremy Howard, `fast.ai <https://www.fast.ai>`_. Thanks to Rachel Thomas and Francisco Ingham.
 """
+
 ###############################################################################
-# We recommend running this tutorial as a notebook, not a script. To download the notebook (.ipynb) file,
+# We recommend running this tutorial as a notebook, not a script. To download the notebook (``.ipynb``) file,
 # click the link at the top of the page.
 #
 # PyTorch provides the elegantly designed modules and classes `torch.nn <https://pytorch.org/docs/stable/nn.html>`_ ,
@@ -90,7 +92,7 @@
 print(y_train.min(), y_train.max())
 
 ###############################################################################
-# Neural net from scratch (no torch.nn)
+# Neural net from scratch (without ``torch.nn``)
 # ---------------------------------------------
 #
 # Let's first create a model using nothing but PyTorch tensor operations. We're assuming
@@ -109,7 +111,7 @@
 #
 # .. note:: We are initializing the weights here with
 #    `Xavier initialisation <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
-#    (by multiplying with 1/sqrt(n)).
+#    (by multiplying with ``1/sqrt(n)``).
 
 import math
 
@@ -123,7 +125,7 @@
 # let's just write a plain matrix multiplication and broadcasted addition
 # to create a simple linear model. We also need an activation function, so
 # we'll write `log_softmax` and use it. Remember: although PyTorch
-# provides lots of pre-written loss functions, activation functions, and
+# provides lots of prewritten loss functions, activation functions, and
 # so forth, you can easily write your own using plain python. PyTorch will
 # even create fast GPU or vectorized CPU code for your function
 # automatically.
@@ -242,7 +244,7 @@ def accuracy(out, yb):
 print(loss_func(model(xb), yb), accuracy(model(xb), yb))
 
 ###############################################################################
-# Using torch.nn.functional
+# Using ``torch.nn.functional``
 # ------------------------------
 #
 # We will now refactor our code, so that it does the same thing as before, only
@@ -278,7 +280,7 @@ def model(xb):
 print(loss_func(model(xb), yb), accuracy(model(xb), yb))
 
 ###############################################################################
-# Refactor using nn.Module
+# Refactor using ``nn.Module``
 # -----------------------------
 # Next up, we'll use ``nn.Module`` and ``nn.Parameter``, for a clearer and more
 # concise training loop. We subclass ``nn.Module`` (which itself is a class and
@@ -320,22 +322,26 @@ def forward(self, xb):
 ###############################################################################
 # Previously for our training loop we had to update the values for each parameter
 # by name, and manually zero out the grads for each parameter separately, like this:
+#
 # ::
-#   with torch.no_grad():
-#       weights -= weights.grad * lr
-#       bias -= bias.grad * lr
-#       weights.grad.zero_()
-#       bias.grad.zero_()
+#
+#    with torch.no_grad():
+#        weights -= weights.grad * lr
+#        bias -= bias.grad * lr
+#        weights.grad.zero_()
+#        bias.grad.zero_()
 #
 #
 # Now we can take advantage of model.parameters() and model.zero_grad() (which
 # are both defined by PyTorch for ``nn.Module``) to make those steps more concise
 # and less prone to the error of forgetting some of our parameters, particularly
 # if we had a more complicated model:
+#
 # ::
-#   with torch.no_grad():
-#       for p in model.parameters(): p -= p.grad * lr
-#       model.zero_grad()
+#
+#    with torch.no_grad():
+#        for p in model.parameters(): p -= p.grad * lr
+#        model.zero_grad()
 #
 #
 # We'll wrap our little training loop in a ``fit`` function so we can run it
@@ -365,8 +371,8 @@ def fit():
 print(loss_func(model(xb), yb))
 
 ###############################################################################
-# Refactor using nn.Linear
-# -------------------------
+# Refactor using ``nn.Linear``
+# ----------------------------
 #
 # We continue to refactor our code.  Instead of manually defining and
 # initializing ``self.weights`` and ``self.bias``, and calculating ``xb  @
@@ -398,7 +404,7 @@ def forward(self, xb):
 print(loss_func(model(xb), yb))
 
 ###############################################################################
-# Refactor using optim
+# Refactor using ``torch.optim``
 # ------------------------------
 #
 # Pytorch also has a package with various optimization algorithms, ``torch.optim``.
@@ -406,15 +412,19 @@ def forward(self, xb):
 # of manually updating each parameter.
 #
 # This will let us replace our previous manually coded optimization step:
+#
 # ::
-#   with torch.no_grad():
-#       for p in model.parameters(): p -= p.grad * lr
-#       model.zero_grad()
+#
+#    with torch.no_grad():
+#        for p in model.parameters(): p -= p.grad * lr
+#        model.zero_grad()
 #
 # and instead use just:
+#
 # ::
-#   opt.step()
-#   opt.zero_grad()
+#
+#    opt.step()
+#    opt.zero_grad()
 #
 # (``optim.zero_grad()`` resets the gradient to 0 and we need to call it before
 # computing the gradient for the next minibatch.)
@@ -473,15 +483,19 @@ def get_model():
 train_ds = TensorDataset(x_train, y_train)
 
 ###############################################################################
-# Previously, we had to iterate through minibatches of x and y values separately:
+# Previously, we had to iterate through minibatches of ``x`` and ``y`` values separately:
+#
 # ::
-#     xb = x_train[start_i:end_i]
-#     yb = y_train[start_i:end_i]
+#
+#    xb = x_train[start_i:end_i]
+#    yb = y_train[start_i:end_i]
 #
 #
 # Now, we can do these two steps together:
+#
 # ::
-#     xb,yb = train_ds[i*bs : i*bs+bs]
+#
+#    xb,yb = train_ds[i*bs : i*bs+bs]
 #
 
 model, opt = get_model()
@@ -499,13 +513,13 @@ def get_model():
 print(loss_func(model(xb), yb))
 
 ###############################################################################
-# Refactor using DataLoader
+# Refactor using ``DataLoader``
 # ------------------------------
 #
-# Pytorch's ``DataLoader`` is responsible for managing batches. You can
+# PyTorch's ``DataLoader`` is responsible for managing batches. You can
 # create a ``DataLoader`` from any ``Dataset``. ``DataLoader`` makes it easier
 # to iterate over batches. Rather than having to use ``train_ds[i*bs : i*bs+bs]``,
-# the DataLoader gives us each minibatch automatically.
+# the ``DataLoader`` gives us each minibatch automatically.
 
 from torch.utils.data import DataLoader
 
@@ -513,16 +527,20 @@ def get_model():
 train_dl = DataLoader(train_ds, batch_size=bs)
 
 ###############################################################################
-# Previously, our loop iterated over batches (xb, yb) like this:
+# Previously, our loop iterated over batches ``(xb, yb)`` like this:
+#
 # ::
-#       for i in range((n-1)//bs + 1):
-#           xb,yb = train_ds[i*bs : i*bs+bs]
-#           pred = model(xb)
 #
-# Now, our loop is much cleaner, as (xb, yb) are loaded automatically from the data loader:
+#    for i in range((n-1)//bs + 1):
+#        xb,yb = train_ds[i*bs : i*bs+bs]
+#        pred = model(xb)
+#
+# Now, our loop is much cleaner, as ``(xb, yb)`` are loaded automatically from the data loader:
+#
 # ::
-#       for xb,yb in train_dl:
-#           pred = model(xb)
+#
+#    for xb,yb in train_dl:
+#        pred = model(xb)
 
 model, opt = get_model()
 
@@ -538,7 +556,7 @@ def get_model():
 print(loss_func(model(xb), yb))
 
 ###############################################################################
-# Thanks to Pytorch's ``nn.Module``, ``nn.Parameter``, ``Dataset``, and ``DataLoader``,
+# Thanks to PyTorch's ``nn.Module``, ``nn.Parameter``, ``Dataset``, and ``DataLoader``,
 # our training loop is now dramatically smaller and easier to understand. Let's
 # now try to add the basic features necessary to create effective models in practice.
 #
@@ -573,7 +591,7 @@ def get_model():
 #
 # (Note that we always call ``model.train()`` before training, and ``model.eval()``
 # before inference, because these are used by layers such as ``nn.BatchNorm2d``
-# and ``nn.Dropout`` to ensure appropriate behaviour for these different phases.)
+# and ``nn.Dropout`` to ensure appropriate behavior for these different phases.)
 
 model, opt = get_model()
 
@@ -667,11 +685,11 @@ def get_data(train_ds, valid_ds, bs):
 # Because none of the functions in the previous section assume anything about
 # the model form, we'll be able to use them to train a CNN without any modification.
 #
-# We will use Pytorch's predefined
+# We will use PyTorch's predefined
 # `Conv2d <https://pytorch.org/docs/stable/nn.html#torch.nn.Conv2d>`_ class
 # as our convolutional layer. We define a CNN with 3 convolutional layers.
 # Each convolution is followed by a ReLU.  At the end, we perform an
-# average pooling.  (Note that ``view`` is PyTorch's version of numpy's
+# average pooling.  (Note that ``view`` is PyTorch's version of Numpy's
 # ``reshape``)
 
 class Mnist_CNN(nn.Module):
@@ -702,7 +720,7 @@ def forward(self, xb):
 fit(epochs, model, loss_func, opt, train_dl, valid_dl)
 
 ###############################################################################
-# nn.Sequential
+# Using ``nn.Sequential``
 # ------------------------
 #
 # ``torch.nn`` has another handy class we can use to simplify our code:
@@ -729,7 +747,7 @@ def preprocess(x):
     return x.view(-1, 1, 28, 28)
 
 ###############################################################################
-# The model created with ``Sequential`` is simply:
+# The model created with ``Sequential`` is simple:
 
 model = nn.Sequential(
     Lambda(preprocess),
@@ -748,7 +766,7 @@ def preprocess(x):
 fit(epochs, model, loss_func, opt, train_dl, valid_dl)
 
 ###############################################################################
-# Wrapping DataLoader
+# Wrapping ``DataLoader``
 # -----------------------------
 #
 # Our CNN is fairly concise, but it only works with MNIST, because:
@@ -862,7 +880,7 @@ def preprocess(x, y):
 # ``torch.nn``, ``torch.optim``, ``Dataset``, and ``DataLoader``. So let's summarize
 # what we've seen:
 #
-#  - **torch.nn**
+#  - ``torch.nn``:
 #
 #    + ``Module``: creates a callable which behaves like a function, but can also
 #      contain state(such as neural net layer weights). It knows what ``Parameter`` (s) it
diff --git a/beginner_source/profiler.py b/beginner_source/profiler.py
index 450e450b0f0..95d077f7ba3 100644
--- a/beginner_source/profiler.py
+++ b/beginner_source/profiler.py
@@ -6,7 +6,7 @@
 PyTorch includes a profiler API that is useful to identify the time and
 memory costs of various PyTorch operations in your code. Profiler can be
 easily integrated in your code, and the results can be printed as a table
-or retured in a JSON trace file.
+or returned in a JSON trace file.
 
 .. note::
     Profiler supports multithreaded models. Profiler runs in the
diff --git a/beginner_source/saving_loading_models.py b/beginner_source/saving_loading_models.py
index f3c74828fa8..d4b328156ce 100644
--- a/beginner_source/saving_loading_models.py
+++ b/beginner_source/saving_loading_models.py
@@ -60,7 +60,7 @@
 # linear layers, etc.) and registered buffers (batchnorm's running_mean)
 # have entries in the model’s *state_dict*. Optimizer
 # objects (``torch.optim``) also have a *state_dict*, which contains
-# information about the optimizer’s state, as well as the hyperparameters
+# information about the optimizer's state, as well as the hyperparameters
 # used.
 #
 # Because *state_dict* objects are Python dictionaries, they can be easily
@@ -158,9 +158,9 @@
 #
 # .. note::
 #     The 1.6 release of PyTorch switched ``torch.save`` to use a new
-#     zipfile-based file format. ``torch.load`` still retains the ability to
+#     zip file-based format. ``torch.load`` still retains the ability to
 #     load files in the old format. If for any reason you want ``torch.save``
-#     to use the old format, pass the kwarg ``_use_new_zipfile_serialization=False``.
+#     to use the old format, pass the ``kwarg``parameter ``_use_new_zipfile_serialization=False``.
 #
 # When saving a model for inference, it is only necessary to save the
 # trained model’s learned parameters. Saving the model’s *state_dict* with
@@ -302,7 +302,7 @@
 #
 # When saving a general checkpoint, to be used for either inference or
 # resuming training, you must save more than just the model’s
-# *state_dict*. It is important to also save the optimizer’s *state_dict*,
+# *state_dict*. It is important to also save the optimizer's *state_dict*,
 # as this contains buffers and parameters that are updated as the model
 # trains. Other items that you may want to save are the epoch you left off
 # on, the latest recorded training loss, external ``torch.nn.Embedding``
@@ -503,7 +503,7 @@
 #
 # When loading a model on a GPU that was trained and saved on CPU, set the
 # ``map_location`` argument in the ``torch.load()`` function to
-# *cuda:device_id*. This loads the model to a given GPU device. Next, be
+# ``cuda:device_id``. This loads the model to a given GPU device. Next, be
 # sure to call ``model.to(torch.device('cuda'))`` to convert the model’s
 # parameter tensors to CUDA tensors. Finally, be sure to use the
 # ``.to(torch.device('cuda'))`` function on all model inputs to prepare
diff --git a/beginner_source/t5_tutorial.py b/beginner_source/t5_tutorial.py
index 6b4742170d9..8f77cd278ea 100644
--- a/beginner_source/t5_tutorial.py
+++ b/beginner_source/t5_tutorial.py
@@ -2,7 +2,7 @@
 T5-Base Model for Summarization, Sentiment Classification, and Translation
 ==========================================================================
 
-**Author**: `Pendo Abbo <pabbo@fb.com>`__, `Joe Cummings <jrcummings@fb.com>`__
+**Authors**: `Pendo Abbo <pabbo@fb.com>`__, `Joe Cummings <jrcummings@fb.com>`__
 
 """
 
@@ -10,12 +10,12 @@
 # Overview
 # --------
 #
-# This tutorial demonstrates how to use a pre-trained T5 Model for summarization, sentiment classification, and
+# This tutorial demonstrates how to use a pretrained T5 Model for summarization, sentiment classification, and
 # translation tasks. We will demonstrate how to use the torchtext library to:
 #
-# 1. Build a text pre-processing pipeline for a T5 model
-# 2. Instantiate a pre-trained T5 model with base configuration
-# 3. Read in the CNNDM, IMDB, and Multi30k datasets and pre-process their texts in preparation for the model
+# 1. Build a text preprocessing pipeline for a T5 model
+# 2. Instantiate a pretrained T5 model with base configuration
+# 3. Read in the CNNDM, IMDB, and Multi30k datasets and preprocess their texts in preparation for the model
 # 4. Perform text summarization, sentiment classification, and translation
 #
 # .. note::
@@ -33,8 +33,8 @@
 # 3. Truncate the sequences to a specified maximum length
 # 4. Add end-of-sequence (EOS) and padding token IDs
 #
-# T5 uses a SentencePiece model for text tokenization. Below, we use a pre-trained SentencePiece model to build
-# the text pre-processing pipeline using torchtext's T5Transform. Note that the transform supports both
+# T5 uses a ``SentencePiece`` model for text tokenization. Below, we use a pretrained ``SentencePiece`` model to build
+# the text preprocessing pipeline using torchtext's T5Transform. Note that the transform supports both
 # batched and non-batched text input (for example, one can either pass a single sentence or a list of sentences), however the T5 model expects the input to be batched.
 #
 
@@ -53,7 +53,7 @@
 )
 
 #######################################################################
-# Alternatively, we can also use the transform shipped with the pre-trained models that does all of the above out-of-the-box
+# Alternatively, we can also use the transform shipped with the pretrained models that does all of the above out-of-the-box
 #
 # .. code-block::
 #
@@ -66,9 +66,9 @@
 # Model Preparation
 # -----------------
 #
-# torchtext provides SOTA pre-trained models that can be used directly for NLP tasks or fine-tuned on downstream tasks. Below
-# we use the pre-trained T5 model with standard base configuration to perform text summarization, sentiment classification, and
-# translation. For additional details on available pre-trained models, see `the torchtext documentation <https://pytorch.org/text/main/models.html>`__
+# torchtext provides SOTA pretrained models that can be used directly for NLP tasks or fine-tuned on downstream tasks. Below
+# we use the pretrained T5 model with standard base configuration to perform text summarization, sentiment classification, and
+# translation. For additional details on available pretrained models, see `the torchtext documentation <https://pytorch.org/text/main/models.html>`__
 #
 #
 from torchtext.models import T5_BASE_GENERATION
@@ -81,8 +81,8 @@
 
 
 #######################################################################
-# GenerationUtils
-# ------------------
+# Using ``GenerationUtils``
+# -------------------------
 #
 # We can use torchtext's ``GenerationUtils`` to produce an output sequence based on the input sequence provided. This calls on the
 # model's encoder and decoder, and iteratively expands the decoded sequences until the end-of-sequence token is generated
@@ -103,8 +103,8 @@
 # datapipes and hence support standard flow-control and mapping/transformation using user defined
 # functions and transforms.
 #
-# Below we demonstrate how to pre-process the CNNDM dataset to include the prefix necessary for the
-# model to indentify the task it is performing. The CNNDM dataset has a train, validation, and test
+# Below we demonstrate how to preprocess the CNNDM dataset to include the prefix necessary for the
+# model to identify the task it is performing. The CNNDM dataset has a train, validation, and test
 # split. Below we demo on the test split.
 #
 # The T5 model uses the prefix "summarize" for text summarization. For more information on task
@@ -160,7 +160,7 @@ def apply_prefix(task, x):
 # This dataset has a train and test split. Below we demo on the test split.
 #
 # The T5 model was trained on the SST2 dataset (also available in torchtext) for sentiment classification using the
-# prefix "sst2 sentence". Therefore, we will use this prefix to perform sentiment classification on the IMDB dataset.
+# prefix ``sst2 sentence``. Therefore, we will use this prefix to perform sentiment classification on the IMDB dataset.
 #
 
 from torchtext.datasets import IMDB
diff --git a/beginner_source/template_tutorial.py b/beginner_source/template_tutorial.py
index 7e3e0220f0e..520bd40eb03 100644
--- a/beginner_source/template_tutorial.py
+++ b/beginner_source/template_tutorial.py
@@ -8,13 +8,13 @@
 
 .. grid:: 2
 
-   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
 
       * Item 1
       * Item 2
       * Item 3
 
-   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
 
       * PyTorch v2.0.0
       * GPU ???
@@ -35,18 +35,18 @@
    respects regular expressions.
    For example to run only ``neural_style_transfer_tutorial.py``,
    use the following command:
-   
+
    .. code-block:: sh
-   
+
       GALLERY_PATTERN="neural_style_transfer_tutorial.py" make html
-    
+
    or
-   
+
    .. code-block:: sh
-      
+
       GALLERY_PATTERN="neural_style_transfer_tutorial.py" sphinx-build . _build
-  
-* Make a copy of this repo and add only your
+
+* Make a copy of this repository and add only your
   tutorial to the `beginner_source` directory removing all other tutorials.
   Then run ``make html``.
   
diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py
index 1dd7466344c..9036cdd7214 100644
--- a/beginner_source/text_sentiment_ngrams_tutorial.py
+++ b/beginner_source/text_sentiment_ngrams_tutorial.py
@@ -1,6 +1,6 @@
 """
 Text classification with the torchtext library
-==================================
+==============================================
 
 In this tutorial, we will show how to use the torchtext library to build the dataset for the text classification analysis. Users will have the flexibility to
 
@@ -133,7 +133,7 @@ def collate_batch(batch):
 # Define the model
 # ----------------
 #
-# The model is composed of the `nn.EmbeddingBag <https://pytorch.org/docs/stable/nn.html?highlight=embeddingbag#torch.nn.EmbeddingBag>`__ layer plus a linear layer for the classification purpose. ``nn.EmbeddingBag`` with the default mode of "mean" computes the mean value of a “bag” of embeddings. Although the text entries here have different lengths, nn.EmbeddingBag module requires no padding here since the text lengths are saved in offsets.
+# The model is composed of the `nn.EmbeddingBag <https://pytorch.org/docs/stable/nn.html?highlight=embeddingbag#torch.nn.EmbeddingBag>`__ layer plus a linear layer for the classification purpose. ``nn.EmbeddingBag`` with the default mode of "mean" computes the mean value of a “bag” of embeddings. Although the text entries here have different lengths, ``nn.EmbeddingBag`` module requires no padding here since the text lengths are saved in offsets.
 #
 # Additionally, since ``nn.EmbeddingBag`` accumulates the average across
 # the embeddings on the fly, ``nn.EmbeddingBag`` can enhance the
diff --git a/beginner_source/transfer_learning_tutorial.py b/beginner_source/transfer_learning_tutorial.py
index 15843ec074e..b4460bb4fb2 100644
--- a/beginner_source/transfer_learning_tutorial.py
+++ b/beginner_source/transfer_learning_tutorial.py
@@ -20,7 +20,7 @@
 
 These two major transfer learning scenarios look as follows:
 
--  **Finetuning the convnet**: Instead of random initialization, we
+-  **Finetuning the ConvNet**: Instead of random initialization, we
    initialize the network with a pretrained network, like the one that is
    trained on imagenet 1000 dataset. Rest of the training looks as
    usual.
@@ -108,7 +108,7 @@
 # augmentations.
 
 def imshow(inp, title=None):
-    """Imshow for Tensor."""
+    """Display image for Tensor."""
     inp = inp.numpy().transpose((1, 2, 0))
     mean = np.array([0.485, 0.456, 0.406])
     std = np.array([0.229, 0.224, 0.225])
@@ -244,7 +244,7 @@ def visualize_model(model, num_images=6):
         model.train(mode=was_training)
 
 ######################################################################
-# Finetuning the convnet
+# Finetuning the ConvNet
 # ----------------------
 #
 # Load a pretrained model and reset final fully connected layer.
@@ -253,7 +253,7 @@ def visualize_model(model, num_images=6):
 model_ft = models.resnet18(weights='IMAGENET1K_V1')
 num_ftrs = model_ft.fc.in_features
 # Here the size of each output sample is set to 2.
-# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
+# Alternatively, it can be generalized to ``nn.Linear(num_ftrs, len(class_names))``.
 model_ft.fc = nn.Linear(num_ftrs, 2)
 
 model_ft = model_ft.to(device)
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index 1917b56ee8a..fab8e3a9a59 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -1,5 +1,5 @@
 """
-Language Modeling with nn.Transformer and TorchText
+Language Modeling with ``nn.Transformer`` and torchtext
 ===============================================================
 
 This is a tutorial on training a sequence-to-sequence model that uses the
@@ -78,12 +78,12 @@ def init_weights(self) -> None:
 
     def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
         """
-        Args:
-            src: Tensor, shape [seq_len, batch_size]
-            src_mask: Tensor, shape [seq_len, seq_len]
+        Arguments:
+            src: Tensor, shape ``[seq_len, batch_size]``
+            src_mask: Tensor, shape ``[seq_len, seq_len]``
 
         Returns:
-            output Tensor of shape [seq_len, batch_size, ntoken]
+            output Tensor of shape ``[seq_len, batch_size, ntoken]``
         """
         src = self.encoder(src) * math.sqrt(self.d_model)
         src = self.pos_encoder(src)
@@ -93,7 +93,7 @@ def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
 
 
 def generate_square_subsequent_mask(sz: int) -> Tensor:
-    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
+    """Generates an upper-triangular matrix of ``-inf``, with zeros on ``diag``."""
     return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
 
 
@@ -120,8 +120,8 @@ def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
 
     def forward(self, x: Tensor) -> Tensor:
         """
-        Args:
-            x: Tensor, shape [seq_len, batch_size, embedding_dim]
+        Arguments:
+            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
         """
         x = x + self.pe[:x.size(0)]
         return self.dropout(x)
@@ -182,7 +182,7 @@ def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
     data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
     return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
 
-# train_iter was "consumed" by the process of building the vocab,
+# ``train_iter`` was "consumed" by the process of building the vocab,
 # so we have to create it again
 train_iter, val_iter, test_iter = WikiText2()
 train_data = data_process(train_iter)
@@ -192,15 +192,15 @@ def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 def batchify(data: Tensor, bsz: int) -> Tensor:
-    """Divides the data into bsz separate sequences, removing extra elements
+    """Divides the data into ``bsz`` separate sequences, removing extra elements
     that wouldn't cleanly fit.
 
-    Args:
+    Arguments:
         data: Tensor, shape [N]
         bsz: int, batch size
 
     Returns:
-        Tensor of shape [N // bsz, bsz]
+        Tensor of shape ``[N // bsz, bsz]``
     """
     seq_len = data.size(0) // bsz
     data = data[:seq_len * bsz]
@@ -209,7 +209,7 @@ def batchify(data: Tensor, bsz: int) -> Tensor:
 
 batch_size = 20
 eval_batch_size = 10
-train_data = batchify(train_data, batch_size)  # shape [seq_len, batch_size]
+train_data = batchify(train_data, batch_size)  # shape ``[seq_len, batch_size]``
 val_data = batchify(val_data, eval_batch_size)
 test_data = batchify(test_data, eval_batch_size)
 
@@ -238,12 +238,12 @@ def batchify(data: Tensor, bsz: int) -> Tensor:
 def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
     """
     Args:
-        source: Tensor, shape [full_seq_len, batch_size]
+        source: Tensor, shape ``[full_seq_len, batch_size]``
         i: int
 
     Returns:
-        tuple (data, target), where data has shape [seq_len, batch_size] and
-        target has shape [seq_len * batch_size]
+        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
+        target has shape ``[seq_len * batch_size]``
     """
     seq_len = min(bptt, len(source) - 1 - i)
     data = source[i:i+seq_len]
@@ -258,15 +258,15 @@ def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
 
 
 ######################################################################
-# The model hyperparameters are defined below. The vocab size is
+# The model hyperparameters are defined below. The ``vocab`` size is
 # equal to the length of the vocab object.
 #
 
 ntokens = len(vocab)  # size of vocabulary
 emsize = 200  # embedding dimension
-d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
-nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
-nhead = 2  # number of heads in nn.MultiheadAttention
+d_hid = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
+nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
+nhead = 2  # number of heads in ``nn.MultiheadAttention``
 dropout = 0.2  # dropout probability
 model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)
 
diff --git a/beginner_source/translation_transformer.py b/beginner_source/translation_transformer.py
index 6e2538d1599..c5553246e38 100644
--- a/beginner_source/translation_transformer.py
+++ b/beginner_source/translation_transformer.py
@@ -1,6 +1,6 @@
 """
-Language Translation with nn.Transformer and torchtext
-======================================================
+Language Translation with ``nn.Transformer`` and torchtext
+==========================================================
 
 This tutorial shows:
     - How to train a translation model from scratch using Transformer.
@@ -40,12 +40,16 @@
 token_transform = {}
 vocab_transform = {}
 
-
+###################################################################################
 # Create source and target language tokenizer. Make sure to install the dependencies.
-# pip install -U torchdata
-# pip install -U spacy
-# python -m spacy download en_core_web_sm
-# python -m spacy download de_core_news_sm
+#
+# .. code-block:: python
+#
+#    pip install -U torchdata
+#    pip install -U spacy
+#    python -m spacy download en_core_web_sm
+#    python -m spacy download de_core_news_sm
+
 token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
 token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')
 
@@ -71,8 +75,8 @@ def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
                                                     specials=special_symbols,
                                                     special_first=True)
 
-# Set UNK_IDX as the default index. This index is returned when the token is not found.
-# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.
+# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
+# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
 for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
   vocab_transform[ln].set_default_index(UNK_IDX)
 
@@ -89,7 +93,7 @@ def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
 # encodings to provide position information of input tokens to the model. The second part is the
 # actual `Transformer <https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html>`__ model.
 # Finally, the output of the Transformer model is passed through linear layer
-# that gives un-normalized probabilities for each token in the target language.
+# that gives unnormalized probabilities for each token in the target language.
 #
 
 
@@ -205,7 +209,7 @@ def create_mask(src, tgt):
 
 ######################################################################
 # Let's now define the parameters of our model and instantiate the same. Below, we also
-# define our loss function which is the cross-entropy loss and the optmizer used for training.
+# define our loss function which is the cross-entropy loss and the optimizer used for training.
 #
 torch.manual_seed(0)
 
@@ -258,7 +262,7 @@ def tensor_transform(token_ids: List[int]):
                       torch.tensor(token_ids),
                       torch.tensor([EOS_IDX])))
 
-# src and tgt language text transforms to convert raw strings into tensors indices
+# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
 text_transform = {}
 for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
     text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
diff --git a/beginner_source/vt_tutorial.py b/beginner_source/vt_tutorial.py
index f3b649f827e..1b0a93b8b4b 100644
--- a/beginner_source/vt_tutorial.py
+++ b/beginner_source/vt_tutorial.py
@@ -30,7 +30,7 @@
 # Convolutional Neural Networks (CNNs) have been the main models for image
 # classification since deep learning took off in 2012, but CNNs typically
 # require hundreds of millions of images for training to achieve the
-# SOTAresults. DeiT is a vision transformer model that requires a lot less
+# SOTA results. DeiT is a vision transformer model that requires a lot less
 # data and computing resources for training to compete with the leading
 # CNNs in performing image classification, which is made possible by two
 # key components of of DeiT:
@@ -50,16 +50,20 @@
 # Classifying Images with DeiT
 # -------------------------------
 #
-# Follow the README at the DeiT repo for detailed information on how to
+# Follow the ``README.md`` at the DeiT repository for detailed information on how to
 # classify images using DeiT, or for a quick test, first install the
-# required packages:
-
-# pip install torch torchvision timm pandas requests
+# required packages: 
+#
+# .. code-block:: python
+#
+#    pip install torch torchvision timm pandas requests
 
 #######################################################
-# To run in Google Colab, uncomment the following line:
-
-# !pip install timm pandas requests
+# To run in Google Colab, install dependencies by running the following command:
+#
+# .. code-block:: python
+#
+#    !pip install timm pandas requests
 
 #############################
 # then run the script below:
@@ -94,8 +98,8 @@
 
 ######################################################################
 # The output should be 269, which, according to the ImageNet list of class
-# index to `labels file <https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a>`_, maps to ‘timber
-# wolf, grey wolf, gray wolf, Canis lupus’.
+# index to `labels file <https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a>`_, maps to ``timber
+# wolf, grey wolf, gray wolf, Canis lupus``.
 #
 # Now that we have verified that we can use the DeiT model to classify
 # images, let’s see how to modify the model so it can run on iOS and
@@ -120,7 +124,7 @@
 
 
 ######################################################################
-# The scripted model file fbdeit_scripted.pt of size about 346MB is
+# The scripted model file ``fbdeit_scripted.pt`` of size about 346MB is
 # generated.
 #
 
@@ -138,8 +142,8 @@
 # Now run the code below:
 #
 
-# Use 'x86' for server inference (the old 'fbgemm' is still available but 'x86' is the recommended default) and 'qnnpack' for mobile inference.
-backend = "x86" # replaced with qnnpack causing much worse inference speed for quantized model on this notebook
+# Use 'x86' for server inference (the old 'fbgemm' is still available but 'x86' is the recommended default) and ``qnnpack`` for mobile inference.
+backend = "x86" # replaced with ``qnnpack`` causing much worse inference speed for quantized model on this notebook
 model.qconfig = torch.quantization.get_default_qconfig(backend)
 torch.backends.quantized.engine = backend
 
@@ -150,7 +154,7 @@
 
 ######################################################################
 # This generates the scripted and quantized version of the model
-# fbdeit_quantized_scripted.pt, with size about 89MB, a 74% reduction of
+# ``fbdeit_quantized_scripted.pt``, with size about 89MB, a 74% reduction of
 # the non-quantized model size of 346MB!
 #
 
@@ -177,7 +181,7 @@
 
 
 ######################################################################
-# The generated fbdeit_optimized_scripted_quantized.pt file has about the
+# The generated ``fbdeit_optimized_scripted_quantized.pt`` file has about the
 # same size as the quantized, scripted, but non-optimized model. The
 # inference result remains the same.
 #
diff --git a/en-wordlist.txt b/en-wordlist.txt
new file mode 100644
index 00000000000..c1447668122
--- /dev/null
+++ b/en-wordlist.txt
@@ -0,0 +1,209 @@
+APIs
+Args
+Autograd
+BCE
+BOS
+Bahdanau
+BatchNorm
+CHW
+CIFAR
+CLS
+CNNDM
+CNNs
+CUDA
+Chatbots
+Colab
+Conv
+ConvNet
+DCGAN
+DCGANs
+DataLoaders
+DeiT
+EOS
+FGSM
+FLAVA
+FloydHub
+FloydHub's
+GAN
+GANs
+GPUs
+GRU
+GRUs
+Goodfellow
+Goodfellow’s
+GreedySearchDecoder
+Hugging Face
+IMDB
+ImageNet
+Initializations
+Iteratively
+JSON
+Kuei
+LSTM
+LeNet
+LeakyReLU
+LeakyReLUs
+Luong
+MLP
+MNIST
+Mypy
+NLP
+NaN
+NeurIPS
+NumPy
+Numericalization
+Numpy's
+Profiler
+PyTorch's
+RGB
+RNN
+RNNs
+Radford
+ReLU
+SST2
+Sigmoid
+SoTA
+TensorBoard
+TextVQA
+Tokenization
+TorchMultimodal
+TorchScript
+Unescape
+VQA
+Wikitext
+accuracies
+activations
+adversarially
+al
+backend
+backprop
+backpropagated
+backpropagates
+backpropagation
+batchnorm's
+benchmarking
+boolean
+broadcasted
+chatbot
+chatbot's
+checkpointing
+composable
+concat
+contrastive
+conv
+convolutional
+cpu
+csv
+datafile
+dataloader
+dataloaders
+datapipes
+dataset
+datasets
+dataset’s
+deserialize
+deserialized
+dir
+downsample
+embeddings
+encodings
+eq
+et
+evaluateInput
+fastai
+fbgemm
+feedforward
+finetune
+finetuning
+helpdesk
+helpdesks
+hyperparameter
+hyperparameters
+imagenet
+io
+iterable
+iteratively
+jit
+jpg
+labelled
+learnable
+loadFilename
+manualSeed
+matplotlib
+minibatch
+minibatches
+minimax
+misclassification
+misclassified
+modularity
+modularized
+multimodal
+multimodality
+multithreaded
+namespace
+natively
+ndarrays
+num
+numericalize
+numpy
+optimizable
+optimizer's
+optimizers
+overfitting
+parallelizable
+parallelization
+perceptibility
+prepend
+preprocess
+preprocessing
+pretrained
+prewritten
+profiler
+profilers
+pytorch
+quantized
+quantizing
+randint
+readably
+reinitializes
+relu
+reproducibility
+rescale
+rewinded
+runtime
+runtime
+runtimes
+softmax
+src
+stacktrace
+stateful
+storages
+strided
+subclasses
+subdirectories
+submodule
+summarization
+tanh
+th
+thresholding
+tokenization
+tokenize
+tokenizer
+torchaudio
+torchdata
+torchtext
+torchtext's
+torchvision
+traceback
+tradeoff
+uncomment
+uncommented
+unimodal
+unnormalized
+unpickling
+utils
+vectorized
+voc
+walkthrough
+warmstart
+warmstarting
diff --git a/tutorials-wordlist.txt b/tutorials-wordlist.txt
deleted file mode 100644
index 822e2fb2525..00000000000
--- a/tutorials-wordlist.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-csv
-DataLoaders
-dataloader
-dataset
-datasets
-dir
-imagenet
-io
-jpg
-ndarrays
-Numpy's
-numpy
-preprocess
-preprocessing
-pytorch
-rescale
-runtime
-th
-subclasses
-submodule
-tanh
-torchvision
-uncomment