diff --git a/advanced_source/neural_style_tutorial.py b/advanced_source/neural_style_tutorial.py
index a293026cb5b..b4ab10ef01d 100644
--- a/advanced_source/neural_style_tutorial.py
+++ b/advanced_source/neural_style_tutorial.py
@@ -87,7 +87,7 @@
 # to 255 tensor images. 
 #
 #
-# .. Note::
+# .. note::
 #     Here are links to download the images required to run the tutorial:
 #     `picasso.jpg <https://pytorch.org/tutorials/_static/img/neural-style/picasso.jpg>`__ and
 #     `dancing.jpg <https://pytorch.org/tutorials/_static/img/neural-style/dancing.jpg>`__.
@@ -183,7 +183,7 @@ def forward(self, input):
         return input
 
 ######################################################################
-# .. Note::
+# .. note::
 #    **Important detail**: although this module is named ``ContentLoss``, it
 #    is not a true PyTorch Loss function. If you want to define your content
 #    loss as a PyTorch Loss function, you have to create a PyTorch autograd function 
@@ -372,7 +372,7 @@ def get_style_model_and_losses(cnn, normalization_mean, normalization_std,
 input_img = content_img.clone()
 # if you want to use white noise by using the following code:
 #
-# ::
+# .. code-block:: python
 #
 #    input_img = torch.randn(content_img.data.size())
 
diff --git a/beginner_source/blitz/neural_networks_tutorial.py b/beginner_source/blitz/neural_networks_tutorial.py
index 3b3c95fd229..dc5bcd32217 100644
--- a/beginner_source/blitz/neural_networks_tutorial.py
+++ b/beginner_source/blitz/neural_networks_tutorial.py
@@ -161,7 +161,7 @@ def forward(self, x):
 # ``.grad_fn`` attribute, you will see a graph of computations that looks
 # like this:
 #
-# ::
+# .. code-block:: sh
 #
 #     input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
 #           -> flatten -> linear -> relu -> linear -> relu -> linear
@@ -253,7 +253,7 @@ def forward(self, x):
 
 
 ###############################################################
-# .. Note::
+# .. note::
 #
 #       Observe how gradient buffers had to be manually set to zero using
 #       ``optimizer.zero_grad()``. This is because gradients are accumulated
diff --git a/beginner_source/data_loading_tutorial.py b/beginner_source/data_loading_tutorial.py
index 7c1c3487cb6..ab9de0d7d73 100644
--- a/beginner_source/data_loading_tutorial.py
+++ b/beginner_source/data_loading_tutorial.py
@@ -50,9 +50,9 @@
 #     estimation <https://blog.dlib.net/2014/08/real-time-face-pose-estimation.html>`__
 #     on a few images from imagenet tagged as 'face'.
 #
-# Dataset comes with a csv file with annotations which looks like this:
+# Dataset comes with a ``.csv`` file with annotations which looks like this:
 #
-# ::
+# .. code-block:: sh
 #
 #     image_name,part_0_x,part_0_y,part_1_x,part_1_y,part_2_x, ... ,part_67_x,part_67_y
 #     0805personali01.jpg,27,83,27,98, ... 84,134
@@ -196,7 +196,7 @@ def __getitem__(self, idx):
 # called. For this, we just need to implement ``__call__`` method and
 # if required, ``__init__`` method. We can then use a transform like this:
 #
-# ::
+# .. code-block:: python
 #
 #     tsfm = Transform(params)
 #     transformed_sample = tsfm(sample)
@@ -421,7 +421,9 @@ def show_landmarks_batch(sample_batched):
 # and dataloader. ``torchvision`` package provides some common datasets and
 # transforms. You might not even have to write custom classes. One of the
 # more generic datasets available in torchvision is ``ImageFolder``.
-# It assumes that images are organized in the following way: ::
+# It assumes that images are organized in the following way:
+#
+# .. code-block:: sh
 #
 #     root/ants/xxx.png
 #     root/ants/xxy.jpeg
@@ -435,7 +437,9 @@ def show_landmarks_batch(sample_batched):
 #
 # where 'ants', 'bees' etc. are class labels. Similarly generic transforms
 # which operate on ``PIL.Image`` like  ``RandomHorizontalFlip``, ``Scale``,
-# are also available. You can use these to write a dataloader like this: ::
+# are also available. You can use these to write a dataloader like this:
+#
+# .. code-block:: pytorch
 #
 #    import torch
 #    from torchvision import transforms, datasets
diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py
index b1097b658d9..e9ac3fdd504 100644
--- a/beginner_source/dcgan_faces_tutorial.py
+++ b/beginner_source/dcgan_faces_tutorial.py
@@ -226,7 +226,7 @@
 # the ``celeba`` directory you just created. The resulting directory
 # structure should be:
 # 
-# ::
+# .. code-block:: sh
 # 
 #    /path/to/celeba
 #        -> img_align_celeba  
diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py
index 228879fa5f2..e9141b45472 100644
--- a/beginner_source/hyperparameter_tuning_tutorial.py
+++ b/beginner_source/hyperparameter_tuning_tutorial.py
@@ -462,7 +462,7 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
 ######################################################################
 # If you run the code, an example output could look like this:
 #
-# ::
+# .. code-block:: sh
 #
 #     Number of trials: 10/10 (10 TERMINATED)
 #     +-----+--------------+------+------+-------------+--------+---------+------------+
diff --git a/beginner_source/introyt/autogradyt_tutorial.py b/beginner_source/introyt/autogradyt_tutorial.py
index a2ed238e52b..f5bc50c3b91 100644
--- a/beginner_source/introyt/autogradyt_tutorial.py
+++ b/beginner_source/introyt/autogradyt_tutorial.py
@@ -213,7 +213,7 @@
 #########################################################################
 # Recall the computation steps we took to get here:
 # 
-# ::
+# .. code-block:: python
 # 
 #    a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
 #    b = torch.sin(a)
@@ -456,10 +456,10 @@ def add_tensors2(x, y):
 # .. note::
 #     The following code cell throws a runtime error. This is expected.
 # 
-# ::
+#    .. code-block:: python
 #
-#    a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
-#    torch.sin_(a)
+#       a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
+#       torch.sin_(a)
 #
 
 #########################################################################
diff --git a/beginner_source/introyt/captumyt.py b/beginner_source/introyt/captumyt.py
index cf63b6109b6..abf2391d254 100644
--- a/beginner_source/introyt/captumyt.py
+++ b/beginner_source/introyt/captumyt.py
@@ -109,11 +109,15 @@
 To install Captum in an Anaconda or pip virtual environment, use the
 appropriate command for your environment below:
 
-With ``conda``::
+With ``conda``:
+
+.. code-block:: sh
 
     conda install pytorch torchvision captum flask-compress matplotlib=3.3.4 -c pytorch
 
-With ``pip``::
+With ``pip``:
+
+.. code-block:: sh
 
     pip install torch torchvision captum matplotlib==3.3.4 Flask-Compress
 
diff --git a/beginner_source/introyt/introyt1_tutorial.py b/beginner_source/introyt/introyt1_tutorial.py
index a5d65bcab16..74675070708 100644
--- a/beginner_source/introyt/introyt1_tutorial.py
+++ b/beginner_source/introyt/introyt1_tutorial.py
@@ -580,7 +580,7 @@ def forward(self, x):
 # 
 # **When you run the cell above,** you should see something like this:
 # 
-# ::
+# .. code-block:: sh
 # 
 #    [1,  2000] loss: 2.235
 #    [1,  4000] loss: 1.940
diff --git a/beginner_source/introyt/tensorboardyt_tutorial.py b/beginner_source/introyt/tensorboardyt_tutorial.py
index 146747410ab..fe4992e4916 100644
--- a/beginner_source/introyt/tensorboardyt_tutorial.py
+++ b/beginner_source/introyt/tensorboardyt_tutorial.py
@@ -24,12 +24,16 @@
 To run this tutorial, you’ll need to install PyTorch, TorchVision,
 Matplotlib, and TensorBoard.
 
-With ``conda``::
+With ``conda``:
+
+.. code-block:: sh
 
     conda install pytorch torchvision -c pytorch
     conda install matplotlib tensorboard
 
-With ``pip``::
+With ``pip``:
+
+.. code-block:: sh
 
     pip install torch torchvision matplotlib tensorboard
 
diff --git a/beginner_source/introyt/tensors_deeper_tutorial.py b/beginner_source/introyt/tensors_deeper_tutorial.py
index 6ef9d805622..3ff76258b57 100644
--- a/beginner_source/introyt/tensors_deeper_tutorial.py
+++ b/beginner_source/introyt/tensors_deeper_tutorial.py
@@ -292,14 +292,14 @@
 # binary operation on tensors if dissimilar shape?
 # 
 # .. note::
-#      The following cell throws a run-time error. This is intentional.
+#    The following cell throws a run-time error. This is intentional.
 #
-# ::
+#    .. code-block:: sh
 #
-#    a = torch.rand(2, 3)
-#    b = torch.rand(3, 2)
+#       a = torch.rand(2, 3)
+#       b = torch.rand(3, 2)
 #
-#    print(a * b)
+#       print(a * b)
 #
 
 
@@ -390,17 +390,17 @@
 # Here are some examples of attempts at broadcasting that will fail:
 # 
 # .. note::
-#       The following cell throws a run-time error. This is intentional.
+#    The following cell throws a run-time error. This is intentional.
 #
-# ::
+#    .. code-block:: python
 #
-#    a =     torch.ones(4, 3, 2)
+#       a =     torch.ones(4, 3, 2)
 #
-#    b = a * torch.rand(4, 3)    # dimensions must match last-to-first
+#       b = a * torch.rand(4, 3)    # dimensions must match last-to-first
 #
-#    c = a * torch.rand(   2, 3) # both 3rd & 2nd dims different
+#       c = a * torch.rand(   2, 3) # both 3rd & 2nd dims different
 #
-#    d = a * torch.rand((0, ))   # can't broadcast with an empty tensor
+#       d = a * torch.rand((0, ))   # can't broadcast with an empty tensor
 #
 
 
@@ -729,7 +729,7 @@
 # following code will throw a runtime error, regardless of whether you
 # have a GPU device available:
 # 
-# ::
+# .. code-block:: python
 # 
 #    x = torch.rand(2, 2)
 #    y = torch.rand(2, 2, device='gpu')
@@ -820,9 +820,9 @@
 # Another place you might use ``unsqueeze()`` is to ease broadcasting.
 # Recall the example above where we had the following code:
 # 
-# ::
+# .. code-block:: python
 # 
-#    a =     torch.ones(4, 3, 2)
+#    a = torch.ones(4, 3, 2)
 # 
 #    c = a * torch.rand(   3, 1) # 3rd dim = 1, 2nd dim identical to a
 #    print(c)
diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py
index ccb22555db0..b45200fd495 100644
--- a/beginner_source/nn_tutorial.py
+++ b/beginner_source/nn_tutorial.py
@@ -328,7 +328,7 @@ def forward(self, xb):
 # Previously for our training loop we had to update the values for each parameter
 # by name, and manually zero out the grads for each parameter separately, like this:
 #
-# ::
+# .. code-block:: python
 #
 #    with torch.no_grad():
 #        weights -= weights.grad * lr
@@ -342,7 +342,7 @@ def forward(self, xb):
 # and less prone to the error of forgetting some of our parameters, particularly
 # if we had a more complicated model:
 #
-# ::
+# .. code-block:: python
 #
 #    with torch.no_grad():
 #        for p in model.parameters(): p -= p.grad * lr
@@ -418,7 +418,7 @@ def forward(self, xb):
 #
 # This will let us replace our previous manually coded optimization step:
 #
-# ::
+# .. code-block:: python
 #
 #    with torch.no_grad():
 #        for p in model.parameters(): p -= p.grad * lr
@@ -426,7 +426,7 @@ def forward(self, xb):
 #
 # and instead use just:
 #
-# ::
+# .. code-block:: python
 #
 #    opt.step()
 #    opt.zero_grad()
@@ -490,7 +490,7 @@ def get_model():
 ###############################################################################
 # Previously, we had to iterate through minibatches of ``x`` and ``y`` values separately:
 #
-# ::
+# .. code-block:: python
 #
 #    xb = x_train[start_i:end_i]
 #    yb = y_train[start_i:end_i]
@@ -498,7 +498,7 @@ def get_model():
 #
 # Now, we can do these two steps together:
 #
-# ::
+# .. code-block:: python
 #
 #    xb,yb = train_ds[i*bs : i*bs+bs]
 #
@@ -534,7 +534,7 @@ def get_model():
 ###############################################################################
 # Previously, our loop iterated over batches ``(xb, yb)`` like this:
 #
-# ::
+# .. code-block:: python
 #
 #    for i in range((n-1)//bs + 1):
 #        xb,yb = train_ds[i*bs : i*bs+bs]
@@ -542,7 +542,7 @@ def get_model():
 #
 # Now, our loop is much cleaner, as ``(xb, yb)`` are loaded automatically from the data loader:
 #
-# ::
+# .. code-block:: python
 #
 #    for xb,yb in train_dl:
 #        pred = model(xb)
diff --git a/beginner_source/profiler.py b/beginner_source/profiler.py
index ed0f173b154..b395edbaca6 100644
--- a/beginner_source/profiler.py
+++ b/beginner_source/profiler.py
@@ -82,7 +82,7 @@ def forward(self, input, mask):
 # ``profiler.profile`` context manager. The ``with_stack=True`` parameter appends the
 # file and line number of the operation in the trace.
 #
-# .. WARNING::
+# .. warning::
 #     ``with_stack=True`` incurs an additional overhead, and is better suited for investigating code.
 #     Remember to remove it if you are benchmarking performance.
 #
@@ -115,7 +115,7 @@ def forward(self, input, mask):
 # `docs <https://pytorch.org/docs/stable/autograd.html#profiler>`__ for
 # valid sorting keys).
 #
-# .. Note::
+# .. note::
 #   When running profiler in a notebook, you might see entries like ``<ipython-input-18-193a910735e8>(13): forward``
 #   instead of filenames in the stacktrace. These correspond to ``<notebook-cell>(line number): calling-function``.
 
diff --git a/beginner_source/saving_loading_models.py b/beginner_source/saving_loading_models.py
index d4b328156ce..3e100afd1e2 100644
--- a/beginner_source/saving_loading_models.py
+++ b/beginner_source/saving_loading_models.py
@@ -115,7 +115,7 @@
 #
 # **Output:**
 #
-# ::
+# .. code-block:: sh
 #
 #    Model's state_dict:
 #    conv1.weight     torch.Size([6, 3, 5, 5])
@@ -175,7 +175,7 @@
 # normalization layers to evaluation mode before running inference.
 # Failing to do this will yield inconsistent inference results.
 #
-# .. Note ::
+# .. note::
 #
 #    Notice that the ``load_state_dict()`` function takes a dictionary
 #    object, NOT a path to a saved object. This means that you must
@@ -183,7 +183,7 @@
 #    ``load_state_dict()`` function. For example, you CANNOT load using
 #    ``model.load_state_dict(PATH)``.
 #
-# .. Note ::
+# .. note::
 #    
 #    If you only plan to keep the best performing model (according to the 
 #    acquired validation loss), don't forget that ``best_model_state = model.state_dict()``
diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py
index 021befdb972..9cc5d6c8671 100644
--- a/beginner_source/text_sentiment_ngrams_tutorial.py
+++ b/beginner_source/text_sentiment_ngrams_tutorial.py
@@ -37,7 +37,7 @@
 train_iter = iter(AG_NEWS(split="train"))
 
 ######################################################################
-# ::
+# .. code-block:: sh
 #
 #     next(train_iter)
 #     >>> (3, "Fears for T N pension after talks Unions representing workers at Turner
@@ -88,7 +88,7 @@ def yield_tokens(data_iter):
 ######################################################################
 # The vocabulary block converts a list of tokens into integers.
 #
-# ::
+# .. code-block:: sh
 #
 #     vocab(['here', 'is', 'an', 'example'])
 #     >>> [475, 21, 30, 5297]
@@ -102,7 +102,7 @@ def yield_tokens(data_iter):
 ######################################################################
 # The text pipeline converts a text string into a list of integers based on the lookup table defined in the vocabulary. The label pipeline converts the label into integers. For example,
 #
-# ::
+# .. code-block:: sh
 #
 #     text_pipeline('here is the an example')
 #     >>> [475, 21, 2, 30, 5297]
@@ -188,7 +188,7 @@ def forward(self, text, offsets):
 #
 # The ``AG_NEWS`` dataset has four labels and therefore the number of classes is four.
 #
-# ::
+# .. code-block:: sh
 #
 #    1 : World
 #    2 : Sports
diff --git a/beginner_source/vt_tutorial.py b/beginner_source/vt_tutorial.py
index e612fe32a73..777098be946 100644
--- a/beginner_source/vt_tutorial.py
+++ b/beginner_source/vt_tutorial.py
@@ -241,7 +241,7 @@
 ######################################################################
 # The results running on a Google Colab are:
 #
-# ::
+# .. code-block:: sh
 #
 #    original model: 1236.69ms
 #    scripted model: 1226.72ms
diff --git a/en-wordlist.txt b/en-wordlist.txt
index 4a6538b04da..1ec9abb68de 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -325,6 +325,7 @@ functionalized
 functorch
 fuser
 geomean
+globals
 grayscale
 hardcode
 helpdesk
@@ -351,6 +352,7 @@ jacobians
 jit
 jitter
 jpg
+json
 judgements
 keypoint
 kwargs
@@ -423,6 +425,7 @@ quantized
 quantizing
 queryable
 randint
+randn
 readably
 recomputation
 regressor
diff --git a/intermediate_source/char_rnn_classification_tutorial.py b/intermediate_source/char_rnn_classification_tutorial.py
index 0957b109b3a..7b641f9184d 100644
--- a/intermediate_source/char_rnn_classification_tutorial.py
+++ b/intermediate_source/char_rnn_classification_tutorial.py
@@ -22,7 +22,7 @@
 of origin, and predict which language a name is from based on the
 spelling:
 
-::
+.. code-block:: sh
 
     $ python predict.py Hinton
     (-0.47) Scottish
@@ -60,7 +60,7 @@
 Preparing the Data
 ==================
 
-.. Note::
+.. note::
    Download the data from
    `here <https://download.pytorch.org/tutorial/data.zip>`_
    and extract it to the current directory.
@@ -501,7 +501,7 @@ def predict(input_line, n_predictions=3):
 #
 # Run ``predict.py`` with a name to view predictions:
 #
-# ::
+# .. code-block:: sh
 #
 #     $ python predict.py Hazaki
 #     (-0.42) Japanese
diff --git a/intermediate_source/char_rnn_generation_tutorial.py b/intermediate_source/char_rnn_generation_tutorial.py
index 114c3f3f572..28e5eaf3832 100644
--- a/intermediate_source/char_rnn_generation_tutorial.py
+++ b/intermediate_source/char_rnn_generation_tutorial.py
@@ -9,7 +9,7 @@
 we used a RNN to classify names into their language of origin. This time
 we'll turn around and generate names from languages.
 
-::
+.. code-block:: sh
 
     > python sample.py Russian RUS
     Rovakov
@@ -64,7 +64,7 @@
 Preparing the Data
 ==================
 
-.. Note::
+.. note::
    Download the data from
    `here <https://download.pytorch.org/tutorial/data.zip>`_
    and extract it to the current directory.
@@ -370,7 +370,7 @@ def timeSince(since):
 #
 # -  Return the final name
 #
-# .. Note::
+# .. note::
 #    Rather than having to give it a starting letter, another
 #    strategy would have been to include a "start of string" token in
 #    training and have the network choose its own starting letter.
diff --git a/intermediate_source/flask_rest_api_tutorial.py b/intermediate_source/flask_rest_api_tutorial.py
index 0975ff93125..8b0162a9e84 100644
--- a/intermediate_source/flask_rest_api_tutorial.py
+++ b/intermediate_source/flask_rest_api_tutorial.py
@@ -32,7 +32,7 @@
 # ``file`` parameter which contains the image. The response will be of JSON
 # response containing the prediction:
 #
-# ::
+# .. code-block:: sh
 #
 #     {"class_id": "n02124075", "class_name": "Egyptian_cat"}
 #
@@ -44,9 +44,9 @@
 #
 # Install the required dependencies by running the following command:
 #
-# ::
+# .. code-block:: sh
 #
-#     $ pip install Flask==2.0.1 torchvision==0.10.0
+#    pip install Flask==2.0.1 torchvision==0.10.0
 
 
 ######################################################################
@@ -64,30 +64,6 @@
 def hello():
     return 'Hello World!'
 
-###############################################################################
-# Save the above snippet in a file called ``app.py`` and you can now run a
-# Flask development server by typing:
-#
-# ::
-#
-#     $ FLASK_ENV=development FLASK_APP=app.py flask run
-
-###############################################################################
-# When you visit ``http://localhost:5000/`` in your web browser, you will be
-# greeted with ``Hello World!`` text
-
-###############################################################################
-# We will make slight changes to the above snippet, so that it suits our API
-# definition. First, we will rename the method to ``predict``. We will update
-# the endpoint path to ``/predict``. Since the image files will be sent via
-# HTTP POST requests, we will update it so that it also accepts only POST
-# requests:
-
-
-@app.route('/predict', methods=['POST'])
-def predict():
-    return 'Hello World!'
-
 ###############################################################################
 # We will also change the response type, so that it returns a JSON response
 # containing ImageNet class id and name. The updated ``app.py`` file will
@@ -137,7 +113,6 @@ def transform_image(image_bytes):
     image = Image.open(io.BytesIO(image_bytes))
     return my_transforms(image).unsqueeze(0)
 
-
 ######################################################################
 # The above method takes image data in bytes, applies the series of transforms
 # and returns a tensor. To test the above method, read an image file in
@@ -173,7 +148,6 @@ def get_prediction(image_bytes):
     _, y_hat = outputs.max(1)
     return y_hat
 
-
 ######################################################################
 # The tensor ``y_hat`` will contain the index of the predicted class id.
 # However, we need a human readable class name. For that we need a class id
@@ -217,18 +191,6 @@ def get_prediction(image_bytes):
 # The first item in array is ImageNet class id and second item is the human
 # readable name.
 #
-# .. Note ::
-#    Did you notice that ``model`` variable is not part of ``get_prediction``
-#    method? Or why is model a global variable? Loading a model can be an
-#    expensive operation in terms of memory and compute. If we loaded the model in the
-#    ``get_prediction`` method, then it would get unnecessarily loaded every
-#    time the method is called. Since, we are building a web server, there
-#    could be thousands of requests per second, we should not waste time
-#    redundantly loading the model for every inference. So, we keep the model
-#    loaded in memory just once. In
-#    production systems, it's necessary to be efficient about your use of
-#    compute to be able to serve requests at scale, so you should generally
-#    load your model before serving requests.
 
 ######################################################################
 # Integrating the model in our API Server
@@ -251,66 +213,68 @@ def get_prediction(image_bytes):
 #            img_bytes = file.read()
 #            class_id, class_name = get_prediction(image_bytes=img_bytes)
 #            return jsonify({'class_id': class_id, 'class_name': class_name})
-
+#
+#
 ######################################################################
 # The ``app.py`` file is now complete. Following is the full version; replace
 # the paths with the paths where you saved your files and it should run:
 #
 # .. code-block:: python
 #
-#   import io
-#   import json
+#    import io
+#    import json
+#
+#    from torchvision import models
+#    import torchvision.transforms as transforms
+#    from PIL import Image
+#    from flask import Flask, jsonify, request
 #
-#   from torchvision import models
-#   import torchvision.transforms as transforms
-#   from PIL import Image
-#   from flask import Flask, jsonify, request
 #
+#    app = Flask(__name__)
+#    imagenet_class_index = json.load(open('<PATH/TO/.json/FILE>/imagenet_class_index.json'))
+#    model = models.densenet121(weights='IMAGENET1K_V1')
+#    model.eval()
 #
-#   app = Flask(__name__)
-#   imagenet_class_index = json.load(open('<PATH/TO/.json/FILE>/imagenet_class_index.json'))
-#   model = models.densenet121(weights='IMAGENET1K_V1')
-#   model.eval()
 #
+#    def transform_image(image_bytes):
+#        my_transforms = transforms.Compose([transforms.Resize(255),
+#                                            transforms.CenterCrop(224),
+#                                            transforms.ToTensor(),
+#                                            transforms.Normalize(
+#                                                [0.485, 0.456, 0.406],
+#                                                [0.229, 0.224, 0.225])])
+#        image = Image.open(io.BytesIO(image_bytes))
+#        return my_transforms(image).unsqueeze(0)
 #
-#   def transform_image(image_bytes):
-#       my_transforms = transforms.Compose([transforms.Resize(255),
-#                                           transforms.CenterCrop(224),
-#                                           transforms.ToTensor(),
-#                                           transforms.Normalize(
-#                                               [0.485, 0.456, 0.406],
-#                                               [0.229, 0.224, 0.225])])
-#       image = Image.open(io.BytesIO(image_bytes))
-#       return my_transforms(image).unsqueeze(0)
 #
+#    def get_prediction(image_bytes):
+#        tensor = transform_image(image_bytes=image_bytes)
+#        outputs = model.forward(tensor)
+#        _, y_hat = outputs.max(1)
+#        predicted_idx = str(y_hat.item())
+#        return imagenet_class_index[predicted_idx]
 #
-#   def get_prediction(image_bytes):
-#       tensor = transform_image(image_bytes=image_bytes)
-#       outputs = model.forward(tensor)
-#       _, y_hat = outputs.max(1)
-#       predicted_idx = str(y_hat.item())
-#       return imagenet_class_index[predicted_idx]
+#
+#    @app.route('/predict', methods=['POST'])
+#    def predict():
+#        if request.method == 'POST':
+#            file = request.files['file']
+#            img_bytes = file.read()
+#            class_id, class_name = get_prediction(image_bytes=img_bytes)
+#            return jsonify({'class_id': class_id, 'class_name': class_name})
 #
 #
-#   @app.route('/predict', methods=['POST'])
-#   def predict():
-#       if request.method == 'POST':
-#           file = request.files['file']
-#           img_bytes = file.read()
-#           class_id, class_name = get_prediction(image_bytes=img_bytes)
-#           return jsonify({'class_id': class_id, 'class_name': class_name})
+#    if __name__ == '__main__':
+#        app.run()
 #
 #
-#   if __name__ == '__main__':
-#       app.run()
-
 ######################################################################
 # Let's test our web server! Run:
 #
-# ::
+# .. code-block:: sh
+#
+#    FLASK_ENV=development FLASK_APP=app.py flask run
 #
-#     $ FLASK_ENV=development FLASK_APP=app.py flask run
-
 #######################################################################
 # We can use the
 # `requests <https://pypi.org/project/requests/>`_
@@ -322,15 +286,15 @@ def get_prediction(image_bytes):
 #
 #    resp = requests.post("http://localhost:5000/predict",
 #                         files={"file": open('<PATH/TO/.jpg/FILE>/cat.jpg','rb')})
+#
 
 #######################################################################
 # Printing `resp.json()` will now show the following:
 #
-# ::
+# .. code-block:: sh
 #
 #     {"class_id": "n02124075", "class_name": "Egyptian_cat"}
 #
-
 ######################################################################
 # Next steps
 # --------------
@@ -368,3 +332,4 @@ def get_prediction(image_bytes):
 #
 # - Finally, we encourage you to check out our other tutorials on deploying PyTorch models
 #   linked-to at the top of the page.
+#
diff --git a/intermediate_source/inductor_debug_cpu.py b/intermediate_source/inductor_debug_cpu.py
index 061ef063856..94dee3ba158 100644
--- a/intermediate_source/inductor_debug_cpu.py
+++ b/intermediate_source/inductor_debug_cpu.py
@@ -98,6 +98,7 @@ def neg1(x):
 # Here are the main parts of code extracted from the files and we correlate the C++ generated line with the FX code line.
 #
 # ``fx_graph_runnable``:
+#
 
 def forward1(self, arg0_1, arg1_1):
     neg = torch.ops.aten.neg.default(arg0_1);  arg0_1 = None
@@ -107,6 +108,7 @@ def forward1(self, arg0_1, arg1_1):
 
 ######################################################################
 # C++ kernel in ``output_code``:
+#
 
 from torch._inductor.codecache import AsyncCompile
 async_compile = AsyncCompile()
@@ -162,7 +164,7 @@ def forward1(self, arg0_1, arg1_1):
 #
 # As we know, the evolved chain of graph-level optimization is like:
 #
-# ::
+# .. code-block:: sh
 #
 # 	torch.neg (Python) -> torch.ops.aten.neg.default (within FX graph) -> ops.neg (within IR node) -> tmp2 = -tmp1 (within C++ kernel)
 #
@@ -228,7 +230,7 @@ def neg2(x):
 ######################################################################
 # IR node:
 #
-# ::
+# .. code-block:: sh
 #
 #     buf0: SchedulerNode(ComputedBuffer)
 #     buf0.writes = [MemoryDep('buf0', c0, {c0: 67120})]
@@ -254,6 +256,7 @@ def neg2(x):
 #             get_index_2 = self.get_index('index0')
 #             store = ops.store('buf0', get_index_2, maximum, None)
 #             return store
+#
 
 ######################################################################
 # According to the traceback logging, the compilation error is caused by the data type inconsistency of ``max_propagate_nan``'s inputs. 
@@ -304,7 +307,7 @@ def neg3(x):
 ######################################################################
 # An accuracy problem would be raised as follows:
 #
-# ::
+# .. code-block:: sh
 #
 # 	torch._dynamo.utils: [ERROR] Accuracy failed: allclose not within tol=0.0001
 # 	Traceback (most recent call last):
@@ -314,13 +317,13 @@ def neg3(x):
 #
 # To debug an accuracy problem with Minifier, two environment variables are needed:
 #
-# ::
+# .. code-block:: sh
 #
 #    TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4 python xx.py
 #
 # Which gives us logging information that demonstrates the steps of minifying:
 #
-# ::
+# .. code-block:: sh
 #
 #     Started off with 6 nodes
 #
diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py
index 2bfeb46b56c..1d003293d56 100644
--- a/intermediate_source/scaled_dot_product_attention_tutorial.py
+++ b/intermediate_source/scaled_dot_product_attention_tutorial.py
@@ -303,7 +303,8 @@ def generate_rand_batch(
 print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
 
 # For even more insights, you can export the trace and use ``chrome://tracing`` to view the results
-# ::
+#
+# .. code-block:: python
 #
 #    prof.export_chrome_trace("compiled_causal_attention_trace.json").
 
diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py
index 7e52f1eaea3..c9e360d7518 100755
--- a/intermediate_source/seq2seq_translation_tutorial.py
+++ b/intermediate_source/seq2seq_translation_tutorial.py
@@ -13,7 +13,7 @@
 In this project we will be teaching a neural network to translate from
 French to English.
 
-::
+.. code-block:: sh
 
     [KEY: > input, = target, < output]
 
@@ -112,11 +112,11 @@
 # download to ``data/eng-fra.txt`` before continuing. The file is a tab
 # separated list of translation pairs:
 #
-# ::
+# .. code-block:: sh
 #
 #     I am cold.    J'ai froid.
 #
-# .. Note::
+# .. note::
 #    Download the data from
 #    `here <https://download.pytorch.org/tutorial/data.zip>`_
 #    and extract it to the current directory.
@@ -775,7 +775,7 @@ def evaluateRandomly(encoder, decoder, n=10):
 # single GRU layer. After about 40 minutes on a MacBook CPU we'll get some
 # reasonable results.
 #
-# .. Note::
+# .. note::
 #    If you run this notebook you can train, interrupt the kernel,
 #    evaluate, and continue training later. Comment out the lines where the
 #    encoder and decoder are initialized and run ``trainIters`` again.
diff --git a/intermediate_source/spatial_transformer_tutorial.py b/intermediate_source/spatial_transformer_tutorial.py
index 49b6b0f0a2b..99efe41b39b 100644
--- a/intermediate_source/spatial_transformer_tutorial.py
+++ b/intermediate_source/spatial_transformer_tutorial.py
@@ -84,7 +84,7 @@
 #
 # .. figure:: /_static/img/stn/stn-arch.png
 #
-# .. Note::
+# .. note::
 #    We need the latest version of PyTorch that contains
 #    affine_grid and grid_sample modules.
 #
diff --git a/recipes_source/recipes/dynamic_quantization.py b/recipes_source/recipes/dynamic_quantization.py
index 25fc9cd3982..eb9605d0c63 100644
--- a/recipes_source/recipes/dynamic_quantization.py
+++ b/recipes_source/recipes/dynamic_quantization.py
@@ -163,15 +163,8 @@ def forward(self,inputs,hidden):
 #
 # Now we get to the fun part. First we create an instance of the model
 # called ``float\_lstm`` then we are going to quantize it. We're going to use
-# the
-#
-# ::
-#
-#     torch.quantization.quantize_dynamic()
-#
-# function here (`see
-# documentation <https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic>`__)
-# which takes the model, then a list of the submodules which we want to
+# the `torch.quantization.quantize_dynamic <https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic>`__ function, which takes the model, then a list of the submodules
+# which we want to
 # have quantized if they appear, then the datatype we are targeting. This
 # function returns a quantized version of the original model as a new
 # module.
diff --git a/recipes_source/recipes/profiler_recipe.py b/recipes_source/recipes/profiler_recipe.py
index a88ea87feca..c9c8cef0c0c 100644
--- a/recipes_source/recipes/profiler_recipe.py
+++ b/recipes_source/recipes/profiler_recipe.py
@@ -16,7 +16,7 @@
 -----
 To install ``torch`` and ``torchvision`` use the following command:
 
-::
+.. code-block:: sh
 
    pip install torch torchvision
 
@@ -121,7 +121,8 @@
 #                        aten::mean     332.000us       2.631ms     125.286us            21
 #                      aten::select       1.668ms       2.292ms       8.988us           255
 # ---------------------------------  ------------  ------------  ------------  ------------
-# Self CPU time total: 57.549ms
+# Self CPU time total: 57.549m
+# 
 
 ######################################################################
 # Here we see that, as expected, most of the time is spent in convolution (and specifically in ``mkldnn_convolution``
@@ -138,7 +139,7 @@
 ########################################################################################
 # The output might look like this (omitting some columns):
 #
-# ::
+# .. code-block:: sh
 #
 #    ---------------------------------  ------------  -------------------------------------------
 #                                 Name     CPU total                                 Input Shapes
@@ -155,6 +156,7 @@
 #                         aten::conv2d       4.751ms  [[5,256,14,14], [256,256,3,3], [], ..., []]
 #    ---------------------------------  ------------  -------------------------------------------
 #    Self CPU time total: 57.549ms
+#
 
 ######################################################################
 # Note the occurrence of ``aten::convolution`` twice with different input shapes.
@@ -178,7 +180,7 @@
 ######################################################################
 # The resulting table output (omitting some columns):
 #
-# ::
+# .. code-block:: sh
 #
 #    -------------------------------------------------------  ------------  ------------
 #                                                       Name     Self CUDA    CUDA total
@@ -196,6 +198,7 @@
 #    -------------------------------------------------------  ------------  ------------
 #    Self CPU time total: 23.015ms
 #    Self CUDA time total: 11.666ms
+#
 
 ######################################################################
 # Note the occurrence of on-device kernels in the output (e.g. ``sgemm_32x32x32_NN``).
@@ -241,7 +244,7 @@
 #############################################################################
 # The output might look like this (omitting some columns):
 #
-# ::
+# .. code-block:: sh
 #
 #    ---------------------------------  ------------  ------------  ------------
 #                                 Name       CPU Mem  Self CPU Mem    # of Calls
@@ -258,6 +261,7 @@
 #        aten::max_pool2d_with_indices      11.48 Mb      11.48 Mb             1
 #    ---------------------------------  ------------  ------------  ------------
 #    Self CPU time total: 53.064ms
+#
 
 ######################################################################
 # 5. Using tracing functionality
@@ -298,7 +302,7 @@
 #################################################################################
 # The output might look like this (omitting some columns):
 #
-# ::
+# .. code-block:: sh
 #
 #    -------------------------  -----------------------------------------------------------
 #                         Name  Source Location
@@ -316,6 +320,7 @@
 #    -------------------------  -----------------------------------------------------------
 #    Self CPU time total: 34.016ms
 #    Self CUDA time total: 11.659ms
+#
 
 ######################################################################
 # Note the two convolutions and the two call sites in ``torchvision/models/resnet.py`` script.
@@ -341,6 +346,7 @@
 #    git clone https://github.com/brendangregg/FlameGraph
 #    cd FlameGraph
 #    ./flamegraph.pl --title "CUDA time" --countname "us." /tmp/profiler_stacks.txt > perf_viz.svg
+#
 
 ######################################################################
 #
diff --git a/recipes_source/recipes/save_load_across_devices.py b/recipes_source/recipes/save_load_across_devices.py
index cd311a62365..be950e15b13 100644
--- a/recipes_source/recipes/save_load_across_devices.py
+++ b/recipes_source/recipes/save_load_across_devices.py
@@ -19,13 +19,12 @@
 first change the runtime to “GPU” or higher. Once you do, we need to
 install ``torch`` if it isn’t already available.
 
-::
+.. code-block:: sh
 
    pip install torch
 
 """
 
-
 ######################################################################
 # Steps
 # -----
diff --git a/recipes_source/recipes/saving_multiple_models_in_one_file.py b/recipes_source/recipes/saving_multiple_models_in_one_file.py
index aeff7803969..f468d7ac6a1 100644
--- a/recipes_source/recipes/saving_multiple_models_in_one_file.py
+++ b/recipes_source/recipes/saving_multiple_models_in_one_file.py
@@ -23,10 +23,10 @@
 Before we begin, we need to install ``torch`` if it isn’t already
 available.
 
-::
+.. code-block:: sh
 
    pip install torch
-   
+
 """
 
 
diff --git a/recipes_source/recipes/tensorboard_with_pytorch.py b/recipes_source/recipes/tensorboard_with_pytorch.py
index 00ee7292a1d..f9c21c4cdc4 100644
--- a/recipes_source/recipes/tensorboard_with_pytorch.py
+++ b/recipes_source/recipes/tensorboard_with_pytorch.py
@@ -13,14 +13,14 @@
 directory. The following command will install PyTorch 1.4+ via 
 Anaconda (recommended):
 
-::
+.. code-block:: sh
 
    $ conda install pytorch torchvision -c pytorch 
    
 
 or pip
 
-::
+.. code-block:: sh
 
    $ pip install torch torchvision
 
diff --git a/recipes_source/recipes/timer_quick_start.py b/recipes_source/recipes/timer_quick_start.py
index b93e13dcbd2..d6b79e094c7 100644
--- a/recipes_source/recipes/timer_quick_start.py
+++ b/recipes_source/recipes/timer_quick_start.py
@@ -46,9 +46,7 @@
     """,
 
     # Alternatively, ``globals`` can be used to pass variables from the outer scope.
-    # -------------------------------------------------------------------------
-    # ::
-    #
+    # 
     #    globals={
     #        "x": torch.ones((128,)),
     #        "y": torch.ones((128,)),
@@ -176,14 +174,14 @@
 #   One generally doesn't care about absolute path. For instance, the full path
 #   and function name for a multiply call is something like:
 #
-# ::
+# .. code-block:: sh
 #
 #    /the/prefix/to/your/pytorch/install/dir/pytorch/build/aten/src/ATen/core/TensorMethods.cpp:at::Tensor::mul(at::Tensor const&) const [/the/path/to/your/conda/install/miniconda3/envs/ab_ref/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so]
 #
 #   when in reality, all of the information that we're interested in can be
 #   represented in:
 #
-# ::
+# .. code-block:: sh
 #
 #    build/aten/src/ATen/core/TensorMethods.cpp:at::Tensor::mul(at::Tensor const&) const
 #
diff --git a/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py b/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py
index c04ae7d4be4..40aeeea9db8 100644
--- a/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py
+++ b/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py
@@ -21,7 +21,7 @@
 Before we begin, we need to install ``torch`` if it isn’t already
 available.
 
-::
+.. code-block:: sh
 
    pip install torch
    
diff --git a/recipes_source/recipes/what_is_state_dict.py b/recipes_source/recipes/what_is_state_dict.py
index 838d0c0d4ff..00f1ea105a4 100644
--- a/recipes_source/recipes/what_is_state_dict.py
+++ b/recipes_source/recipes/what_is_state_dict.py
@@ -26,7 +26,7 @@
 Before we begin, we need to install ``torch`` if it isn’t already
 available.
 
-::
+.. code-block:: sh
 
    pip install torch
 
diff --git a/recipes_source/recipes/zeroing_out_gradients.py b/recipes_source/recipes/zeroing_out_gradients.py
index b3c25654d93..1d6a9315917 100644
--- a/recipes_source/recipes/zeroing_out_gradients.py
+++ b/recipes_source/recipes/zeroing_out_gradients.py
@@ -33,7 +33,7 @@
 Before we begin, we need to install ``torch`` and ``torchvision`` if
 they aren’t already available.
 
-::
+.. code-block:: sh
 
    pip install torchvision