Merge branch 'main' into add_amx_doc

CaoE · web-flow · commit 88d32df006e4 · 2023-06-10T10:20:24.000+08:00
diff --git a/_static/img/seq-seq-images/attention-decoder-network.png b/_static/img/seq-seq-images/attention-decoder-network.png
diff --git a/beginner_source/transfer_learning_tutorial.py b/beginner_source/transfer_learning_tutorial.py
@@ -44,6 +44,7 @@
 import matplotlib.pyplot as plt
 import time
 import os
+from PIL import Image
 from tempfile import TemporaryDirectory
 
 cudnn.benchmark = True
@@ -337,6 +338,47 @@ def visualize_model(model, num_images=6):
 plt.ioff()
 plt.show()
 
+
+######################################################################
+# Inference on custom images
+# --------------------------
+#
+# Use the trained model to make predictions on custom images and visualize
+# the predicted class labels along with the images.
+#
+
+def visualize_model_predictions(model,img_path):
+    was_training = model.training
+    model.eval()
+
+    img = Image.open(img_path)
+    img = data_transforms['val'](img)
+    img = img.unsqueeze(0)
+    img = img.to(device)
+
+    with torch.no_grad():
+        outputs = model(img)
+        _, preds = torch.max(outputs, 1)
+
+        ax = plt.subplot(2,2,1)
+        ax.axis('off')
+        ax.set_title(f'Predicted: {class_names[preds[0]]}')
+        imshow(img.cpu().data[0])
+        
+        model.train(mode=was_training)
+
+######################################################################
+#
+
+visualize_model_predictions(
+    model_conv,
+    img_path='data/hymenoptera_data/val/bees/72100438_73de9f17af.jpg'
+)
+
+plt.ioff()
+plt.show()
+
+
 ######################################################################
 # Further Learning
 # -----------------
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
@@ -2,7 +2,7 @@
 Language Modeling with ``nn.Transformer`` and torchtext
 ===============================================================
 
-This is a tutorial on training a sequence-to-sequence model that uses the
+This is a tutorial on training a model to predict the next word in a sequence using the
 `nn.Transformer <https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html>`__ module.
 
 The PyTorch 1.2 release includes a standard transformer module based on the
@@ -29,15 +29,24 @@
 
 ######################################################################
 # In this tutorial, we train a ``nn.TransformerEncoder`` model on a
-# language modeling task. The language modeling task is to assign a
+# language modeling task. Please note that this tutorial does not cover
+# the training of `nn.TransformerDecoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html#torch.nn.TransformerDecoder>`__, as depicted in
+# the right half of the diagram above. The language modeling task is to assign a
 # probability for the likelihood of a given word (or a sequence of words)
 # to follow a sequence of words. A sequence of tokens are passed to the embedding
 # layer first, followed by a positional encoding layer to account for the order
 # of the word (see the next paragraph for more details). The
 # ``nn.TransformerEncoder`` consists of multiple layers of
 # `nn.TransformerEncoderLayer <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html>`__.
-# To produce a probability distribution over output words, the output of 
-# the ``nn.TransformerEncoder`` model is passed through a linear layer.
+# Along with the input sequence, a square attention mask is required because the
+# self-attention layers in ``nn.TransformerDecoder`` are only allowed to attend
+# the earlier positions in the sequence. For the language modeling task, any
+# tokens on the future positions should be masked. To produce a probability
+# distribution over output words, the output of the ``nn.TransformerEncoder``
+# model is passed through a linear layer to output unnormalized logits.
+# The log-softmax function isn't applied here due to the later use of
+# `CrossEntropyLoss <https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html>`__,
+# which requires the inputs to be unnormalized logits.
 #
 
 import math
@@ -130,6 +139,7 @@ def forward(self, x: Tensor) -> Tensor:
 #  .. code-block:: bash
 #
 #      %%bash
+#      pip install portalocker
 #      pip install torchdata
 #
 # The vocab object is built based on the train dataset and is used to numericalize
diff --git a/en-wordlist.txt b/en-wordlist.txt
@@ -89,6 +89,7 @@ LeNet
 LeakyReLU
 LeakyReLUs
 Lipschitz
+logits
 Lua
 Luong
 MLP
diff --git a/intermediate_source/FSDP_adavnced_tutorial.rst b/intermediate_source/FSDP_adavnced_tutorial.rst
@@ -75,7 +75,7 @@ highlight different available features in FSDP that are helpful for training
 large scale model above 3B parameters. Also, we cover specific features for
 Transformer based models. The code for this tutorial is available in  `Pytorch
 Examples
-<https://github.com/HamidShojanazeri/examples/tree/FSDP_example/FSDP/>`__.
+<https://github.com/HamidShojanazeri/examples/tree/FSDP_example/distributed/FSDP/>`__.
 
 
 *Setup*
@@ -97,13 +97,13 @@ Please create a `data` folder, download the WikiHow dataset from `wikihowAll.csv
 `wikihowSep.cs <https://ucsb.app.box.com/s/7yq601ijl1lzvlfu4rjdbbxforzd2oag>`__,
 and place them in the `data` folder.  We will use the wikihow dataset from
 `summarization_dataset
-<https://github.com/HamidShojanazeri/examples/blob/FSDP_example/FSDP/summarization_dataset.py>`__.
+<https://github.com/HamidShojanazeri/examples/blob/FSDP_example/distributed/FSDP/summarization_dataset.py>`__.
 
 Next, we add the following code snippets to a Python script “T5_training.py”.
 
 .. note::
    The full source code for this tutorial is available in `PyTorch examples
-   <https://github.com/HamidShojanazeri/examples/tree/FSDP_example/FSDP>`__.
+   <https://github.com/HamidShojanazeri/examples/tree/FSDP_example/distributed/FSDP>`__.
 
 1.3  Import necessary packages:
 
diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py
@@ -440,25 +440,27 @@ def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGT
         self.max_length = max_length
 
         self.embedding = nn.Embedding(self.output_size, self.hidden_size)
-        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
-        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
+        self.fc_hidden = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.fc_encoder = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.alignment_vector = nn.Parameter(torch.Tensor(1, hidden_size))
+        torch.nn.init.xavier_uniform_(self.alignment_vector)
         self.dropout = nn.Dropout(self.dropout_p)
-        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
+        self.gru = nn.GRU(self.hidden_size * 2, self.hidden_size)
         self.out = nn.Linear(self.hidden_size, self.output_size)
 
     def forward(self, input, hidden, encoder_outputs):
-        embedded = self.embedding(input).view(1, 1, -1)
+        embedded = self.embedding(input).view(1, -1)
         embedded = self.dropout(embedded)
 
-        attn_weights = F.softmax(
-            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
-        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
-                                 encoder_outputs.unsqueeze(0))
-
-        output = torch.cat((embedded[0], attn_applied[0]), 1)
-        output = self.attn_combine(output).unsqueeze(0)
+        transformed_hidden = self.fc_hidden(hidden[0])
+        expanded_hidden_state = transformed_hidden.expand(self.max_length, -1)
+        alignment_scores = torch.tanh(expanded_hidden_state +
+                                      self.fc_encoder(encoder_outputs))
+        alignment_scores = self.alignment_vector.mm(alignment_scores.T)
+        attn_weights = F.softmax(alignment_scores, dim=1)
+        context_vector = attn_weights.mm(encoder_outputs)
 
-        output = F.relu(output)
+        output = torch.cat((embedded, context_vector), 1).unsqueeze(0)
         output, hidden = self.gru(output, hidden)
 
         output = F.log_softmax(self.out(output[0]), dim=1)
@@ -761,15 +763,15 @@ def evaluateRandomly(encoder, decoder, n=10):
 #
 
 hidden_size = 256
-encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
-attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
+encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
+attn_decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
 
-trainIters(encoder1, attn_decoder1, 75000, print_every=5000)
+trainIters(encoder, attn_decoder, 75000, print_every=5000)
 
 ######################################################################
 #
 
-evaluateRandomly(encoder1, attn_decoder1)
+evaluateRandomly(encoder, attn_decoder)
 
 
 ######################################################################
@@ -787,7 +789,7 @@ def evaluateRandomly(encoder, decoder, n=10):
 #
 
 output_words, attentions = evaluate(
-    encoder1, attn_decoder1, "je suis trop froid .")
+    encoder, attn_decoder, "je suis trop froid .")
 plt.matshow(attentions.numpy())
 
 
@@ -817,7 +819,7 @@ def showAttention(input_sentence, output_words, attentions):
 
 def evaluateAndShowAttention(input_sentence):
     output_words, attentions = evaluate(
-        encoder1, attn_decoder1, input_sentence)
+        encoder, attn_decoder, input_sentence)
     print('input =', input_sentence)
     print('output =', ' '.join(output_words))
     showAttention(input_sentence, output_words, attentions)
diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst
@@ -68,6 +68,13 @@ Prototype features are not available as part of binary distributions like PyPI o
    :link: ../prototype/numeric_suite_tutorial.html
    :tags: Debugging,Quantization
 
+.. customcarditem::
+   :header: Quantization in PyTorch 2.0 Export Tutorial
+   :card_description: Learn how to use the Quantization in PyTorch 2.0 Export.
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../prototype/quantization_in_pytorch_2_0_export_tutorial.html
+   :tags: Quantization
+
 .. Mobile
 
 .. customcarditem::
@@ -193,6 +200,7 @@ Prototype features are not available as part of binary distributions like PyPI o
    prototype/fx_graph_mode_ptq_dynamic.html
    prototype/fx_graph_mode_ptq_static.html
    prototype/graph_mode_dynamic_bert_tutorial.html
+   prototype/quantization_in_pytorch_2_0_export_tutorial.html
    prototype/ios_gpu_workflow.html
    prototype/nnapi_mobilenetv2.html
    prototype/tracing_based_selective_build.html
diff --git a/prototype_source/quantization_in_pytorch_2_0_export_tutorial.rst b/prototype_source/quantization_in_pytorch_2_0_export_tutorial.rst