Merge branch 'main' into main

carljparker · web-flow · commit ed8c29d47a88 · 2023-06-02T10:18:32.000-07:00
diff --git a/beginner_source/basics/optimization_tutorial.py b/beginner_source/basics/optimization_tutorial.py
@@ -149,6 +149,9 @@ def forward(self, x):
 
 def train_loop(dataloader, model, loss_fn, optimizer):
     size = len(dataloader.dataset)
+    # Set the model to training mode - important for batch normalization and dropout layers
+    # Unnecessary in this situation but added for best practices
+    model.train()
     for batch, (X, y) in enumerate(dataloader):
         # Compute prediction and loss
         pred = model(X)
@@ -165,10 +168,15 @@ def train_loop(dataloader, model, loss_fn, optimizer):
 
 
 def test_loop(dataloader, model, loss_fn):
+    # Set the model to evaluation mode - important for batch normalization and dropout layers
+    # Unnecessary in this situation but added for best practices
+    model.eval()
     size = len(dataloader.dataset)
     num_batches = len(dataloader)
     test_loss, correct = 0, 0
 
+    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
+    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
     with torch.no_grad():
         for X, y in dataloader:
             pred = model(X)
diff --git a/beginner_source/data_loading_tutorial.py b/beginner_source/data_loading_tutorial.py
@@ -165,9 +165,7 @@ def __getitem__(self, idx):
 
 fig = plt.figure()
 
-for i in range(len(face_dataset)):
-    sample = face_dataset[i]
-
+for i, sample in enumerate(face_dataset):
     print(i, sample['image'].shape, sample['landmarks'].shape)
 
     ax = plt.subplot(1, 4, i + 1)
@@ -268,8 +266,8 @@ def __call__(self, sample):
         h, w = image.shape[:2]
         new_h, new_w = self.output_size
 
-        top = np.random.randint(0, h - new_h)
-        left = np.random.randint(0, w - new_w)
+        top = np.random.randint(0, h - new_h + 1)
+        left = np.random.randint(0, w - new_w + 1)
 
         image = image[top: top + new_h,
                       left: left + new_w]
@@ -294,7 +292,7 @@ def __call__(self, sample):
 
 ######################################################################
 # .. note::
-#     In the example above, `RandomCrop` uses an external library's random number generator 
+#     In the example above, `RandomCrop` uses an external library's random number generator
 #     (in this case, Numpy's `np.random.int`). This can result in unexpected behavior with `DataLoader`
 #     (see `here <https://pytorch.org/docs/stable/notes/faq.html#my-data-loader-workers-return-identical-random-numbers>`_).
 #     In practice, it is safer to stick to PyTorch's random number generator, e.g. by using `torch.randint` instead.
@@ -356,9 +354,7 @@ def __call__(self, sample):
                                                ToTensor()
                                            ]))
 
-for i in range(len(transformed_dataset)):
-    sample = transformed_dataset[i]
-
+for i, sample in enumerate(transformed_dataset):
     print(i, sample['image'].size(), sample['landmarks'].size())
 
     if i == 3:
diff --git a/beginner_source/former_torchies/parallelism_tutorial.py b/beginner_source/former_torchies/parallelism_tutorial.py
@@ -53,7 +53,10 @@ def forward(self, x):
 
 class MyDataParallel(nn.DataParallel):
     def __getattr__(self, name):
-        return getattr(self.module, name)
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.module, name)
     
 ########################################################################
 # **Primitives on which DataParallel is implemented upon:**
diff --git a/beginner_source/introyt/introyt1_tutorial.py b/beginner_source/introyt/introyt1_tutorial.py
@@ -288,7 +288,7 @@ def num_flat_features(self, x):
 
 transform = transforms.Compose(
     [transforms.ToTensor(),
-     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))])
 
 
 ##########################################################################
@@ -297,9 +297,28 @@ def num_flat_features(self, x):
 # -  ``transforms.ToTensor()`` converts images loaded by Pillow into 
 #    PyTorch tensors.
 # -  ``transforms.Normalize()`` adjusts the values of the tensor so
-#    that their average is zero and their standard deviation is 0.5. Most
+#    that their average is zero and their standard deviation is 1.0. Most
 #    activation functions have their strongest gradients around x = 0, so
 #    centering our data there can speed learning.
+#    The values passed to the transform are the means (first tuple) and the
+#    standard deviations (second tuple) of the rgb values of the images in
+#    the dataset. You can calculate these values yourself by running these
+#    few lines of code:
+#          ```
+#           from torch.utils.data import ConcatDataset
+#           transform = transforms.Compose([transforms.ToTensor()])
+#           trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
+#                                        download=True, transform=transform)
+#
+#           #stack all train images together into a tensor of shape 
+#           #(50000, 3, 32, 32)
+#           x = torch.stack([sample[0] for sample in ConcatDataset([trainset])])
+#           
+#           #get the mean of each channel            
+#           mean = torch.mean(x, dim=(0,2,3)) #tensor([0.4914, 0.4822, 0.4465])
+#           std = torch.std(x, dim=(0,2,3)) #tensor([0.2470, 0.2435, 0.2616])  
+# 
+#          ```   
 # 
 # There are many more transforms available, including cropping, centering,
 # rotation, and reflection.
diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py
@@ -795,8 +795,7 @@ def __len__(self):
         return len(self.dl)
 
     def __iter__(self):
-        batches = iter(self.dl)
-        for b in batches:
+        for b in self.dl:
             yield (self.func(*b))
 
 train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
diff --git a/beginner_source/transfer_learning_tutorial.py b/beginner_source/transfer_learning_tutorial.py
@@ -46,7 +46,7 @@
 import matplotlib.pyplot as plt
 import time
 import os
-import copy
+from tempfile import TemporaryDirectory
 
 cudnn.benchmark = True
 plt.ion()   # interactive mode
@@ -146,67 +146,71 @@ def imshow(inp, title=None):
 def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
     since = time.time()
 
-    best_model_wts = copy.deepcopy(model.state_dict())
-    best_acc = 0.0
-
-    for epoch in range(num_epochs):
-        print(f'Epoch {epoch}/{num_epochs - 1}')
-        print('-' * 10)
-
-        # Each epoch has a training and validation phase
-        for phase in ['train', 'val']:
-            if phase == 'train':
-                model.train()  # Set model to training mode
-            else:
-                model.eval()   # Set model to evaluate mode
-
-            running_loss = 0.0
-            running_corrects = 0
-
-            # Iterate over data.
-            for inputs, labels in dataloaders[phase]:
-                inputs = inputs.to(device)
-                labels = labels.to(device)
-
-                # zero the parameter gradients
-                optimizer.zero_grad()
-
-                # forward
-                # track history if only in train
-                with torch.set_grad_enabled(phase == 'train'):
-                    outputs = model(inputs)
-                    _, preds = torch.max(outputs, 1)
-                    loss = criterion(outputs, labels)
-
-                    # backward + optimize only if in training phase
-                    if phase == 'train':
-                        loss.backward()
-                        optimizer.step()
-
-                # statistics
-                running_loss += loss.item() * inputs.size(0)
-                running_corrects += torch.sum(preds == labels.data)
-            if phase == 'train':
-                scheduler.step()
-
-            epoch_loss = running_loss / dataset_sizes[phase]
-            epoch_acc = running_corrects.double() / dataset_sizes[phase]
-
-            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
-
-            # deep copy the model
-            if phase == 'val' and epoch_acc > best_acc:
-                best_acc = epoch_acc
-                best_model_wts = copy.deepcopy(model.state_dict())
-
-        print()
-
-    time_elapsed = time.time() - since
-    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
-    print(f'Best val Acc: {best_acc:4f}')
-
-    # load best model weights
-    model.load_state_dict(best_model_wts)
+    # Create a temporary directory to save training checkpoints
+    with TemporaryDirectory() as tempdir:
+        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')
+    
+        torch.save(model.state_dict(), best_model_params_path)
+        best_acc = 0.0
+
+        for epoch in range(num_epochs):
+            print(f'Epoch {epoch}/{num_epochs - 1}')
+            print('-' * 10)
+
+            # Each epoch has a training and validation phase
+            for phase in ['train', 'val']:
+                if phase == 'train':
+                    model.train()  # Set model to training mode
+                else:
+                    model.eval()   # Set model to evaluate mode
+
+                running_loss = 0.0
+                running_corrects = 0
+
+                # Iterate over data.
+                for inputs, labels in dataloaders[phase]:
+                    inputs = inputs.to(device)
+                    labels = labels.to(device)
+
+                    # zero the parameter gradients
+                    optimizer.zero_grad()
+
+                    # forward
+                    # track history if only in train
+                    with torch.set_grad_enabled(phase == 'train'):
+                        outputs = model(inputs)
+                        _, preds = torch.max(outputs, 1)
+                        loss = criterion(outputs, labels)
+
+                        # backward + optimize only if in training phase
+                        if phase == 'train':
+                            loss.backward()
+                            optimizer.step()
+
+                    # statistics
+                    running_loss += loss.item() * inputs.size(0)
+                    running_corrects += torch.sum(preds == labels.data)
+                if phase == 'train':
+                    scheduler.step()
+
+                epoch_loss = running_loss / dataset_sizes[phase]
+                epoch_acc = running_corrects.double() / dataset_sizes[phase]
+
+                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
+
+                # deep copy the model
+                if phase == 'val' and epoch_acc > best_acc:
+                    best_acc = epoch_acc
+                    torch.save(model.state_dict(), best_model_params_path)
+
+            print()
+
+        time_elapsed = time.time() - since
+        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
+        print(f'Best val Acc: {best_acc:4f}')
+
+        # load best model weights
+        model.load_state_dict(torch.load(best_model_params_path))
     return model
 
 
diff --git a/intermediate_source/char_rnn_classification_tutorial.py b/intermediate_source/char_rnn_classification_tutorial.py
@@ -4,11 +4,14 @@
 **************************************************************
 **Author**: `Sean Robertson <https://github.com/spro>`_
 
-We will be building and training a basic character-level RNN to classify
-words. This tutorial, along with the following two, show how to do
-preprocess data for NLP modeling "from scratch", in particular not using
-many of the convenience functions of `torchtext`, so you can see how
-preprocessing for NLP modeling works at a low level.
+We will be building and training a basic character-level Recurrent Neural
+Network (RNN) to classify words. This tutorial, along with two other
+Natural Language Processing (NLP) "from scratch" tutorials
+:doc:`/intermediate/char_rnn_generation_tutorial` and
+:doc:`/intermediate/seq2seq_translation_tutorial`, show how to
+preprocess data to model NLP. In particular these tutorials do not
+use many of the convenience functions of `torchtext`, so you can see how
+preprocessing to model NLP works at a low level.
 
 A character-level RNN reads words as a series of characters -
 outputting a prediction and "hidden state" at each step, feeding its
@@ -32,13 +35,15 @@
     (-2.68) Dutch
 
 
-**Recommended Reading:**
+Recommended Preparation
+=======================
 
-I assume you have at least installed PyTorch, know Python, and
-understand Tensors:
+Before starting this tutorial it is recommended that you have installed PyTorch,
+and have a basic understanding of Python programming language and Tensors:
 
 -  https://pytorch.org/ For installation instructions
 -  :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general
+   and learn the basics of Tensors
 -  :doc:`/beginner/pytorch_with_examples` for a wide and deep overview
 -  :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user
 
@@ -181,10 +186,6 @@ def lineToTensor(line):
 # is just 2 linear layers which operate on an input and hidden state, with
 # a ``LogSoftmax`` layer after the output.
 #
-# .. figure:: https://i.imgur.com/Z2xbySO.png
-#    :alt:
-#
-#
 
 import torch.nn as nn
 
@@ -195,13 +196,13 @@ def __init__(self, input_size, hidden_size, output_size):
         self.hidden_size = hidden_size
 
         self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
-        self.i2o = nn.Linear(input_size + hidden_size, output_size)
+        self.h2o = nn.Linear(hidden_size, output_size)
         self.softmax = nn.LogSoftmax(dim=1)
 
     def forward(self, input, hidden):
         combined = torch.cat((input, hidden), 1)
         hidden = self.i2h(combined)
-        output = self.i2o(combined)
+        output = self.h2o(hidden)
         output = self.softmax(output)
         return output, hidden
 
diff --git a/intermediate_source/char_rnn_generation_tutorial.py b/intermediate_source/char_rnn_generation_tutorial.py
@@ -278,7 +278,7 @@ def train(category_tensor, input_line_tensor, target_line_tensor):
 
     rnn.zero_grad()
 
-    loss = 0
+    loss = torch.Tensor([0]) # you can also just simply use ``loss = 0``
 
     for i in range(input_line_tensor.size(0)):
         output, hidden = rnn(category_tensor, input_line_tensor[i], hidden)
diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.rst b/intermediate_source/dynamic_quantization_bert_tutorial.rst
@@ -255,6 +255,9 @@ model before and after the dynamic quantization.
         torch.manual_seed(seed)
     set_seed(42)
 
+    # Initialize a global random number generator
+    global_rng = random.Random()
+
 
 2.2 Load the fine-tuned BERT model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -525,6 +528,21 @@ We can serialize and save the quantized model for the future use using
 
 .. code:: python
 
+    def ids_tensor(shape, vocab_size, rng=None, name=None):
+        #  Creates a random int32 tensor of the shape within the vocab size
+        if rng is None:
+            rng = global_rng
+
+        total_dims = 1
+        for dim in shape:
+            total_dims *= dim
+
+        values = []
+        for _ in range(total_dims):
+            values.append(rng.randint(0, vocab_size - 1))
+
+        return torch.tensor(data=values, dtype=torch.long, device='cpu').view(shape).contiguous()
+
     input_ids = ids_tensor([8, 128], 2)
     token_type_ids = ids_tensor([8, 128], 2)
     attention_mask = ids_tensor([8, 128], vocab_size=2)
diff --git a/prototype_source/fx_graph_mode_ptq_static.rst b/prototype_source/fx_graph_mode_ptq_static.rst
@@ -214,9 +214,9 @@ Download the `torchvision resnet18 model <https://download.pytorch.org/models/re
     float_model = load_model(saved_model_dir + float_model_file).to("cpu")
     float_model.eval()
 
-    # deepcopy the model since we need to keep the original model around
-    import copy
-    model_to_quantize = copy.deepcopy(float_model)
+    # create another instance of the model since
+    # we need to keep the original model around
+    model_to_quantize = load_model(saved_model_dir + float_model_file).to("cpu")
 
 3. Set model to eval mode
 -------------------------
@@ -408,4 +408,4 @@ Running the model in AIBench (with single threading) gives the following result:
 
 As we can see for resnet18 both FX graph mode and eager mode quantized model get similar speedup over the floating point model,
 which is around 2-4x faster than the floating point model. But the actual speedup over floating point model may vary
-depending on model, device, build, input batch sizes, threading etc.
+depending on model, device, build, input batch sizes, threading etc.