improve GPU performance. add interactive demo at the end.

vincentqb · vincentqb · commit 2568d5efe9aa · 2020-11-04T13:30:01.000-05:00
diff --git a/intermediate_source/speech_command_recognition_with_torchaudio.py b/intermediate_source/speech_command_recognition_with_torchaudio.py
@@ -3,18 +3,33 @@
 ==========================================
 
 This tutorial will show you how to correctly format an audio dataset and
-then train/test an audio classifier network on the dataset. First, let’s
-import the common torch packages such as
-``torchaudio <https://github.com/pytorch/audio>``\ \_ and can be
+then train/test an audio classifier network on the dataset.
+
+Colab has GPU option available. In the menu tabs, select “Runtime” then
+“Change runtime type”. In the pop-up that follows, you can choose GPU.
+After the change, your runtime should automatically restart (which means
+information from executed cells disappear).
+
+First, let’s import the common torch packages such as
+``torchaudio <https://github.com/pytorch/audio>``\ \_ that can be
 installed by following the instructions on the website.
 
 """
 
 # Uncomment the following line to run in Google Colab
-# !pip install torch
-# !pip install torchaudio
+
+# GPU:
+# !pip install torch==1.7.0+cu101 torchvision==0.8.1+cu101 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CPU:
+# !pip install torch==1.7.0+cpu torchvision==0.8.1+cpu torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
+
+# For interactive demo at the end:
+# !pip install pydub
 
 import os
+from base64 import b64decode
+from io import BytesIO
 
 import IPython.display as ipd
 import matplotlib.pyplot as plt
@@ -25,6 +40,8 @@
 import torch.nn.functional as F
 import torch.optim as optim
 import torchaudio
+from google.colab import output as colab_output
+from pydub import AudioSegment
 from torchaudio.datasets import SPEECHCOMMANDS
 
 ######################################################################
@@ -73,11 +90,12 @@ def load_list(filename):
             self._walker = load_list("testing_list.txt")
         elif subset == "training":
             excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
+            excludes = set(excludes)
             self._walker = [w for w in self._walker if w not in excludes]
 
 
+# Create training and testing split of the data. We do not use validation in this tutorial.
 train_set = SubsetSC("training")
-# valid_set = SubsetSC("validation")
 test_set = SubsetSC("testing")
 
 waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]
@@ -100,7 +118,7 @@ def load_list(filename):
 # Let’s find the list of labels available in the dataset.
 #
 
-labels = list(set(datapoint[2] for datapoint in train_set))
+labels = sorted(list(set(datapoint[2] for datapoint in train_set)))
 labels
 
 
@@ -170,6 +188,7 @@ def encode(word):
 # encoding.
 #
 
+
 def pad_sequence(batch):
     # Make all tensor in a batch the same length by padding with zeros
     batch = [item.t() for item in batch]
@@ -184,9 +203,9 @@ def collate_fn(batch):
 
     tensors, targets = [], []
 
-    # Apply transform and encode
+    # Gather in lists, and encode labels
     for waveform, _, label, *_ in batch:
-        tensors += [transform(waveform)]
+        tensors += [waveform]
         targets += [encode(label)]
 
     # Group the list of tensors into a batched tensor
@@ -196,20 +215,31 @@ def collate_fn(batch):
     return tensors, targets
 
 
-batch_size = 128
+batch_size = 256
 
-if device == 'cuda':
+if device == "cuda":
     num_workers = 1
     pin_memory = True
 else:
     num_workers = 0
     pin_memory = False
 
 train_loader = torch.utils.data.DataLoader(
-    train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=num_workers, pin_memory=pin_memory,
+    train_set,
+    batch_size=batch_size,
+    shuffle=True,
+    collate_fn=collate_fn,
+    num_workers=num_workers,
+    pin_memory=pin_memory,
 )
 test_loader = torch.utils.data.DataLoader(
-    test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=num_workers, pin_memory=pin_memory,
+    test_set,
+    batch_size=batch_size,
+    shuffle=False,
+    drop_last=False,
+    collate_fn=collate_fn,
+    num_workers=num_workers,
+    pin_memory=pin_memory,
 )
 
 
@@ -232,21 +262,21 @@ def collate_fn(batch):
 
 
 class M5(nn.Module):
-    def __init__(self, stride=16, n_channel=32, n_output=35):
+    def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
         super().__init__()
-        self.conv1 = nn.Conv1d(1, n_channel, 80, stride=stride)
+        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
         self.bn1 = nn.BatchNorm1d(n_channel)
         self.pool1 = nn.MaxPool1d(4)
-        self.conv2 = nn.Conv1d(n_channel, n_channel, 3)
+        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
         self.bn2 = nn.BatchNorm1d(n_channel)
         self.pool2 = nn.MaxPool1d(4)
-        self.conv3 = nn.Conv1d(n_channel, 2*n_channel, 3)
-        self.bn3 = nn.BatchNorm1d(2*n_channel)
+        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
+        self.bn3 = nn.BatchNorm1d(2 * n_channel)
         self.pool3 = nn.MaxPool1d(4)
-        self.conv4 = nn.Conv1d(2*n_channel, 2*n_channel, 3)
-        self.bn4 = nn.BatchNorm1d(2*n_channel)
+        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
+        self.bn4 = nn.BatchNorm1d(2 * n_channel)
         self.pool4 = nn.MaxPool1d(4)
-        self.fc1 = nn.Linear(2*n_channel, n_output)
+        self.fc1 = nn.Linear(2 * n_channel, n_output)
 
     def forward(self, x):
         x = self.conv1(x)
@@ -267,7 +297,7 @@ def forward(self, x):
         return F.log_softmax(x, dim=2)
 
 
-model = M5(n_output=len(labels))
+model = M5(n_input=transformed.shape[0], n_output=len(labels))
 model.to(device)
 print(model)
 
@@ -296,12 +326,9 @@ def count_parameters(model):
 # --------------------------------
 #
 # Now let’s define a training function that will feed our training data
-# into the model and perform the backward pass and optimization steps.
-#
-# Finally, we can train and test the network. We will train the network
-# for ten epochs then reduce the learn rate and train for ten more epochs.
-# The network will be tested after each epoch to see how the accuracy
-# varies during the training.
+# into the model and perform the backward pass and optimization steps. The
+# network will be tested after each epoch to see how the accuracy varies
+# during the training.
 #
 
 
@@ -312,6 +339,7 @@ def train(model, epoch, log_interval):
         data = data.to(device)
         target = target.to(device)
 
+        data = transform(data)
         output = model(data)
 
         # negative log-likelihood for a tensor of size (batch x 1 x n_output)
@@ -323,10 +351,10 @@ def train(model, epoch, log_interval):
 
         # print training stats
         if batch_idx % log_interval == 0:
-            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss:.6f}')
+            print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss:.6f}")
 
-        if 'pbar' in globals():
-            pbar.update()
+        if "pbar" in globals() and "pbar_update" in globals():
+            pbar.update(pbar_update)
 
 
 ######################################################################
@@ -346,24 +374,26 @@ def argmax(tensor):
 
 def number_of_correct(pred, target):
     # compute number of correct predictions
-    return pred.squeeze().eq(target).cpu().sum().item()
+    return pred.squeeze().eq(target).sum().item()
 
 
 def test(model, epoch):
     model.eval()
     correct = 0
     for data, target in test_loader:
+
         data = data.to(device)
         target = target.to(device)
 
+        data = transform(data)
         output = model(data)
         pred = argmax(output)
         correct += number_of_correct(pred, target)
 
-        if 'pbar' in globals():
-          pbar.update()
+        if "pbar" in globals() and "pbar_update" in globals():
+            pbar.update(pbar_update)
 
-    print(f'\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n')
+    print(f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n")
 
 
 ######################################################################
@@ -375,21 +405,28 @@ def test(model, epoch):
 
 log_interval = 20
 n_epoch = 2
+pbar_update = 1 / (len(train_loader) + len(test_loader))
+
+# The transform needs to live on the same device as the model and the data.
+transform = transform.to(device)
 
-with tqdm(total=n_epoch * (len(train_loader) + len(test_loader))) as pbar:
-    for epoch in range(1, n_epoch+1):
+with tqdm(total=n_epoch) as pbar:
+    for epoch in range(1, n_epoch + 1):
         train(model, epoch, log_interval)
         test(model, epoch)
         scheduler.step()
 
 
 ######################################################################
-# Let’s look at the last words in the train set, and see how the model did
-# on it.
+# The network should be more than 65% accurate on the test set after 2
+# epochs, and 85% after 21 epochs. Let’s look at the last words in the
+# train set, and see how the model did on it.
 #
 
+
 def predict(waveform):
-    # Take a waveform and use the model to predict
+    # Use the model to predict the label of the waveform
+    waveform = waveform.to(device)
     waveform = transform(waveform)
     output = model(waveform.unsqueeze(0))
     output = argmax(output).squeeze()
@@ -410,9 +447,9 @@ def predict(waveform):
 for i, (waveform, sample_rate, utterance, *_) in enumerate(test_set):
     output = predict(waveform)
     if output != utterance:
-      ipd.Audio(waveform.numpy(), rate=sample_rate)
-      print(f"Data point #{i}. Expected: {utterance}. Predicted: {output}.")
-      break
+        ipd.Audio(waveform.numpy(), rate=sample_rate)
+        print(f"Data point #{i}. Expected: {utterance}. Predicted: {output}.")
+        break
 else:
     print("All examples in this dataset were correctly classified!")
     print("In this case, let's just look at the last data point")
@@ -421,17 +458,59 @@ def predict(waveform):
 
 
 ######################################################################
-# Feel free to try with one of your own recordings!
+# Feel free to try with one of your own recordings of one of the labels!
+# For example, in Colab, say “Go” while executing the cell below. This
+# will record one second of audio and try to classify it.
 #
 
 
+RECORD = """
+const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
+const b2text = blob => new Promise(resolve => {
+  const reader = new FileReader()
+  reader.onloadend = e => resolve(e.srcElement.result)
+  reader.readAsDataURL(blob)
+})
+var record = time => new Promise(async resolve => {
+  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
+  recorder = new MediaRecorder(stream)
+  chunks = []
+  recorder.ondataavailable = e => chunks.push(e.data)
+  recorder.start()
+  await sleep(time)
+  recorder.onstop = async ()=>{
+    blob = new Blob(chunks)
+    text = await b2text(blob)
+    resolve(text)
+  }
+  recorder.stop()
+})
+"""
+
+
+def record(seconds=1):
+    display(ipd.Javascript(RECORD))
+    print(f"Recording started for {seconds} seconds.")
+    s = colab_output.eval_js("record(%d)" % (seconds * 1000))
+    print("Recording ended.")
+    b = b64decode(s.split(",")[1])
+
+    fileformat = "wav"
+    filename = f"_audio.{fileformat}"
+    AudioSegment.from_file(BytesIO(b)).export(filename, format=fileformat)
+
+    return torchaudio.load(filename)
+
+
+waveform, sample_rate = record()
+print(f"Predicted: {predict(waveform)}.")
+ipd.Audio(waveform.numpy(), rate=sample_rate)
+
+
 ######################################################################
 # Conclusion
 # ----------
 #
-# The network should be more than 70% accurate on the test set after 2
-# epochs, 80% after 14 epochs, and 85% after 21 epochs.
-#
 # In this tutorial, we used torchaudio to load a dataset and resample the
 # signal. We have then defined a neural network that we trained to
 # recognize a given command. There are also other data preprocessing