Skip to content

Commit 2568d5e

Browse files
committed
improve GPU performance. add interactive demo at the end.
1 parent c9e9423 commit 2568d5e

File tree

1 file changed

+126
-47
lines changed

1 file changed

+126
-47
lines changed

intermediate_source/speech_command_recognition_with_torchaudio.py

Lines changed: 126 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,33 @@
33
==========================================
44
55
This tutorial will show you how to correctly format an audio dataset and
6-
then train/test an audio classifier network on the dataset. First, let’s
7-
import the common torch packages such as
8-
``torchaudio <https://github.com/pytorch/audio>``\ \_ and can be
6+
then train/test an audio classifier network on the dataset.
7+
8+
Colab has GPU option available. In the menu tabs, select “Runtime” then
9+
“Change runtime type”. In the pop-up that follows, you can choose GPU.
10+
After the change, your runtime should automatically restart (which means
11+
information from executed cells disappear).
12+
13+
First, let’s import the common torch packages such as
14+
``torchaudio <https://github.com/pytorch/audio>``\ \_ that can be
915
installed by following the instructions on the website.
1016
1117
"""
1218

1319
# Uncomment the following line to run in Google Colab
14-
# !pip install torch
15-
# !pip install torchaudio
20+
21+
# GPU:
22+
# !pip install torch==1.7.0+cu101 torchvision==0.8.1+cu101 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
23+
24+
# CPU:
25+
# !pip install torch==1.7.0+cpu torchvision==0.8.1+cpu torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
26+
27+
# For interactive demo at the end:
28+
# !pip install pydub
1629

1730
import os
31+
from base64 import b64decode
32+
from io import BytesIO
1833

1934
import IPython.display as ipd
2035
import matplotlib.pyplot as plt
@@ -25,6 +40,8 @@
2540
import torch.nn.functional as F
2641
import torch.optim as optim
2742
import torchaudio
43+
from google.colab import output as colab_output
44+
from pydub import AudioSegment
2845
from torchaudio.datasets import SPEECHCOMMANDS
2946

3047
######################################################################
@@ -73,11 +90,12 @@ def load_list(filename):
7390
self._walker = load_list("testing_list.txt")
7491
elif subset == "training":
7592
excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
93+
excludes = set(excludes)
7694
self._walker = [w for w in self._walker if w not in excludes]
7795

7896

97+
# Create training and testing split of the data. We do not use validation in this tutorial.
7998
train_set = SubsetSC("training")
80-
# valid_set = SubsetSC("validation")
8199
test_set = SubsetSC("testing")
82100

83101
waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]
@@ -100,7 +118,7 @@ def load_list(filename):
100118
# Let’s find the list of labels available in the dataset.
101119
#
102120

103-
labels = list(set(datapoint[2] for datapoint in train_set))
121+
labels = sorted(list(set(datapoint[2] for datapoint in train_set)))
104122
labels
105123

106124

@@ -170,6 +188,7 @@ def encode(word):
170188
# encoding.
171189
#
172190

191+
173192
def pad_sequence(batch):
174193
# Make all tensor in a batch the same length by padding with zeros
175194
batch = [item.t() for item in batch]
@@ -184,9 +203,9 @@ def collate_fn(batch):
184203

185204
tensors, targets = [], []
186205

187-
# Apply transform and encode
206+
# Gather in lists, and encode labels
188207
for waveform, _, label, *_ in batch:
189-
tensors += [transform(waveform)]
208+
tensors += [waveform]
190209
targets += [encode(label)]
191210

192211
# Group the list of tensors into a batched tensor
@@ -196,20 +215,31 @@ def collate_fn(batch):
196215
return tensors, targets
197216

198217

199-
batch_size = 128
218+
batch_size = 256
200219

201-
if device == 'cuda':
220+
if device == "cuda":
202221
num_workers = 1
203222
pin_memory = True
204223
else:
205224
num_workers = 0
206225
pin_memory = False
207226

208227
train_loader = torch.utils.data.DataLoader(
209-
train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=num_workers, pin_memory=pin_memory,
228+
train_set,
229+
batch_size=batch_size,
230+
shuffle=True,
231+
collate_fn=collate_fn,
232+
num_workers=num_workers,
233+
pin_memory=pin_memory,
210234
)
211235
test_loader = torch.utils.data.DataLoader(
212-
test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=num_workers, pin_memory=pin_memory,
236+
test_set,
237+
batch_size=batch_size,
238+
shuffle=False,
239+
drop_last=False,
240+
collate_fn=collate_fn,
241+
num_workers=num_workers,
242+
pin_memory=pin_memory,
213243
)
214244

215245

@@ -232,21 +262,21 @@ def collate_fn(batch):
232262

233263

234264
class M5(nn.Module):
235-
def __init__(self, stride=16, n_channel=32, n_output=35):
265+
def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
236266
super().__init__()
237-
self.conv1 = nn.Conv1d(1, n_channel, 80, stride=stride)
267+
self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
238268
self.bn1 = nn.BatchNorm1d(n_channel)
239269
self.pool1 = nn.MaxPool1d(4)
240-
self.conv2 = nn.Conv1d(n_channel, n_channel, 3)
270+
self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
241271
self.bn2 = nn.BatchNorm1d(n_channel)
242272
self.pool2 = nn.MaxPool1d(4)
243-
self.conv3 = nn.Conv1d(n_channel, 2*n_channel, 3)
244-
self.bn3 = nn.BatchNorm1d(2*n_channel)
273+
self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
274+
self.bn3 = nn.BatchNorm1d(2 * n_channel)
245275
self.pool3 = nn.MaxPool1d(4)
246-
self.conv4 = nn.Conv1d(2*n_channel, 2*n_channel, 3)
247-
self.bn4 = nn.BatchNorm1d(2*n_channel)
276+
self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
277+
self.bn4 = nn.BatchNorm1d(2 * n_channel)
248278
self.pool4 = nn.MaxPool1d(4)
249-
self.fc1 = nn.Linear(2*n_channel, n_output)
279+
self.fc1 = nn.Linear(2 * n_channel, n_output)
250280

251281
def forward(self, x):
252282
x = self.conv1(x)
@@ -267,7 +297,7 @@ def forward(self, x):
267297
return F.log_softmax(x, dim=2)
268298

269299

270-
model = M5(n_output=len(labels))
300+
model = M5(n_input=transformed.shape[0], n_output=len(labels))
271301
model.to(device)
272302
print(model)
273303

@@ -296,12 +326,9 @@ def count_parameters(model):
296326
# --------------------------------
297327
#
298328
# Now let’s define a training function that will feed our training data
299-
# into the model and perform the backward pass and optimization steps.
300-
#
301-
# Finally, we can train and test the network. We will train the network
302-
# for ten epochs then reduce the learn rate and train for ten more epochs.
303-
# The network will be tested after each epoch to see how the accuracy
304-
# varies during the training.
329+
# into the model and perform the backward pass and optimization steps. The
330+
# network will be tested after each epoch to see how the accuracy varies
331+
# during the training.
305332
#
306333

307334

@@ -312,6 +339,7 @@ def train(model, epoch, log_interval):
312339
data = data.to(device)
313340
target = target.to(device)
314341

342+
data = transform(data)
315343
output = model(data)
316344

317345
# negative log-likelihood for a tensor of size (batch x 1 x n_output)
@@ -323,10 +351,10 @@ def train(model, epoch, log_interval):
323351

324352
# print training stats
325353
if batch_idx % log_interval == 0:
326-
print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss:.6f}')
354+
print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss:.6f}")
327355

328-
if 'pbar' in globals():
329-
pbar.update()
356+
if "pbar" in globals() and "pbar_update" in globals():
357+
pbar.update(pbar_update)
330358

331359

332360
######################################################################
@@ -346,24 +374,26 @@ def argmax(tensor):
346374

347375
def number_of_correct(pred, target):
348376
# compute number of correct predictions
349-
return pred.squeeze().eq(target).cpu().sum().item()
377+
return pred.squeeze().eq(target).sum().item()
350378

351379

352380
def test(model, epoch):
353381
model.eval()
354382
correct = 0
355383
for data, target in test_loader:
384+
356385
data = data.to(device)
357386
target = target.to(device)
358387

388+
data = transform(data)
359389
output = model(data)
360390
pred = argmax(output)
361391
correct += number_of_correct(pred, target)
362392

363-
if 'pbar' in globals():
364-
pbar.update()
393+
if "pbar" in globals() and "pbar_update" in globals():
394+
pbar.update(pbar_update)
365395

366-
print(f'\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n')
396+
print(f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n")
367397

368398

369399
######################################################################
@@ -375,21 +405,28 @@ def test(model, epoch):
375405

376406
log_interval = 20
377407
n_epoch = 2
408+
pbar_update = 1 / (len(train_loader) + len(test_loader))
409+
410+
# The transform needs to live on the same device as the model and the data.
411+
transform = transform.to(device)
378412

379-
with tqdm(total=n_epoch * (len(train_loader) + len(test_loader))) as pbar:
380-
for epoch in range(1, n_epoch+1):
413+
with tqdm(total=n_epoch) as pbar:
414+
for epoch in range(1, n_epoch + 1):
381415
train(model, epoch, log_interval)
382416
test(model, epoch)
383417
scheduler.step()
384418

385419

386420
######################################################################
387-
# Let’s look at the last words in the train set, and see how the model did
388-
# on it.
421+
# The network should be more than 65% accurate on the test set after 2
422+
# epochs, and 85% after 21 epochs. Let’s look at the last words in the
423+
# train set, and see how the model did on it.
389424
#
390425

426+
391427
def predict(waveform):
392-
# Take a waveform and use the model to predict
428+
# Use the model to predict the label of the waveform
429+
waveform = waveform.to(device)
393430
waveform = transform(waveform)
394431
output = model(waveform.unsqueeze(0))
395432
output = argmax(output).squeeze()
@@ -410,9 +447,9 @@ def predict(waveform):
410447
for i, (waveform, sample_rate, utterance, *_) in enumerate(test_set):
411448
output = predict(waveform)
412449
if output != utterance:
413-
ipd.Audio(waveform.numpy(), rate=sample_rate)
414-
print(f"Data point #{i}. Expected: {utterance}. Predicted: {output}.")
415-
break
450+
ipd.Audio(waveform.numpy(), rate=sample_rate)
451+
print(f"Data point #{i}. Expected: {utterance}. Predicted: {output}.")
452+
break
416453
else:
417454
print("All examples in this dataset were correctly classified!")
418455
print("In this case, let's just look at the last data point")
@@ -421,17 +458,59 @@ def predict(waveform):
421458

422459

423460
######################################################################
424-
# Feel free to try with one of your own recordings!
461+
# Feel free to try with one of your own recordings of one of the labels!
462+
# For example, in Colab, say “Go” while executing the cell below. This
463+
# will record one second of audio and try to classify it.
425464
#
426465

427466

467+
RECORD = """
468+
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
469+
const b2text = blob => new Promise(resolve => {
470+
const reader = new FileReader()
471+
reader.onloadend = e => resolve(e.srcElement.result)
472+
reader.readAsDataURL(blob)
473+
})
474+
var record = time => new Promise(async resolve => {
475+
stream = await navigator.mediaDevices.getUserMedia({ audio: true })
476+
recorder = new MediaRecorder(stream)
477+
chunks = []
478+
recorder.ondataavailable = e => chunks.push(e.data)
479+
recorder.start()
480+
await sleep(time)
481+
recorder.onstop = async ()=>{
482+
blob = new Blob(chunks)
483+
text = await b2text(blob)
484+
resolve(text)
485+
}
486+
recorder.stop()
487+
})
488+
"""
489+
490+
491+
def record(seconds=1):
492+
display(ipd.Javascript(RECORD))
493+
print(f"Recording started for {seconds} seconds.")
494+
s = colab_output.eval_js("record(%d)" % (seconds * 1000))
495+
print("Recording ended.")
496+
b = b64decode(s.split(",")[1])
497+
498+
fileformat = "wav"
499+
filename = f"_audio.{fileformat}"
500+
AudioSegment.from_file(BytesIO(b)).export(filename, format=fileformat)
501+
502+
return torchaudio.load(filename)
503+
504+
505+
waveform, sample_rate = record()
506+
print(f"Predicted: {predict(waveform)}.")
507+
ipd.Audio(waveform.numpy(), rate=sample_rate)
508+
509+
428510
######################################################################
429511
# Conclusion
430512
# ----------
431513
#
432-
# The network should be more than 70% accurate on the test set after 2
433-
# epochs, 80% after 14 epochs, and 85% after 21 epochs.
434-
#
435514
# In this tutorial, we used torchaudio to load a dataset and resample the
436515
# signal. We have then defined a neural network that we trained to
437516
# recognize a given command. There are also other data preprocessing

0 commit comments

Comments
 (0)