3
3
==========================================
4
4
5
5
This tutorial will show you how to correctly format an audio dataset and
6
- then train/test an audio classifier network on the dataset. First, let’s
7
- import the common torch packages such as
8
- ``torchaudio <https://github.com/pytorch/audio>``\ \_ and can be
6
+ then train/test an audio classifier network on the dataset.
7
+
8
+ Colab has GPU option available. In the menu tabs, select “Runtime” then
9
+ “Change runtime type”. In the pop-up that follows, you can choose GPU.
10
+ After the change, your runtime should automatically restart (which means
11
+ information from executed cells disappear).
12
+
13
+ First, let’s import the common torch packages such as
14
+ ``torchaudio <https://github.com/pytorch/audio>``\ \_ that can be
9
15
installed by following the instructions on the website.
10
16
11
17
"""
12
18
13
19
# Uncomment the following line to run in Google Colab
14
- # !pip install torch
15
- # !pip install torchaudio
20
+
21
+ # GPU:
22
+ # !pip install torch==1.7.0+cu101 torchvision==0.8.1+cu101 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
23
+
24
+ # CPU:
25
+ # !pip install torch==1.7.0+cpu torchvision==0.8.1+cpu torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
26
+
27
+ # For interactive demo at the end:
28
+ # !pip install pydub
16
29
17
30
import os
31
+ from base64 import b64decode
32
+ from io import BytesIO
18
33
19
34
import IPython .display as ipd
20
35
import matplotlib .pyplot as plt
25
40
import torch .nn .functional as F
26
41
import torch .optim as optim
27
42
import torchaudio
43
+ from google .colab import output as colab_output
44
+ from pydub import AudioSegment
28
45
from torchaudio .datasets import SPEECHCOMMANDS
29
46
30
47
######################################################################
@@ -73,11 +90,12 @@ def load_list(filename):
73
90
self ._walker = load_list ("testing_list.txt" )
74
91
elif subset == "training" :
75
92
excludes = load_list ("validation_list.txt" ) + load_list ("testing_list.txt" )
93
+ excludes = set (excludes )
76
94
self ._walker = [w for w in self ._walker if w not in excludes ]
77
95
78
96
97
+ # Create training and testing split of the data. We do not use validation in this tutorial.
79
98
train_set = SubsetSC ("training" )
80
- # valid_set = SubsetSC("validation")
81
99
test_set = SubsetSC ("testing" )
82
100
83
101
waveform , sample_rate , label , speaker_id , utterance_number = train_set [0 ]
@@ -100,7 +118,7 @@ def load_list(filename):
100
118
# Let’s find the list of labels available in the dataset.
101
119
#
102
120
103
- labels = list (set (datapoint [2 ] for datapoint in train_set ))
121
+ labels = sorted ( list (set (datapoint [2 ] for datapoint in train_set ) ))
104
122
labels
105
123
106
124
@@ -170,6 +188,7 @@ def encode(word):
170
188
# encoding.
171
189
#
172
190
191
+
173
192
def pad_sequence (batch ):
174
193
# Make all tensor in a batch the same length by padding with zeros
175
194
batch = [item .t () for item in batch ]
@@ -184,9 +203,9 @@ def collate_fn(batch):
184
203
185
204
tensors , targets = [], []
186
205
187
- # Apply transform and encode
206
+ # Gather in lists, and encode labels
188
207
for waveform , _ , label , * _ in batch :
189
- tensors += [transform ( waveform ) ]
208
+ tensors += [waveform ]
190
209
targets += [encode (label )]
191
210
192
211
# Group the list of tensors into a batched tensor
@@ -196,20 +215,31 @@ def collate_fn(batch):
196
215
return tensors , targets
197
216
198
217
199
- batch_size = 128
218
+ batch_size = 256
200
219
201
- if device == ' cuda' :
220
+ if device == " cuda" :
202
221
num_workers = 1
203
222
pin_memory = True
204
223
else :
205
224
num_workers = 0
206
225
pin_memory = False
207
226
208
227
train_loader = torch .utils .data .DataLoader (
209
- train_set , batch_size = batch_size , shuffle = True , collate_fn = collate_fn , num_workers = num_workers , pin_memory = pin_memory ,
228
+ train_set ,
229
+ batch_size = batch_size ,
230
+ shuffle = True ,
231
+ collate_fn = collate_fn ,
232
+ num_workers = num_workers ,
233
+ pin_memory = pin_memory ,
210
234
)
211
235
test_loader = torch .utils .data .DataLoader (
212
- test_set , batch_size = batch_size , shuffle = False , collate_fn = collate_fn , num_workers = num_workers , pin_memory = pin_memory ,
236
+ test_set ,
237
+ batch_size = batch_size ,
238
+ shuffle = False ,
239
+ drop_last = False ,
240
+ collate_fn = collate_fn ,
241
+ num_workers = num_workers ,
242
+ pin_memory = pin_memory ,
213
243
)
214
244
215
245
@@ -232,21 +262,21 @@ def collate_fn(batch):
232
262
233
263
234
264
class M5 (nn .Module ):
235
- def __init__ (self , stride = 16 , n_channel = 32 , n_output = 35 ):
265
+ def __init__ (self , n_input = 1 , n_output = 35 , stride = 16 , n_channel = 32 ):
236
266
super ().__init__ ()
237
- self .conv1 = nn .Conv1d (1 , n_channel , 80 , stride = stride )
267
+ self .conv1 = nn .Conv1d (n_input , n_channel , kernel_size = 80 , stride = stride )
238
268
self .bn1 = nn .BatchNorm1d (n_channel )
239
269
self .pool1 = nn .MaxPool1d (4 )
240
- self .conv2 = nn .Conv1d (n_channel , n_channel , 3 )
270
+ self .conv2 = nn .Conv1d (n_channel , n_channel , kernel_size = 3 )
241
271
self .bn2 = nn .BatchNorm1d (n_channel )
242
272
self .pool2 = nn .MaxPool1d (4 )
243
- self .conv3 = nn .Conv1d (n_channel , 2 * n_channel , 3 )
244
- self .bn3 = nn .BatchNorm1d (2 * n_channel )
273
+ self .conv3 = nn .Conv1d (n_channel , 2 * n_channel , kernel_size = 3 )
274
+ self .bn3 = nn .BatchNorm1d (2 * n_channel )
245
275
self .pool3 = nn .MaxPool1d (4 )
246
- self .conv4 = nn .Conv1d (2 * n_channel , 2 * n_channel , 3 )
247
- self .bn4 = nn .BatchNorm1d (2 * n_channel )
276
+ self .conv4 = nn .Conv1d (2 * n_channel , 2 * n_channel , kernel_size = 3 )
277
+ self .bn4 = nn .BatchNorm1d (2 * n_channel )
248
278
self .pool4 = nn .MaxPool1d (4 )
249
- self .fc1 = nn .Linear (2 * n_channel , n_output )
279
+ self .fc1 = nn .Linear (2 * n_channel , n_output )
250
280
251
281
def forward (self , x ):
252
282
x = self .conv1 (x )
@@ -267,7 +297,7 @@ def forward(self, x):
267
297
return F .log_softmax (x , dim = 2 )
268
298
269
299
270
- model = M5 (n_output = len (labels ))
300
+ model = M5 (n_input = transformed . shape [ 0 ], n_output = len (labels ))
271
301
model .to (device )
272
302
print (model )
273
303
@@ -296,12 +326,9 @@ def count_parameters(model):
296
326
# --------------------------------
297
327
#
298
328
# Now let’s define a training function that will feed our training data
299
- # into the model and perform the backward pass and optimization steps.
300
- #
301
- # Finally, we can train and test the network. We will train the network
302
- # for ten epochs then reduce the learn rate and train for ten more epochs.
303
- # The network will be tested after each epoch to see how the accuracy
304
- # varies during the training.
329
+ # into the model and perform the backward pass and optimization steps. The
330
+ # network will be tested after each epoch to see how the accuracy varies
331
+ # during the training.
305
332
#
306
333
307
334
@@ -312,6 +339,7 @@ def train(model, epoch, log_interval):
312
339
data = data .to (device )
313
340
target = target .to (device )
314
341
342
+ data = transform (data )
315
343
output = model (data )
316
344
317
345
# negative log-likelihood for a tensor of size (batch x 1 x n_output)
@@ -323,10 +351,10 @@ def train(model, epoch, log_interval):
323
351
324
352
# print training stats
325
353
if batch_idx % log_interval == 0 :
326
- print (f' Train Epoch: { epoch } [{ batch_idx * len (data )} /{ len (train_loader .dataset )} ({ 100. * batch_idx / len (train_loader ):.0f} %)]\t Loss: { loss :.6f} ' )
354
+ print (f" Train Epoch: { epoch } [{ batch_idx * len (data )} /{ len (train_loader .dataset )} ({ 100. * batch_idx / len (train_loader ):.0f} %)]\t Loss: { loss :.6f} " )
327
355
328
- if ' pbar' in globals ():
329
- pbar .update ()
356
+ if " pbar" in globals () and "pbar_update" in globals ():
357
+ pbar .update (pbar_update )
330
358
331
359
332
360
######################################################################
@@ -346,24 +374,26 @@ def argmax(tensor):
346
374
347
375
def number_of_correct (pred , target ):
348
376
# compute number of correct predictions
349
- return pred .squeeze ().eq (target ).cpu (). sum ().item ()
377
+ return pred .squeeze ().eq (target ).sum ().item ()
350
378
351
379
352
380
def test (model , epoch ):
353
381
model .eval ()
354
382
correct = 0
355
383
for data , target in test_loader :
384
+
356
385
data = data .to (device )
357
386
target = target .to (device )
358
387
388
+ data = transform (data )
359
389
output = model (data )
360
390
pred = argmax (output )
361
391
correct += number_of_correct (pred , target )
362
392
363
- if ' pbar' in globals ():
364
- pbar .update ()
393
+ if " pbar" in globals () and "pbar_update" in globals ():
394
+ pbar .update (pbar_update )
365
395
366
- print (f' \n Test Epoch: { epoch } \t Accuracy: { correct } /{ len (test_loader .dataset )} ({ 100. * correct / len (test_loader .dataset ):.0f} %)\n ' )
396
+ print (f" \n Test Epoch: { epoch } \t Accuracy: { correct } /{ len (test_loader .dataset )} ({ 100. * correct / len (test_loader .dataset ):.0f} %)\n " )
367
397
368
398
369
399
######################################################################
@@ -375,21 +405,28 @@ def test(model, epoch):
375
405
376
406
log_interval = 20
377
407
n_epoch = 2
408
+ pbar_update = 1 / (len (train_loader ) + len (test_loader ))
409
+
410
+ # The transform needs to live on the same device as the model and the data.
411
+ transform = transform .to (device )
378
412
379
- with tqdm (total = n_epoch * ( len ( train_loader ) + len ( test_loader )) ) as pbar :
380
- for epoch in range (1 , n_epoch + 1 ):
413
+ with tqdm (total = n_epoch ) as pbar :
414
+ for epoch in range (1 , n_epoch + 1 ):
381
415
train (model , epoch , log_interval )
382
416
test (model , epoch )
383
417
scheduler .step ()
384
418
385
419
386
420
######################################################################
387
- # Let’s look at the last words in the train set, and see how the model did
388
- # on it.
421
+ # The network should be more than 65% accurate on the test set after 2
422
+ # epochs, and 85% after 21 epochs. Let’s look at the last words in the
423
+ # train set, and see how the model did on it.
389
424
#
390
425
426
+
391
427
def predict (waveform ):
392
- # Take a waveform and use the model to predict
428
+ # Use the model to predict the label of the waveform
429
+ waveform = waveform .to (device )
393
430
waveform = transform (waveform )
394
431
output = model (waveform .unsqueeze (0 ))
395
432
output = argmax (output ).squeeze ()
@@ -410,9 +447,9 @@ def predict(waveform):
410
447
for i , (waveform , sample_rate , utterance , * _ ) in enumerate (test_set ):
411
448
output = predict (waveform )
412
449
if output != utterance :
413
- ipd .Audio (waveform .numpy (), rate = sample_rate )
414
- print (f"Data point #{ i } . Expected: { utterance } . Predicted: { output } ." )
415
- break
450
+ ipd .Audio (waveform .numpy (), rate = sample_rate )
451
+ print (f"Data point #{ i } . Expected: { utterance } . Predicted: { output } ." )
452
+ break
416
453
else :
417
454
print ("All examples in this dataset were correctly classified!" )
418
455
print ("In this case, let's just look at the last data point" )
@@ -421,17 +458,59 @@ def predict(waveform):
421
458
422
459
423
460
######################################################################
424
- # Feel free to try with one of your own recordings!
461
+ # Feel free to try with one of your own recordings of one of the labels!
462
+ # For example, in Colab, say “Go” while executing the cell below. This
463
+ # will record one second of audio and try to classify it.
425
464
#
426
465
427
466
467
+ RECORD = """
468
+ const sleep = time => new Promise(resolve => setTimeout(resolve, time))
469
+ const b2text = blob => new Promise(resolve => {
470
+ const reader = new FileReader()
471
+ reader.onloadend = e => resolve(e.srcElement.result)
472
+ reader.readAsDataURL(blob)
473
+ })
474
+ var record = time => new Promise(async resolve => {
475
+ stream = await navigator.mediaDevices.getUserMedia({ audio: true })
476
+ recorder = new MediaRecorder(stream)
477
+ chunks = []
478
+ recorder.ondataavailable = e => chunks.push(e.data)
479
+ recorder.start()
480
+ await sleep(time)
481
+ recorder.onstop = async ()=>{
482
+ blob = new Blob(chunks)
483
+ text = await b2text(blob)
484
+ resolve(text)
485
+ }
486
+ recorder.stop()
487
+ })
488
+ """
489
+
490
+
491
+ def record (seconds = 1 ):
492
+ display (ipd .Javascript (RECORD ))
493
+ print (f"Recording started for { seconds } seconds." )
494
+ s = colab_output .eval_js ("record(%d)" % (seconds * 1000 ))
495
+ print ("Recording ended." )
496
+ b = b64decode (s .split ("," )[1 ])
497
+
498
+ fileformat = "wav"
499
+ filename = f"_audio.{ fileformat } "
500
+ AudioSegment .from_file (BytesIO (b )).export (filename , format = fileformat )
501
+
502
+ return torchaudio .load (filename )
503
+
504
+
505
+ waveform , sample_rate = record ()
506
+ print (f"Predicted: { predict (waveform )} ." )
507
+ ipd .Audio (waveform .numpy (), rate = sample_rate )
508
+
509
+
428
510
######################################################################
429
511
# Conclusion
430
512
# ----------
431
513
#
432
- # The network should be more than 70% accurate on the test set after 2
433
- # epochs, 80% after 14 epochs, and 85% after 21 epochs.
434
- #
435
514
# In this tutorial, we used torchaudio to load a dataset and resample the
436
515
# signal. We have then defined a neural network that we trained to
437
516
# recognize a given command. There are also other data preprocessing
0 commit comments