3
3
==========================================
4
4
5
5
This tutorial will show you how to correctly format an audio dataset and
6
- then train/test an audio classifier network on the dataset. First, let’s
7
- import the common torch packages such as
8
- ``torchaudio <https://github.com/pytorch/audio>``\ \_ and can be
6
+ then train/test an audio classifier network on the dataset.
7
+
8
+ Colab has GPU option available. In the menu tabs, select “Runtime” then
9
+ “Change runtime type”. In the pop-up that follows, you can choose GPU.
10
+ After the change, your runtime should automatically restart (which means
11
+ information from executed cells disappear).
12
+
13
+ First, let’s import the common torch packages such as
14
+ ``torchaudio <https://github.com/pytorch/audio>``\ \_ that can be
9
15
installed by following the instructions on the website.
10
16
11
17
"""
12
18
13
19
# Uncomment the following line to run in Google Colab
14
- # !pip install torch
15
- # !pip install torchaudio
20
+
21
+ # GPU:
22
+ # !pip install torch==1.7.0+cu101 torchvision==0.8.1+cu101 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
23
+
24
+ # CPU:
25
+ # !pip install torch==1.7.0+cpu torchvision==0.8.1+cpu torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
26
+
27
+ # For interactive demo at the end:
28
+ # !pip install pydub
16
29
17
30
import os
31
+ from base64 import b64decode
32
+ from io import BytesIO
18
33
19
34
import IPython .display as ipd
20
35
import matplotlib .pyplot as plt
25
40
import torch .nn .functional as F
26
41
import torch .optim as optim
27
42
import torchaudio
43
+ from google .colab import output as colab_output
44
+ from pydub import AudioSegment
28
45
from torchaudio .datasets import SPEECHCOMMANDS
29
46
30
47
######################################################################
@@ -73,11 +90,12 @@ def load_list(filename):
73
90
self ._walker = load_list ("testing_list.txt" )
74
91
elif subset == "training" :
75
92
excludes = load_list ("validation_list.txt" ) + load_list ("testing_list.txt" )
93
+ excludes = set (excludes )
76
94
self ._walker = [w for w in self ._walker if w not in excludes ]
77
95
78
96
97
+ # Create training and testing split of the data. We do not use validation in this tutorial.
79
98
train_set = SubsetSC ("training" )
80
- # valid_set = SubsetSC("validation")
81
99
test_set = SubsetSC ("testing" )
82
100
83
101
waveform , sample_rate , label , speaker_id , utterance_number = train_set [0 ]
@@ -92,15 +110,14 @@ def load_list(filename):
92
110
print ("Shape of waveform: {}" .format (waveform .size ()))
93
111
print ("Sample rate of waveform: {}" .format (sample_rate ))
94
112
95
- plt .figure ();
96
113
plt .plot (waveform .t ().numpy ());
97
114
98
115
99
116
######################################################################
100
117
# Let’s find the list of labels available in the dataset.
101
118
#
102
119
103
- labels = list (set (datapoint [2 ] for datapoint in train_set ))
120
+ labels = sorted ( list (set (datapoint [2 ] for datapoint in train_set ) ))
104
121
labels
105
122
106
123
@@ -170,6 +187,7 @@ def encode(word):
170
187
# encoding.
171
188
#
172
189
190
+
173
191
def pad_sequence (batch ):
174
192
# Make all tensor in a batch the same length by padding with zeros
175
193
batch = [item .t () for item in batch ]
@@ -184,9 +202,9 @@ def collate_fn(batch):
184
202
185
203
tensors , targets = [], []
186
204
187
- # Apply transform and encode
205
+ # Gather in lists, and encode labels
188
206
for waveform , _ , label , * _ in batch :
189
- tensors += [transform ( waveform ) ]
207
+ tensors += [waveform ]
190
208
targets += [encode (label )]
191
209
192
210
# Group the list of tensors into a batched tensor
@@ -196,20 +214,31 @@ def collate_fn(batch):
196
214
return tensors , targets
197
215
198
216
199
- batch_size = 128
217
+ batch_size = 256
200
218
201
- if device == ' cuda' :
219
+ if device == " cuda" :
202
220
num_workers = 1
203
221
pin_memory = True
204
222
else :
205
223
num_workers = 0
206
224
pin_memory = False
207
225
208
226
train_loader = torch .utils .data .DataLoader (
209
- train_set , batch_size = batch_size , shuffle = True , collate_fn = collate_fn , num_workers = num_workers , pin_memory = pin_memory ,
227
+ train_set ,
228
+ batch_size = batch_size ,
229
+ shuffle = True ,
230
+ collate_fn = collate_fn ,
231
+ num_workers = num_workers ,
232
+ pin_memory = pin_memory ,
210
233
)
211
234
test_loader = torch .utils .data .DataLoader (
212
- test_set , batch_size = batch_size , shuffle = False , collate_fn = collate_fn , num_workers = num_workers , pin_memory = pin_memory ,
235
+ test_set ,
236
+ batch_size = batch_size ,
237
+ shuffle = False ,
238
+ drop_last = False ,
239
+ collate_fn = collate_fn ,
240
+ num_workers = num_workers ,
241
+ pin_memory = pin_memory ,
213
242
)
214
243
215
244
@@ -232,21 +261,21 @@ def collate_fn(batch):
232
261
233
262
234
263
class M5 (nn .Module ):
235
- def __init__ (self , stride = 16 , n_channel = 32 , n_output = 35 ):
264
+ def __init__ (self , n_input = 1 , n_output = 35 , stride = 16 , n_channel = 32 ):
236
265
super ().__init__ ()
237
- self .conv1 = nn .Conv1d (1 , n_channel , 80 , stride = stride )
266
+ self .conv1 = nn .Conv1d (n_input , n_channel , kernel_size = 80 , stride = stride )
238
267
self .bn1 = nn .BatchNorm1d (n_channel )
239
268
self .pool1 = nn .MaxPool1d (4 )
240
- self .conv2 = nn .Conv1d (n_channel , n_channel , 3 )
269
+ self .conv2 = nn .Conv1d (n_channel , n_channel , kernel_size = 3 )
241
270
self .bn2 = nn .BatchNorm1d (n_channel )
242
271
self .pool2 = nn .MaxPool1d (4 )
243
- self .conv3 = nn .Conv1d (n_channel , 2 * n_channel , 3 )
244
- self .bn3 = nn .BatchNorm1d (2 * n_channel )
272
+ self .conv3 = nn .Conv1d (n_channel , 2 * n_channel , kernel_size = 3 )
273
+ self .bn3 = nn .BatchNorm1d (2 * n_channel )
245
274
self .pool3 = nn .MaxPool1d (4 )
246
- self .conv4 = nn .Conv1d (2 * n_channel , 2 * n_channel , 3 )
247
- self .bn4 = nn .BatchNorm1d (2 * n_channel )
275
+ self .conv4 = nn .Conv1d (2 * n_channel , 2 * n_channel , kernel_size = 3 )
276
+ self .bn4 = nn .BatchNorm1d (2 * n_channel )
248
277
self .pool4 = nn .MaxPool1d (4 )
249
- self .fc1 = nn .Linear (2 * n_channel , n_output )
278
+ self .fc1 = nn .Linear (2 * n_channel , n_output )
250
279
251
280
def forward (self , x ):
252
281
x = self .conv1 (x )
@@ -267,7 +296,7 @@ def forward(self, x):
267
296
return F .log_softmax (x , dim = 2 )
268
297
269
298
270
- model = M5 (n_output = len (labels ))
299
+ model = M5 (n_input = transformed . shape [ 0 ], n_output = len (labels ))
271
300
model .to (device )
272
301
print (model )
273
302
@@ -284,7 +313,7 @@ def count_parameters(model):
284
313
# We will use the same optimization technique used in the paper, an Adam
285
314
# optimizer with weight decay set to 0.0001. At first, we will train with
286
315
# a learning rate of 0.01, but we will use a ``scheduler`` to decrease it
287
- # to 0.001 during training.
316
+ # to 0.001 during training after 20 epochs .
288
317
#
289
318
290
319
optimizer = optim .Adam (model .parameters (), lr = 0.01 , weight_decay = 0.0001 )
@@ -296,11 +325,9 @@ def count_parameters(model):
296
325
# --------------------------------
297
326
#
298
327
# Now let’s define a training function that will feed our training data
299
- # into the model and perform the backward pass and optimization steps.
300
- #
301
- # Finally, we can train and test the network. We will train the network
302
- # for ten epochs then reduce the learn rate and train for ten more epochs.
303
- # The network will be tested after each epoch to see how the accuracy
328
+ # into the model and perform the backward pass and optimization steps. For
329
+ # training, the loss we will use is the negative log-likelihood. The
330
+ # network will then be tested after each epoch to see how the accuracy
304
331
# varies during the training.
305
332
#
306
333
@@ -312,6 +339,8 @@ def train(model, epoch, log_interval):
312
339
data = data .to (device )
313
340
target = target .to (device )
314
341
342
+ # apply transform and model on whole batch directly on device
343
+ data = transform (data )
315
344
output = model (data )
316
345
317
346
# negative log-likelihood for a tensor of size (batch x 1 x n_output)
@@ -323,10 +352,10 @@ def train(model, epoch, log_interval):
323
352
324
353
# print training stats
325
354
if batch_idx % log_interval == 0 :
326
- print (f' Train Epoch: { epoch } [{ batch_idx * len (data )} /{ len (train_loader .dataset )} ({ 100. * batch_idx / len (train_loader ):.0f} %)]\t Loss: { loss :.6f} ' )
355
+ print (f" Train Epoch: { epoch } [{ batch_idx * len (data )} /{ len (train_loader .dataset )} ({ 100. * batch_idx / len (train_loader ):.0f} %)]\t Loss: { loss :.6f} " )
327
356
328
- if ' pbar' in globals ():
329
- pbar .update ()
357
+ if " pbar" in globals () and "pbar_update" in globals ():
358
+ pbar .update (pbar_update )
330
359
331
360
332
361
######################################################################
@@ -346,24 +375,28 @@ def argmax(tensor):
346
375
347
376
def number_of_correct (pred , target ):
348
377
# compute number of correct predictions
349
- return pred .squeeze ().eq (target ).cpu (). sum ().item ()
378
+ return pred .squeeze ().eq (target ).sum ().item ()
350
379
351
380
352
381
def test (model , epoch ):
353
382
model .eval ()
354
383
correct = 0
355
384
for data , target in test_loader :
385
+
356
386
data = data .to (device )
357
387
target = target .to (device )
358
388
389
+ # apply transform and model on whole batch directly on device
390
+ data = transform (data )
359
391
output = model (data )
392
+
360
393
pred = argmax (output )
361
394
correct += number_of_correct (pred , target )
362
395
363
- if ' pbar' in globals ():
364
- pbar .update ()
396
+ if " pbar" in globals () and "pbar_update" in globals ():
397
+ pbar .update (pbar_update )
365
398
366
- print (f' \n Test Epoch: { epoch } \t Accuracy: { correct } /{ len (test_loader .dataset )} ({ 100. * correct / len (test_loader .dataset ):.0f} %)\n ' )
399
+ print (f" \n Test Epoch: { epoch } \t Accuracy: { correct } /{ len (test_loader .dataset )} ({ 100. * correct / len (test_loader .dataset ):.0f} %)\n " )
367
400
368
401
369
402
######################################################################
@@ -375,21 +408,28 @@ def test(model, epoch):
375
408
376
409
log_interval = 20
377
410
n_epoch = 2
411
+ pbar_update = 1 / (len (train_loader ) + len (test_loader ))
412
+
413
+ # The transform needs to live on the same device as the model and the data.
414
+ transform = transform .to (device )
378
415
379
- with tqdm (total = n_epoch * ( len ( train_loader ) + len ( test_loader )) ) as pbar :
380
- for epoch in range (1 , n_epoch + 1 ):
416
+ with tqdm (total = n_epoch ) as pbar :
417
+ for epoch in range (1 , n_epoch + 1 ):
381
418
train (model , epoch , log_interval )
382
419
test (model , epoch )
383
420
scheduler .step ()
384
421
385
422
386
423
######################################################################
387
- # Let’s look at the last words in the train set, and see how the model did
388
- # on it.
424
+ # The network should be more than 65% accurate on the test set after 2
425
+ # epochs, and 85% after 21 epochs. Let’s look at the last words in the
426
+ # train set, and see how the model did on it.
389
427
#
390
428
429
+
391
430
def predict (waveform ):
392
- # Take a waveform and use the model to predict
431
+ # Use the model to predict the label of the waveform
432
+ waveform = waveform .to (device )
393
433
waveform = transform (waveform )
394
434
output = model (waveform .unsqueeze (0 ))
395
435
output = argmax (output ).squeeze ()
@@ -410,9 +450,9 @@ def predict(waveform):
410
450
for i , (waveform , sample_rate , utterance , * _ ) in enumerate (test_set ):
411
451
output = predict (waveform )
412
452
if output != utterance :
413
- ipd .Audio (waveform .numpy (), rate = sample_rate )
414
- print (f"Data point #{ i } . Expected: { utterance } . Predicted: { output } ." )
415
- break
453
+ ipd .Audio (waveform .numpy (), rate = sample_rate )
454
+ print (f"Data point #{ i } . Expected: { utterance } . Predicted: { output } ." )
455
+ break
416
456
else :
417
457
print ("All examples in this dataset were correctly classified!" )
418
458
print ("In this case, let's just look at the last data point" )
@@ -421,17 +461,59 @@ def predict(waveform):
421
461
422
462
423
463
######################################################################
424
- # Feel free to try with one of your own recordings!
464
+ # Feel free to try with one of your own recordings of one of the labels!
465
+ # For example, using Colab, say “Go” while executing the cell below. This
466
+ # will record one second of audio and try to classify it.
425
467
#
426
468
427
469
470
+ RECORD = """
471
+ const sleep = time => new Promise(resolve => setTimeout(resolve, time))
472
+ const b2text = blob => new Promise(resolve => {
473
+ const reader = new FileReader()
474
+ reader.onloadend = e => resolve(e.srcElement.result)
475
+ reader.readAsDataURL(blob)
476
+ })
477
+ var record = time => new Promise(async resolve => {
478
+ stream = await navigator.mediaDevices.getUserMedia({ audio: true })
479
+ recorder = new MediaRecorder(stream)
480
+ chunks = []
481
+ recorder.ondataavailable = e => chunks.push(e.data)
482
+ recorder.start()
483
+ await sleep(time)
484
+ recorder.onstop = async ()=>{
485
+ blob = new Blob(chunks)
486
+ text = await b2text(blob)
487
+ resolve(text)
488
+ }
489
+ recorder.stop()
490
+ })
491
+ """
492
+
493
+
494
+ def record (seconds = 1 ):
495
+ display (ipd .Javascript (RECORD ))
496
+ print (f"Recording started for { seconds } seconds." )
497
+ s = colab_output .eval_js ("record(%d)" % (seconds * 1000 ))
498
+ print ("Recording ended." )
499
+ b = b64decode (s .split ("," )[1 ])
500
+
501
+ fileformat = "wav"
502
+ filename = f"_audio.{ fileformat } "
503
+ AudioSegment .from_file (BytesIO (b )).export (filename , format = fileformat )
504
+
505
+ return torchaudio .load (filename )
506
+
507
+
508
+ waveform , sample_rate = record ()
509
+ print (f"Predicted: { predict (waveform )} ." )
510
+ ipd .Audio (waveform .numpy (), rate = sample_rate )
511
+
512
+
428
513
######################################################################
429
514
# Conclusion
430
515
# ----------
431
516
#
432
- # The network should be more than 70% accurate on the test set after 2
433
- # epochs, 80% after 14 epochs, and 85% after 21 epochs.
434
- #
435
517
# In this tutorial, we used torchaudio to load a dataset and resample the
436
518
# signal. We have then defined a neural network that we trained to
437
519
# recognize a given command. There are also other data preprocessing
0 commit comments