10
10
11
11
"""
12
12
13
+ # Uncomment the following line to run in Google Colab
13
14
# !pip install torch
14
15
# !pip install torchaudio
15
16
@@ -61,44 +62,24 @@ class SubsetSC(SPEECHCOMMANDS):
61
62
def __init__ (self , subset : str = None ):
62
63
super ().__init__ ("./" , download = True )
63
64
64
- if subset in ["training" , "validation" ]:
65
- filepath = os .path .join (self ._path , "validation_list.txt" )
66
- with open (filepath ) as f :
67
- validation_list = [
68
- os .path .join (self ._path , line .strip ()) for line in f .readlines ()
69
- ]
70
-
71
- if subset in ["training" , "testing" ]:
72
- filepath = os .path .join (self ._path , "testing_list.txt" )
73
- with open (filepath ) as f :
74
- testing_list = [
75
- os .path .join (self ._path , line .strip ()) for line in f .readlines ()
76
- ]
65
+ def load_list (filename ):
66
+ filepath = os .path .join (self ._path , filename )
67
+ with open (filepath ) as fileobj :
68
+ return [os .path .join (self ._path , line .strip ()) for line in fileobj ]
77
69
78
70
if subset == "validation" :
79
- walker = validation_list
71
+ self . _walker = load_list ( " validation_list.txt" )
80
72
elif subset == "testing" :
81
- walker = testing_list
82
- elif subset in ["training" , None ]:
83
- walker = self ._walker # defined by SPEECHCOMMANDS parent class
84
- else :
85
- raise ValueError (
86
- "When `subset` not None, it must take a value from {'training', 'validation', 'testing'}."
87
- )
88
-
89
- if subset == "training" :
90
- walker = filter (
91
- lambda w : not (w in validation_list or w in testing_list ), walker
92
- )
93
-
94
- self ._walker = list (walker )
73
+ self ._walker = load_list ("testing_list.txt" )
74
+ elif subset == "training" :
75
+ excludes = load_list ("validation_list.txt" ) + load_list ("testing_list.txt" )
76
+ self ._walker = [w for w in self ._walker if w not in excludes ]
95
77
96
78
97
79
train_set = SubsetSC ("training" )
98
80
# valid_set = SubsetSC("validation")
99
81
test_set = SubsetSC ("testing" )
100
82
101
-
102
83
waveform , sample_rate , label , speaker_id , utterance_number = train_set [0 ]
103
84
104
85
@@ -111,8 +92,8 @@ def __init__(self, subset: str = None):
111
92
print ("Shape of waveform: {}" .format (waveform .size ()))
112
93
print ("Sample rate of waveform: {}" .format (sample_rate ))
113
94
114
- plt .figure ()
115
- plt .plot (waveform .t ().numpy ())
95
+ plt .figure ();
96
+ plt .plot (waveform .t ().numpy ());
116
97
117
98
118
99
######################################################################
@@ -147,31 +128,26 @@ def __init__(self, subset: str = None):
147
128
# Formatting the Data
148
129
# -------------------
149
130
#
150
- # The dataset uses a single channel for audio. We do not need to down mix
151
- # the audio channels (which we could do for instance by either taking the
152
- # mean along the channel dimension, or simply keeping only one of the
153
- # channels).
131
+ # This is a good place to apply transformations to the data. For the
132
+ # waveform, we downsample the audio for faster processing without losing
133
+ # too much of the classification power.
154
134
#
155
-
156
-
157
- ######################################################################
158
- # We downsample the audio for faster processing without losing too much of
159
- # the classification power .
135
+ # We don’t need to apply other transformations here. It is common for some
136
+ # datasets though to have to reduce the number of channels (say from
137
+ # stereo to mono) by either taking the mean along the channel dimension,
138
+ # or simply keeping only one of the channels. Since SpeechCommands uses a
139
+ # single channel for audio, this is not needed here .
160
140
#
161
141
162
142
new_sample_rate = 8000
163
- transform = torchaudio .transforms .Resample (
164
- orig_freq = sample_rate , new_freq = new_sample_rate
165
- )
143
+ transform = torchaudio .transforms .Resample (orig_freq = sample_rate , new_freq = new_sample_rate )
166
144
transformed = transform (waveform )
167
145
168
146
ipd .Audio (transformed .numpy (), rate = new_sample_rate )
169
147
170
148
171
149
######################################################################
172
- # To encode each word, we use a simple language model where we represent
173
- # each fo the 35 words by its corresponding position of the command in the
174
- # list above.
150
+ # We are encoding each word using its index in the list of labels.
175
151
#
176
152
177
153
@@ -183,48 +159,57 @@ def encode(word):
183
159
184
160
185
161
######################################################################
186
- # We now define a collate function that assembles a list of audio
187
- # recordings and a list of utterances into two batched tensors. In this
188
- # function, we also apply the resampling, and the encoding. The collate
189
- # function is used in the pytroch data loader that allow us to iterate
190
- # over a dataset by batches.
162
+ # To turn a list of data point made of audio recordings and utterances
163
+ # into two batched tensors for the model, we implement a collate function
164
+ # which is used by the PyTorch DataLoader that allows us to iterate over a
165
+ # dataset by batches. Please see `the
166
+ # documentation <https://pytorch.org/docs/stable/data.html#working-with-collate-fn>`__
167
+ # for more information about working with a collate function.
168
+ #
169
+ # In the collate function, we also apply the resampling, and the text
170
+ # encoding.
191
171
#
192
-
193
172
194
173
def pad_sequence (batch ):
195
174
# Make all tensor in a batch the same length by padding with zeros
196
175
batch = [item .t () for item in batch ]
197
- batch = torch .nn .utils .rnn .pad_sequence (batch , batch_first = True , padding_value = 0.0 )
176
+ batch = torch .nn .utils .rnn .pad_sequence (batch , batch_first = True , padding_value = 0. )
198
177
return batch .permute (0 , 2 , 1 )
199
178
200
179
201
180
def collate_fn (batch ):
202
181
203
182
# A data tuple has the form:
204
183
# waveform, sample_rate, label, speaker_id, utterance_number
205
- # and so we are only interested in item 0 and 2
206
184
207
- # Apply transforms to waveforms
208
- tensors = [transform (b [0 ]) for b in batch ]
209
- tensors = pad_sequence (tensors )
185
+ tensors , targets = [], []
186
+
187
+ # Apply transform and encode
188
+ for waveform , _ , label , * _ in batch :
189
+ tensors += [transform (waveform )]
190
+ targets += [encode (label )]
210
191
211
- # Apply transform to target utterance
212
- targets = [ encode ( b [ 2 ]) for b in batch ]
192
+ # Group the list of tensors into a batched tensor
193
+ tensors = pad_sequence ( tensors )
213
194
targets = torch .stack (targets )
214
195
215
196
return tensors , targets
216
197
217
198
218
199
batch_size = 128
219
200
220
- kwargs = (
221
- {"num_workers" : 1 , "pin_memory" : True } if device == "cuda" else {}
222
- ) # needed for using datasets on gpu
201
+ if device == 'cuda' :
202
+ num_workers = 1
203
+ pin_memory = True
204
+ else :
205
+ num_workers = 0
206
+ pin_memory = False
207
+
223
208
train_loader = torch .utils .data .DataLoader (
224
- train_set , batch_size = batch_size , shuffle = True , collate_fn = collate_fn , ** kwargs
209
+ train_set , batch_size = batch_size , shuffle = True , collate_fn = collate_fn , num_workers = num_workers , pin_memory = pin_memory ,
225
210
)
226
211
test_loader = torch .utils .data .DataLoader (
227
- test_set , batch_size = batch_size , shuffle = False , collate_fn = collate_fn , ** kwargs
212
+ test_set , batch_size = batch_size , shuffle = False , collate_fn = collate_fn , num_workers = num_workers , pin_memory = pin_memory ,
228
213
)
229
214
230
215
@@ -255,13 +240,13 @@ def __init__(self, stride=16, n_channel=32, n_output=35):
255
240
self .conv2 = nn .Conv1d (n_channel , n_channel , 3 )
256
241
self .bn2 = nn .BatchNorm1d (n_channel )
257
242
self .pool2 = nn .MaxPool1d (4 )
258
- self .conv3 = nn .Conv1d (n_channel , 2 * n_channel , 3 )
259
- self .bn3 = nn .BatchNorm1d (2 * n_channel )
243
+ self .conv3 = nn .Conv1d (n_channel , 2 * n_channel , 3 )
244
+ self .bn3 = nn .BatchNorm1d (2 * n_channel )
260
245
self .pool3 = nn .MaxPool1d (4 )
261
- self .conv4 = nn .Conv1d (2 * n_channel , 2 * n_channel , 3 )
262
- self .bn4 = nn .BatchNorm1d (2 * n_channel )
246
+ self .conv4 = nn .Conv1d (2 * n_channel , 2 * n_channel , 3 )
247
+ self .bn4 = nn .BatchNorm1d (2 * n_channel )
263
248
self .pool4 = nn .MaxPool1d (4 )
264
- self .fc1 = nn .Linear (2 * n_channel , n_output )
249
+ self .fc1 = nn .Linear (2 * n_channel , n_output )
265
250
266
251
def forward (self , x ):
267
252
x = self .conv1 (x )
@@ -303,9 +288,7 @@ def count_parameters(model):
303
288
#
304
289
305
290
optimizer = optim .Adam (model .parameters (), lr = 0.01 , weight_decay = 0.0001 )
306
- scheduler = optim .lr_scheduler .StepLR (
307
- optimizer , step_size = 20 , gamma = 0.1
308
- ) # reduce the learning after 20 epochs by a factor of 10
291
+ scheduler = optim .lr_scheduler .StepLR (optimizer , step_size = 20 , gamma = 0.1 ) # reduce the learning after 20 epochs by a factor of 10
309
292
310
293
311
294
######################################################################
@@ -340,11 +323,9 @@ def train(model, epoch, log_interval):
340
323
341
324
# print training stats
342
325
if batch_idx % log_interval == 0 :
343
- print (
344
- f"Train Epoch: { epoch } [{ batch_idx * len (data )} /{ len (train_loader .dataset )} ({ 100. * batch_idx / len (train_loader ):.0f} %)]\t Loss: { loss :.6f} "
345
- )
326
+ print (f'Train Epoch: { epoch } [{ batch_idx * len (data )} /{ len (train_loader .dataset )} ({ 100. * batch_idx / len (train_loader ):.0f} %)]\t Loss: { loss :.6f} ' )
346
327
347
- if " pbar" in globals ():
328
+ if ' pbar' in globals ():
348
329
pbar .update ()
349
330
350
331
@@ -379,12 +360,10 @@ def test(model, epoch):
379
360
pred = argmax (output )
380
361
correct += number_of_correct (pred , target )
381
362
382
- if " pbar" in globals ():
383
- pbar .update ()
363
+ if ' pbar' in globals ():
364
+ pbar .update ()
384
365
385
- print (
386
- f"\n Test Epoch: { epoch } \t Accuracy: { correct } /{ len (test_loader .dataset )} ({ 100. * correct / len (test_loader .dataset ):.0f} %)\n "
387
- )
366
+ print (f'\n Test Epoch: { epoch } \t Accuracy: { correct } /{ len (test_loader .dataset )} ({ 100. * correct / len (test_loader .dataset ):.0f} %)\n ' )
388
367
389
368
390
369
######################################################################
@@ -398,38 +377,60 @@ def test(model, epoch):
398
377
n_epoch = 2
399
378
400
379
with tqdm (total = n_epoch * (len (train_loader ) + len (test_loader ))) as pbar :
401
- for epoch in range (1 , n_epoch + 1 ):
380
+ for epoch in range (1 , n_epoch + 1 ):
402
381
train (model , epoch , log_interval )
403
382
test (model , epoch )
404
383
scheduler .step ()
405
384
406
385
407
386
######################################################################
408
- # Let’s try looking at one of the last words in the train and test set.
387
+ # Let’s look at the last words in the train set, and see how the model did
388
+ # on it.
409
389
#
410
390
391
+ def predict (waveform ):
392
+ # Take a waveform and use the model to predict
393
+ waveform = transform (waveform )
394
+ output = model (waveform .unsqueeze (0 ))
395
+ output = argmax (output ).squeeze ()
396
+ output = labels [output ]
397
+ return output
398
+
399
+
411
400
waveform , sample_rate , utterance , * _ = train_set [- 1 ]
412
401
ipd .Audio (waveform .numpy (), rate = sample_rate )
413
402
414
- waveform = transform (waveform )
415
- output = model (waveform .unsqueeze (0 ))
416
- output = argmax (output ).squeeze ()
417
- print (f"Expected: { utterance } . Predicted: { labels [output ]} ." )
403
+ print (f"Expected: { utterance } . Predicted: { predict (waveform )} ." )
418
404
419
- waveform , sample_rate , utterance , * _ = test_set [- 1 ]
420
- ipd .Audio (waveform .numpy (), rate = sample_rate )
421
405
422
- waveform = transform (waveform )
423
- output = model (waveform .unsqueeze (0 ))
424
- output = argmax (output ).squeeze ()
425
- print (f"Expected: { utterance } . Predicted: { labels [output ]} ." )
406
+ ######################################################################
407
+ # Let’s find an example that isn’t classified correctly, if there is one.
408
+ #
409
+
410
+ for i , (waveform , sample_rate , utterance , * _ ) in enumerate (test_set ):
411
+ output = predict (waveform )
412
+ if output != utterance :
413
+ ipd .Audio (waveform .numpy (), rate = sample_rate )
414
+ print (f"Data point #{ i } . Expected: { utterance } . Predicted: { output } ." )
415
+ break
416
+ else :
417
+ print ("All examples in this dataset were correctly classified!" )
418
+ print ("In this case, let's just look at the last data point" )
419
+ ipd .Audio (waveform .numpy (), rate = sample_rate )
420
+ print (f"Data point #{ i } . Expected: { utterance } . Predicted: { output } ." )
421
+
422
+
423
+ ######################################################################
424
+ # Feel free to try with one of your own recordings!
425
+ #
426
426
427
427
428
428
######################################################################
429
429
# Conclusion
430
430
# ----------
431
431
#
432
- # After two epochs, the network should be more than 70% accurate.
432
+ # The network should be more than 70% accurate on the test set after 2
433
+ # epochs, 80% after 14 epochs, and 85% after 21 epochs.
433
434
#
434
435
# In this tutorial, we used torchaudio to load a dataset and resample the
435
436
# signal. We have then defined a neural network that we trained to
0 commit comments