10
10
11
11
"""
12
12
13
- # Uncomment to run in Google Colab
14
13
# !pip install torch
15
14
# !pip install torchaudio
16
15
@@ -66,14 +65,14 @@ def __init__(self, subset: str = None):
66
65
filepath = os .path .join (self ._path , "validation_list.txt" )
67
66
with open (filepath ) as f :
68
67
validation_list = [
69
- os .path .join (self ._path , l .strip ()) for l in f .readlines ()
68
+ os .path .join (self ._path , line .strip ()) for line in f .readlines ()
70
69
]
71
70
72
71
if subset in ["training" , "testing" ]:
73
72
filepath = os .path .join (self ._path , "testing_list.txt" )
74
73
with open (filepath ) as f :
75
74
testing_list = [
76
- os .path .join (self ._path , l .strip ()) for l in f .readlines ()
75
+ os .path .join (self ._path , line .strip ()) for line in f .readlines ()
77
76
]
78
77
79
78
if subset == "validation" :
@@ -216,15 +215,16 @@ def collate_fn(batch):
216
215
return tensors , targets
217
216
218
217
218
+ batch_size = 128
219
+
219
220
kwargs = (
220
221
{"num_workers" : 1 , "pin_memory" : True } if device == "cuda" else {}
221
- ) # needed to run on gpu
222
-
222
+ ) # needed for using datasets on gpu
223
223
train_loader = torch .utils .data .DataLoader (
224
- train_set , batch_size = 128 , shuffle = True , collate_fn = collate_fn , ** kwargs
224
+ train_set , batch_size = batch_size , shuffle = True , collate_fn = collate_fn , ** kwargs
225
225
)
226
226
test_loader = torch .utils .data .DataLoader (
227
- test_set , batch_size = 128 , shuffle = False , collate_fn = collate_fn , ** kwargs
227
+ test_set , batch_size = batch_size , shuffle = False , collate_fn = collate_fn , ** kwargs
228
228
)
229
229
230
230
@@ -236,31 +236,32 @@ def collate_fn(batch):
236
236
# the raw audio data. Usually more advanced transforms are applied to the
237
237
# audio data, however CNNs can be used to accurately process the raw data.
238
238
# The specific architecture is modeled after the M5 network architecture
239
- # described in https://arxiv.org/pdf/1610.00087.pdf. An important aspect
240
- # of models processing raw audio data is the receptive field of their
241
- # first layer’s filters. Our model’s first filter is length 80 so when
242
- # processing audio sampled at 8kHz the receptive field is around 10ms.
243
- # This size is similar to speech processing applications that often use
244
- # receptive fields ranging from 20ms to 40ms.
239
+ # described in ``this paper <https://arxiv.org/pdf/1610.00087.pdf>``\ \_.
240
+ # An important aspect of models processing raw audio data is the receptive
241
+ # field of their first layer’s filters. Our model’s first filter is length
242
+ # 80 so when processing audio sampled at 8kHz the receptive field is
243
+ # around 10ms (and at 4kHz, around 20 ms). This size is similar to speech
244
+ # processing applications that often use receptive fields ranging from
245
+ # 20ms to 40ms.
245
246
#
246
247
247
248
248
- class Net (nn .Module ):
249
- def __init__ (self , n_output = 10 ):
250
- super (Net , self ).__init__ ()
251
- self .conv1 = nn .Conv1d (1 , 128 , 80 , 4 )
252
- self .bn1 = nn .BatchNorm1d (128 )
249
+ class M5 (nn .Module ):
250
+ def __init__ (self , stride = 16 , n_channel = 32 , n_output = 35 ):
251
+ super ().__init__ ()
252
+ self .conv1 = nn .Conv1d (1 , n_channel , 80 , stride = stride )
253
+ self .bn1 = nn .BatchNorm1d (n_channel )
253
254
self .pool1 = nn .MaxPool1d (4 )
254
- self .conv2 = nn .Conv1d (128 , 128 , 3 )
255
- self .bn2 = nn .BatchNorm1d (128 )
255
+ self .conv2 = nn .Conv1d (n_channel , n_channel , 3 )
256
+ self .bn2 = nn .BatchNorm1d (n_channel )
256
257
self .pool2 = nn .MaxPool1d (4 )
257
- self .conv3 = nn .Conv1d (128 , 256 , 3 )
258
- self .bn3 = nn .BatchNorm1d (256 )
258
+ self .conv3 = nn .Conv1d (n_channel , 2 * n_channel , 3 )
259
+ self .bn3 = nn .BatchNorm1d (2 * n_channel )
259
260
self .pool3 = nn .MaxPool1d (4 )
260
- self .conv4 = nn .Conv1d (256 , 512 , 3 )
261
- self .bn4 = nn .BatchNorm1d (512 )
261
+ self .conv4 = nn .Conv1d (2 * n_channel , 2 * n_channel , 3 )
262
+ self .bn4 = nn .BatchNorm1d (2 * n_channel )
262
263
self .pool4 = nn .MaxPool1d (4 )
263
- self .fc1 = nn .Linear (512 , n_output )
264
+ self .fc1 = nn .Linear (2 * n_channel , n_output )
264
265
265
266
def forward (self , x ):
266
267
x = self .conv1 (x )
@@ -275,15 +276,13 @@ def forward(self, x):
275
276
x = self .conv4 (x )
276
277
x = F .relu (self .bn4 (x ))
277
278
x = self .pool4 (x )
278
- x = F .avg_pool1d (
279
- x , x .shape [- 1 ]
280
- ) # input should be 512x14 so this outputs a 512x1
281
- x = x .permute (0 , 2 , 1 ) # change the 512x1 to 1x512
279
+ x = F .avg_pool1d (x , x .shape [- 1 ])
280
+ x = x .permute (0 , 2 , 1 )
282
281
x = self .fc1 (x )
283
282
return F .log_softmax (x , dim = 2 )
284
283
285
284
286
- model = Net (n_output = len (labels ))
285
+ model = M5 (n_output = len (labels ))
287
286
model .to (device )
288
287
print (model )
289
288
@@ -304,7 +303,9 @@ def count_parameters(model):
304
303
#
305
304
306
305
optimizer = optim .Adam (model .parameters (), lr = 0.01 , weight_decay = 0.0001 )
307
- scheduler = optim .lr_scheduler .StepLR (optimizer , step_size = 20 , gamma = 0.1 )
306
+ scheduler = optim .lr_scheduler .StepLR (
307
+ optimizer , step_size = 20 , gamma = 0.1
308
+ ) # reduce the learning after 20 epochs by a factor of 10
308
309
309
310
310
311
######################################################################
@@ -321,11 +322,6 @@ def count_parameters(model):
321
322
#
322
323
323
324
324
- def nll_loss (tensor , target ):
325
- # negative log-likelihood for a tensor of size (batch x 1 x n_output)
326
- return F .nll_loss (tensor .squeeze (), target )
327
-
328
-
329
325
def train (model , epoch , log_interval ):
330
326
model .train ()
331
327
for batch_idx , (data , target ) in enumerate (train_loader ):
@@ -334,7 +330,9 @@ def train(model, epoch, log_interval):
334
330
target = target .to (device )
335
331
336
332
output = model (data )
337
- loss = nll_loss (output , target )
333
+
334
+ # negative log-likelihood for a tensor of size (batch x 1 x n_output)
335
+ loss = F .nll_loss (output .squeeze (), target )
338
336
339
337
optimizer .zero_grad ()
340
338
loss .backward ()
@@ -385,7 +383,7 @@ def test(model, epoch):
385
383
pbar .update ()
386
384
387
385
print (
388
- f"\n Test set: Accuracy : { correct } /{ len (test_loader .dataset )} ({ 100. * correct / len (test_loader .dataset ):.0f} %)\n "
386
+ f"\n Test Epoch: { epoch } \t Accuracy : { correct } /{ len (test_loader .dataset )} ({ 100. * correct / len (test_loader .dataset ):.0f} %)\n "
389
387
)
390
388
391
389
@@ -412,12 +410,16 @@ def test(model, epoch):
412
410
413
411
waveform , sample_rate , utterance , * _ = train_set [- 1 ]
414
412
ipd .Audio (waveform .numpy (), rate = sample_rate )
413
+
414
+ waveform = transform (waveform )
415
415
output = model (waveform .unsqueeze (0 ))
416
416
output = argmax (output ).squeeze ()
417
417
print (f"Expected: { utterance } . Predicted: { labels [output ]} ." )
418
418
419
419
waveform , sample_rate , utterance , * _ = test_set [- 1 ]
420
420
ipd .Audio (waveform .numpy (), rate = sample_rate )
421
+
422
+ waveform = transform (waveform )
421
423
output = model (waveform .unsqueeze (0 ))
422
424
output = argmax (output ).squeeze ()
423
425
print (f"Expected: { utterance } . Predicted: { labels [output ]} ." )
@@ -427,7 +429,7 @@ def test(model, epoch):
427
429
# Conclusion
428
430
# ----------
429
431
#
430
- # After one epoch , the network should be more than 65 % accurate.
432
+ # After two epochs , the network should be more than 70 % accurate.
431
433
#
432
434
# In this tutorial, we used torchaudio to load a dataset and resample the
433
435
# signal. We have then defined a neural network that we trained to
0 commit comments