Skip to content

Commit 5e728e9

Browse files
committed
update with a few parameter tuned. model takes less than 10 min to run now.
1 parent 78af54e commit 5e728e9

File tree

1 file changed

+41
-39
lines changed

1 file changed

+41
-39
lines changed

intermediate_source/speech_command_recognition_with_torchaudio.py

Lines changed: 41 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
1111
"""
1212

13-
# Uncomment to run in Google Colab
1413
# !pip install torch
1514
# !pip install torchaudio
1615

@@ -66,14 +65,14 @@ def __init__(self, subset: str = None):
6665
filepath = os.path.join(self._path, "validation_list.txt")
6766
with open(filepath) as f:
6867
validation_list = [
69-
os.path.join(self._path, l.strip()) for l in f.readlines()
68+
os.path.join(self._path, line.strip()) for line in f.readlines()
7069
]
7170

7271
if subset in ["training", "testing"]:
7372
filepath = os.path.join(self._path, "testing_list.txt")
7473
with open(filepath) as f:
7574
testing_list = [
76-
os.path.join(self._path, l.strip()) for l in f.readlines()
75+
os.path.join(self._path, line.strip()) for line in f.readlines()
7776
]
7877

7978
if subset == "validation":
@@ -216,15 +215,16 @@ def collate_fn(batch):
216215
return tensors, targets
217216

218217

218+
batch_size = 128
219+
219220
kwargs = (
220221
{"num_workers": 1, "pin_memory": True} if device == "cuda" else {}
221-
) # needed to run on gpu
222-
222+
) # needed for using datasets on gpu
223223
train_loader = torch.utils.data.DataLoader(
224-
train_set, batch_size=128, shuffle=True, collate_fn=collate_fn, **kwargs
224+
train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, **kwargs
225225
)
226226
test_loader = torch.utils.data.DataLoader(
227-
test_set, batch_size=128, shuffle=False, collate_fn=collate_fn, **kwargs
227+
test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, **kwargs
228228
)
229229

230230

@@ -236,31 +236,32 @@ def collate_fn(batch):
236236
# the raw audio data. Usually more advanced transforms are applied to the
237237
# audio data, however CNNs can be used to accurately process the raw data.
238238
# The specific architecture is modeled after the M5 network architecture
239-
# described in https://arxiv.org/pdf/1610.00087.pdf. An important aspect
240-
# of models processing raw audio data is the receptive field of their
241-
# first layer’s filters. Our model’s first filter is length 80 so when
242-
# processing audio sampled at 8kHz the receptive field is around 10ms.
243-
# This size is similar to speech processing applications that often use
244-
# receptive fields ranging from 20ms to 40ms.
239+
# described in ``this paper <https://arxiv.org/pdf/1610.00087.pdf>``\ \_.
240+
# An important aspect of models processing raw audio data is the receptive
241+
# field of their first layer’s filters. Our model’s first filter is length
242+
# 80 so when processing audio sampled at 8kHz the receptive field is
243+
# around 10ms (and at 4kHz, around 20 ms). This size is similar to speech
244+
# processing applications that often use receptive fields ranging from
245+
# 20ms to 40ms.
245246
#
246247

247248

248-
class Net(nn.Module):
249-
def __init__(self, n_output=10):
250-
super(Net, self).__init__()
251-
self.conv1 = nn.Conv1d(1, 128, 80, 4)
252-
self.bn1 = nn.BatchNorm1d(128)
249+
class M5(nn.Module):
250+
def __init__(self, stride=16, n_channel=32, n_output=35):
251+
super().__init__()
252+
self.conv1 = nn.Conv1d(1, n_channel, 80, stride=stride)
253+
self.bn1 = nn.BatchNorm1d(n_channel)
253254
self.pool1 = nn.MaxPool1d(4)
254-
self.conv2 = nn.Conv1d(128, 128, 3)
255-
self.bn2 = nn.BatchNorm1d(128)
255+
self.conv2 = nn.Conv1d(n_channel, n_channel, 3)
256+
self.bn2 = nn.BatchNorm1d(n_channel)
256257
self.pool2 = nn.MaxPool1d(4)
257-
self.conv3 = nn.Conv1d(128, 256, 3)
258-
self.bn3 = nn.BatchNorm1d(256)
258+
self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, 3)
259+
self.bn3 = nn.BatchNorm1d(2 * n_channel)
259260
self.pool3 = nn.MaxPool1d(4)
260-
self.conv4 = nn.Conv1d(256, 512, 3)
261-
self.bn4 = nn.BatchNorm1d(512)
261+
self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, 3)
262+
self.bn4 = nn.BatchNorm1d(2 * n_channel)
262263
self.pool4 = nn.MaxPool1d(4)
263-
self.fc1 = nn.Linear(512, n_output)
264+
self.fc1 = nn.Linear(2 * n_channel, n_output)
264265

265266
def forward(self, x):
266267
x = self.conv1(x)
@@ -275,15 +276,13 @@ def forward(self, x):
275276
x = self.conv4(x)
276277
x = F.relu(self.bn4(x))
277278
x = self.pool4(x)
278-
x = F.avg_pool1d(
279-
x, x.shape[-1]
280-
) # input should be 512x14 so this outputs a 512x1
281-
x = x.permute(0, 2, 1) # change the 512x1 to 1x512
279+
x = F.avg_pool1d(x, x.shape[-1])
280+
x = x.permute(0, 2, 1)
282281
x = self.fc1(x)
283282
return F.log_softmax(x, dim=2)
284283

285284

286-
model = Net(n_output=len(labels))
285+
model = M5(n_output=len(labels))
287286
model.to(device)
288287
print(model)
289288

@@ -304,7 +303,9 @@ def count_parameters(model):
304303
#
305304

306305
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
307-
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
306+
scheduler = optim.lr_scheduler.StepLR(
307+
optimizer, step_size=20, gamma=0.1
308+
) # reduce the learning after 20 epochs by a factor of 10
308309

309310

310311
######################################################################
@@ -321,11 +322,6 @@ def count_parameters(model):
321322
#
322323

323324

324-
def nll_loss(tensor, target):
325-
# negative log-likelihood for a tensor of size (batch x 1 x n_output)
326-
return F.nll_loss(tensor.squeeze(), target)
327-
328-
329325
def train(model, epoch, log_interval):
330326
model.train()
331327
for batch_idx, (data, target) in enumerate(train_loader):
@@ -334,7 +330,9 @@ def train(model, epoch, log_interval):
334330
target = target.to(device)
335331

336332
output = model(data)
337-
loss = nll_loss(output, target)
333+
334+
# negative log-likelihood for a tensor of size (batch x 1 x n_output)
335+
loss = F.nll_loss(output.squeeze(), target)
338336

339337
optimizer.zero_grad()
340338
loss.backward()
@@ -385,7 +383,7 @@ def test(model, epoch):
385383
pbar.update()
386384

387385
print(
388-
f"\nTest set: Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n"
386+
f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n"
389387
)
390388

391389

@@ -412,12 +410,16 @@ def test(model, epoch):
412410

413411
waveform, sample_rate, utterance, *_ = train_set[-1]
414412
ipd.Audio(waveform.numpy(), rate=sample_rate)
413+
414+
waveform = transform(waveform)
415415
output = model(waveform.unsqueeze(0))
416416
output = argmax(output).squeeze()
417417
print(f"Expected: {utterance}. Predicted: {labels[output]}.")
418418

419419
waveform, sample_rate, utterance, *_ = test_set[-1]
420420
ipd.Audio(waveform.numpy(), rate=sample_rate)
421+
422+
waveform = transform(waveform)
421423
output = model(waveform.unsqueeze(0))
422424
output = argmax(output).squeeze()
423425
print(f"Expected: {utterance}. Predicted: {labels[output]}.")
@@ -427,7 +429,7 @@ def test(model, epoch):
427429
# Conclusion
428430
# ----------
429431
#
430-
# After one epoch, the network should be more than 65% accurate.
432+
# After two epochs, the network should be more than 70% accurate.
431433
#
432434
# In this tutorial, we used torchaudio to load a dataset and resample the
433435
# signal. We have then defined a neural network that we trained to

0 commit comments

Comments
 (0)