21
21
import torchaudio
22
22
import matplotlib .pyplot as plt
23
23
24
-
25
24
######################################################################
26
- # Opening a dataset
25
+ # Opening a file
27
26
# -----------------
28
27
#
29
-
30
-
31
- ######################################################################
32
- # torchaudio supports loading sound files in the wav and mp3 format. We
28
+ # ``torchaudio`` also supports loading sound files in the wav and mp3 format. We
33
29
# call waveform the resulting raw audio signal.
34
30
#
35
31
42
38
plt .figure ()
43
39
plt .plot (waveform .t ().numpy ())
44
40
41
+ ######################################################################
42
+ # When you load a file in ``torchaudio``, you can optionally specify the backend to use either
43
+ # `SoX <https://pypi.org/project/sox/>`_ or `SoundFile <https://pypi.org/project/SoundFile/>`_
44
+ # via ``torchaudio.set_audio_backend``. These backends are loaded lazily when needed.
45
+ #
46
+ # ``torchaudio`` also makes JIT compilation optional for functions, and uses ``nn.Module`` where possible.
45
47
46
48
######################################################################
47
49
# Transformations
48
50
# ---------------
49
51
#
50
- # torchaudio supports a growing list of
52
+ # `` torchaudio`` supports a growing list of
51
53
# `transformations <https://pytorch.org/audio/transforms.html>`_.
52
54
#
53
55
# - **Resample**: Resample waveform to a different sample rate.
54
56
# - **Spectrogram**: Create a spectrogram from a waveform.
57
+ # - **GriffinLim**: Compute waveform from a linear scale magnitude spectrogram using
58
+ # the Griffin-Lim transformation.
59
+ # - **ComputeDeltas**: Compute delta coefficients of a tensor, usually a spectrogram.
60
+ # - **ComplexNorm**: Compute the norm of a complex tensor.
55
61
# - **MelScale**: This turns a normal STFT into a Mel-frequency STFT,
56
62
# using a conversion matrix.
57
63
# - **AmplitudeToDB**: This turns a spectrogram from the
62
68
# STFT function in PyTorch.
63
69
# - **MuLawEncoding**: Encode waveform based on mu-law companding.
64
70
# - **MuLawDecoding**: Decode mu-law encoded waveform.
71
+ # - **TimeStretch**: Stretch a spectrogram in time without modifying pitch for a given rate.
72
+ # - **FrequencyMasking**: Apply masking to a spectrogram in the frequency domain.
73
+ # - **TimeMasking**: Apply masking to a spectrogram in the time domain.
74
+ #
75
+ # Each transform supports batching: you can perform a transform on a single raw
76
+ # audio signal or spectrogram, or many of the same shape.
65
77
#
66
- # Since all transforms are nn.Modules or jit.ScriptModules, they can be
78
+ # Since all transforms are `` nn.Modules`` or `` jit.ScriptModules`` , they can be
67
79
# used as part of a neural network at any point.
68
80
#
69
81
@@ -168,13 +180,86 @@ def normalize(tensor):
168
180
print ("Median relative difference between original and MuLaw reconstucted signals: {:.2%}" .format (err ))
169
181
170
182
183
+ ######################################################################
184
+ # Functional
185
+ # ---------------
186
+ #
187
+ # The transformations seen above rely on lower level stateless functions for their computations.
188
+ # These functions are available under ``torchaudio.functional``. The complete list is available
189
+ # `here <https://pytorch.org/audio/functional.html>`_ and includes:
190
+ #
191
+ # - **istft**: Inverse short time Fourier Transform.
192
+ # - **gain**: Applies amplification or attenuation to the whole waveform.
193
+ # - **dither**: Increases the perceived dynamic range of audio stored at a
194
+ # particular bit-depth.
195
+ # - **compute_deltas**: Compute delta coefficients of a tensor.
196
+ # - **equalizer_biquad**: Design biquad peaking equalizer filter and perform filtering.
197
+ # - **lowpass_biquad**: Design biquad lowpass filter and perform filtering.
198
+ # - **highpass_biquad**:Design biquad highpass filter and perform filtering.
199
+ #
200
+ # For example, let's try the `mu_law_encoding` functional:
201
+
202
+ mu_law_encoding_waveform = torchaudio .functional .mu_law_encoding (waveform , quantization_channels = 256 )
203
+
204
+ print ("Shape of transformed waveform: {}" .format (mu_law_encoding_waveform .size ()))
205
+
206
+ plt .figure ()
207
+ plt .plot (mu_law_encoding_waveform [0 ,:].numpy ())
208
+
209
+ ######################################################################
210
+ # You can see how the output fron ``torchaudio.functional.mu_law_encoding`` is the same as
211
+ # the output from ``torchaudio.transforms.MuLawEncoding``.
212
+ #
213
+ # Now let's experiment with a few of the other functionals and visualize their output. Taking our
214
+ # spectogram, we can compute it's deltas:
215
+
216
+ computed = torchaudio .functional .compute_deltas (specgram , win_length = 3 )
217
+ print ("Shape of computed deltas: {}" .format (computed .shape ))
218
+
219
+ plt .figure ()
220
+ plt .imshow (computed .log2 ()[0 ,:,:].detach ().numpy (), cmap = 'gray' )
221
+
222
+ ######################################################################
223
+ # We can take the original waveform and apply different effects to it.
224
+ #
225
+
226
+ gain_waveform = torchaudio .functional .gain (waveform , gain_db = 5.0 )
227
+ print ("Min of gain_waveform: {}\n Max of gain_waveform: {}\n Mean of gain_waveform: {}" .format (gain_waveform .min (), gain_waveform .max (), gain_waveform .mean ()))
228
+
229
+ dither_waveform = torchaudio .functional .dither (waveform )
230
+ print ("Min of dither_waveform: {}\n Max of dither_waveform: {}\n Mean of dither_waveform: {}" .format (dither_waveform .min (), dither_waveform .max (), dither_waveform .mean ()))
231
+
232
+ ######################################################################
233
+ # Another example of the capabilities in ``torchaudio.functional`` are applying filters to our
234
+ # waveform. Applying the lowpass biquad filter to our waveform will output a new waveform with
235
+ # the signal of the frequency modified.
236
+
237
+ lowpass_waveform = torchaudio .functional .lowpass_biquad (waveform , sample_rate , cutoff_freq = 3000 )
238
+
239
+ print ("Min of lowpass_waveform: {}\n Max of lowpass_waveform: {}\n Mean of lowpass_waveform: {}" .format (lowpass_waveform .min (), lowpass_waveform .max (), lowpass_waveform .mean ()))
240
+
241
+ plt .figure ()
242
+ plt .plot (lowpass_waveform .t ().numpy ())
243
+
244
+ ######################################################################
245
+ # We can also visualize a waveform with the highpass biquad filter.
246
+ #
247
+
248
+ highpass_waveform = torchaudio .functional .highpass_biquad (waveform , sample_rate , cutoff_freq = 2000 )
249
+
250
+ print ("Min of highpass_waveform: {}\n Max of highpass_waveform: {}\n Mean of highpass_waveform: {}" .format (highpass_waveform .min (), highpass_waveform .max (), highpass_waveform .mean ()))
251
+
252
+ plt .figure ()
253
+ plt .plot (highpass_waveform .t ().numpy ())
254
+
255
+
171
256
######################################################################
172
257
# Migrating to torchaudio from Kaldi
173
258
# ----------------------------------
174
259
#
175
260
# Users may be familiar with
176
261
# `Kaldi <http://github.com/kaldi-asr/kaldi>`_, a toolkit for speech
177
- # recognition. torchaudio offers compatibility with it in
262
+ # recognition. `` torchaudio`` offers compatibility with it in
178
263
# ``torchaudio.kaldi_io``. It can indeed read from kaldi scp, or ark file
179
264
# or streams with:
180
265
#
@@ -184,8 +269,8 @@ def normalize(tensor):
184
269
# - read_mat_scp
185
270
# - read_mat_ark
186
271
#
187
- # torchaudio provides Kaldi-compatible transforms for ``spectrogram`` and
188
- # ``fbank`` with the benefit of GPU support, see
272
+ # `` torchaudio`` provides Kaldi-compatible transforms for ``spectrogram``,
273
+ # ``fbank``, ``mfcc``, and ``resample_waveform with the benefit of GPU support, see
189
274
# `here <compliance.kaldi.html>`__ for more information.
190
275
#
191
276
@@ -225,13 +310,66 @@ def normalize(tensor):
225
310
plt .imshow (fbank .t ().numpy (), cmap = 'gray' )
226
311
227
312
313
+ ######################################################################
314
+ # You can create mel frequency cepstral coefficients from a raw audio signal
315
+ # This matches the input/output of Kaldi’s compute-mfcc-feats.
316
+ #
317
+
318
+ mfcc = torchaudio .compliance .kaldi .mfcc (waveform , ** params )
319
+
320
+ print ("Shape of mfcc: {}" .format (mfcc .size ()))
321
+
322
+ plt .figure ()
323
+ plt .imshow (mfcc .t ().numpy (), cmap = 'gray' )
324
+
325
+
326
+ ######################################################################
327
+ # Available Datasets
328
+ # -----------------
329
+ #
330
+ # If you do not want to create your own dataset to train your model, ``torchaudio`` offers a
331
+ # unified dataset interface. This interface supports lazy-loading of files to memory, download
332
+ # and extract functions, and datasets to build models.
333
+ #
334
+ # The datasets ``torchaudio`` currently supports are:
335
+ #
336
+ # - **VCTK**: Speech data uttered by 109 native speakers of English with various accents
337
+ # (`Read more here <https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html>`_).
338
+ # - **Yesno**: Sixty recordings of one individual saying yes or no in Hebrew; each
339
+ # recording is eight words long (`Read more here <https://www.openslr.org/1/>`_).
340
+ # - **Common Voice**: An open source, multi-language dataset of voices that anyone can use
341
+ # to train speech-enabled applications (`Read more here <https://voice.mozilla.org/en/datasets>`_).
342
+ # - **LibriSpeech**: Large-scale (1000 hours) corpus of read English speech (`Read more here <http://www.openslr.org/12>`_).
343
+ #
344
+
345
+ yesno_data = torchaudio .datasets .YESNO ('./' , download = True )
346
+
347
+ # A data point in Yesno is a tuple (waveform, sample_rate, labels) where labels is a list of integers with 1 for yes and 0 for no.
348
+
349
+ # Pick data point number 3 to see an example of the the yesno_data:
350
+ n = 3
351
+ waveform , sample_rate , labels = yesno_data [n ]
352
+
353
+ print ("Waveform: {}\n Sample rate: {}\n Labels: {}" .format (waveform , sample_rate , labels ))
354
+
355
+ plt .figure ()
356
+ plt .plot (waveform .t ().numpy ())
357
+
358
+
359
+ ######################################################################
360
+ # Now, whenever you ask for a sound file from the dataset, it is loaded in memory only when you ask for it.
361
+ # Meaning, the dataset only loads and keeps in memory the items that you want and use, saving on memory.
362
+ #
363
+
228
364
######################################################################
229
365
# Conclusion
230
366
# ----------
231
367
#
232
368
# We used an example raw audio signal, or waveform, to illustrate how to
233
- # open an audio file using torchaudio, and how to pre-process and
234
- # transform such waveform. Given that torchaudio is built on PyTorch,
369
+ # open an audio file using ``torchaudio``, and how to pre-process,
370
+ # transform, and apply functions to such waveform. We also demonstrated how
371
+ # to use familiar Kaldi functions, as well as utilize built-in datasets to
372
+ # construct our models. Given that ``torchaudio`` is built on PyTorch,
235
373
# these techniques can be used as building blocks for more advanced audio
236
374
# applications, such as speech recognition, while leveraging GPUs.
237
375
#
0 commit comments