diff --git a/beginner_source/audio_preprocessing_tutorial.py b/beginner_source/audio_preprocessing_tutorial.py index dbd5b96310d..fea8d3d5a88 100644 --- a/beginner_source/audio_preprocessing_tutorial.py +++ b/beginner_source/audio_preprocessing_tutorial.py @@ -21,15 +21,11 @@ import torchaudio import matplotlib.pyplot as plt - ###################################################################### -# Opening a dataset +# Opening a file # ----------------- # - - -###################################################################### -# torchaudio supports loading sound files in the wav and mp3 format. We +# ``torchaudio`` also supports loading sound files in the wav and mp3 format. We # call waveform the resulting raw audio signal. # @@ -42,16 +38,26 @@ plt.figure() plt.plot(waveform.t().numpy()) +###################################################################### +# When you load a file in ``torchaudio``, you can optionally specify the backend to use either +# `SoX `_ or `SoundFile `_ +# via ``torchaudio.set_audio_backend``. These backends are loaded lazily when needed. +# +# ``torchaudio`` also makes JIT compilation optional for functions, and uses ``nn.Module`` where possible. ###################################################################### # Transformations # --------------- # -# torchaudio supports a growing list of +# ``torchaudio`` supports a growing list of # `transformations `_. # # - **Resample**: Resample waveform to a different sample rate. # - **Spectrogram**: Create a spectrogram from a waveform. +# - **GriffinLim**: Compute waveform from a linear scale magnitude spectrogram using +# the Griffin-Lim transformation. +# - **ComputeDeltas**: Compute delta coefficients of a tensor, usually a spectrogram. +# - **ComplexNorm**: Compute the norm of a complex tensor. # - **MelScale**: This turns a normal STFT into a Mel-frequency STFT, # using a conversion matrix. # - **AmplitudeToDB**: This turns a spectrogram from the @@ -62,8 +68,14 @@ # STFT function in PyTorch. # - **MuLawEncoding**: Encode waveform based on mu-law companding. # - **MuLawDecoding**: Decode mu-law encoded waveform. +# - **TimeStretch**: Stretch a spectrogram in time without modifying pitch for a given rate. +# - **FrequencyMasking**: Apply masking to a spectrogram in the frequency domain. +# - **TimeMasking**: Apply masking to a spectrogram in the time domain. +# +# Each transform supports batching: you can perform a transform on a single raw +# audio signal or spectrogram, or many of the same shape. # -# Since all transforms are nn.Modules or jit.ScriptModules, they can be +# Since all transforms are ``nn.Modules`` or ``jit.ScriptModules``, they can be # used as part of a neural network at any point. # @@ -168,13 +180,86 @@ def normalize(tensor): print("Median relative difference between original and MuLaw reconstucted signals: {:.2%}".format(err)) +###################################################################### +# Functional +# --------------- +# +# The transformations seen above rely on lower level stateless functions for their computations. +# These functions are available under ``torchaudio.functional``. The complete list is available +# `here `_ and includes: +# +# - **istft**: Inverse short time Fourier Transform. +# - **gain**: Applies amplification or attenuation to the whole waveform. +# - **dither**: Increases the perceived dynamic range of audio stored at a +# particular bit-depth. +# - **compute_deltas**: Compute delta coefficients of a tensor. +# - **equalizer_biquad**: Design biquad peaking equalizer filter and perform filtering. +# - **lowpass_biquad**: Design biquad lowpass filter and perform filtering. +# - **highpass_biquad**:Design biquad highpass filter and perform filtering. +# +# For example, let's try the `mu_law_encoding` functional: + +mu_law_encoding_waveform = torchaudio.functional.mu_law_encoding(waveform, quantization_channels=256) + +print("Shape of transformed waveform: {}".format(mu_law_encoding_waveform.size())) + +plt.figure() +plt.plot(mu_law_encoding_waveform[0,:].numpy()) + +###################################################################### +# You can see how the output fron ``torchaudio.functional.mu_law_encoding`` is the same as +# the output from ``torchaudio.transforms.MuLawEncoding``. +# +# Now let's experiment with a few of the other functionals and visualize their output. Taking our +# spectogram, we can compute it's deltas: + +computed = torchaudio.functional.compute_deltas(specgram, win_length=3) +print("Shape of computed deltas: {}".format(computed.shape)) + +plt.figure() +plt.imshow(computed.log2()[0,:,:].detach().numpy(), cmap='gray') + +###################################################################### +# We can take the original waveform and apply different effects to it. +# + +gain_waveform = torchaudio.functional.gain(waveform, gain_db=5.0) +print("Min of gain_waveform: {}\nMax of gain_waveform: {}\nMean of gain_waveform: {}".format(gain_waveform.min(), gain_waveform.max(), gain_waveform.mean())) + +dither_waveform = torchaudio.functional.dither(waveform) +print("Min of dither_waveform: {}\nMax of dither_waveform: {}\nMean of dither_waveform: {}".format(dither_waveform.min(), dither_waveform.max(), dither_waveform.mean())) + +###################################################################### +# Another example of the capabilities in ``torchaudio.functional`` are applying filters to our +# waveform. Applying the lowpass biquad filter to our waveform will output a new waveform with +# the signal of the frequency modified. + +lowpass_waveform = torchaudio.functional.lowpass_biquad(waveform, sample_rate, cutoff_freq=3000) + +print("Min of lowpass_waveform: {}\nMax of lowpass_waveform: {}\nMean of lowpass_waveform: {}".format(lowpass_waveform.min(), lowpass_waveform.max(), lowpass_waveform.mean())) + +plt.figure() +plt.plot(lowpass_waveform.t().numpy()) + +###################################################################### +# We can also visualize a waveform with the highpass biquad filter. +# + +highpass_waveform = torchaudio.functional.highpass_biquad(waveform, sample_rate, cutoff_freq=2000) + +print("Min of highpass_waveform: {}\nMax of highpass_waveform: {}\nMean of highpass_waveform: {}".format(highpass_waveform.min(), highpass_waveform.max(), highpass_waveform.mean())) + +plt.figure() +plt.plot(highpass_waveform.t().numpy()) + + ###################################################################### # Migrating to torchaudio from Kaldi # ---------------------------------- # # Users may be familiar with # `Kaldi `_, a toolkit for speech -# recognition. torchaudio offers compatibility with it in +# recognition. ``torchaudio`` offers compatibility with it in # ``torchaudio.kaldi_io``. It can indeed read from kaldi scp, or ark file # or streams with: # @@ -184,8 +269,8 @@ def normalize(tensor): # - read_mat_scp # - read_mat_ark # -# torchaudio provides Kaldi-compatible transforms for ``spectrogram`` and -# ``fbank`` with the benefit of GPU support, see +# ``torchaudio`` provides Kaldi-compatible transforms for ``spectrogram``, +# ``fbank``, ``mfcc``, and ``resample_waveform with the benefit of GPU support, see # `here `__ for more information. # @@ -225,13 +310,66 @@ def normalize(tensor): plt.imshow(fbank.t().numpy(), cmap='gray') +###################################################################### +# You can create mel frequency cepstral coefficients from a raw audio signal +# This matches the input/output of Kaldi’s compute-mfcc-feats. +# + +mfcc = torchaudio.compliance.kaldi.mfcc(waveform, **params) + +print("Shape of mfcc: {}".format(mfcc.size())) + +plt.figure() +plt.imshow(mfcc.t().numpy(), cmap='gray') + + +###################################################################### +# Available Datasets +# ----------------- +# +# If you do not want to create your own dataset to train your model, ``torchaudio`` offers a +# unified dataset interface. This interface supports lazy-loading of files to memory, download +# and extract functions, and datasets to build models. +# +# The datasets ``torchaudio`` currently supports are: +# +# - **VCTK**: Speech data uttered by 109 native speakers of English with various accents +# (`Read more here `_). +# - **Yesno**: Sixty recordings of one individual saying yes or no in Hebrew; each +# recording is eight words long (`Read more here `_). +# - **Common Voice**: An open source, multi-language dataset of voices that anyone can use +# to train speech-enabled applications (`Read more here `_). +# - **LibriSpeech**: Large-scale (1000 hours) corpus of read English speech (`Read more here `_). +# + +yesno_data = torchaudio.datasets.YESNO('./', download=True) + +# A data point in Yesno is a tuple (waveform, sample_rate, labels) where labels is a list of integers with 1 for yes and 0 for no. + +# Pick data point number 3 to see an example of the the yesno_data: +n = 3 +waveform, sample_rate, labels = yesno_data[n] + +print("Waveform: {}\nSample rate: {}\nLabels: {}".format(waveform, sample_rate, labels)) + +plt.figure() +plt.plot(waveform.t().numpy()) + + +###################################################################### +# Now, whenever you ask for a sound file from the dataset, it is loaded in memory only when you ask for it. +# Meaning, the dataset only loads and keeps in memory the items that you want and use, saving on memory. +# + ###################################################################### # Conclusion # ---------- # # We used an example raw audio signal, or waveform, to illustrate how to -# open an audio file using torchaudio, and how to pre-process and -# transform such waveform. Given that torchaudio is built on PyTorch, +# open an audio file using ``torchaudio``, and how to pre-process, +# transform, and apply functions to such waveform. We also demonstrated how +# to use familiar Kaldi functions, as well as utilize built-in datasets to +# construct our models. Given that ``torchaudio`` is built on PyTorch, # these techniques can be used as building blocks for more advanced audio # applications, such as speech recognition, while leveraging GPUs. #