diff --git a/beginner_source/audio_preprocessing_tutorial.py b/beginner_source/audio_preprocessing_tutorial.py
index dbd5b96310d..fea8d3d5a88 100644
--- a/beginner_source/audio_preprocessing_tutorial.py
+++ b/beginner_source/audio_preprocessing_tutorial.py
@@ -21,15 +21,11 @@
import torchaudio
import matplotlib.pyplot as plt
-
######################################################################
-# Opening a dataset
+# Opening a file
# -----------------
#
-
-
-######################################################################
-# torchaudio supports loading sound files in the wav and mp3 format. We
+# ``torchaudio`` also supports loading sound files in the wav and mp3 format. We
# call waveform the resulting raw audio signal.
#
@@ -42,16 +38,26 @@
plt.figure()
plt.plot(waveform.t().numpy())
+######################################################################
+# When you load a file in ``torchaudio``, you can optionally specify the backend to use either
+# `SoX `_ or `SoundFile `_
+# via ``torchaudio.set_audio_backend``. These backends are loaded lazily when needed.
+#
+# ``torchaudio`` also makes JIT compilation optional for functions, and uses ``nn.Module`` where possible.
######################################################################
# Transformations
# ---------------
#
-# torchaudio supports a growing list of
+# ``torchaudio`` supports a growing list of
# `transformations `_.
#
# - **Resample**: Resample waveform to a different sample rate.
# - **Spectrogram**: Create a spectrogram from a waveform.
+# - **GriffinLim**: Compute waveform from a linear scale magnitude spectrogram using
+# the Griffin-Lim transformation.
+# - **ComputeDeltas**: Compute delta coefficients of a tensor, usually a spectrogram.
+# - **ComplexNorm**: Compute the norm of a complex tensor.
# - **MelScale**: This turns a normal STFT into a Mel-frequency STFT,
# using a conversion matrix.
# - **AmplitudeToDB**: This turns a spectrogram from the
@@ -62,8 +68,14 @@
# STFT function in PyTorch.
# - **MuLawEncoding**: Encode waveform based on mu-law companding.
# - **MuLawDecoding**: Decode mu-law encoded waveform.
+# - **TimeStretch**: Stretch a spectrogram in time without modifying pitch for a given rate.
+# - **FrequencyMasking**: Apply masking to a spectrogram in the frequency domain.
+# - **TimeMasking**: Apply masking to a spectrogram in the time domain.
+#
+# Each transform supports batching: you can perform a transform on a single raw
+# audio signal or spectrogram, or many of the same shape.
#
-# Since all transforms are nn.Modules or jit.ScriptModules, they can be
+# Since all transforms are ``nn.Modules`` or ``jit.ScriptModules``, they can be
# used as part of a neural network at any point.
#
@@ -168,13 +180,86 @@ def normalize(tensor):
print("Median relative difference between original and MuLaw reconstucted signals: {:.2%}".format(err))
+######################################################################
+# Functional
+# ---------------
+#
+# The transformations seen above rely on lower level stateless functions for their computations.
+# These functions are available under ``torchaudio.functional``. The complete list is available
+# `here `_ and includes:
+#
+# - **istft**: Inverse short time Fourier Transform.
+# - **gain**: Applies amplification or attenuation to the whole waveform.
+# - **dither**: Increases the perceived dynamic range of audio stored at a
+# particular bit-depth.
+# - **compute_deltas**: Compute delta coefficients of a tensor.
+# - **equalizer_biquad**: Design biquad peaking equalizer filter and perform filtering.
+# - **lowpass_biquad**: Design biquad lowpass filter and perform filtering.
+# - **highpass_biquad**:Design biquad highpass filter and perform filtering.
+#
+# For example, let's try the `mu_law_encoding` functional:
+
+mu_law_encoding_waveform = torchaudio.functional.mu_law_encoding(waveform, quantization_channels=256)
+
+print("Shape of transformed waveform: {}".format(mu_law_encoding_waveform.size()))
+
+plt.figure()
+plt.plot(mu_law_encoding_waveform[0,:].numpy())
+
+######################################################################
+# You can see how the output fron ``torchaudio.functional.mu_law_encoding`` is the same as
+# the output from ``torchaudio.transforms.MuLawEncoding``.
+#
+# Now let's experiment with a few of the other functionals and visualize their output. Taking our
+# spectogram, we can compute it's deltas:
+
+computed = torchaudio.functional.compute_deltas(specgram, win_length=3)
+print("Shape of computed deltas: {}".format(computed.shape))
+
+plt.figure()
+plt.imshow(computed.log2()[0,:,:].detach().numpy(), cmap='gray')
+
+######################################################################
+# We can take the original waveform and apply different effects to it.
+#
+
+gain_waveform = torchaudio.functional.gain(waveform, gain_db=5.0)
+print("Min of gain_waveform: {}\nMax of gain_waveform: {}\nMean of gain_waveform: {}".format(gain_waveform.min(), gain_waveform.max(), gain_waveform.mean()))
+
+dither_waveform = torchaudio.functional.dither(waveform)
+print("Min of dither_waveform: {}\nMax of dither_waveform: {}\nMean of dither_waveform: {}".format(dither_waveform.min(), dither_waveform.max(), dither_waveform.mean()))
+
+######################################################################
+# Another example of the capabilities in ``torchaudio.functional`` are applying filters to our
+# waveform. Applying the lowpass biquad filter to our waveform will output a new waveform with
+# the signal of the frequency modified.
+
+lowpass_waveform = torchaudio.functional.lowpass_biquad(waveform, sample_rate, cutoff_freq=3000)
+
+print("Min of lowpass_waveform: {}\nMax of lowpass_waveform: {}\nMean of lowpass_waveform: {}".format(lowpass_waveform.min(), lowpass_waveform.max(), lowpass_waveform.mean()))
+
+plt.figure()
+plt.plot(lowpass_waveform.t().numpy())
+
+######################################################################
+# We can also visualize a waveform with the highpass biquad filter.
+#
+
+highpass_waveform = torchaudio.functional.highpass_biquad(waveform, sample_rate, cutoff_freq=2000)
+
+print("Min of highpass_waveform: {}\nMax of highpass_waveform: {}\nMean of highpass_waveform: {}".format(highpass_waveform.min(), highpass_waveform.max(), highpass_waveform.mean()))
+
+plt.figure()
+plt.plot(highpass_waveform.t().numpy())
+
+
######################################################################
# Migrating to torchaudio from Kaldi
# ----------------------------------
#
# Users may be familiar with
# `Kaldi `_, a toolkit for speech
-# recognition. torchaudio offers compatibility with it in
+# recognition. ``torchaudio`` offers compatibility with it in
# ``torchaudio.kaldi_io``. It can indeed read from kaldi scp, or ark file
# or streams with:
#
@@ -184,8 +269,8 @@ def normalize(tensor):
# - read_mat_scp
# - read_mat_ark
#
-# torchaudio provides Kaldi-compatible transforms for ``spectrogram`` and
-# ``fbank`` with the benefit of GPU support, see
+# ``torchaudio`` provides Kaldi-compatible transforms for ``spectrogram``,
+# ``fbank``, ``mfcc``, and ``resample_waveform with the benefit of GPU support, see
# `here `__ for more information.
#
@@ -225,13 +310,66 @@ def normalize(tensor):
plt.imshow(fbank.t().numpy(), cmap='gray')
+######################################################################
+# You can create mel frequency cepstral coefficients from a raw audio signal
+# This matches the input/output of Kaldiās compute-mfcc-feats.
+#
+
+mfcc = torchaudio.compliance.kaldi.mfcc(waveform, **params)
+
+print("Shape of mfcc: {}".format(mfcc.size()))
+
+plt.figure()
+plt.imshow(mfcc.t().numpy(), cmap='gray')
+
+
+######################################################################
+# Available Datasets
+# -----------------
+#
+# If you do not want to create your own dataset to train your model, ``torchaudio`` offers a
+# unified dataset interface. This interface supports lazy-loading of files to memory, download
+# and extract functions, and datasets to build models.
+#
+# The datasets ``torchaudio`` currently supports are:
+#
+# - **VCTK**: Speech data uttered by 109 native speakers of English with various accents
+# (`Read more here `_).
+# - **Yesno**: Sixty recordings of one individual saying yes or no in Hebrew; each
+# recording is eight words long (`Read more here `_).
+# - **Common Voice**: An open source, multi-language dataset of voices that anyone can use
+# to train speech-enabled applications (`Read more here `_).
+# - **LibriSpeech**: Large-scale (1000 hours) corpus of read English speech (`Read more here `_).
+#
+
+yesno_data = torchaudio.datasets.YESNO('./', download=True)
+
+# A data point in Yesno is a tuple (waveform, sample_rate, labels) where labels is a list of integers with 1 for yes and 0 for no.
+
+# Pick data point number 3 to see an example of the the yesno_data:
+n = 3
+waveform, sample_rate, labels = yesno_data[n]
+
+print("Waveform: {}\nSample rate: {}\nLabels: {}".format(waveform, sample_rate, labels))
+
+plt.figure()
+plt.plot(waveform.t().numpy())
+
+
+######################################################################
+# Now, whenever you ask for a sound file from the dataset, it is loaded in memory only when you ask for it.
+# Meaning, the dataset only loads and keeps in memory the items that you want and use, saving on memory.
+#
+
######################################################################
# Conclusion
# ----------
#
# We used an example raw audio signal, or waveform, to illustrate how to
-# open an audio file using torchaudio, and how to pre-process and
-# transform such waveform. Given that torchaudio is built on PyTorch,
+# open an audio file using ``torchaudio``, and how to pre-process,
+# transform, and apply functions to such waveform. We also demonstrated how
+# to use familiar Kaldi functions, as well as utilize built-in datasets to
+# construct our models. Given that ``torchaudio`` is built on PyTorch,
# these techniques can be used as building blocks for more advanced audio
# applications, such as speech recognition, while leveraging GPUs.
#