diff --git a/beginner_source/audio_data_augmentation_tutorial.py b/beginner_source/audio_data_augmentation_tutorial.py
deleted file mode 100644
index 933aefc4ef8..00000000000
--- a/beginner_source/audio_data_augmentation_tutorial.py
+++ /dev/null
@@ -1,443 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Audio Data Augmentation
-=======================
-
-``torchaudio`` provides a variety of ways to augment audio data.
-
-In this tutorial, we look into a way to apply effects, filters,
-RIR (room impulse response) and codecs.
-
-At the end, we synthesize noisy speech over phone from clean speech.
-"""
-
-import torch
-import torchaudio
-import torchaudio.functional as F
-
-print(torch.__version__)
-print(torchaudio.__version__)
-
-######################################################################
-# Preparation
-# -----------
-#
-# First, we import the modules and download the audio assets we use in this tutorial.
-#
-
-import math
-
-from IPython.display import Audio
-import matplotlib.pyplot as plt
-
-from torchaudio.utils import download_asset
-
-SAMPLE_WAV = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.wav")
-SAMPLE_RIR = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
-SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav")
-SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
-
-
-######################################################################
-# Applying effects and filtering
-# ------------------------------
-#
-# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to
-# those available in ``sox`` to Tensor objects and file object audio sources.
-#
-# There are two functions for this:
-#
-# -  :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects
-#    to Tensor.
-# -  :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to
-#    other audio sources.
-#
-# Both functions accept effect definitions in the form
-# ``List[List[str]]``.
-# This is mostly consistent with how ``sox`` command works, but one caveat is
-# that ``sox`` adds some effects automatically, whereas ``torchaudio``’s
-# implementation does not.
-#
-# For the list of available effects, please refer to `the sox
-# documentation <http://sox.sourceforge.net/sox.html>`__.
-#
-# **Tip** If you need to load and resample your audio data on the fly,
-# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file`
-# with effect ``"rate"``.
-#
-# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a
-# file-like object or path-like object.
-# Similar to :py:func:`torchaudio.load`, when the audio format cannot be
-# inferred from either the file extension or header, you can provide
-# argument ``format`` to specify the format of the audio source.
-#
-# **Note** This process is not differentiable.
-#
-
-# Load the data
-waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV)
-
-# Define effects
-effects = [
-    ["lowpass", "-1", "300"],  # apply single-pole lowpass filter
-    ["speed", "0.8"],  # reduce the speed
-    # This only changes sample rate, so it is necessary to
-    # add `rate` effect with original sample rate after this.
-    ["rate", f"{sample_rate1}"],
-    ["reverb", "-w"],  # Reverbration gives some dramatic feeling
-]
-
-# Apply effects
-waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects)
-
-print(waveform1.shape, sample_rate1)
-print(waveform2.shape, sample_rate2)
-
-######################################################################
-# Note that the number of frames and number of channels are different from
-# those of the original after the effects are applied. Let’s listen to the
-# audio.
-#
-
-def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-    time_axis = torch.arange(0, num_frames) / sample_rate
-
-    figure, axes = plt.subplots(num_channels, 1)
-    if num_channels == 1:
-        axes = [axes]
-    for c in range(num_channels):
-        axes[c].plot(time_axis, waveform[c], linewidth=1)
-        axes[c].grid(True)
-        if num_channels > 1:
-            axes[c].set_ylabel(f"Channel {c+1}")
-        if xlim:
-            axes[c].set_xlim(xlim)
-    figure.suptitle(title)
-    plt.show(block=False)
-
-######################################################################
-#
-
-def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
-    waveform = waveform.numpy()
-
-    num_channels, _ = waveform.shape
-
-    figure, axes = plt.subplots(num_channels, 1)
-    if num_channels == 1:
-        axes = [axes]
-    for c in range(num_channels):
-        axes[c].specgram(waveform[c], Fs=sample_rate)
-        if num_channels > 1:
-            axes[c].set_ylabel(f"Channel {c+1}")
-        if xlim:
-            axes[c].set_xlim(xlim)
-    figure.suptitle(title)
-    plt.show(block=False)
-
-######################################################################
-# Original:
-# ~~~~~~~~~
-#
-
-plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2))
-plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04))
-Audio(waveform1, rate=sample_rate1)
-
-######################################################################
-# Effects applied:
-# ~~~~~~~~~~~~~~~~
-#
-
-plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2))
-plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04))
-Audio(waveform2, rate=sample_rate2)
-
-######################################################################
-# Doesn’t it sound more dramatic?
-#
-
-######################################################################
-# Simulating room reverberation
-# -----------------------------
-#
-# `Convolution
-# reverb <https://en.wikipedia.org/wiki/Convolution_reverb>`__ is a
-# technique that's used to make clean audio sound as though it has been
-# produced in a different environment.
-#
-# Using Room Impulse Response (RIR), for instance, we can make clean speech
-# sound as though it has been uttered in a conference room.
-#
-# For this process, we need RIR data. The following data are from the VOiCES
-# dataset, but you can record your own — just turn on your microphone
-# and clap your hands.
-#
-
-rir_raw, sample_rate = torchaudio.load(SAMPLE_RIR)
-plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)")
-plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
-Audio(rir_raw, rate=sample_rate)
-
-######################################################################
-# First, we need to clean up the RIR. We extract the main impulse, normalize
-# the signal power, then flip along the time axis.
-#
-
-rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)]
-rir = rir / torch.norm(rir, p=2)
-RIR = torch.flip(rir, [1])
-
-plot_waveform(rir, sample_rate, title="Room Impulse Response")
-
-######################################################################
-# Then, we convolve the speech signal with the RIR filter.
-#
-
-speech, _ = torchaudio.load(SAMPLE_SPEECH)
-
-speech_ = torch.nn.functional.pad(speech, (RIR.shape[1] - 1, 0))
-augmented = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
-
-######################################################################
-# Original:
-# ~~~~~~~~~
-#
-
-plot_waveform(speech, sample_rate, title="Original")
-plot_specgram(speech, sample_rate, title="Original")
-Audio(speech, rate=sample_rate)
-
-######################################################################
-# RIR applied:
-# ~~~~~~~~~~~~
-#
-
-plot_waveform(augmented, sample_rate, title="RIR Applied")
-plot_specgram(augmented, sample_rate, title="RIR Applied")
-Audio(augmented, rate=sample_rate)
-
-
-######################################################################
-# Adding background noise
-# -----------------------
-#
-# To add background noise to audio data, you can simply add a noise Tensor to
-# the Tensor representing the audio data. A common method to adjust the
-# intensity of noise is changing the Signal-to-Noise Ratio (SNR).
-# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__]
-#
-# $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$
-#
-# $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$
-#
-
-speech, _ = torchaudio.load(SAMPLE_SPEECH)
-noise, _ = torchaudio.load(SAMPLE_NOISE)
-noise = noise[:, : speech.shape[1]]
-
-speech_rms = speech.norm(p=2)
-noise_rms = noise.norm(p=2)
-
-snr_dbs = [20, 10, 3]
-noisy_speeches = []
-for snr_db in snr_dbs:
-    snr = 10 ** (snr_db / 20)
-    scale = snr * noise_rms / speech_rms
-    noisy_speeches.append((scale * speech + noise) / 2)
-
-######################################################################
-# Background noise:
-# ~~~~~~~~~~~~~~~~~
-#
-
-plot_waveform(noise, sample_rate, title="Background noise")
-plot_specgram(noise, sample_rate, title="Background noise")
-Audio(noise, rate=sample_rate)
-
-######################################################################
-# SNR 20 dB:
-# ~~~~~~~~~~
-#
-
-snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0]
-plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
-plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
-Audio(noisy_speech, rate=sample_rate)
-
-######################################################################
-# SNR 10 dB:
-# ~~~~~~~~~~
-#
-
-snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1]
-plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
-plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
-Audio(noisy_speech, rate=sample_rate)
-
-######################################################################
-# SNR 3 dB:
-# ~~~~~~~~~
-#
-
-snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2]
-plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
-plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
-Audio(noisy_speech, rate=sample_rate)
-
-
-######################################################################
-# Applying codec to Tensor object
-# -------------------------------
-#
-# :py:func:`torchaudio.functional.apply_codec` can apply codecs to
-# a Tensor object.
-#
-# **Note** This process is not differentiable.
-#
-
-
-waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH)
-
-configs = [
-    {"format": "wav", "encoding": "ULAW", "bits_per_sample": 8},
-    {"format": "gsm"},
-    {"format": "vorbis", "compression": -1},
-]
-waveforms = []
-for param in configs:
-    augmented = F.apply_codec(waveform, sample_rate, **param)
-    waveforms.append(augmented)
-
-######################################################################
-# Original:
-# ~~~~~~~~~
-#
-
-plot_waveform(waveform, sample_rate, title="Original")
-plot_specgram(waveform, sample_rate, title="Original")
-Audio(waveform, rate=sample_rate)
-
-######################################################################
-# 8 bit mu-law:
-# ~~~~~~~~~~~~~
-#
-
-plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law")
-plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law")
-Audio(waveforms[0], rate=sample_rate)
-
-######################################################################
-# GSM-FR:
-# ~~~~~~~
-#
-
-plot_waveform(waveforms[1], sample_rate, title="GSM-FR")
-plot_specgram(waveforms[1], sample_rate, title="GSM-FR")
-Audio(waveforms[1], rate=sample_rate)
-
-######################################################################
-# Vorbis:
-# ~~~~~~~
-#
-
-plot_waveform(waveforms[2], sample_rate, title="Vorbis")
-plot_specgram(waveforms[2], sample_rate, title="Vorbis")
-Audio(waveforms[2], rate=sample_rate)
-
-######################################################################
-# Simulating a phone recoding
-# ---------------------------
-#
-# Combining the previous techniques, we can simulate audio that sounds
-# like a person talking over a phone in a echoey room with people talking
-# in the background.
-#
-
-sample_rate = 16000
-original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
-
-plot_specgram(original_speech, sample_rate, title="Original")
-
-# Apply RIR
-speech_ = torch.nn.functional.pad(original_speech, (RIR.shape[1] - 1, 0))
-rir_applied = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
-
-plot_specgram(rir_applied, sample_rate, title="RIR Applied")
-
-# Add background noise
-# Because the noise is recorded in the actual environment, we consider that
-# the noise contains the acoustic feature of the environment. Therefore, we add
-# the noise after RIR application.
-noise, _ = torchaudio.load(SAMPLE_NOISE)
-noise = noise[:, : rir_applied.shape[1]]
-
-snr_db = 8
-scale = (10 ** (snr_db / 20)) * noise.norm(p=2) / rir_applied.norm(p=2)
-bg_added = (scale * rir_applied + noise) / 2
-
-plot_specgram(bg_added, sample_rate, title="BG noise added")
-
-# Apply filtering and change sample rate
-filtered, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(
-    bg_added,
-    sample_rate,
-    effects=[
-        ["lowpass", "4000"],
-        [
-            "compand",
-            "0.02,0.05",
-            "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8",
-            "-8",
-            "-7",
-            "0.05",
-        ],
-        ["rate", "8000"],
-    ],
-)
-
-plot_specgram(filtered, sample_rate2, title="Filtered")
-
-# Apply telephony codec
-codec_applied = F.apply_codec(filtered, sample_rate2, format="gsm")
-
-plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied")
-
-
-######################################################################
-# Original speech:
-# ~~~~~~~~~~~~~~~~
-#
-
-Audio(original_speech, rate=sample_rate)
-
-######################################################################
-# RIR applied:
-# ~~~~~~~~~~~~
-#
-
-Audio(rir_applied, rate=sample_rate)
-
-######################################################################
-# Background noise added:
-# ~~~~~~~~~~~~~~~~~~~~~~~
-#
-
-Audio(bg_added, rate=sample_rate)
-
-######################################################################
-# Filtered:
-# ~~~~~~~~~
-#
-
-Audio(filtered, rate=sample_rate2)
-
-######################################################################
-# Codec applied:
-# ~~~~~~~~~~~~~~
-#
-
-Audio(codec_applied, rate=sample_rate2)
diff --git a/beginner_source/audio_data_augmentation_tutorial.rst b/beginner_source/audio_data_augmentation_tutorial.rst
new file mode 100644
index 00000000000..55ba024a590
--- /dev/null
+++ b/beginner_source/audio_data_augmentation_tutorial.rst
@@ -0,0 +1,10 @@
+Audio Data Augmentation
+=======================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html'" />
diff --git a/beginner_source/audio_datasets_tutorial.py b/beginner_source/audio_datasets_tutorial.py
deleted file mode 100644
index f08ed99e0db..00000000000
--- a/beginner_source/audio_datasets_tutorial.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Audio Datasets
-==============
-
-``torchaudio`` provides easy access to common, publicly accessible
-datasets. Please refer to the official documentation for the list of
-available datasets.
-"""
-
-# When running this tutorial in Google Colab, install the required packages
-# with the following.
-# !pip install torchaudio
-
-import torch
-import torchaudio
-
-print(torch.__version__)
-print(torchaudio.__version__)
-
-######################################################################
-# Preparing data and utility functions (skip this section)
-# --------------------------------------------------------
-#
-
-# @title Prepare data and utility functions. {display-mode: "form"}
-# @markdown
-# @markdown You do not need to look into this cell.
-# @markdown Just execute once and you are good to go.
-
-# -------------------------------------------------------------------------------
-# Preparation of data and helper functions.
-# -------------------------------------------------------------------------------
-import multiprocessing
-import os
-
-import matplotlib.pyplot as plt
-from IPython.display import Audio, display
-
-
-_SAMPLE_DIR = "_assets"
-YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no")
-os.makedirs(YESNO_DATASET_PATH, exist_ok=True)
-
-
-def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-
-    figure, axes = plt.subplots(num_channels, 1)
-    if num_channels == 1:
-        axes = [axes]
-    for c in range(num_channels):
-        axes[c].specgram(waveform[c], Fs=sample_rate)
-        if num_channels > 1:
-            axes[c].set_ylabel(f"Channel {c+1}")
-        if xlim:
-            axes[c].set_xlim(xlim)
-    figure.suptitle(title)
-    plt.show(block=False)
-
-
-def play_audio(waveform, sample_rate):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-    if num_channels == 1:
-        display(Audio(waveform[0], rate=sample_rate))
-    elif num_channels == 2:
-        display(Audio((waveform[0], waveform[1]), rate=sample_rate))
-    else:
-        raise ValueError("Waveform with more than 2 channels are not supported.")
-
-
-######################################################################
-# Here, we show how to use the
-# :py:func:`torchaudio.datasets.YESNO` dataset.
-#
-
-
-dataset = torchaudio.datasets.YESNO(YESNO_DATASET_PATH, download=True)
-
-for i in [1, 3, 5]:
-    waveform, sample_rate, label = dataset[i]
-    plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
-    play_audio(waveform, sample_rate)
diff --git a/beginner_source/audio_datasets_tutorial.rst b/beginner_source/audio_datasets_tutorial.rst
new file mode 100644
index 00000000000..0bcac44a0a8
--- /dev/null
+++ b/beginner_source/audio_datasets_tutorial.rst
@@ -0,0 +1,10 @@
+Audio Datasets
+==============
+
+This tutorial has been moved to https://pytorch.org/tutorials/beginner/audio_datasets_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/audio_datasets_tutorial.html'" />
diff --git a/beginner_source/audio_feature_augmentation_tutorial.py b/beginner_source/audio_feature_augmentation_tutorial.py
deleted file mode 100644
index 3961dafbc74..00000000000
--- a/beginner_source/audio_feature_augmentation_tutorial.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Audio Feature Augmentation
-==========================
-"""
-
-# When running this tutorial in Google Colab, install the required packages
-# with the following.
-# !pip install torchaudio librosa
-
-import torch
-import torchaudio
-import torchaudio.transforms as T
-
-print(torch.__version__)
-print(torchaudio.__version__)
-
-######################################################################
-# Preparing data and utility functions (skip this section)
-# --------------------------------------------------------
-#
-
-# @title Prepare data and utility functions. {display-mode: "form"}
-# @markdown
-# @markdown You do not need to look into this cell.
-# @markdown Just execute once and you are good to go.
-# @markdown
-# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/),
-# @markdown which is licensed under Creative Commos BY 4.0.
-
-# -------------------------------------------------------------------------------
-# Preparation of data and helper functions.
-# -------------------------------------------------------------------------------
-
-import os
-
-import librosa
-import matplotlib.pyplot as plt
-import requests
-
-
-_SAMPLE_DIR = "_assets"
-
-SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"  # noqa: E501
-SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav")
-
-os.makedirs(_SAMPLE_DIR, exist_ok=True)
-
-
-def _fetch_data():
-    uri = [
-        (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH),
-    ]
-    for url, path in uri:
-        with open(path, "wb") as file_:
-            file_.write(requests.get(url).content)
-
-
-_fetch_data()
-
-
-def _get_sample(path, resample=None):
-    effects = [["remix", "1"]]
-    if resample:
-        effects.extend(
-            [
-                ["lowpass", f"{resample // 2}"],
-                ["rate", f"{resample}"],
-            ]
-        )
-    return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
-
-
-def get_speech_sample(*, resample=None):
-    return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample)
-
-
-def get_spectrogram(
-    n_fft=400,
-    win_len=None,
-    hop_len=None,
-    power=2.0,
-):
-    waveform, _ = get_speech_sample()
-    spectrogram = T.Spectrogram(
-        n_fft=n_fft,
-        win_length=win_len,
-        hop_length=hop_len,
-        center=True,
-        pad_mode="reflect",
-        power=power,
-    )
-    return spectrogram(waveform)
-
-
-def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None):
-    fig, axs = plt.subplots(1, 1)
-    axs.set_title(title or "Spectrogram (db)")
-    axs.set_ylabel(ylabel)
-    axs.set_xlabel("frame")
-    im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect)
-    if xmax:
-        axs.set_xlim((0, xmax))
-    fig.colorbar(im, ax=axs)
-    plt.show(block=False)
-
-
-######################################################################
-# SpecAugment
-# -----------
-#
-# `SpecAugment <https://ai.googleblog.com/2019/04/specaugment-new-data-augmentation.html>`__
-# is a popular spectrogram augmentation technique.
-#
-# ``torchaudio`` implements :py:func:`torchaudio.transforms.TimeStretch`,
-# :py:func:`torchaudio.transforms.TimeMasking` and
-# :py:func:`torchaudio.transforms.FrequencyMasking`.
-#
-
-######################################################################
-# TimeStretch
-# -----------
-#
-
-
-spec = get_spectrogram(power=None)
-stretch = T.TimeStretch()
-
-rate = 1.2
-spec_ = stretch(spec, rate)
-plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304)
-
-plot_spectrogram(torch.abs(spec[0]), title="Original", aspect="equal", xmax=304)
-
-rate = 0.9
-spec_ = stretch(spec, rate)
-plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304)
-
-######################################################################
-# TimeMasking
-# -----------
-#
-
-torch.random.manual_seed(4)
-
-spec = get_spectrogram()
-plot_spectrogram(spec[0], title="Original")
-
-masking = T.TimeMasking(time_mask_param=80)
-spec = masking(spec)
-
-plot_spectrogram(spec[0], title="Masked along time axis")
-
-######################################################################
-# FrequencyMasking
-# ----------------
-#
-
-
-torch.random.manual_seed(4)
-
-spec = get_spectrogram()
-plot_spectrogram(spec[0], title="Original")
-
-masking = T.FrequencyMasking(freq_mask_param=80)
-spec = masking(spec)
-
-plot_spectrogram(spec[0], title="Masked along frequency axis")
diff --git a/beginner_source/audio_feature_augmentation_tutorial.rst b/beginner_source/audio_feature_augmentation_tutorial.rst
new file mode 100644
index 00000000000..55d3811b3fa
--- /dev/null
+++ b/beginner_source/audio_feature_augmentation_tutorial.rst
@@ -0,0 +1,10 @@
+Audio Feature Augmentation
+==========================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html'" />
diff --git a/beginner_source/audio_feature_extractions_tutorial.py b/beginner_source/audio_feature_extractions_tutorial.py
deleted file mode 100644
index 822c00d97ba..00000000000
--- a/beginner_source/audio_feature_extractions_tutorial.py
+++ /dev/null
@@ -1,457 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Audio Feature Extractions
-=========================
-
-``torchaudio`` implements feature extractions commonly used in the audio
-domain. They are available in ``torchaudio.functional`` and
-``torchaudio.transforms``.
-
-``functional`` implements features as standalone functions.
-They are stateless.
-
-``transforms`` implements features as objects,
-using implementations from ``functional`` and ``torch.nn.Module``.
-They can be serialized using TorchScript.
-"""
-
-import torch
-import torchaudio
-import torchaudio.functional as F
-import torchaudio.transforms as T
-
-print(torch.__version__)
-print(torchaudio.__version__)
-
-######################################################################
-# Preparation
-# -----------
-#
-# .. note::
-#
-#    When running this tutorial in Google Colab, install the required packages
-#
-#    .. code::
-#
-#       !pip install librosa
-#
-from IPython.display import Audio
-import librosa
-import matplotlib.pyplot as plt
-from torchaudio.utils import download_asset
-
-torch.random.manual_seed(0)
-
-SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
-
-
-def plot_waveform(waveform, sr, title="Waveform"):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-    time_axis = torch.arange(0, num_frames) / sr
-
-    figure, axes = plt.subplots(num_channels, 1)
-    axes.plot(time_axis, waveform[0], linewidth=1)
-    axes.grid(True)
-    figure.suptitle(title)
-    plt.show(block=False)
-
-
-def plot_spectrogram(specgram, title=None, ylabel="freq_bin"):
-    fig, axs = plt.subplots(1, 1)
-    axs.set_title(title or "Spectrogram (db)")
-    axs.set_ylabel(ylabel)
-    axs.set_xlabel("frame")
-    im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto")
-    fig.colorbar(im, ax=axs)
-    plt.show(block=False)
-
-
-def plot_fbank(fbank, title=None):
-    fig, axs = plt.subplots(1, 1)
-    axs.set_title(title or "Filter bank")
-    axs.imshow(fbank, aspect="auto")
-    axs.set_ylabel("frequency bin")
-    axs.set_xlabel("mel bin")
-    plt.show(block=False)
-
-
-######################################################################
-# Overview of audio features
-# --------------------------
-#
-# The following diagram shows the relationship between common audio features
-# and torchaudio APIs to generate them.
-#
-# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
-#
-# For the complete list of available features, please refer to the
-# documentation.
-#
-
-
-######################################################################
-# Spectrogram
-# -----------
-#
-# To get the frequency make-up of an audio signal as it varies with time,
-# you can use :py:func:`torchaudio.transforms.Spectrogram`.
-#
-
-SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH)
-
-plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform")
-Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE)
-
-
-######################################################################
-#
-
-n_fft = 1024
-win_length = None
-hop_length = 512
-
-# Define transform
-spectrogram = T.Spectrogram(
-    n_fft=n_fft,
-    win_length=win_length,
-    hop_length=hop_length,
-    center=True,
-    pad_mode="reflect",
-    power=2.0,
-)
-
-######################################################################
-#
-
-# Perform transform
-spec = spectrogram(SPEECH_WAVEFORM)
-
-######################################################################
-#
-
-plot_spectrogram(spec[0], title="torchaudio")
-
-######################################################################
-# GriffinLim
-# ----------
-#
-# To recover a waveform from a spectrogram, you can use ``GriffinLim``.
-#
-
-torch.random.manual_seed(0)
-
-n_fft = 1024
-win_length = None
-hop_length = 512
-
-spec = T.Spectrogram(
-    n_fft=n_fft,
-    win_length=win_length,
-    hop_length=hop_length,
-)(SPEECH_WAVEFORM)
-
-######################################################################
-#
-
-griffin_lim = T.GriffinLim(
-    n_fft=n_fft,
-    win_length=win_length,
-    hop_length=hop_length,
-)
-
-######################################################################
-#
-
-reconstructed_waveform = griffin_lim(spec)
-
-######################################################################
-#
-
-plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed")
-Audio(reconstructed_waveform, rate=SAMPLE_RATE)
-
-######################################################################
-# Mel Filter Bank
-# ---------------
-#
-# :py:func:`torchaudio.functional.melscale_fbanks` generates the filter bank
-# for converting frequency bins to mel-scale bins.
-#
-# Since this function does not require input audio/features, there is no
-# equivalent transform in :py:func:`torchaudio.transforms`.
-#
-
-n_fft = 256
-n_mels = 64
-sample_rate = 6000
-
-mel_filters = F.melscale_fbanks(
-    int(n_fft // 2 + 1),
-    n_mels=n_mels,
-    f_min=0.0,
-    f_max=sample_rate / 2.0,
-    sample_rate=sample_rate,
-    norm="slaney",
-)
-
-######################################################################
-#
-
-plot_fbank(mel_filters, "Mel Filter Bank - torchaudio")
-
-######################################################################
-# Comparison against librosa
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# For reference, here is the equivalent way to get the mel filter bank
-# with ``librosa``.
-#
-
-mel_filters_librosa = librosa.filters.mel(
-    sr=sample_rate,
-    n_fft=n_fft,
-    n_mels=n_mels,
-    fmin=0.0,
-    fmax=sample_rate / 2.0,
-    norm="slaney",
-    htk=True,
-).T
-
-######################################################################
-#
-
-plot_fbank(mel_filters_librosa, "Mel Filter Bank - librosa")
-
-mse = torch.square(mel_filters - mel_filters_librosa).mean().item()
-print("Mean Square Difference: ", mse)
-
-######################################################################
-# MelSpectrogram
-# --------------
-#
-# Generating a mel-scale spectrogram involves generating a spectrogram
-# and performing mel-scale conversion. In ``torchaudio``,
-# :py:func:`torchaudio.transforms.MelSpectrogram` provides
-# this functionality.
-#
-
-n_fft = 1024
-win_length = None
-hop_length = 512
-n_mels = 128
-
-mel_spectrogram = T.MelSpectrogram(
-    sample_rate=sample_rate,
-    n_fft=n_fft,
-    win_length=win_length,
-    hop_length=hop_length,
-    center=True,
-    pad_mode="reflect",
-    power=2.0,
-    norm="slaney",
-    onesided=True,
-    n_mels=n_mels,
-    mel_scale="htk",
-)
-
-melspec = mel_spectrogram(SPEECH_WAVEFORM)
-
-######################################################################
-#
-
-plot_spectrogram(melspec[0], title="MelSpectrogram - torchaudio", ylabel="mel freq")
-
-######################################################################
-# Comparison against librosa
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# For reference, here is the equivalent means of generating mel-scale
-# spectrograms with ``librosa``.
-#
-
-melspec_librosa = librosa.feature.melspectrogram(
-    y=SPEECH_WAVEFORM.numpy()[0],
-    sr=sample_rate,
-    n_fft=n_fft,
-    hop_length=hop_length,
-    win_length=win_length,
-    center=True,
-    pad_mode="reflect",
-    power=2.0,
-    n_mels=n_mels,
-    norm="slaney",
-    htk=True,
-)
-
-######################################################################
-#
-
-plot_spectrogram(melspec_librosa, title="MelSpectrogram - librosa", ylabel="mel freq")
-
-mse = torch.square(melspec - melspec_librosa).mean().item()
-print("Mean Square Difference: ", mse)
-
-######################################################################
-# MFCC
-# ----
-#
-
-n_fft = 2048
-win_length = None
-hop_length = 512
-n_mels = 256
-n_mfcc = 256
-
-mfcc_transform = T.MFCC(
-    sample_rate=sample_rate,
-    n_mfcc=n_mfcc,
-    melkwargs={
-        "n_fft": n_fft,
-        "n_mels": n_mels,
-        "hop_length": hop_length,
-        "mel_scale": "htk",
-    },
-)
-
-mfcc = mfcc_transform(SPEECH_WAVEFORM)
-
-######################################################################
-#
-
-plot_spectrogram(mfcc[0])
-
-######################################################################
-# Comparison against librosa
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-
-melspec = librosa.feature.melspectrogram(
-    y=SPEECH_WAVEFORM.numpy()[0],
-    sr=sample_rate,
-    n_fft=n_fft,
-    win_length=win_length,
-    hop_length=hop_length,
-    n_mels=n_mels,
-    htk=True,
-    norm=None,
-)
-
-mfcc_librosa = librosa.feature.mfcc(
-    S=librosa.core.spectrum.power_to_db(melspec),
-    n_mfcc=n_mfcc,
-    dct_type=2,
-    norm="ortho",
-)
-
-######################################################################
-#
-
-plot_spectrogram(mfcc_librosa)
-
-mse = torch.square(mfcc - mfcc_librosa).mean().item()
-print("Mean Square Difference: ", mse)
-
-######################################################################
-# LFCC
-# ----
-#
-
-n_fft = 2048
-win_length = None
-hop_length = 512
-n_lfcc = 256
-
-lfcc_transform = T.LFCC(
-    sample_rate=sample_rate,
-    n_lfcc=n_lfcc,
-    speckwargs={
-        "n_fft": n_fft,
-        "win_length": win_length,
-        "hop_length": hop_length,
-    },
-)
-
-lfcc = lfcc_transform(SPEECH_WAVEFORM)
-plot_spectrogram(lfcc[0])
-
-######################################################################
-# Pitch
-# -----
-#
-
-pitch = F.detect_pitch_frequency(SPEECH_WAVEFORM, SAMPLE_RATE)
-
-######################################################################
-#
-
-def plot_pitch(waveform, sr, pitch):
-    figure, axis = plt.subplots(1, 1)
-    axis.set_title("Pitch Feature")
-    axis.grid(True)
-
-    end_time = waveform.shape[1] / sr
-    time_axis = torch.linspace(0, end_time, waveform.shape[1])
-    axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3)
-
-    axis2 = axis.twinx()
-    time_axis = torch.linspace(0, end_time, pitch.shape[1])
-    axis2.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green")
-
-    axis2.legend(loc=0)
-    plt.show(block=False)
-
-
-plot_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch)
-
-######################################################################
-# Kaldi Pitch (beta)
-# ------------------
-#
-# Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic
-# speech recognition (ASR) applications. This is a beta feature in ``torchaudio``,
-# and it is available as :py:func:`torchaudio.functional.compute_kaldi_pitch`.
-#
-# 1. A pitch extraction algorithm tuned for automatic speech recognition
-#
-#    Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S.
-#    Khudanpur
-#
-#    2014 IEEE International Conference on Acoustics, Speech and Signal
-#    Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi:
-#    10.1109/ICASSP.2014.6854049.
-#    [`abstract <https://ieeexplore.ieee.org/document/6854049>`__],
-#    [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
-#
-
-pitch_feature = F.compute_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE)
-pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1]
-
-######################################################################
-#
-
-def plot_kaldi_pitch(waveform, sr, pitch, nfcc):
-    _, axis = plt.subplots(1, 1)
-    axis.set_title("Kaldi Pitch Feature")
-    axis.grid(True)
-
-    end_time = waveform.shape[1] / sr
-    time_axis = torch.linspace(0, end_time, waveform.shape[1])
-    axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3)
-
-    time_axis = torch.linspace(0, end_time, pitch.shape[1])
-    ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green")
-    axis.set_ylim((-1.3, 1.3))
-
-    axis2 = axis.twinx()
-    time_axis = torch.linspace(0, end_time, nfcc.shape[1])
-    ln2 = axis2.plot(time_axis, nfcc[0], linewidth=2, label="NFCC", color="blue", linestyle="--")
-
-    lns = ln1 + ln2
-    labels = [l.get_label() for l in lns]
-    axis.legend(lns, labels, loc=0)
-    plt.show(block=False)
-
-
-plot_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch, nfcc)
diff --git a/beginner_source/audio_feature_extractions_tutorial.rst b/beginner_source/audio_feature_extractions_tutorial.rst
new file mode 100644
index 00000000000..a2a8da4ab75
--- /dev/null
+++ b/beginner_source/audio_feature_extractions_tutorial.rst
@@ -0,0 +1,10 @@
+Audio Feature Extractions
+=========================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_feature_extractions_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/audio_feature_extractions_tutorial.html'" />
diff --git a/beginner_source/audio_io_tutorial.py b/beginner_source/audio_io_tutorial.py
deleted file mode 100644
index 4917f1b1025..00000000000
--- a/beginner_source/audio_io_tutorial.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Audio I/O
-=========
-
-This tutorial shows how to use TorchAudio's basic I/O API to load audio files
-into PyTorch's Tensor object, and save Tensor objects to audio files.
-"""
-
-import torch
-import torchaudio
-
-print(torch.__version__)
-print(torchaudio.__version__)
-
-######################################################################
-# Preparation
-# -----------
-#
-# First, we import the modules and download the audio assets we use in this tutorial.
-#
-# .. note::
-#    When running this tutorial in Google Colab, install the required packages
-#    with the following:
-#
-#    .. code::
-#
-#       !pip install boto3
-
-import io
-import os
-import tarfile
-import tempfile
-
-import boto3
-import matplotlib.pyplot as plt
-import requests
-from botocore import UNSIGNED
-from botocore.config import Config
-from IPython.display import Audio
-from torchaudio.utils import download_asset
-
-SAMPLE_GSM = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.gsm")
-SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
-SAMPLE_WAV_8000 = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav")
-
-
-
-######################################################################
-# Querying audio metadata
-# -----------------------
-#
-# Function :py:func:`torchaudio.info` fetches audio metadata.
-# You can provide a path-like object or file-like object.
-#
-
-metadata = torchaudio.info(SAMPLE_WAV)
-print(metadata)
-
-######################################################################
-# Where
-#
-# -  ``sample_rate`` is the sampling rate of the audio
-# -  ``num_channels`` is the number of channels
-# -  ``num_frames`` is the number of frames per channel
-# -  ``bits_per_sample`` is bit depth
-# -  ``encoding`` is the sample coding format
-#
-# ``encoding`` can take on one of the following values:
-#
-# -  ``"PCM_S"``: Signed integer linear PCM
-# -  ``"PCM_U"``: Unsigned integer linear PCM
-# -  ``"PCM_F"``: Floating point linear PCM
-# -  ``"FLAC"``: Flac, `Free Lossless Audio
-#    Codec <https://xiph.org/flac/>`__
-# -  ``"ULAW"``: Mu-law,
-#    [`wikipedia <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`__]
-# -  ``"ALAW"``: A-law
-#    [`wikipedia <https://en.wikipedia.org/wiki/A-law_algorithm>`__]
-# -  ``"MP3"`` : MP3, MPEG-1 Audio Layer III
-# -  ``"VORBIS"``: OGG Vorbis [`xiph.org <https://xiph.org/vorbis/>`__]
-# -  ``"AMR_NB"``: Adaptive Multi-Rate
-#    [`wikipedia <https://en.wikipedia.org/wiki/Adaptive_Multi-Rate_audio_codec>`__]
-# -  ``"AMR_WB"``: Adaptive Multi-Rate Wideband
-#    [`wikipedia <https://en.wikipedia.org/wiki/Adaptive_Multi-Rate_Wideband>`__]
-# -  ``"OPUS"``: Opus [`opus-codec.org <https://opus-codec.org/>`__]
-# -  ``"GSM"``: GSM-FR
-#    [`wikipedia <https://en.wikipedia.org/wiki/Full_Rate>`__]
-# -  ``"HTK"``: Single channel 16-bit PCM
-# -  ``"UNKNOWN"`` None of above
-#
-
-######################################################################
-# **Note**
-#
-# -  ``bits_per_sample`` can be ``0`` for formats with compression and/or
-#    variable bit rate (such as MP3).
-# -  ``num_frames`` can be ``0`` for GSM-FR format.
-#
-
-metadata = torchaudio.info(SAMPLE_GSM)
-print(metadata)
-
-
-######################################################################
-# Querying file-like object
-# -------------------------
-#
-# :py:func:`torchaudio.info` works on file-like objects.
-#
-
-url = "https://download.pytorch.org/torchaudio/tutorial-assets/steam-train-whistle-daniel_simon.wav"
-with requests.get(url, stream=True) as response:
-    metadata = torchaudio.info(response.raw)
-print(metadata)
-
-######################################################################
-# .. note::
-#
-#    When passing a file-like object, ``info`` does not read
-#    all of the underlying data; rather, it reads only a portion
-#    of the data from the beginning.
-#    Therefore, for a given audio format, it may not be able to retrieve the
-#    correct metadata, including the format itself. In such case, you
-#    can pass ``format`` argument to specify the format of the audio.
-
-######################################################################
-# Loading audio data
-# ------------------
-#
-# To load audio data, you can use :py:func:`torchaudio.load`.
-#
-# This function accepts a path-like object or file-like object as input.
-#
-# The returned value is a tuple of waveform (``Tensor``) and sample rate
-# (``int``).
-#
-# By default, the resulting tensor object has ``dtype=torch.float32`` and
-# its value range is ``[-1.0, 1.0]``.
-#
-# For the list of supported format, please refer to `the torchaudio
-# documentation <https://pytorch.org/audio>`__.
-#
-
-waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
-
-
-######################################################################
-#
-def plot_waveform(waveform, sample_rate):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-    time_axis = torch.arange(0, num_frames) / sample_rate
-
-    figure, axes = plt.subplots(num_channels, 1)
-    if num_channels == 1:
-        axes = [axes]
-    for c in range(num_channels):
-        axes[c].plot(time_axis, waveform[c], linewidth=1)
-        axes[c].grid(True)
-        if num_channels > 1:
-            axes[c].set_ylabel(f"Channel {c+1}")
-    figure.suptitle("waveform")
-    plt.show(block=False)
-
-
-######################################################################
-#
-plot_waveform(waveform, sample_rate)
-
-
-######################################################################
-#
-def plot_specgram(waveform, sample_rate, title="Spectrogram"):
-    waveform = waveform.numpy()
-
-    num_channels, num_frames = waveform.shape
-
-    figure, axes = plt.subplots(num_channels, 1)
-    if num_channels == 1:
-        axes = [axes]
-    for c in range(num_channels):
-        axes[c].specgram(waveform[c], Fs=sample_rate)
-        if num_channels > 1:
-            axes[c].set_ylabel(f"Channel {c+1}")
-    figure.suptitle(title)
-    plt.show(block=False)
-
-
-######################################################################
-#
-plot_specgram(waveform, sample_rate)
-
-
-######################################################################
-#
-Audio(waveform.numpy()[0], rate=sample_rate)
-
-######################################################################
-# Loading from file-like object
-# -----------------------------
-#
-# The I/O functions support file-like objects.
-# This allows for fetching and decoding audio data from locations
-# within and beyond the local file system.
-# The following examples illustrate this.
-#
-
-######################################################################
-#
-
-# Load audio data as HTTP request
-url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-with requests.get(url, stream=True) as response:
-    waveform, sample_rate = torchaudio.load(response.raw)
-plot_specgram(waveform, sample_rate, title="HTTP datasource")
-
-######################################################################
-#
-
-# Load audio from tar file
-tar_path = download_asset("tutorial-assets/VOiCES_devkit.tar.gz")
-tar_item = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-with tarfile.open(tar_path, mode="r") as tarfile_:
-    fileobj = tarfile_.extractfile(tar_item)
-    waveform, sample_rate = torchaudio.load(fileobj)
-plot_specgram(waveform, sample_rate, title="TAR file")
-
-######################################################################
-#
-
-# Load audio from S3
-bucket = "pytorch-tutorial-assets"
-key = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
-response = client.get_object(Bucket=bucket, Key=key)
-waveform, sample_rate = torchaudio.load(response["Body"])
-plot_specgram(waveform, sample_rate, title="From S3")
-
-
-######################################################################
-# Tips on slicing
-# ---------------
-#
-# Providing ``num_frames`` and ``frame_offset`` arguments restricts
-# decoding to the corresponding segment of the input.
-#
-# The same result can be achieved using vanilla Tensor slicing,
-# (i.e. ``waveform[:, frame_offset:frame_offset+num_frames]``). However,
-# providing ``num_frames`` and ``frame_offset`` arguments is more
-# efficient.
-#
-# This is because the function will end data acquisition and decoding
-# once it finishes decoding the requested frames. This is advantageous
-# when the audio data are transferred via network as the data transfer will
-# stop as soon as the necessary amount of data is fetched.
-#
-# The following example illustrates this.
-#
-
-# Illustration of two different decoding methods.
-# The first one will fetch all the data and decode them, while
-# the second one will stop fetching data once it completes decoding.
-# The resulting waveforms are identical.
-
-frame_offset, num_frames = 16000, 16000  # Fetch and decode the 1 - 2 seconds
-
-url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-print("Fetching all the data...")
-with requests.get(url, stream=True) as response:
-    waveform1, sample_rate1 = torchaudio.load(response.raw)
-    waveform1 = waveform1[:, frame_offset : frame_offset + num_frames]
-    print(f" - Fetched {response.raw.tell()} bytes")
-
-print("Fetching until the requested frames are available...")
-with requests.get(url, stream=True) as response:
-    waveform2, sample_rate2 = torchaudio.load(response.raw, frame_offset=frame_offset, num_frames=num_frames)
-    print(f" - Fetched {response.raw.tell()} bytes")
-
-print("Checking the resulting waveform ... ", end="")
-assert (waveform1 == waveform2).all()
-print("matched!")
-
-######################################################################
-# Saving audio to file
-# --------------------
-#
-# To save audio data in formats interpretable by common applications,
-# you can use :py:func:`torchaudio.save`.
-#
-# This function accepts a path-like object or file-like object.
-#
-# When passing a file-like object, you also need to provide argument ``format``
-# so that the function knows which format it should use. In the
-# case of a path-like object, the function will infer the format from
-# the extension. If you are saving to a file without an extension, you need
-# to provide argument ``format``.
-#
-# When saving WAV-formatted data, the default encoding for ``float32`` Tensor
-# is 32-bit floating-point PCM. You can provide arguments ``encoding`` and
-# ``bits_per_sample`` to change this behavior. For example, to save data
-# in 16-bit signed integer PCM, you can do the following.
-#
-# .. note::
-#
-# Saving data in encodings with a lower bit depth reduces the
-# resulting file size but also precision.
-#
-
-waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
-
-
-######################################################################
-#
-
-def inspect_file(path):
-    print("-" * 10)
-    print("Source:", path)
-    print("-" * 10)
-    print(f" - File size: {os.path.getsize(path)} bytes")
-    print(f" - {torchaudio.info(path)}")
-    print()
-
-######################################################################
-#
-# Save without any encoding option.
-# The function will pick up the encoding which
-# the provided data fit
-with tempfile.TemporaryDirectory() as tempdir:
-    path = f"{tempdir}/save_example_default.wav"
-    torchaudio.save(path, waveform, sample_rate)
-    inspect_file(path)
-
-######################################################################
-#
-# Save as 16-bit signed integer Linear PCM
-# The resulting file occupies half the storage but loses precision
-with tempfile.TemporaryDirectory() as tempdir:
-    path = f"{tempdir}/save_example_PCM_S16.wav"
-    torchaudio.save(path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
-    inspect_file(path)
-
-
-######################################################################
-# :py:func:`torchaudio.save` can also handle other formats.
-# To name a few:
-#
-
-formats = [
-    "flac",
-    "vorbis",
-    "sph",
-    "amb",
-    "amr-nb",
-    "gsm",
-]
-
-######################################################################
-#
-waveform, sample_rate = torchaudio.load(SAMPLE_WAV_8000)
-with tempfile.TemporaryDirectory() as tempdir:
-    for format in formats:
-        path = f"{tempdir}/save_example.{format}"
-        torchaudio.save(path, waveform, sample_rate, format=format)
-        inspect_file(path)
-
-######################################################################
-# Saving to file-like object
-# --------------------------
-#
-# Similar to the other I/O functions, you can save audio to file-like
-# objects. When saving to a file-like object, argument ``format`` is
-# required.
-#
-
-
-waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
-
-# Saving to bytes buffer
-buffer_ = io.BytesIO()
-torchaudio.save(buffer_, waveform, sample_rate, format="wav")
-
-buffer_.seek(0)
-print(buffer_.read(16))
diff --git a/beginner_source/audio_io_tutorial.rst b/beginner_source/audio_io_tutorial.rst
new file mode 100644
index 00000000000..3263ad93a98
--- /dev/null
+++ b/beginner_source/audio_io_tutorial.rst
@@ -0,0 +1,10 @@
+Audio I/O
+=========
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_io_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/audio_io_tutorial.html'" />
diff --git a/beginner_source/audio_resampling_tutorial.rst b/beginner_source/audio_resampling_tutorial.rst
index 0a611110578..01210830eb7 100644
--- a/beginner_source/audio_resampling_tutorial.rst
+++ b/beginner_source/audio_resampling_tutorial.rst
@@ -1,3 +1,6 @@
+Audio Resampling
+================
+
 This tutorial has been moved to `a new location <https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html>`_
 You will be redirected in 3 seconds.
 
diff --git a/intermediate_source/forced_alignment_with_torchaudio_tutorial.py b/intermediate_source/forced_alignment_with_torchaudio_tutorial.py
deleted file mode 100644
index 1a5e8025d8f..00000000000
--- a/intermediate_source/forced_alignment_with_torchaudio_tutorial.py
+++ /dev/null
@@ -1,528 +0,0 @@
-"""
-Forced Alignment with Wav2Vec2
-==============================
-
-**Author** `Moto Hira <moto@fb.com>`__
-
-This tutorial shows how to align transcript to speech with
-``torchaudio``, using CTC segmentation algorithm described in
-`CTC-Segmentation of Large Corpora for German End-to-end Speech
-Recognition <https://arxiv.org/abs/2007.09127>`__.
-
-"""
-
-import torch
-import torchaudio
-
-print(torch.__version__)
-print(torchaudio.__version__)
-
-
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(device)
-
-
-######################################################################
-# Overview
-# --------
-#
-# The process of alignment looks like the following.
-#
-# 1. Estimate the frame-wise label probability from audio waveform
-# 2. Generate the trellis matrix which represents the probability of
-#    labels aligned at time step.
-# 3. Find the most likely path from the trellis matrix.
-#
-# In this example, we use ``torchaudio``\ ’s ``Wav2Vec2`` model for
-# acoustic feature extraction.
-#
-
-
-######################################################################
-# Preparation
-# -----------
-#
-# First we import the necessary packages, and fetch data that we work on.
-#
-
-# %matplotlib inline
-
-from dataclasses import dataclass
-
-import IPython
-import matplotlib
-import matplotlib.pyplot as plt
-
-matplotlib.rcParams["figure.figsize"] = [16.0, 4.8]
-
-torch.random.manual_seed(0)
-
-SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
-
-
-######################################################################
-# Generate frame-wise label probability
-# -------------------------------------
-#
-# The first step is to generate the label class porbability of each aduio
-# frame. We can use a Wav2Vec2 model that is trained for ASR. Here we use
-# :py:func:`torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H`.
-#
-# ``torchaudio`` provides easy access to pretrained models with associated
-# labels.
-#
-# .. note::
-#
-#    In the subsequent sections, we will compute the probability in
-#    log-domain to avoid numerical instability. For this purpose, we
-#    normalize the ``emission`` with :py:func:`torch.log_softmax`.
-#
-
-bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
-model = bundle.get_model().to(device)
-labels = bundle.get_labels()
-with torch.inference_mode():
-    waveform, _ = torchaudio.load(SPEECH_FILE)
-    emissions, _ = model(waveform.to(device))
-    emissions = torch.log_softmax(emissions, dim=-1)
-
-emission = emissions[0].cpu().detach()
-
-################################################################################
-# Visualization
-################################################################################
-print(labels)
-plt.imshow(emission.T)
-plt.colorbar()
-plt.title("Frame-wise class probability")
-plt.xlabel("Time")
-plt.ylabel("Labels")
-plt.show()
-
-
-######################################################################
-# Generate alignment probability (trellis)
-# ----------------------------------------
-#
-# From the emission matrix, next we generate the trellis which represents
-# the probability of transcript labels occur at each time frame.
-#
-# Trellis is 2D matrix with time axis and label axis. The label axis
-# represents the transcript that we are aligning. In the following, we use
-# :math:`t` to denote the index in time axis and :math:`j` to denote the
-# index in label axis. :math:`c_j` represents the label at label index
-# :math:`j`.
-#
-# To generate, the probability of time step :math:`t+1`, we look at the
-# trellis from time step :math:`t` and emission at time step :math:`t+1`.
-# There are two path to reach to time step :math:`t+1` with label
-# :math:`c_{j+1}`. The first one is the case where the label was
-# :math:`c_{j+1}` at :math:`t` and there was no label change from
-# :math:`t` to :math:`t+1`. The other case is where the label was
-# :math:`c_j` at :math:`t` and it transitioned to the next label
-# :math:`c_{j+1}` at :math:`t+1`.
-#
-# The follwoing diagram illustrates this transition.
-#
-# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/ctc-forward.png
-#
-# Since we are looking for the most likely transitions, we take the more
-# likely path for the value of :math:`k_{(t+1, j+1)}`, that is
-#
-# :math:`k_{(t+1, j+1)} = max( k_{(t, j)} p(t+1, c_{j+1}), k_{(t, j+1)} p(t+1, repeat) )`
-#
-# where :math:`k` represents is trellis matrix, and :math:`p(t, c_j)`
-# represents the probability of label :math:`c_j` at time step :math:`t`.
-# :math:`repeat` represents the blank token from CTC formulation. (For the
-# detail of CTC algorithm, please refer to the *Sequence Modeling with CTC*
-# [`distill.pub <https://distill.pub/2017/ctc/>`__])
-#
-
-transcript = "I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT"
-dictionary = {c: i for i, c in enumerate(labels)}
-
-tokens = [dictionary[c] for c in transcript]
-print(list(zip(transcript, tokens)))
-
-
-def get_trellis(emission, tokens, blank_id=0):
-    num_frame = emission.size(0)
-    num_tokens = len(tokens)
-
-    # Trellis has extra diemsions for both time axis and tokens.
-    # The extra dim for tokens represents <SoS> (start-of-sentence)
-    # The extra dim for time axis is for simplification of the code.
-    trellis = torch.empty((num_frame + 1, num_tokens + 1))
-    trellis[0, 0] = 0
-    trellis[1:, 0] = torch.cumsum(emission[:, 0], 0)
-    trellis[0, -num_tokens:] = -float("inf")
-    trellis[-num_tokens:, 0] = float("inf")
-
-    for t in range(num_frame):
-        trellis[t + 1, 1:] = torch.maximum(
-            # Score for staying at the same token
-            trellis[t, 1:] + emission[t, blank_id],
-            # Score for changing to the next token
-            trellis[t, :-1] + emission[t, tokens],
-        )
-    return trellis
-
-
-trellis = get_trellis(emission, tokens)
-
-################################################################################
-# Visualization
-################################################################################
-plt.imshow(trellis[1:, 1:].T, origin="lower")
-plt.annotate("- Inf", (trellis.size(1) / 5, trellis.size(1) / 1.5))
-plt.colorbar()
-plt.show()
-
-######################################################################
-# In the above visualization, we can see that there is a trace of high
-# probability crossing the matrix diagonally.
-#
-
-
-######################################################################
-# Find the most likely path (backtracking)
-# ----------------------------------------
-#
-# Once the trellis is generated, we will traverse it following the
-# elements with high probability.
-#
-# We will start from the last label index with the time step of highest
-# probability, then, we traverse back in time, picking stay
-# (:math:`c_j \rightarrow c_j`) or transition
-# (:math:`c_j \rightarrow c_{j+1}`), based on the post-transition
-# probability :math:`k_{t, j} p(t+1, c_{j+1})` or
-# :math:`k_{t, j+1} p(t+1, repeat)`.
-#
-# Transition is done once the label reaches the beginning.
-#
-# The trellis matrix is used for path-finding, but for the final
-# probability of each segment, we take the frame-wise probability from
-# emission matrix.
-#
-
-
-@dataclass
-class Point:
-    token_index: int
-    time_index: int
-    score: float
-
-
-def backtrack(trellis, emission, tokens, blank_id=0):
-    # Note:
-    # j and t are indices for trellis, which has extra dimensions
-    # for time and tokens at the beginning.
-    # When referring to time frame index `T` in trellis,
-    # the corresponding index in emission is `T-1`.
-    # Similarly, when referring to token index `J` in trellis,
-    # the corresponding index in transcript is `J-1`.
-    j = trellis.size(1) - 1
-    t_start = torch.argmax(trellis[:, j]).item()
-
-    path = []
-    for t in range(t_start, 0, -1):
-        # 1. Figure out if the current position was stay or change
-        # Note (again):
-        # `emission[J-1]` is the emission at time frame `J` of trellis dimension.
-        # Score for token staying the same from time frame J-1 to T.
-        stayed = trellis[t - 1, j] + emission[t - 1, blank_id]
-        # Score for token changing from C-1 at T-1 to J at T.
-        changed = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
-
-        # 2. Store the path with frame-wise probability.
-        prob = emission[t - 1, tokens[j - 1] if changed > stayed else 0].exp().item()
-        # Return token index and time index in non-trellis coordinate.
-        path.append(Point(j - 1, t - 1, prob))
-
-        # 3. Update the token
-        if changed > stayed:
-            j -= 1
-            if j == 0:
-                break
-    else:
-        raise ValueError("Failed to align")
-    return path[::-1]
-
-
-path = backtrack(trellis, emission, tokens)
-for p in path:
-    print(p)
-
-
-################################################################################
-# Visualization
-################################################################################
-def plot_trellis_with_path(trellis, path):
-    # To plot trellis with path, we take advantage of 'nan' value
-    trellis_with_path = trellis.clone()
-    for _, p in enumerate(path):
-        trellis_with_path[p.time_index, p.token_index] = float("nan")
-    plt.imshow(trellis_with_path[1:, 1:].T, origin="lower")
-
-
-plot_trellis_with_path(trellis, path)
-plt.title("The path found by backtracking")
-plt.show()
-
-######################################################################
-# Looking good. Now this path contains repetations for the same labels, so
-# let’s merge them to make it close to the original transcript.
-#
-# When merging the multiple path points, we simply take the average
-# probability for the merged segments.
-#
-
-
-# Merge the labels
-@dataclass
-class Segment:
-    label: str
-    start: int
-    end: int
-    score: float
-
-    def __repr__(self):
-        return f"{self.label}\t({self.score:4.2f}): [{self.start:5d}, {self.end:5d})"
-
-    @property
-    def length(self):
-        return self.end - self.start
-
-
-def merge_repeats(path):
-    i1, i2 = 0, 0
-    segments = []
-    while i1 < len(path):
-        while i2 < len(path) and path[i1].token_index == path[i2].token_index:
-            i2 += 1
-        score = sum(path[k].score for k in range(i1, i2)) / (i2 - i1)
-        segments.append(
-            Segment(
-                transcript[path[i1].token_index],
-                path[i1].time_index,
-                path[i2 - 1].time_index + 1,
-                score,
-            )
-        )
-        i1 = i2
-    return segments
-
-
-segments = merge_repeats(path)
-for seg in segments:
-    print(seg)
-
-
-################################################################################
-# Visualization
-################################################################################
-def plot_trellis_with_segments(trellis, segments, transcript):
-    # To plot trellis with path, we take advantage of 'nan' value
-    trellis_with_path = trellis.clone()
-    for i, seg in enumerate(segments):
-        if seg.label != "|":
-            trellis_with_path[seg.start + 1 : seg.end + 1, i + 1] = float("nan")
-
-    fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9.5))
-    ax1.set_title("Path, label and probability for each label")
-    ax1.imshow(trellis_with_path.T, origin="lower")
-    ax1.set_xticks([])
-
-    for i, seg in enumerate(segments):
-        if seg.label != "|":
-            ax1.annotate(seg.label, (seg.start + 0.7, i + 0.3), weight="bold")
-            ax1.annotate(f"{seg.score:.2f}", (seg.start - 0.3, i + 4.3))
-
-    ax2.set_title("Label probability with and without repetation")
-    xs, hs, ws = [], [], []
-    for seg in segments:
-        if seg.label != "|":
-            xs.append((seg.end + seg.start) / 2 + 0.4)
-            hs.append(seg.score)
-            ws.append(seg.end - seg.start)
-            ax2.annotate(seg.label, (seg.start + 0.8, -0.07), weight="bold")
-    ax2.bar(xs, hs, width=ws, color="gray", alpha=0.5, edgecolor="black")
-
-    xs, hs = [], []
-    for p in path:
-        label = transcript[p.token_index]
-        if label != "|":
-            xs.append(p.time_index + 1)
-            hs.append(p.score)
-
-    ax2.bar(xs, hs, width=0.5, alpha=0.5)
-    ax2.axhline(0, color="black")
-    ax2.set_xlim(ax1.get_xlim())
-    ax2.set_ylim(-0.1, 1.1)
-
-
-plot_trellis_with_segments(trellis, segments, transcript)
-plt.tight_layout()
-plt.show()
-
-
-######################################################################
-# Looks good. Now let’s merge the words. The Wav2Vec2 model uses ``'|'``
-# as the word boundary, so we merge the segments before each occurance of
-# ``'|'``.
-#
-# Then, finally, we segment the original audio into segmented audio and
-# listen to them to see if the segmentation is correct.
-#
-
-# Merge words
-def merge_words(segments, separator="|"):
-    words = []
-    i1, i2 = 0, 0
-    while i1 < len(segments):
-        if i2 >= len(segments) or segments[i2].label == separator:
-            if i1 != i2:
-                segs = segments[i1:i2]
-                word = "".join([seg.label for seg in segs])
-                score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs)
-                words.append(Segment(word, segments[i1].start, segments[i2 - 1].end, score))
-            i1 = i2 + 1
-            i2 = i1
-        else:
-            i2 += 1
-    return words
-
-
-word_segments = merge_words(segments)
-for word in word_segments:
-    print(word)
-
-
-################################################################################
-# Visualization
-################################################################################
-def plot_alignments(trellis, segments, word_segments, waveform):
-    trellis_with_path = trellis.clone()
-    for i, seg in enumerate(segments):
-        if seg.label != "|":
-            trellis_with_path[seg.start + 1 : seg.end + 1, i + 1] = float("nan")
-
-    fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9.5))
-
-    ax1.imshow(trellis_with_path[1:, 1:].T, origin="lower")
-    ax1.set_xticks([])
-    ax1.set_yticks([])
-
-    for word in word_segments:
-        ax1.axvline(word.start - 0.5)
-        ax1.axvline(word.end - 0.5)
-
-    for i, seg in enumerate(segments):
-        if seg.label != "|":
-            ax1.annotate(seg.label, (seg.start, i + 0.3))
-            ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 4), fontsize=8)
-
-    # The original waveform
-    ratio = waveform.size(0) / (trellis.size(0) - 1)
-    ax2.plot(waveform)
-    for word in word_segments:
-        x0 = ratio * word.start
-        x1 = ratio * word.end
-        ax2.axvspan(x0, x1, alpha=0.1, color="red")
-        ax2.annotate(f"{word.score:.2f}", (x0, 0.8))
-
-    for seg in segments:
-        if seg.label != "|":
-            ax2.annotate(seg.label, (seg.start * ratio, 0.9))
-    xticks = ax2.get_xticks()
-    plt.xticks(xticks, xticks / bundle.sample_rate)
-    ax2.set_xlabel("time [second]")
-    ax2.set_yticks([])
-    ax2.set_ylim(-1.0, 1.0)
-    ax2.set_xlim(0, waveform.size(-1))
-
-
-plot_alignments(
-    trellis,
-    segments,
-    word_segments,
-    waveform[0],
-)
-plt.show()
-
-################################################################################
-#
-
-# A trick to embed the resulting audio to the generated file.
-# `IPython.display.Audio` has to be the last call in a cell,
-# and there should be only one call par cell.
-def display_segment(i):
-    ratio = waveform.size(1) / (trellis.size(0) - 1)
-    word = word_segments[i]
-    x0 = int(ratio * word.start)
-    x1 = int(ratio * word.end)
-    print(f"{word.label} ({word.score:.2f}): {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f} sec")
-    segment = waveform[:, x0:x1]
-    return IPython.display.Audio(segment.numpy(), rate=bundle.sample_rate)
-
-
-######################################################################
-#
-
-# Generate the audio for each segment
-print(transcript)
-IPython.display.Audio(SPEECH_FILE)
-
-
-######################################################################
-#
-
-display_segment(0)
-
-######################################################################
-#
-
-display_segment(1)
-
-######################################################################
-#
-
-display_segment(2)
-
-######################################################################
-#
-
-display_segment(3)
-
-######################################################################
-#
-
-display_segment(4)
-
-######################################################################
-#
-
-display_segment(5)
-
-######################################################################
-#
-
-display_segment(6)
-
-######################################################################
-#
-
-display_segment(7)
-
-######################################################################
-#
-
-display_segment(8)
-
-######################################################################
-# Conclusion
-# ----------
-#
-# In this tutorial, we looked how to use torchaudio’s Wav2Vec2 model to
-# perform CTC segmentation for forced alignment.
-#
diff --git a/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst b/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst
new file mode 100644
index 00000000000..4c9752d016d
--- /dev/null
+++ b/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst
@@ -0,0 +1,11 @@
+Forced Alignment with Wav2Vec2
+==============================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/forced_alignment_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/forced_alignment_tutorial.html'" />
+
diff --git a/intermediate_source/speech_command_classification_with_torchaudio_tutorial.py b/intermediate_source/speech_command_classification_with_torchaudio_tutorial.py
deleted file mode 100644
index ba7ff93a875..00000000000
--- a/intermediate_source/speech_command_classification_with_torchaudio_tutorial.py
+++ /dev/null
@@ -1,545 +0,0 @@
-"""
-Speech Command Classification with torchaudio
-*********************************************
-
-This tutorial will show you how to correctly format an audio dataset and
-then train/test an audio classifier network on the dataset.
-
-Colab has GPU option available. In the menu tabs, select “Runtime” then
-“Change runtime type”. In the pop-up that follows, you can choose GPU.
-After the change, your runtime should automatically restart (which means
-information from executed cells disappear).
-
-First, let’s import the common torch packages such as
-`torchaudio <https://github.com/pytorch/audio>`__ that can be installed
-by following the instructions on the website.
-
-"""
-
-# Uncomment the line corresponding to your "runtime type" to run in Google Colab
-
-# CPU:
-# !pip install pydub torch==1.7.0+cpu torchvision==0.8.1+cpu torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
-
-# GPU:
-# !pip install pydub torch==1.7.0+cu101 torchvision==0.8.1+cu101 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-import torchaudio
-import sys
-
-import matplotlib.pyplot as plt
-import IPython.display as ipd
-
-from tqdm import tqdm
-
-
-######################################################################
-# Let’s check if a CUDA GPU is available and select our device. Running
-# the network on a GPU will greatly decrease the training/testing runtime.
-#
-
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(device)
-
-
-######################################################################
-# Importing the Dataset
-# ---------------------
-#
-# We use torchaudio to download and represent the dataset. Here we use
-# `SpeechCommands <https://arxiv.org/abs/1804.03209>`__, which is a
-# datasets of 35 commands spoken by different people. The dataset
-# ``SPEECHCOMMANDS`` is a ``torch.utils.data.Dataset`` version of the
-# dataset. In this dataset, all audio files are about 1 second long (and
-# so about 16000 time frames long).
-#
-# The actual loading and formatting steps happen when a data point is
-# being accessed, and torchaudio takes care of converting the audio files
-# to tensors. If one wants to load an audio file directly instead,
-# ``torchaudio.load()`` can be used. It returns a tuple containing the
-# newly created tensor along with the sampling frequency of the audio file
-# (16kHz for SpeechCommands).
-#
-# Going back to the dataset, here we create a subclass that splits it into
-# standard training, validation, testing subsets.
-#
-
-from torchaudio.datasets import SPEECHCOMMANDS
-import os
-
-
-class SubsetSC(SPEECHCOMMANDS):
-    def __init__(self, subset: str = None):
-        super().__init__("./", download=True)
-
-        def load_list(filename):
-            filepath = os.path.join(self._path, filename)
-            with open(filepath) as fileobj:
-                return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj]
-
-        if subset == "validation":
-            self._walker = load_list("validation_list.txt")
-        elif subset == "testing":
-            self._walker = load_list("testing_list.txt")
-        elif subset == "training":
-            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
-            excludes = set(excludes)
-            self._walker = [w for w in self._walker if w not in excludes]
-
-
-# Create training and testing split of the data. We do not use validation in this tutorial.
-train_set = SubsetSC("training")
-test_set = SubsetSC("testing")
-
-waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]
-
-
-######################################################################
-# A data point in the SPEECHCOMMANDS dataset is a tuple made of a waveform
-# (the audio signal), the sample rate, the utterance (label), the ID of
-# the speaker, the number of the utterance.
-#
-
-print("Shape of waveform: {}".format(waveform.size()))
-print("Sample rate of waveform: {}".format(sample_rate))
-
-plt.plot(waveform.t().numpy());
-
-
-######################################################################
-# Let’s find the list of labels available in the dataset.
-#
-
-labels = sorted(list(set(datapoint[2] for datapoint in train_set)))
-labels
-
-
-######################################################################
-# The 35 audio labels are commands that are said by users. The first few
-# files are people saying “marvin”.
-#
-
-waveform_first, *_ = train_set[0]
-ipd.Audio(waveform_first.numpy(), rate=sample_rate)
-
-waveform_second, *_ = train_set[1]
-ipd.Audio(waveform_second.numpy(), rate=sample_rate)
-
-
-######################################################################
-# The last file is someone saying “visual”.
-#
-
-waveform_last, *_ = train_set[-1]
-ipd.Audio(waveform_last.numpy(), rate=sample_rate)
-
-
-######################################################################
-# Formatting the Data
-# -------------------
-#
-# This is a good place to apply transformations to the data. For the
-# waveform, we downsample the audio for faster processing without losing
-# too much of the classification power.
-#
-# We don’t need to apply other transformations here. It is common for some
-# datasets though to have to reduce the number of channels (say from
-# stereo to mono) by either taking the mean along the channel dimension,
-# or simply keeping only one of the channels. Since SpeechCommands uses a
-# single channel for audio, this is not needed here.
-#
-
-new_sample_rate = 8000
-transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
-transformed = transform(waveform)
-
-ipd.Audio(transformed.numpy(), rate=new_sample_rate)
-
-
-######################################################################
-# We are encoding each word using its index in the list of labels.
-#
-
-
-def label_to_index(word):
-    # Return the position of the word in labels
-    return torch.tensor(labels.index(word))
-
-
-def index_to_label(index):
-    # Return the word corresponding to the index in labels
-    # This is the inverse of label_to_index
-    return labels[index]
-
-
-word_start = "yes"
-index = label_to_index(word_start)
-word_recovered = index_to_label(index)
-
-print(word_start, "-->", index, "-->", word_recovered)
-
-
-######################################################################
-# To turn a list of data point made of audio recordings and utterances
-# into two batched tensors for the model, we implement a collate function
-# which is used by the PyTorch DataLoader that allows us to iterate over a
-# dataset by batches. Please see `the
-# documentation <https://pytorch.org/docs/stable/data.html#working-with-collate-fn>`__
-# for more information about working with a collate function.
-#
-# In the collate function, we also apply the resampling, and the text
-# encoding.
-#
-
-
-def pad_sequence(batch):
-    # Make all tensor in a batch the same length by padding with zeros
-    batch = [item.t() for item in batch]
-    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
-    return batch.permute(0, 2, 1)
-
-
-def collate_fn(batch):
-
-    # A data tuple has the form:
-    # waveform, sample_rate, label, speaker_id, utterance_number
-
-    tensors, targets = [], []
-
-    # Gather in lists, and encode labels as indices
-    for waveform, _, label, *_ in batch:
-        tensors += [waveform]
-        targets += [label_to_index(label)]
-
-    # Group the list of tensors into a batched tensor
-    tensors = pad_sequence(tensors)
-    targets = torch.stack(targets)
-
-    return tensors, targets
-
-
-batch_size = 256
-
-if device == "cuda":
-    num_workers = 1
-    pin_memory = True
-else:
-    num_workers = 0
-    pin_memory = False
-
-train_loader = torch.utils.data.DataLoader(
-    train_set,
-    batch_size=batch_size,
-    shuffle=True,
-    collate_fn=collate_fn,
-    num_workers=num_workers,
-    pin_memory=pin_memory,
-)
-test_loader = torch.utils.data.DataLoader(
-    test_set,
-    batch_size=batch_size,
-    shuffle=False,
-    drop_last=False,
-    collate_fn=collate_fn,
-    num_workers=num_workers,
-    pin_memory=pin_memory,
-)
-
-
-######################################################################
-# Define the Network
-# ------------------
-#
-# For this tutorial we will use a convolutional neural network to process
-# the raw audio data. Usually more advanced transforms are applied to the
-# audio data, however CNNs can be used to accurately process the raw data.
-# The specific architecture is modeled after the M5 network architecture
-# described in `this paper <https://arxiv.org/pdf/1610.00087.pdf>`__. An
-# important aspect of models processing raw audio data is the receptive
-# field of their first layer’s filters. Our model’s first filter is length
-# 80 so when processing audio sampled at 8kHz the receptive field is
-# around 10ms (and at 4kHz, around 20 ms). This size is similar to speech
-# processing applications that often use receptive fields ranging from
-# 20ms to 40ms.
-#
-
-
-class M5(nn.Module):
-    def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
-        super().__init__()
-        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
-        self.bn1 = nn.BatchNorm1d(n_channel)
-        self.pool1 = nn.MaxPool1d(4)
-        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
-        self.bn2 = nn.BatchNorm1d(n_channel)
-        self.pool2 = nn.MaxPool1d(4)
-        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
-        self.bn3 = nn.BatchNorm1d(2 * n_channel)
-        self.pool3 = nn.MaxPool1d(4)
-        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
-        self.bn4 = nn.BatchNorm1d(2 * n_channel)
-        self.pool4 = nn.MaxPool1d(4)
-        self.fc1 = nn.Linear(2 * n_channel, n_output)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = F.relu(self.bn1(x))
-        x = self.pool1(x)
-        x = self.conv2(x)
-        x = F.relu(self.bn2(x))
-        x = self.pool2(x)
-        x = self.conv3(x)
-        x = F.relu(self.bn3(x))
-        x = self.pool3(x)
-        x = self.conv4(x)
-        x = F.relu(self.bn4(x))
-        x = self.pool4(x)
-        x = F.avg_pool1d(x, x.shape[-1])
-        x = x.permute(0, 2, 1)
-        x = self.fc1(x)
-        return F.log_softmax(x, dim=2)
-
-
-model = M5(n_input=transformed.shape[0], n_output=len(labels))
-model.to(device)
-print(model)
-
-
-def count_parameters(model):
-    return sum(p.numel() for p in model.parameters() if p.requires_grad)
-
-
-n = count_parameters(model)
-print("Number of parameters: %s" % n)
-
-
-######################################################################
-# We will use the same optimization technique used in the paper, an Adam
-# optimizer with weight decay set to 0.0001. At first, we will train with
-# a learning rate of 0.01, but we will use a ``scheduler`` to decrease it
-# to 0.001 during training after 20 epochs.
-#
-
-optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
-scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)  # reduce the learning after 20 epochs by a factor of 10
-
-
-######################################################################
-# Training and Testing the Network
-# --------------------------------
-#
-# Now let’s define a training function that will feed our training data
-# into the model and perform the backward pass and optimization steps. For
-# training, the loss we will use is the negative log-likelihood. The
-# network will then be tested after each epoch to see how the accuracy
-# varies during the training.
-#
-
-
-def train(model, epoch, log_interval):
-    model.train()
-    for batch_idx, (data, target) in enumerate(train_loader):
-
-        data = data.to(device)
-        target = target.to(device)
-
-        # apply transform and model on whole batch directly on device
-        data = transform(data)
-        output = model(data)
-
-        # negative log-likelihood for a tensor of size (batch x 1 x n_output)
-        loss = F.nll_loss(output.squeeze(), target)
-
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-
-        # print training stats
-        if batch_idx % log_interval == 0:
-            print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")
-
-        # update progress bar
-        pbar.update(pbar_update)
-        # record loss
-        losses.append(loss.item())
-
-
-######################################################################
-# Now that we have a training function, we need to make one for testing
-# the networks accuracy. We will set the model to ``eval()`` mode and then
-# run inference on the test dataset. Calling ``eval()`` sets the training
-# variable in all modules in the network to false. Certain layers like
-# batch normalization and dropout layers behave differently during
-# training so this step is crucial for getting correct results.
-#
-
-
-def number_of_correct(pred, target):
-    # count number of correct predictions
-    return pred.squeeze().eq(target).sum().item()
-
-
-def get_likely_index(tensor):
-    # find most likely label index for each element in the batch
-    return tensor.argmax(dim=-1)
-
-
-def test(model, epoch):
-    model.eval()
-    correct = 0
-    for data, target in test_loader:
-
-        data = data.to(device)
-        target = target.to(device)
-
-        # apply transform and model on whole batch directly on device
-        data = transform(data)
-        output = model(data)
-
-        pred = get_likely_index(output)
-        correct += number_of_correct(pred, target)
-
-        # update progress bar
-        pbar.update(pbar_update)
-
-    print(f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n")
-
-
-######################################################################
-# Finally, we can train and test the network. We will train the network
-# for ten epochs then reduce the learn rate and train for ten more epochs.
-# The network will be tested after each epoch to see how the accuracy
-# varies during the training.
-#
-
-log_interval = 20
-n_epoch = 2
-
-pbar_update = 1 / (len(train_loader) + len(test_loader))
-losses = []
-
-# The transform needs to live on the same device as the model and the data.
-transform = transform.to(device)
-with tqdm(total=n_epoch) as pbar:
-    for epoch in range(1, n_epoch + 1):
-        train(model, epoch, log_interval)
-        test(model, epoch)
-        scheduler.step()
-
-# Let's plot the training loss versus the number of iteration.
-# plt.plot(losses);
-# plt.title("training loss");
-
-
-######################################################################
-# The network should be more than 65% accurate on the test set after 2
-# epochs, and 85% after 21 epochs. Let’s look at the last words in the
-# train set, and see how the model did on it.
-#
-
-
-def predict(tensor):
-    # Use the model to predict the label of the waveform
-    tensor = tensor.to(device)
-    tensor = transform(tensor)
-    tensor = model(tensor.unsqueeze(0))
-    tensor = get_likely_index(tensor)
-    tensor = index_to_label(tensor.squeeze())
-    return tensor
-
-
-waveform, sample_rate, utterance, *_ = train_set[-1]
-ipd.Audio(waveform.numpy(), rate=sample_rate)
-
-print(f"Expected: {utterance}. Predicted: {predict(waveform)}.")
-
-
-######################################################################
-# Let’s find an example that isn’t classified correctly, if there is one.
-#
-
-for i, (waveform, sample_rate, utterance, *_) in enumerate(test_set):
-    output = predict(waveform)
-    if output != utterance:
-        ipd.Audio(waveform.numpy(), rate=sample_rate)
-        print(f"Data point #{i}. Expected: {utterance}. Predicted: {output}.")
-        break
-else:
-    print("All examples in this dataset were correctly classified!")
-    print("In this case, let's just look at the last data point")
-    ipd.Audio(waveform.numpy(), rate=sample_rate)
-    print(f"Data point #{i}. Expected: {utterance}. Predicted: {output}.")
-
-
-######################################################################
-# Feel free to try with one of your own recordings of one of the labels!
-# For example, using Colab, say “Go” while executing the cell below. This
-# will record one second of audio and try to classify it.
-#
-
-
-def record(seconds=1):
-
-    from google.colab import output as colab_output
-    from base64 import b64decode
-    from io import BytesIO
-    from pydub import AudioSegment
-
-    RECORD = (
-        b"const sleep  = time => new Promise(resolve => setTimeout(resolve, time))\n"
-        b"const b2text = blob => new Promise(resolve => {\n"
-        b"  const reader = new FileReader()\n"
-        b"  reader.onloadend = e => resolve(e.srcElement.result)\n"
-        b"  reader.readAsDataURL(blob)\n"
-        b"})\n"
-        b"var record = time => new Promise(async resolve => {\n"
-        b"  stream = await navigator.mediaDevices.getUserMedia({ audio: true })\n"
-        b"  recorder = new MediaRecorder(stream)\n"
-        b"  chunks = []\n"
-        b"  recorder.ondataavailable = e => chunks.push(e.data)\n"
-        b"  recorder.start()\n"
-        b"  await sleep(time)\n"
-        b"  recorder.onstop = async ()=>{\n"
-        b"    blob = new Blob(chunks)\n"
-        b"    text = await b2text(blob)\n"
-        b"    resolve(text)\n"
-        b"  }\n"
-        b"  recorder.stop()\n"
-        b"})"
-    )
-    RECORD = RECORD.decode("ascii")
-
-    print(f"Recording started for {seconds} seconds.")
-    display(ipd.Javascript(RECORD))
-    s = colab_output.eval_js("record(%d)" % (seconds * 1000))
-    print("Recording ended.")
-    b = b64decode(s.split(",")[1])
-
-    fileformat = "wav"
-    filename = f"_audio.{fileformat}"
-    AudioSegment.from_file(BytesIO(b)).export(filename, format=fileformat)
-    return torchaudio.load(filename)
-
-
-# Detect whether notebook runs in google colab
-if "google.colab" in sys.modules:
-    waveform, sample_rate = record()
-    print(f"Predicted: {predict(waveform)}.")
-    ipd.Audio(waveform.numpy(), rate=sample_rate)
-
-
-######################################################################
-# Conclusion
-# ----------
-#
-# In this tutorial, we used torchaudio to load a dataset and resample the
-# signal. We have then defined a neural network that we trained to
-# recognize a given command. There are also other data preprocessing
-# methods, such as finding the mel frequency cepstral coefficients (MFCC),
-# that can reduce the size of the dataset. This transform is also
-# available in torchaudio as ``torchaudio.transforms.MFCC``.
-#
diff --git a/intermediate_source/speech_recognition_pipeline_tutorial.py b/intermediate_source/speech_recognition_pipeline_tutorial.py
deleted file mode 100644
index 9ffbdcaaf0b..00000000000
--- a/intermediate_source/speech_recognition_pipeline_tutorial.py
+++ /dev/null
@@ -1,302 +0,0 @@
-"""
-Speech Recognition with Wav2Vec2
-================================
-
-**Author**: `Moto Hira <moto@fb.com>`__
-
-This tutorial shows how to perform speech recognition using
-pre-trained models from wav2vec 2.0
-[`paper <https://arxiv.org/abs/2006.11477>`__].
-
-"""
-
-
-######################################################################
-# Overview
-# --------
-#
-# The process of speech recognition looks like the following.
-#
-# 1. Extract the acoustic features from audio waveform
-#
-# 2. Estimate the class of the acoustic features frame-by-frame
-#
-# 3. Generate hypothesis from the sequence of the class probabilities
-#
-# Torchaudio provides easy access to the pre-trained weights and
-# associated information, such as the expected sample rate and class
-# labels. They are bundled together and available under
-# :py:func:`torchaudio.pipelines` module.
-#
-
-
-######################################################################
-# Preparation
-# -----------
-#
-# First we import the necessary packages, and fetch data that we work on.
-#
-
-# %matplotlib inline
-
-import os
-
-import IPython
-import matplotlib
-import matplotlib.pyplot as plt
-import requests
-import torch
-import torchaudio
-
-matplotlib.rcParams["figure.figsize"] = [16.0, 4.8]
-
-torch.random.manual_seed(0)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-print(torch.__version__)
-print(torchaudio.__version__)
-print(device)
-
-SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"  # noqa: E501
-SPEECH_FILE = "_assets/speech.wav"
-
-if not os.path.exists(SPEECH_FILE):
-    os.makedirs("_assets", exist_ok=True)
-    with open(SPEECH_FILE, "wb") as file:
-        file.write(requests.get(SPEECH_URL).content)
-
-
-######################################################################
-# Creating a pipeline
-# -------------------
-#
-# First, we will create a Wav2Vec2 model that performs the feature
-# extraction and the classification.
-#
-# There are two types of Wav2Vec2 pre-trained weights available in
-# torchaudio. The ones fine-tuned for ASR task, and the ones not
-# fine-tuned.
-#
-# Wav2Vec2 (and HuBERT) models are trained in self-supervised manner. They
-# are firstly trained with audio only for representation learning, then
-# fine-tuned for a specific task with additional labels.
-#
-# The pre-trained weights without fine-tuning can be fine-tuned
-# for other downstream tasks as well, but this tutorial does not
-# cover that.
-#
-# We will use :py:func:`torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H` here.
-#
-# There are multiple models available as
-# :py:mod:`torchaudio.pipelines`. Please check the documentation for
-# the detail of how they are trained.
-#
-# The bundle object provides the interface to instantiate model and other
-# information. Sampling rate and the class labels are found as follow.
-#
-
-bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
-
-print("Sample Rate:", bundle.sample_rate)
-
-print("Labels:", bundle.get_labels())
-
-
-######################################################################
-# Model can be constructed as following. This process will automatically
-# fetch the pre-trained weights and load it into the model.
-#
-
-model = bundle.get_model().to(device)
-
-print(model.__class__)
-
-
-######################################################################
-# Loading data
-# ------------
-#
-# We will use the speech data from `VOiCES
-# dataset <https://iqtlabs.github.io/voices/>`__, which is licensed under
-# Creative Commons BY 4.0.
-#
-
-IPython.display.Audio(SPEECH_FILE)
-
-
-######################################################################
-# To load data, we use :py:func:`torchaudio.load`.
-#
-# If the sampling rate is different from what the pipeline expects, then
-# we can use :py:func:`torchaudio.functional.resample` for resampling.
-#
-# .. note::
-#
-#    - :py:func:`torchaudio.functional.resample` works on CUDA tensors as well.
-#    - When performing resampling multiple times on the same set of sample rates,
-#      using :py:func:`torchaudio.transforms.Resample` might improve the performace.
-#
-
-waveform, sample_rate = torchaudio.load(SPEECH_FILE)
-waveform = waveform.to(device)
-
-if sample_rate != bundle.sample_rate:
-    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
-
-
-######################################################################
-# Extracting acoustic features
-# ----------------------------
-#
-# The next step is to extract acoustic features from the audio.
-#
-# .. note::
-#    Wav2Vec2 models fine-tuned for ASR task can perform feature
-#    extraction and classification with one step, but for the sake of the
-#    tutorial, we also show how to perform feature extraction here.
-#
-
-with torch.inference_mode():
-    features, _ = model.extract_features(waveform)
-
-
-######################################################################
-# The returned features is a list of tensors. Each tensor is the output of
-# a transformer layer.
-#
-
-fig, ax = plt.subplots(len(features), 1, figsize=(16, 4.3 * len(features)))
-for i, feats in enumerate(features):
-    ax[i].imshow(feats[0].cpu())
-    ax[i].set_title(f"Feature from transformer layer {i+1}")
-    ax[i].set_xlabel("Feature dimension")
-    ax[i].set_ylabel("Frame (time-axis)")
-plt.tight_layout()
-plt.show()
-
-
-######################################################################
-# Feature classification
-# ----------------------
-#
-# Once the acoustic features are extracted, the next step is to classify
-# them into a set of categories.
-#
-# Wav2Vec2 model provides method to perform the feature extraction and
-# classification in one step.
-#
-
-with torch.inference_mode():
-    emission, _ = model(waveform)
-
-
-######################################################################
-# The output is in the form of logits. It is not in the form of
-# probability.
-#
-# Let’s visualize this.
-#
-
-plt.imshow(emission[0].cpu().T)
-plt.title("Classification result")
-plt.xlabel("Frame (time-axis)")
-plt.ylabel("Class")
-plt.show()
-print("Class labels:", bundle.get_labels())
-
-
-######################################################################
-# We can see that there are strong indications to certain labels across
-# the time line.
-#
-
-
-######################################################################
-# Generating transcripts
-# ----------------------
-#
-# From the sequence of label probabilities, now we want to generate
-# transcripts. The process to generate hypotheses is often called
-# “decoding”.
-#
-# Decoding is more elaborate than simple classification because
-# decoding at certain time step can be affected by surrounding
-# observations.
-#
-# For example, take a word like ``night`` and ``knight``. Even if their
-# prior probability distribution are differnt (in typical conversations,
-# ``night`` would occur way more often than ``knight``), to accurately
-# generate transcripts with ``knight``, such as ``a knight with a sword``,
-# the decoding process has to postpone the final decision until it sees
-# enough context.
-#
-# There are many decoding techniques proposed, and they require external
-# resources, such as word dictionary and language models.
-#
-# In this tutorial, for the sake of simplicity, we will perform greedy
-# decoding which does not depend on such external components, and simply
-# pick up the best hypothesis at each time step. Therefore, the context
-# information are not used, and only one transcript can be generated.
-#
-# We start by defining greedy decoding algorithm.
-#
-
-
-class GreedyCTCDecoder(torch.nn.Module):
-    def __init__(self, labels, blank=0):
-        super().__init__()
-        self.labels = labels
-        self.blank = blank
-
-    def forward(self, emission: torch.Tensor) -> str:
-        """Given a sequence emission over labels, get the best path string
-        Args:
-          emission (Tensor): Logit tensors. Shape `[num_seq, num_label]`.
-
-        Returns:
-          str: The resulting transcript
-        """
-        indices = torch.argmax(emission, dim=-1)  # [num_seq,]
-        indices = torch.unique_consecutive(indices, dim=-1)
-        indices = [i for i in indices if i != self.blank]
-        return "".join([self.labels[i] for i in indices])
-
-
-######################################################################
-# Now create the decoder object and decode the transcript.
-#
-
-decoder = GreedyCTCDecoder(labels=bundle.get_labels())
-transcript = decoder(emission[0])
-
-
-######################################################################
-# Let’s check the result and listen again to the audio.
-#
-
-print(transcript)
-IPython.display.Audio(SPEECH_FILE)
-
-
-######################################################################
-# The ASR model is fine-tuned using a loss function called Connectionist Temporal Classification (CTC).
-# The detail of CTC loss is explained
-# `here <https://distill.pub/2017/ctc/>`__. In CTC a blank token (ϵ) is a
-# special token which represents a repetition of the previous symbol. In
-# decoding, these are simply ignored.
-#
-
-
-######################################################################
-# Conclusion
-# ----------
-#
-# In this tutorial, we looked at how to use :py:mod:`torchaudio.pipelines` to
-# perform acoustic feature extraction and speech recognition. Constructing
-# a model and getting the emission is as short as two lines.
-#
-# ::
-#
-#    model = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H.get_model()
-#    emission = model(waveforms, ...)
-#
diff --git a/intermediate_source/speech_recognition_pipeline_tutorial.rst b/intermediate_source/speech_recognition_pipeline_tutorial.rst
new file mode 100644
index 00000000000..4ec497b3bd8
--- /dev/null
+++ b/intermediate_source/speech_recognition_pipeline_tutorial.rst
@@ -0,0 +1,10 @@
+Speech Recognition with Wav2Vec2
+================================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/speech_recognition_pipeline_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/speech_recognition_pipeline_tutorial.html'" />
diff --git a/intermediate_source/text_to_speech_with_torchaudio.py b/intermediate_source/text_to_speech_with_torchaudio.py
deleted file mode 100644
index ade495af6a8..00000000000
--- a/intermediate_source/text_to_speech_with_torchaudio.py
+++ /dev/null
@@ -1,326 +0,0 @@
-"""
-Text-to-speech with torchaudio
-==============================
-
-**Author**: `Yao-Yuan Yang <https://github.com/yangarbiter>`__, `Moto
-Hira <moto@fb.com>`__
-
-"""
-
-######################################################################
-# Overview
-# --------
-# 
-# This tutorial shows how to build text-to-speech pipeline, using the
-# pretrained Tacotron2 in torchaudio.
-# 
-# The text-to-speech pipeline goes as follows: 1. Text preprocessing
-# 
-# First, the input text is encoded into a list of symbols. In this
-# tutorial, we will use English characters and phonemes as the symbols.
-# 
-# 2. Spectrogram generation
-# 
-# From the encoded text, a spectrogram is generated. We use ``Tacotron2``
-# model for this.
-# 
-# 3. Time-domain conversion
-# 
-# The last step is converting the spectrogram into the waveform. The
-# process to generate speech from spectrogram is also called Vocoder. In
-# this tutorial, three different vocoders are used,
-# ```WaveRNN`` <https://pytorch.org/audio/stable/models/wavernn.html>`__,
-# ```Griffin-Lim`` <https://pytorch.org/audio/stable/transforms.html#griffinlim>`__,
-# and
-# ```Nvidia's WaveGlow`` <https://pytorch.org/hub/nvidia_deeplearningexamples_tacotron2/>`__.
-# 
-# The following figure illustrates the whole process.
-# 
-# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/tacotron2_tts_pipeline.png
-# 
-
-
-######################################################################
-# Preparation
-# -----------
-# 
-# First, we install the necessary dependencies. In addition to
-# ``torchaudio``, ``DeepPhonemizer`` is required to perform phoneme-based
-# encoding.
-# 
-
-# %%
-#  .. code-block:: bash
-#
-#      %%bash
-#      pip3 install deep_phonemizer
-
-import torch
-import torchaudio
-import matplotlib.pyplot as plt
-
-import IPython
-
-print(torch.__version__)
-print(torchaudio.__version__)
-
-torch.random.manual_seed(0)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-
-
-######################################################################
-# Text Processing
-# ---------------
-# 
-
-
-######################################################################
-# Character-based encoding
-# ~~~~~~~~~~~~~~~~~~~~~~~~
-# 
-# In this section, we will go through how the character-based encoding
-# works.
-# 
-# Since the pre-trained Tacotron2 model expects specific set of symbol
-# tables, the same functionalities available in ``torchaudio``. This
-# section is more for the explanation of the basis of encoding.
-# 
-# Firstly, we define the set of symbols. For example, we can use
-# ``'_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz'``. Then, we will map the
-# each character of the input text into the index of the corresponding
-# symbol in the table.
-# 
-# The following is an example of such processing. In the example, symbols
-# that are not in the table are ignored.
-# 
-
-symbols = '_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz'
-look_up = {s: i for i, s in enumerate(symbols)}
-symbols = set(symbols)
-
-def text_to_sequence(text):
-  text = text.lower()
-  return [look_up[s] for s in text if s in symbols]
-
-text = "Hello world! Text to speech!"
-print(text_to_sequence(text))
-
-
-######################################################################
-# As mentioned in the above, the symbol table and indices must match
-# what the pretrained Tacotron2 model expects. ``torchaudio`` provides the
-# transform along with the pretrained model. For example, you can
-# instantiate and use such transform as follow.
-# 
-
-processor = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH.get_text_processor()
-
-text = "Hello world! Text to speech!"
-processed, lengths = processor(text)
-
-print(processed)
-print(lengths)
-
-
-######################################################################
-# The ``processor`` object takes either a text or list of texts as inputs.
-# When a list of texts are provided, the returned ``lengths`` variable
-# represents the valid length of each processed tokens in the output
-# batch.
-# 
-# The intermediate representation can be retrieved as follow.
-# 
-
-print([processor.tokens[i] for i in processed[0, :lengths[0]]])
-
-
-######################################################################
-# Phoneme-based encoding
-# ~~~~~~~~~~~~~~~~~~~~~~
-# 
-# Phoneme-based encoding is similar to character-based encoding, but it
-# uses a symbol table based on phonemes and a G2P (Grapheme-to-Phoneme)
-# model.
-# 
-# The detail of the G2P model is out of scope of this tutorial, we will
-# just look at what the conversion looks like.
-# 
-# Similar to the case of character-based encoding, the encoding process is
-# expected to match what a pretrained Tacotron2 model is trained on.
-# ``torchaudio`` has an interface to create the process.
-# 
-# The following code illustrates how to make and use the process. Behind
-# the scene, a G2P model is created using ``DeepPhonemizer`` package, and
-# the pretrained weights published by the author of ``DeepPhonemizer`` is
-# fetched.
-# 
-
-bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
-
-processor = bundle.get_text_processor()
-
-text = "Hello world! Text to speech!"
-with torch.inference_mode():
-  processed, lengths = processor(text)
-
-print(processed)
-print(lengths)
-
-
-######################################################################
-# Notice that the encoded values are different from the example of
-# character-based encoding.
-# 
-# The intermediate representation looks like the following.
-# 
-
-print([processor.tokens[i] for i in processed[0, :lengths[0]]])
-
-
-######################################################################
-# Spectrogram Generation
-# ----------------------
-# 
-# ``Tacotron2`` is the model we use to generate spectrogram from the
-# encoded text. For the detail of the model, please refer to `the
-# paper <https://arxiv.org/abs/1712.05884>`__.
-# 
-# It is easy to instantiate a Tacotron2 model with pretrained weight,
-# however, note that the input to Tacotron2 models are processed by the
-# matching text processor.
-# 
-# ``torchaudio`` bundles the matching models and processors together so
-# that it is easy to create the pipeline.
-# 
-# (For the available bundles, and its usage, please refer to `the
-# documentation <https://pytorch.org/audio/stable/pipelines.html#tacotron2-text-to-speech>`__.)
-# 
-
-bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
-processor = bundle.get_text_processor()
-tacotron2 = bundle.get_tacotron2().to(device)
-
-text = "Hello world! Text to speech!"
-
-with torch.inference_mode():
-  processed, lengths = processor(text)
-  processed = processed.to(device)
-  lengths = lengths.to(device)
-  spec, _, _ = tacotron2.infer(processed, lengths)
-
-
-plt.imshow(spec[0].cpu().detach())
-
-
-######################################################################
-# Note that ``Tacotron2.infer`` method perfoms multinomial sampling,
-# therefor, the process of generating the spectrogram incurs randomness.
-# 
-
-for _ in range(3):
-  with torch.inference_mode():
-    spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
-  plt.imshow(spec[0].cpu().detach())
-  plt.show()
-
-
-######################################################################
-# Waveform Generation
-# -------------------
-# 
-# Once the spectrogram is generated, the last process is to recover the
-# waveform from the spectrogram.
-# 
-# ``torchaudio`` provides vocoders based on ``GriffinLim`` and
-# ``WaveRNN``.
-# 
-
-
-######################################################################
-# WaveRNN
-# ~~~~~~~
-# 
-# Continuing from the previous section, we can instantiate the matching
-# WaveRNN model from the same bundle.
-# 
-
-bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
-
-processor = bundle.get_text_processor()
-tacotron2 = bundle.get_tacotron2().to(device)
-vocoder = bundle.get_vocoder().to(device)
-
-text = "Hello world! Text to speech!"
-
-with torch.inference_mode():
-  processed, lengths = processor(text)
-  processed = processed.to(device)
-  lengths = lengths.to(device)
-  spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
-  waveforms, lengths = vocoder(spec, spec_lengths)
-
-torchaudio.save("output_wavernn.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
-IPython.display.display(IPython.display.Audio("output_wavernn.wav"))
-
-
-######################################################################
-# Griffin-Lim
-# ~~~~~~~~~~~
-# 
-# Using the Griffin-Lim vocoder is same as WaveRNN. You can instantiate
-# the vocode object with ``get_vocoder`` method and pass the spectrogram.
-# 
-
-bundle = torchaudio.pipelines.TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH
-
-processor = bundle.get_text_processor()
-tacotron2 = bundle.get_tacotron2().to(device)
-vocoder = bundle.get_vocoder().to(device)
-
-with torch.inference_mode():
-  processed, lengths = processor(text)
-  processed = processed.to(device)
-  lengths = lengths.to(device)
-  spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
-waveforms, lengths = vocoder(spec, spec_lengths)
-
-torchaudio.save("output_griffinlim.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
-IPython.display.display(IPython.display.Audio("output_griffinlim.wav"))
-
-
-######################################################################
-# Waveglow
-# ~~~~~~~~
-# 
-# Waveglow is a vocoder published by Nvidia. The pretrained weights are
-# published on Torch Hub. One can instantiate the model using ``torch.hub``
-# module.
-# 
-if torch.cuda.is_available():
-  waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32')
-else:
-  # Workaround to load model mapped on GPU
-  # https://stackoverflow.com/a/61840832
-  waveglow = torch.hub.load(
-      "NVIDIA/DeepLearningExamples:torchhub",
-      "nvidia_waveglow",
-      model_math="fp32",
-      pretrained=False,
-  )
-  checkpoint = torch.hub.load_state_dict_from_url(
-      "https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ckpt_fp32/versions/19.09.0/files/nvidia_waveglowpyt_fp32_20190427",
-      progress=False,
-      map_location=device,
-  )
-  state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()}
-
-waveglow = waveglow.remove_weightnorm(waveglow)
-waveglow = waveglow.to(device)
-waveglow.eval()
-
-with torch.no_grad():
-  waveforms = waveglow.infer(spec)
-
-torchaudio.save("output_waveglow.wav", waveforms[0:1].cpu(), sample_rate=22050)
-IPython.display.display(IPython.display.Audio("output_waveglow.wav"))
diff --git a/intermediate_source/text_to_speech_with_torchaudio.rst b/intermediate_source/text_to_speech_with_torchaudio.rst
new file mode 100644
index 00000000000..bbb6d7f272d
--- /dev/null
+++ b/intermediate_source/text_to_speech_with_torchaudio.rst
@@ -0,0 +1,10 @@
+Text-to-speech with Tacotron2
+=============================
+
+This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/tacotron2_pipeline_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/tacotron2_pipeline_tutorial.html'" />