diff --git a/beginner_source/audio_data_augmentation_tutorial.py b/beginner_source/audio_data_augmentation_tutorial.py deleted file mode 100644 index 933aefc4ef8..00000000000 --- a/beginner_source/audio_data_augmentation_tutorial.py +++ /dev/null @@ -1,443 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Audio Data Augmentation -======================= - -``torchaudio`` provides a variety of ways to augment audio data. - -In this tutorial, we look into a way to apply effects, filters, -RIR (room impulse response) and codecs. - -At the end, we synthesize noisy speech over phone from clean speech. -""" - -import torch -import torchaudio -import torchaudio.functional as F - -print(torch.__version__) -print(torchaudio.__version__) - -###################################################################### -# Preparation -# ----------- -# -# First, we import the modules and download the audio assets we use in this tutorial. -# - -import math - -from IPython.display import Audio -import matplotlib.pyplot as plt - -from torchaudio.utils import download_asset - -SAMPLE_WAV = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.wav") -SAMPLE_RIR = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav") -SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav") -SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav") - - -###################################################################### -# Applying effects and filtering -# ------------------------------ -# -# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to -# those available in ``sox`` to Tensor objects and file object audio sources. -# -# There are two functions for this: -# -# - :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects -# to Tensor. -# - :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to -# other audio sources. -# -# Both functions accept effect definitions in the form -# ``List[List[str]]``. -# This is mostly consistent with how ``sox`` command works, but one caveat is -# that ``sox`` adds some effects automatically, whereas ``torchaudio``’s -# implementation does not. -# -# For the list of available effects, please refer to `the sox -# documentation `__. -# -# **Tip** If you need to load and resample your audio data on the fly, -# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file` -# with effect ``"rate"``. -# -# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a -# file-like object or path-like object. -# Similar to :py:func:`torchaudio.load`, when the audio format cannot be -# inferred from either the file extension or header, you can provide -# argument ``format`` to specify the format of the audio source. -# -# **Note** This process is not differentiable. -# - -# Load the data -waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV) - -# Define effects -effects = [ - ["lowpass", "-1", "300"], # apply single-pole lowpass filter - ["speed", "0.8"], # reduce the speed - # This only changes sample rate, so it is necessary to - # add `rate` effect with original sample rate after this. - ["rate", f"{sample_rate1}"], - ["reverb", "-w"], # Reverbration gives some dramatic feeling -] - -# Apply effects -waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects) - -print(waveform1.shape, sample_rate1) -print(waveform2.shape, sample_rate2) - -###################################################################### -# Note that the number of frames and number of channels are different from -# those of the original after the effects are applied. Let’s listen to the -# audio. -# - -def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - time_axis = torch.arange(0, num_frames) / sample_rate - - figure, axes = plt.subplots(num_channels, 1) - if num_channels == 1: - axes = [axes] - for c in range(num_channels): - axes[c].plot(time_axis, waveform[c], linewidth=1) - axes[c].grid(True) - if num_channels > 1: - axes[c].set_ylabel(f"Channel {c+1}") - if xlim: - axes[c].set_xlim(xlim) - figure.suptitle(title) - plt.show(block=False) - -###################################################################### -# - -def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): - waveform = waveform.numpy() - - num_channels, _ = waveform.shape - - figure, axes = plt.subplots(num_channels, 1) - if num_channels == 1: - axes = [axes] - for c in range(num_channels): - axes[c].specgram(waveform[c], Fs=sample_rate) - if num_channels > 1: - axes[c].set_ylabel(f"Channel {c+1}") - if xlim: - axes[c].set_xlim(xlim) - figure.suptitle(title) - plt.show(block=False) - -###################################################################### -# Original: -# ~~~~~~~~~ -# - -plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2)) -plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04)) -Audio(waveform1, rate=sample_rate1) - -###################################################################### -# Effects applied: -# ~~~~~~~~~~~~~~~~ -# - -plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2)) -plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04)) -Audio(waveform2, rate=sample_rate2) - -###################################################################### -# Doesn’t it sound more dramatic? -# - -###################################################################### -# Simulating room reverberation -# ----------------------------- -# -# `Convolution -# reverb `__ is a -# technique that's used to make clean audio sound as though it has been -# produced in a different environment. -# -# Using Room Impulse Response (RIR), for instance, we can make clean speech -# sound as though it has been uttered in a conference room. -# -# For this process, we need RIR data. The following data are from the VOiCES -# dataset, but you can record your own — just turn on your microphone -# and clap your hands. -# - -rir_raw, sample_rate = torchaudio.load(SAMPLE_RIR) -plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)") -plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)") -Audio(rir_raw, rate=sample_rate) - -###################################################################### -# First, we need to clean up the RIR. We extract the main impulse, normalize -# the signal power, then flip along the time axis. -# - -rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)] -rir = rir / torch.norm(rir, p=2) -RIR = torch.flip(rir, [1]) - -plot_waveform(rir, sample_rate, title="Room Impulse Response") - -###################################################################### -# Then, we convolve the speech signal with the RIR filter. -# - -speech, _ = torchaudio.load(SAMPLE_SPEECH) - -speech_ = torch.nn.functional.pad(speech, (RIR.shape[1] - 1, 0)) -augmented = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0] - -###################################################################### -# Original: -# ~~~~~~~~~ -# - -plot_waveform(speech, sample_rate, title="Original") -plot_specgram(speech, sample_rate, title="Original") -Audio(speech, rate=sample_rate) - -###################################################################### -# RIR applied: -# ~~~~~~~~~~~~ -# - -plot_waveform(augmented, sample_rate, title="RIR Applied") -plot_specgram(augmented, sample_rate, title="RIR Applied") -Audio(augmented, rate=sample_rate) - - -###################################################################### -# Adding background noise -# ----------------------- -# -# To add background noise to audio data, you can simply add a noise Tensor to -# the Tensor representing the audio data. A common method to adjust the -# intensity of noise is changing the Signal-to-Noise Ratio (SNR). -# [`wikipedia `__] -# -# $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$ -# -# $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$ -# - -speech, _ = torchaudio.load(SAMPLE_SPEECH) -noise, _ = torchaudio.load(SAMPLE_NOISE) -noise = noise[:, : speech.shape[1]] - -speech_rms = speech.norm(p=2) -noise_rms = noise.norm(p=2) - -snr_dbs = [20, 10, 3] -noisy_speeches = [] -for snr_db in snr_dbs: - snr = 10 ** (snr_db / 20) - scale = snr * noise_rms / speech_rms - noisy_speeches.append((scale * speech + noise) / 2) - -###################################################################### -# Background noise: -# ~~~~~~~~~~~~~~~~~ -# - -plot_waveform(noise, sample_rate, title="Background noise") -plot_specgram(noise, sample_rate, title="Background noise") -Audio(noise, rate=sample_rate) - -###################################################################### -# SNR 20 dB: -# ~~~~~~~~~~ -# - -snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0] -plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -Audio(noisy_speech, rate=sample_rate) - -###################################################################### -# SNR 10 dB: -# ~~~~~~~~~~ -# - -snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1] -plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -Audio(noisy_speech, rate=sample_rate) - -###################################################################### -# SNR 3 dB: -# ~~~~~~~~~ -# - -snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2] -plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") -Audio(noisy_speech, rate=sample_rate) - - -###################################################################### -# Applying codec to Tensor object -# ------------------------------- -# -# :py:func:`torchaudio.functional.apply_codec` can apply codecs to -# a Tensor object. -# -# **Note** This process is not differentiable. -# - - -waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH) - -configs = [ - {"format": "wav", "encoding": "ULAW", "bits_per_sample": 8}, - {"format": "gsm"}, - {"format": "vorbis", "compression": -1}, -] -waveforms = [] -for param in configs: - augmented = F.apply_codec(waveform, sample_rate, **param) - waveforms.append(augmented) - -###################################################################### -# Original: -# ~~~~~~~~~ -# - -plot_waveform(waveform, sample_rate, title="Original") -plot_specgram(waveform, sample_rate, title="Original") -Audio(waveform, rate=sample_rate) - -###################################################################### -# 8 bit mu-law: -# ~~~~~~~~~~~~~ -# - -plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law") -plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law") -Audio(waveforms[0], rate=sample_rate) - -###################################################################### -# GSM-FR: -# ~~~~~~~ -# - -plot_waveform(waveforms[1], sample_rate, title="GSM-FR") -plot_specgram(waveforms[1], sample_rate, title="GSM-FR") -Audio(waveforms[1], rate=sample_rate) - -###################################################################### -# Vorbis: -# ~~~~~~~ -# - -plot_waveform(waveforms[2], sample_rate, title="Vorbis") -plot_specgram(waveforms[2], sample_rate, title="Vorbis") -Audio(waveforms[2], rate=sample_rate) - -###################################################################### -# Simulating a phone recoding -# --------------------------- -# -# Combining the previous techniques, we can simulate audio that sounds -# like a person talking over a phone in a echoey room with people talking -# in the background. -# - -sample_rate = 16000 -original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH) - -plot_specgram(original_speech, sample_rate, title="Original") - -# Apply RIR -speech_ = torch.nn.functional.pad(original_speech, (RIR.shape[1] - 1, 0)) -rir_applied = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0] - -plot_specgram(rir_applied, sample_rate, title="RIR Applied") - -# Add background noise -# Because the noise is recorded in the actual environment, we consider that -# the noise contains the acoustic feature of the environment. Therefore, we add -# the noise after RIR application. -noise, _ = torchaudio.load(SAMPLE_NOISE) -noise = noise[:, : rir_applied.shape[1]] - -snr_db = 8 -scale = (10 ** (snr_db / 20)) * noise.norm(p=2) / rir_applied.norm(p=2) -bg_added = (scale * rir_applied + noise) / 2 - -plot_specgram(bg_added, sample_rate, title="BG noise added") - -# Apply filtering and change sample rate -filtered, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor( - bg_added, - sample_rate, - effects=[ - ["lowpass", "4000"], - [ - "compand", - "0.02,0.05", - "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", - "-8", - "-7", - "0.05", - ], - ["rate", "8000"], - ], -) - -plot_specgram(filtered, sample_rate2, title="Filtered") - -# Apply telephony codec -codec_applied = F.apply_codec(filtered, sample_rate2, format="gsm") - -plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied") - - -###################################################################### -# Original speech: -# ~~~~~~~~~~~~~~~~ -# - -Audio(original_speech, rate=sample_rate) - -###################################################################### -# RIR applied: -# ~~~~~~~~~~~~ -# - -Audio(rir_applied, rate=sample_rate) - -###################################################################### -# Background noise added: -# ~~~~~~~~~~~~~~~~~~~~~~~ -# - -Audio(bg_added, rate=sample_rate) - -###################################################################### -# Filtered: -# ~~~~~~~~~ -# - -Audio(filtered, rate=sample_rate2) - -###################################################################### -# Codec applied: -# ~~~~~~~~~~~~~~ -# - -Audio(codec_applied, rate=sample_rate2) diff --git a/beginner_source/audio_data_augmentation_tutorial.rst b/beginner_source/audio_data_augmentation_tutorial.rst new file mode 100644 index 00000000000..55ba024a590 --- /dev/null +++ b/beginner_source/audio_data_augmentation_tutorial.rst @@ -0,0 +1,10 @@ +Audio Data Augmentation +======================= + +This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html + +It will redirect in 3 seconds. + +.. raw:: html + + diff --git a/beginner_source/audio_datasets_tutorial.py b/beginner_source/audio_datasets_tutorial.py deleted file mode 100644 index f08ed99e0db..00000000000 --- a/beginner_source/audio_datasets_tutorial.py +++ /dev/null @@ -1,87 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Audio Datasets -============== - -``torchaudio`` provides easy access to common, publicly accessible -datasets. Please refer to the official documentation for the list of -available datasets. -""" - -# When running this tutorial in Google Colab, install the required packages -# with the following. -# !pip install torchaudio - -import torch -import torchaudio - -print(torch.__version__) -print(torchaudio.__version__) - -###################################################################### -# Preparing data and utility functions (skip this section) -# -------------------------------------------------------- -# - -# @title Prepare data and utility functions. {display-mode: "form"} -# @markdown -# @markdown You do not need to look into this cell. -# @markdown Just execute once and you are good to go. - -# ------------------------------------------------------------------------------- -# Preparation of data and helper functions. -# ------------------------------------------------------------------------------- -import multiprocessing -import os - -import matplotlib.pyplot as plt -from IPython.display import Audio, display - - -_SAMPLE_DIR = "_assets" -YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no") -os.makedirs(YESNO_DATASET_PATH, exist_ok=True) - - -def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - - figure, axes = plt.subplots(num_channels, 1) - if num_channels == 1: - axes = [axes] - for c in range(num_channels): - axes[c].specgram(waveform[c], Fs=sample_rate) - if num_channels > 1: - axes[c].set_ylabel(f"Channel {c+1}") - if xlim: - axes[c].set_xlim(xlim) - figure.suptitle(title) - plt.show(block=False) - - -def play_audio(waveform, sample_rate): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - if num_channels == 1: - display(Audio(waveform[0], rate=sample_rate)) - elif num_channels == 2: - display(Audio((waveform[0], waveform[1]), rate=sample_rate)) - else: - raise ValueError("Waveform with more than 2 channels are not supported.") - - -###################################################################### -# Here, we show how to use the -# :py:func:`torchaudio.datasets.YESNO` dataset. -# - - -dataset = torchaudio.datasets.YESNO(YESNO_DATASET_PATH, download=True) - -for i in [1, 3, 5]: - waveform, sample_rate, label = dataset[i] - plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}") - play_audio(waveform, sample_rate) diff --git a/beginner_source/audio_datasets_tutorial.rst b/beginner_source/audio_datasets_tutorial.rst new file mode 100644 index 00000000000..0bcac44a0a8 --- /dev/null +++ b/beginner_source/audio_datasets_tutorial.rst @@ -0,0 +1,10 @@ +Audio Datasets +============== + +This tutorial has been moved to https://pytorch.org/tutorials/beginner/audio_datasets_tutorial.html + +It will redirect in 3 seconds. + +.. raw:: html + + diff --git a/beginner_source/audio_feature_augmentation_tutorial.py b/beginner_source/audio_feature_augmentation_tutorial.py deleted file mode 100644 index 3961dafbc74..00000000000 --- a/beginner_source/audio_feature_augmentation_tutorial.py +++ /dev/null @@ -1,168 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Audio Feature Augmentation -========================== -""" - -# When running this tutorial in Google Colab, install the required packages -# with the following. -# !pip install torchaudio librosa - -import torch -import torchaudio -import torchaudio.transforms as T - -print(torch.__version__) -print(torchaudio.__version__) - -###################################################################### -# Preparing data and utility functions (skip this section) -# -------------------------------------------------------- -# - -# @title Prepare data and utility functions. {display-mode: "form"} -# @markdown -# @markdown You do not need to look into this cell. -# @markdown Just execute once and you are good to go. -# @markdown -# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), -# @markdown which is licensed under Creative Commos BY 4.0. - -# ------------------------------------------------------------------------------- -# Preparation of data and helper functions. -# ------------------------------------------------------------------------------- - -import os - -import librosa -import matplotlib.pyplot as plt -import requests - - -_SAMPLE_DIR = "_assets" - -SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" # noqa: E501 -SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav") - -os.makedirs(_SAMPLE_DIR, exist_ok=True) - - -def _fetch_data(): - uri = [ - (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), - ] - for url, path in uri: - with open(path, "wb") as file_: - file_.write(requests.get(url).content) - - -_fetch_data() - - -def _get_sample(path, resample=None): - effects = [["remix", "1"]] - if resample: - effects.extend( - [ - ["lowpass", f"{resample // 2}"], - ["rate", f"{resample}"], - ] - ) - return torchaudio.sox_effects.apply_effects_file(path, effects=effects) - - -def get_speech_sample(*, resample=None): - return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample) - - -def get_spectrogram( - n_fft=400, - win_len=None, - hop_len=None, - power=2.0, -): - waveform, _ = get_speech_sample() - spectrogram = T.Spectrogram( - n_fft=n_fft, - win_length=win_len, - hop_length=hop_len, - center=True, - pad_mode="reflect", - power=power, - ) - return spectrogram(waveform) - - -def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None): - fig, axs = plt.subplots(1, 1) - axs.set_title(title or "Spectrogram (db)") - axs.set_ylabel(ylabel) - axs.set_xlabel("frame") - im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect) - if xmax: - axs.set_xlim((0, xmax)) - fig.colorbar(im, ax=axs) - plt.show(block=False) - - -###################################################################### -# SpecAugment -# ----------- -# -# `SpecAugment `__ -# is a popular spectrogram augmentation technique. -# -# ``torchaudio`` implements :py:func:`torchaudio.transforms.TimeStretch`, -# :py:func:`torchaudio.transforms.TimeMasking` and -# :py:func:`torchaudio.transforms.FrequencyMasking`. -# - -###################################################################### -# TimeStretch -# ----------- -# - - -spec = get_spectrogram(power=None) -stretch = T.TimeStretch() - -rate = 1.2 -spec_ = stretch(spec, rate) -plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304) - -plot_spectrogram(torch.abs(spec[0]), title="Original", aspect="equal", xmax=304) - -rate = 0.9 -spec_ = stretch(spec, rate) -plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304) - -###################################################################### -# TimeMasking -# ----------- -# - -torch.random.manual_seed(4) - -spec = get_spectrogram() -plot_spectrogram(spec[0], title="Original") - -masking = T.TimeMasking(time_mask_param=80) -spec = masking(spec) - -plot_spectrogram(spec[0], title="Masked along time axis") - -###################################################################### -# FrequencyMasking -# ---------------- -# - - -torch.random.manual_seed(4) - -spec = get_spectrogram() -plot_spectrogram(spec[0], title="Original") - -masking = T.FrequencyMasking(freq_mask_param=80) -spec = masking(spec) - -plot_spectrogram(spec[0], title="Masked along frequency axis") diff --git a/beginner_source/audio_feature_augmentation_tutorial.rst b/beginner_source/audio_feature_augmentation_tutorial.rst new file mode 100644 index 00000000000..55d3811b3fa --- /dev/null +++ b/beginner_source/audio_feature_augmentation_tutorial.rst @@ -0,0 +1,10 @@ +Audio Feature Augmentation +========================== + +This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html + +It will redirect in 3 seconds. + +.. raw:: html + + diff --git a/beginner_source/audio_feature_extractions_tutorial.py b/beginner_source/audio_feature_extractions_tutorial.py deleted file mode 100644 index 822c00d97ba..00000000000 --- a/beginner_source/audio_feature_extractions_tutorial.py +++ /dev/null @@ -1,457 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Audio Feature Extractions -========================= - -``torchaudio`` implements feature extractions commonly used in the audio -domain. They are available in ``torchaudio.functional`` and -``torchaudio.transforms``. - -``functional`` implements features as standalone functions. -They are stateless. - -``transforms`` implements features as objects, -using implementations from ``functional`` and ``torch.nn.Module``. -They can be serialized using TorchScript. -""" - -import torch -import torchaudio -import torchaudio.functional as F -import torchaudio.transforms as T - -print(torch.__version__) -print(torchaudio.__version__) - -###################################################################### -# Preparation -# ----------- -# -# .. note:: -# -# When running this tutorial in Google Colab, install the required packages -# -# .. code:: -# -# !pip install librosa -# -from IPython.display import Audio -import librosa -import matplotlib.pyplot as plt -from torchaudio.utils import download_asset - -torch.random.manual_seed(0) - -SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") - - -def plot_waveform(waveform, sr, title="Waveform"): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - time_axis = torch.arange(0, num_frames) / sr - - figure, axes = plt.subplots(num_channels, 1) - axes.plot(time_axis, waveform[0], linewidth=1) - axes.grid(True) - figure.suptitle(title) - plt.show(block=False) - - -def plot_spectrogram(specgram, title=None, ylabel="freq_bin"): - fig, axs = plt.subplots(1, 1) - axs.set_title(title or "Spectrogram (db)") - axs.set_ylabel(ylabel) - axs.set_xlabel("frame") - im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto") - fig.colorbar(im, ax=axs) - plt.show(block=False) - - -def plot_fbank(fbank, title=None): - fig, axs = plt.subplots(1, 1) - axs.set_title(title or "Filter bank") - axs.imshow(fbank, aspect="auto") - axs.set_ylabel("frequency bin") - axs.set_xlabel("mel bin") - plt.show(block=False) - - -###################################################################### -# Overview of audio features -# -------------------------- -# -# The following diagram shows the relationship between common audio features -# and torchaudio APIs to generate them. -# -# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png -# -# For the complete list of available features, please refer to the -# documentation. -# - - -###################################################################### -# Spectrogram -# ----------- -# -# To get the frequency make-up of an audio signal as it varies with time, -# you can use :py:func:`torchaudio.transforms.Spectrogram`. -# - -SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH) - -plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform") -Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE) - - -###################################################################### -# - -n_fft = 1024 -win_length = None -hop_length = 512 - -# Define transform -spectrogram = T.Spectrogram( - n_fft=n_fft, - win_length=win_length, - hop_length=hop_length, - center=True, - pad_mode="reflect", - power=2.0, -) - -###################################################################### -# - -# Perform transform -spec = spectrogram(SPEECH_WAVEFORM) - -###################################################################### -# - -plot_spectrogram(spec[0], title="torchaudio") - -###################################################################### -# GriffinLim -# ---------- -# -# To recover a waveform from a spectrogram, you can use ``GriffinLim``. -# - -torch.random.manual_seed(0) - -n_fft = 1024 -win_length = None -hop_length = 512 - -spec = T.Spectrogram( - n_fft=n_fft, - win_length=win_length, - hop_length=hop_length, -)(SPEECH_WAVEFORM) - -###################################################################### -# - -griffin_lim = T.GriffinLim( - n_fft=n_fft, - win_length=win_length, - hop_length=hop_length, -) - -###################################################################### -# - -reconstructed_waveform = griffin_lim(spec) - -###################################################################### -# - -plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed") -Audio(reconstructed_waveform, rate=SAMPLE_RATE) - -###################################################################### -# Mel Filter Bank -# --------------- -# -# :py:func:`torchaudio.functional.melscale_fbanks` generates the filter bank -# for converting frequency bins to mel-scale bins. -# -# Since this function does not require input audio/features, there is no -# equivalent transform in :py:func:`torchaudio.transforms`. -# - -n_fft = 256 -n_mels = 64 -sample_rate = 6000 - -mel_filters = F.melscale_fbanks( - int(n_fft // 2 + 1), - n_mels=n_mels, - f_min=0.0, - f_max=sample_rate / 2.0, - sample_rate=sample_rate, - norm="slaney", -) - -###################################################################### -# - -plot_fbank(mel_filters, "Mel Filter Bank - torchaudio") - -###################################################################### -# Comparison against librosa -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For reference, here is the equivalent way to get the mel filter bank -# with ``librosa``. -# - -mel_filters_librosa = librosa.filters.mel( - sr=sample_rate, - n_fft=n_fft, - n_mels=n_mels, - fmin=0.0, - fmax=sample_rate / 2.0, - norm="slaney", - htk=True, -).T - -###################################################################### -# - -plot_fbank(mel_filters_librosa, "Mel Filter Bank - librosa") - -mse = torch.square(mel_filters - mel_filters_librosa).mean().item() -print("Mean Square Difference: ", mse) - -###################################################################### -# MelSpectrogram -# -------------- -# -# Generating a mel-scale spectrogram involves generating a spectrogram -# and performing mel-scale conversion. In ``torchaudio``, -# :py:func:`torchaudio.transforms.MelSpectrogram` provides -# this functionality. -# - -n_fft = 1024 -win_length = None -hop_length = 512 -n_mels = 128 - -mel_spectrogram = T.MelSpectrogram( - sample_rate=sample_rate, - n_fft=n_fft, - win_length=win_length, - hop_length=hop_length, - center=True, - pad_mode="reflect", - power=2.0, - norm="slaney", - onesided=True, - n_mels=n_mels, - mel_scale="htk", -) - -melspec = mel_spectrogram(SPEECH_WAVEFORM) - -###################################################################### -# - -plot_spectrogram(melspec[0], title="MelSpectrogram - torchaudio", ylabel="mel freq") - -###################################################################### -# Comparison against librosa -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For reference, here is the equivalent means of generating mel-scale -# spectrograms with ``librosa``. -# - -melspec_librosa = librosa.feature.melspectrogram( - y=SPEECH_WAVEFORM.numpy()[0], - sr=sample_rate, - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - center=True, - pad_mode="reflect", - power=2.0, - n_mels=n_mels, - norm="slaney", - htk=True, -) - -###################################################################### -# - -plot_spectrogram(melspec_librosa, title="MelSpectrogram - librosa", ylabel="mel freq") - -mse = torch.square(melspec - melspec_librosa).mean().item() -print("Mean Square Difference: ", mse) - -###################################################################### -# MFCC -# ---- -# - -n_fft = 2048 -win_length = None -hop_length = 512 -n_mels = 256 -n_mfcc = 256 - -mfcc_transform = T.MFCC( - sample_rate=sample_rate, - n_mfcc=n_mfcc, - melkwargs={ - "n_fft": n_fft, - "n_mels": n_mels, - "hop_length": hop_length, - "mel_scale": "htk", - }, -) - -mfcc = mfcc_transform(SPEECH_WAVEFORM) - -###################################################################### -# - -plot_spectrogram(mfcc[0]) - -###################################################################### -# Comparison against librosa -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - -melspec = librosa.feature.melspectrogram( - y=SPEECH_WAVEFORM.numpy()[0], - sr=sample_rate, - n_fft=n_fft, - win_length=win_length, - hop_length=hop_length, - n_mels=n_mels, - htk=True, - norm=None, -) - -mfcc_librosa = librosa.feature.mfcc( - S=librosa.core.spectrum.power_to_db(melspec), - n_mfcc=n_mfcc, - dct_type=2, - norm="ortho", -) - -###################################################################### -# - -plot_spectrogram(mfcc_librosa) - -mse = torch.square(mfcc - mfcc_librosa).mean().item() -print("Mean Square Difference: ", mse) - -###################################################################### -# LFCC -# ---- -# - -n_fft = 2048 -win_length = None -hop_length = 512 -n_lfcc = 256 - -lfcc_transform = T.LFCC( - sample_rate=sample_rate, - n_lfcc=n_lfcc, - speckwargs={ - "n_fft": n_fft, - "win_length": win_length, - "hop_length": hop_length, - }, -) - -lfcc = lfcc_transform(SPEECH_WAVEFORM) -plot_spectrogram(lfcc[0]) - -###################################################################### -# Pitch -# ----- -# - -pitch = F.detect_pitch_frequency(SPEECH_WAVEFORM, SAMPLE_RATE) - -###################################################################### -# - -def plot_pitch(waveform, sr, pitch): - figure, axis = plt.subplots(1, 1) - axis.set_title("Pitch Feature") - axis.grid(True) - - end_time = waveform.shape[1] / sr - time_axis = torch.linspace(0, end_time, waveform.shape[1]) - axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3) - - axis2 = axis.twinx() - time_axis = torch.linspace(0, end_time, pitch.shape[1]) - axis2.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green") - - axis2.legend(loc=0) - plt.show(block=False) - - -plot_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch) - -###################################################################### -# Kaldi Pitch (beta) -# ------------------ -# -# Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic -# speech recognition (ASR) applications. This is a beta feature in ``torchaudio``, -# and it is available as :py:func:`torchaudio.functional.compute_kaldi_pitch`. -# -# 1. A pitch extraction algorithm tuned for automatic speech recognition -# -# Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S. -# Khudanpur -# -# 2014 IEEE International Conference on Acoustics, Speech and Signal -# Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi: -# 10.1109/ICASSP.2014.6854049. -# [`abstract `__], -# [`paper `__] -# - -pitch_feature = F.compute_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE) -pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1] - -###################################################################### -# - -def plot_kaldi_pitch(waveform, sr, pitch, nfcc): - _, axis = plt.subplots(1, 1) - axis.set_title("Kaldi Pitch Feature") - axis.grid(True) - - end_time = waveform.shape[1] / sr - time_axis = torch.linspace(0, end_time, waveform.shape[1]) - axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3) - - time_axis = torch.linspace(0, end_time, pitch.shape[1]) - ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green") - axis.set_ylim((-1.3, 1.3)) - - axis2 = axis.twinx() - time_axis = torch.linspace(0, end_time, nfcc.shape[1]) - ln2 = axis2.plot(time_axis, nfcc[0], linewidth=2, label="NFCC", color="blue", linestyle="--") - - lns = ln1 + ln2 - labels = [l.get_label() for l in lns] - axis.legend(lns, labels, loc=0) - plt.show(block=False) - - -plot_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch, nfcc) diff --git a/beginner_source/audio_feature_extractions_tutorial.rst b/beginner_source/audio_feature_extractions_tutorial.rst new file mode 100644 index 00000000000..a2a8da4ab75 --- /dev/null +++ b/beginner_source/audio_feature_extractions_tutorial.rst @@ -0,0 +1,10 @@ +Audio Feature Extractions +========================= + +This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_feature_extractions_tutorial.html + +It will redirect in 3 seconds. + +.. raw:: html + + diff --git a/beginner_source/audio_io_tutorial.py b/beginner_source/audio_io_tutorial.py deleted file mode 100644 index 4917f1b1025..00000000000 --- a/beginner_source/audio_io_tutorial.py +++ /dev/null @@ -1,385 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Audio I/O -========= - -This tutorial shows how to use TorchAudio's basic I/O API to load audio files -into PyTorch's Tensor object, and save Tensor objects to audio files. -""" - -import torch -import torchaudio - -print(torch.__version__) -print(torchaudio.__version__) - -###################################################################### -# Preparation -# ----------- -# -# First, we import the modules and download the audio assets we use in this tutorial. -# -# .. note:: -# When running this tutorial in Google Colab, install the required packages -# with the following: -# -# .. code:: -# -# !pip install boto3 - -import io -import os -import tarfile -import tempfile - -import boto3 -import matplotlib.pyplot as plt -import requests -from botocore import UNSIGNED -from botocore.config import Config -from IPython.display import Audio -from torchaudio.utils import download_asset - -SAMPLE_GSM = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.gsm") -SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") -SAMPLE_WAV_8000 = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav") - - - -###################################################################### -# Querying audio metadata -# ----------------------- -# -# Function :py:func:`torchaudio.info` fetches audio metadata. -# You can provide a path-like object or file-like object. -# - -metadata = torchaudio.info(SAMPLE_WAV) -print(metadata) - -###################################################################### -# Where -# -# - ``sample_rate`` is the sampling rate of the audio -# - ``num_channels`` is the number of channels -# - ``num_frames`` is the number of frames per channel -# - ``bits_per_sample`` is bit depth -# - ``encoding`` is the sample coding format -# -# ``encoding`` can take on one of the following values: -# -# - ``"PCM_S"``: Signed integer linear PCM -# - ``"PCM_U"``: Unsigned integer linear PCM -# - ``"PCM_F"``: Floating point linear PCM -# - ``"FLAC"``: Flac, `Free Lossless Audio -# Codec `__ -# - ``"ULAW"``: Mu-law, -# [`wikipedia `__] -# - ``"ALAW"``: A-law -# [`wikipedia `__] -# - ``"MP3"`` : MP3, MPEG-1 Audio Layer III -# - ``"VORBIS"``: OGG Vorbis [`xiph.org `__] -# - ``"AMR_NB"``: Adaptive Multi-Rate -# [`wikipedia `__] -# - ``"AMR_WB"``: Adaptive Multi-Rate Wideband -# [`wikipedia `__] -# - ``"OPUS"``: Opus [`opus-codec.org `__] -# - ``"GSM"``: GSM-FR -# [`wikipedia `__] -# - ``"HTK"``: Single channel 16-bit PCM -# - ``"UNKNOWN"`` None of above -# - -###################################################################### -# **Note** -# -# - ``bits_per_sample`` can be ``0`` for formats with compression and/or -# variable bit rate (such as MP3). -# - ``num_frames`` can be ``0`` for GSM-FR format. -# - -metadata = torchaudio.info(SAMPLE_GSM) -print(metadata) - - -###################################################################### -# Querying file-like object -# ------------------------- -# -# :py:func:`torchaudio.info` works on file-like objects. -# - -url = "https://download.pytorch.org/torchaudio/tutorial-assets/steam-train-whistle-daniel_simon.wav" -with requests.get(url, stream=True) as response: - metadata = torchaudio.info(response.raw) -print(metadata) - -###################################################################### -# .. note:: -# -# When passing a file-like object, ``info`` does not read -# all of the underlying data; rather, it reads only a portion -# of the data from the beginning. -# Therefore, for a given audio format, it may not be able to retrieve the -# correct metadata, including the format itself. In such case, you -# can pass ``format`` argument to specify the format of the audio. - -###################################################################### -# Loading audio data -# ------------------ -# -# To load audio data, you can use :py:func:`torchaudio.load`. -# -# This function accepts a path-like object or file-like object as input. -# -# The returned value is a tuple of waveform (``Tensor``) and sample rate -# (``int``). -# -# By default, the resulting tensor object has ``dtype=torch.float32`` and -# its value range is ``[-1.0, 1.0]``. -# -# For the list of supported format, please refer to `the torchaudio -# documentation `__. -# - -waveform, sample_rate = torchaudio.load(SAMPLE_WAV) - - -###################################################################### -# -def plot_waveform(waveform, sample_rate): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - time_axis = torch.arange(0, num_frames) / sample_rate - - figure, axes = plt.subplots(num_channels, 1) - if num_channels == 1: - axes = [axes] - for c in range(num_channels): - axes[c].plot(time_axis, waveform[c], linewidth=1) - axes[c].grid(True) - if num_channels > 1: - axes[c].set_ylabel(f"Channel {c+1}") - figure.suptitle("waveform") - plt.show(block=False) - - -###################################################################### -# -plot_waveform(waveform, sample_rate) - - -###################################################################### -# -def plot_specgram(waveform, sample_rate, title="Spectrogram"): - waveform = waveform.numpy() - - num_channels, num_frames = waveform.shape - - figure, axes = plt.subplots(num_channels, 1) - if num_channels == 1: - axes = [axes] - for c in range(num_channels): - axes[c].specgram(waveform[c], Fs=sample_rate) - if num_channels > 1: - axes[c].set_ylabel(f"Channel {c+1}") - figure.suptitle(title) - plt.show(block=False) - - -###################################################################### -# -plot_specgram(waveform, sample_rate) - - -###################################################################### -# -Audio(waveform.numpy()[0], rate=sample_rate) - -###################################################################### -# Loading from file-like object -# ----------------------------- -# -# The I/O functions support file-like objects. -# This allows for fetching and decoding audio data from locations -# within and beyond the local file system. -# The following examples illustrate this. -# - -###################################################################### -# - -# Load audio data as HTTP request -url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" -with requests.get(url, stream=True) as response: - waveform, sample_rate = torchaudio.load(response.raw) -plot_specgram(waveform, sample_rate, title="HTTP datasource") - -###################################################################### -# - -# Load audio from tar file -tar_path = download_asset("tutorial-assets/VOiCES_devkit.tar.gz") -tar_item = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" -with tarfile.open(tar_path, mode="r") as tarfile_: - fileobj = tarfile_.extractfile(tar_item) - waveform, sample_rate = torchaudio.load(fileobj) -plot_specgram(waveform, sample_rate, title="TAR file") - -###################################################################### -# - -# Load audio from S3 -bucket = "pytorch-tutorial-assets" -key = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" -client = boto3.client("s3", config=Config(signature_version=UNSIGNED)) -response = client.get_object(Bucket=bucket, Key=key) -waveform, sample_rate = torchaudio.load(response["Body"]) -plot_specgram(waveform, sample_rate, title="From S3") - - -###################################################################### -# Tips on slicing -# --------------- -# -# Providing ``num_frames`` and ``frame_offset`` arguments restricts -# decoding to the corresponding segment of the input. -# -# The same result can be achieved using vanilla Tensor slicing, -# (i.e. ``waveform[:, frame_offset:frame_offset+num_frames]``). However, -# providing ``num_frames`` and ``frame_offset`` arguments is more -# efficient. -# -# This is because the function will end data acquisition and decoding -# once it finishes decoding the requested frames. This is advantageous -# when the audio data are transferred via network as the data transfer will -# stop as soon as the necessary amount of data is fetched. -# -# The following example illustrates this. -# - -# Illustration of two different decoding methods. -# The first one will fetch all the data and decode them, while -# the second one will stop fetching data once it completes decoding. -# The resulting waveforms are identical. - -frame_offset, num_frames = 16000, 16000 # Fetch and decode the 1 - 2 seconds - -url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" -print("Fetching all the data...") -with requests.get(url, stream=True) as response: - waveform1, sample_rate1 = torchaudio.load(response.raw) - waveform1 = waveform1[:, frame_offset : frame_offset + num_frames] - print(f" - Fetched {response.raw.tell()} bytes") - -print("Fetching until the requested frames are available...") -with requests.get(url, stream=True) as response: - waveform2, sample_rate2 = torchaudio.load(response.raw, frame_offset=frame_offset, num_frames=num_frames) - print(f" - Fetched {response.raw.tell()} bytes") - -print("Checking the resulting waveform ... ", end="") -assert (waveform1 == waveform2).all() -print("matched!") - -###################################################################### -# Saving audio to file -# -------------------- -# -# To save audio data in formats interpretable by common applications, -# you can use :py:func:`torchaudio.save`. -# -# This function accepts a path-like object or file-like object. -# -# When passing a file-like object, you also need to provide argument ``format`` -# so that the function knows which format it should use. In the -# case of a path-like object, the function will infer the format from -# the extension. If you are saving to a file without an extension, you need -# to provide argument ``format``. -# -# When saving WAV-formatted data, the default encoding for ``float32`` Tensor -# is 32-bit floating-point PCM. You can provide arguments ``encoding`` and -# ``bits_per_sample`` to change this behavior. For example, to save data -# in 16-bit signed integer PCM, you can do the following. -# -# .. note:: -# -# Saving data in encodings with a lower bit depth reduces the -# resulting file size but also precision. -# - -waveform, sample_rate = torchaudio.load(SAMPLE_WAV) - - -###################################################################### -# - -def inspect_file(path): - print("-" * 10) - print("Source:", path) - print("-" * 10) - print(f" - File size: {os.path.getsize(path)} bytes") - print(f" - {torchaudio.info(path)}") - print() - -###################################################################### -# -# Save without any encoding option. -# The function will pick up the encoding which -# the provided data fit -with tempfile.TemporaryDirectory() as tempdir: - path = f"{tempdir}/save_example_default.wav" - torchaudio.save(path, waveform, sample_rate) - inspect_file(path) - -###################################################################### -# -# Save as 16-bit signed integer Linear PCM -# The resulting file occupies half the storage but loses precision -with tempfile.TemporaryDirectory() as tempdir: - path = f"{tempdir}/save_example_PCM_S16.wav" - torchaudio.save(path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16) - inspect_file(path) - - -###################################################################### -# :py:func:`torchaudio.save` can also handle other formats. -# To name a few: -# - -formats = [ - "flac", - "vorbis", - "sph", - "amb", - "amr-nb", - "gsm", -] - -###################################################################### -# -waveform, sample_rate = torchaudio.load(SAMPLE_WAV_8000) -with tempfile.TemporaryDirectory() as tempdir: - for format in formats: - path = f"{tempdir}/save_example.{format}" - torchaudio.save(path, waveform, sample_rate, format=format) - inspect_file(path) - -###################################################################### -# Saving to file-like object -# -------------------------- -# -# Similar to the other I/O functions, you can save audio to file-like -# objects. When saving to a file-like object, argument ``format`` is -# required. -# - - -waveform, sample_rate = torchaudio.load(SAMPLE_WAV) - -# Saving to bytes buffer -buffer_ = io.BytesIO() -torchaudio.save(buffer_, waveform, sample_rate, format="wav") - -buffer_.seek(0) -print(buffer_.read(16)) diff --git a/beginner_source/audio_io_tutorial.rst b/beginner_source/audio_io_tutorial.rst new file mode 100644 index 00000000000..3263ad93a98 --- /dev/null +++ b/beginner_source/audio_io_tutorial.rst @@ -0,0 +1,10 @@ +Audio I/O +========= + +This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_io_tutorial.html + +It will redirect in 3 seconds. + +.. raw:: html + + diff --git a/beginner_source/audio_resampling_tutorial.rst b/beginner_source/audio_resampling_tutorial.rst index 0a611110578..01210830eb7 100644 --- a/beginner_source/audio_resampling_tutorial.rst +++ b/beginner_source/audio_resampling_tutorial.rst @@ -1,3 +1,6 @@ +Audio Resampling +================ + This tutorial has been moved to `a new location `_ You will be redirected in 3 seconds. diff --git a/intermediate_source/forced_alignment_with_torchaudio_tutorial.py b/intermediate_source/forced_alignment_with_torchaudio_tutorial.py deleted file mode 100644 index 1a5e8025d8f..00000000000 --- a/intermediate_source/forced_alignment_with_torchaudio_tutorial.py +++ /dev/null @@ -1,528 +0,0 @@ -""" -Forced Alignment with Wav2Vec2 -============================== - -**Author** `Moto Hira `__ - -This tutorial shows how to align transcript to speech with -``torchaudio``, using CTC segmentation algorithm described in -`CTC-Segmentation of Large Corpora for German End-to-end Speech -Recognition `__. - -""" - -import torch -import torchaudio - -print(torch.__version__) -print(torchaudio.__version__) - - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -print(device) - - -###################################################################### -# Overview -# -------- -# -# The process of alignment looks like the following. -# -# 1. Estimate the frame-wise label probability from audio waveform -# 2. Generate the trellis matrix which represents the probability of -# labels aligned at time step. -# 3. Find the most likely path from the trellis matrix. -# -# In this example, we use ``torchaudio``\ ’s ``Wav2Vec2`` model for -# acoustic feature extraction. -# - - -###################################################################### -# Preparation -# ----------- -# -# First we import the necessary packages, and fetch data that we work on. -# - -# %matplotlib inline - -from dataclasses import dataclass - -import IPython -import matplotlib -import matplotlib.pyplot as plt - -matplotlib.rcParams["figure.figsize"] = [16.0, 4.8] - -torch.random.manual_seed(0) - -SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") - - -###################################################################### -# Generate frame-wise label probability -# ------------------------------------- -# -# The first step is to generate the label class porbability of each aduio -# frame. We can use a Wav2Vec2 model that is trained for ASR. Here we use -# :py:func:`torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H`. -# -# ``torchaudio`` provides easy access to pretrained models with associated -# labels. -# -# .. note:: -# -# In the subsequent sections, we will compute the probability in -# log-domain to avoid numerical instability. For this purpose, we -# normalize the ``emission`` with :py:func:`torch.log_softmax`. -# - -bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H -model = bundle.get_model().to(device) -labels = bundle.get_labels() -with torch.inference_mode(): - waveform, _ = torchaudio.load(SPEECH_FILE) - emissions, _ = model(waveform.to(device)) - emissions = torch.log_softmax(emissions, dim=-1) - -emission = emissions[0].cpu().detach() - -################################################################################ -# Visualization -################################################################################ -print(labels) -plt.imshow(emission.T) -plt.colorbar() -plt.title("Frame-wise class probability") -plt.xlabel("Time") -plt.ylabel("Labels") -plt.show() - - -###################################################################### -# Generate alignment probability (trellis) -# ---------------------------------------- -# -# From the emission matrix, next we generate the trellis which represents -# the probability of transcript labels occur at each time frame. -# -# Trellis is 2D matrix with time axis and label axis. The label axis -# represents the transcript that we are aligning. In the following, we use -# :math:`t` to denote the index in time axis and :math:`j` to denote the -# index in label axis. :math:`c_j` represents the label at label index -# :math:`j`. -# -# To generate, the probability of time step :math:`t+1`, we look at the -# trellis from time step :math:`t` and emission at time step :math:`t+1`. -# There are two path to reach to time step :math:`t+1` with label -# :math:`c_{j+1}`. The first one is the case where the label was -# :math:`c_{j+1}` at :math:`t` and there was no label change from -# :math:`t` to :math:`t+1`. The other case is where the label was -# :math:`c_j` at :math:`t` and it transitioned to the next label -# :math:`c_{j+1}` at :math:`t+1`. -# -# The follwoing diagram illustrates this transition. -# -# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/ctc-forward.png -# -# Since we are looking for the most likely transitions, we take the more -# likely path for the value of :math:`k_{(t+1, j+1)}`, that is -# -# :math:`k_{(t+1, j+1)} = max( k_{(t, j)} p(t+1, c_{j+1}), k_{(t, j+1)} p(t+1, repeat) )` -# -# where :math:`k` represents is trellis matrix, and :math:`p(t, c_j)` -# represents the probability of label :math:`c_j` at time step :math:`t`. -# :math:`repeat` represents the blank token from CTC formulation. (For the -# detail of CTC algorithm, please refer to the *Sequence Modeling with CTC* -# [`distill.pub `__]) -# - -transcript = "I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT" -dictionary = {c: i for i, c in enumerate(labels)} - -tokens = [dictionary[c] for c in transcript] -print(list(zip(transcript, tokens))) - - -def get_trellis(emission, tokens, blank_id=0): - num_frame = emission.size(0) - num_tokens = len(tokens) - - # Trellis has extra diemsions for both time axis and tokens. - # The extra dim for tokens represents (start-of-sentence) - # The extra dim for time axis is for simplification of the code. - trellis = torch.empty((num_frame + 1, num_tokens + 1)) - trellis[0, 0] = 0 - trellis[1:, 0] = torch.cumsum(emission[:, 0], 0) - trellis[0, -num_tokens:] = -float("inf") - trellis[-num_tokens:, 0] = float("inf") - - for t in range(num_frame): - trellis[t + 1, 1:] = torch.maximum( - # Score for staying at the same token - trellis[t, 1:] + emission[t, blank_id], - # Score for changing to the next token - trellis[t, :-1] + emission[t, tokens], - ) - return trellis - - -trellis = get_trellis(emission, tokens) - -################################################################################ -# Visualization -################################################################################ -plt.imshow(trellis[1:, 1:].T, origin="lower") -plt.annotate("- Inf", (trellis.size(1) / 5, trellis.size(1) / 1.5)) -plt.colorbar() -plt.show() - -###################################################################### -# In the above visualization, we can see that there is a trace of high -# probability crossing the matrix diagonally. -# - - -###################################################################### -# Find the most likely path (backtracking) -# ---------------------------------------- -# -# Once the trellis is generated, we will traverse it following the -# elements with high probability. -# -# We will start from the last label index with the time step of highest -# probability, then, we traverse back in time, picking stay -# (:math:`c_j \rightarrow c_j`) or transition -# (:math:`c_j \rightarrow c_{j+1}`), based on the post-transition -# probability :math:`k_{t, j} p(t+1, c_{j+1})` or -# :math:`k_{t, j+1} p(t+1, repeat)`. -# -# Transition is done once the label reaches the beginning. -# -# The trellis matrix is used for path-finding, but for the final -# probability of each segment, we take the frame-wise probability from -# emission matrix. -# - - -@dataclass -class Point: - token_index: int - time_index: int - score: float - - -def backtrack(trellis, emission, tokens, blank_id=0): - # Note: - # j and t are indices for trellis, which has extra dimensions - # for time and tokens at the beginning. - # When referring to time frame index `T` in trellis, - # the corresponding index in emission is `T-1`. - # Similarly, when referring to token index `J` in trellis, - # the corresponding index in transcript is `J-1`. - j = trellis.size(1) - 1 - t_start = torch.argmax(trellis[:, j]).item() - - path = [] - for t in range(t_start, 0, -1): - # 1. Figure out if the current position was stay or change - # Note (again): - # `emission[J-1]` is the emission at time frame `J` of trellis dimension. - # Score for token staying the same from time frame J-1 to T. - stayed = trellis[t - 1, j] + emission[t - 1, blank_id] - # Score for token changing from C-1 at T-1 to J at T. - changed = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]] - - # 2. Store the path with frame-wise probability. - prob = emission[t - 1, tokens[j - 1] if changed > stayed else 0].exp().item() - # Return token index and time index in non-trellis coordinate. - path.append(Point(j - 1, t - 1, prob)) - - # 3. Update the token - if changed > stayed: - j -= 1 - if j == 0: - break - else: - raise ValueError("Failed to align") - return path[::-1] - - -path = backtrack(trellis, emission, tokens) -for p in path: - print(p) - - -################################################################################ -# Visualization -################################################################################ -def plot_trellis_with_path(trellis, path): - # To plot trellis with path, we take advantage of 'nan' value - trellis_with_path = trellis.clone() - for _, p in enumerate(path): - trellis_with_path[p.time_index, p.token_index] = float("nan") - plt.imshow(trellis_with_path[1:, 1:].T, origin="lower") - - -plot_trellis_with_path(trellis, path) -plt.title("The path found by backtracking") -plt.show() - -###################################################################### -# Looking good. Now this path contains repetations for the same labels, so -# let’s merge them to make it close to the original transcript. -# -# When merging the multiple path points, we simply take the average -# probability for the merged segments. -# - - -# Merge the labels -@dataclass -class Segment: - label: str - start: int - end: int - score: float - - def __repr__(self): - return f"{self.label}\t({self.score:4.2f}): [{self.start:5d}, {self.end:5d})" - - @property - def length(self): - return self.end - self.start - - -def merge_repeats(path): - i1, i2 = 0, 0 - segments = [] - while i1 < len(path): - while i2 < len(path) and path[i1].token_index == path[i2].token_index: - i2 += 1 - score = sum(path[k].score for k in range(i1, i2)) / (i2 - i1) - segments.append( - Segment( - transcript[path[i1].token_index], - path[i1].time_index, - path[i2 - 1].time_index + 1, - score, - ) - ) - i1 = i2 - return segments - - -segments = merge_repeats(path) -for seg in segments: - print(seg) - - -################################################################################ -# Visualization -################################################################################ -def plot_trellis_with_segments(trellis, segments, transcript): - # To plot trellis with path, we take advantage of 'nan' value - trellis_with_path = trellis.clone() - for i, seg in enumerate(segments): - if seg.label != "|": - trellis_with_path[seg.start + 1 : seg.end + 1, i + 1] = float("nan") - - fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9.5)) - ax1.set_title("Path, label and probability for each label") - ax1.imshow(trellis_with_path.T, origin="lower") - ax1.set_xticks([]) - - for i, seg in enumerate(segments): - if seg.label != "|": - ax1.annotate(seg.label, (seg.start + 0.7, i + 0.3), weight="bold") - ax1.annotate(f"{seg.score:.2f}", (seg.start - 0.3, i + 4.3)) - - ax2.set_title("Label probability with and without repetation") - xs, hs, ws = [], [], [] - for seg in segments: - if seg.label != "|": - xs.append((seg.end + seg.start) / 2 + 0.4) - hs.append(seg.score) - ws.append(seg.end - seg.start) - ax2.annotate(seg.label, (seg.start + 0.8, -0.07), weight="bold") - ax2.bar(xs, hs, width=ws, color="gray", alpha=0.5, edgecolor="black") - - xs, hs = [], [] - for p in path: - label = transcript[p.token_index] - if label != "|": - xs.append(p.time_index + 1) - hs.append(p.score) - - ax2.bar(xs, hs, width=0.5, alpha=0.5) - ax2.axhline(0, color="black") - ax2.set_xlim(ax1.get_xlim()) - ax2.set_ylim(-0.1, 1.1) - - -plot_trellis_with_segments(trellis, segments, transcript) -plt.tight_layout() -plt.show() - - -###################################################################### -# Looks good. Now let’s merge the words. The Wav2Vec2 model uses ``'|'`` -# as the word boundary, so we merge the segments before each occurance of -# ``'|'``. -# -# Then, finally, we segment the original audio into segmented audio and -# listen to them to see if the segmentation is correct. -# - -# Merge words -def merge_words(segments, separator="|"): - words = [] - i1, i2 = 0, 0 - while i1 < len(segments): - if i2 >= len(segments) or segments[i2].label == separator: - if i1 != i2: - segs = segments[i1:i2] - word = "".join([seg.label for seg in segs]) - score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs) - words.append(Segment(word, segments[i1].start, segments[i2 - 1].end, score)) - i1 = i2 + 1 - i2 = i1 - else: - i2 += 1 - return words - - -word_segments = merge_words(segments) -for word in word_segments: - print(word) - - -################################################################################ -# Visualization -################################################################################ -def plot_alignments(trellis, segments, word_segments, waveform): - trellis_with_path = trellis.clone() - for i, seg in enumerate(segments): - if seg.label != "|": - trellis_with_path[seg.start + 1 : seg.end + 1, i + 1] = float("nan") - - fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9.5)) - - ax1.imshow(trellis_with_path[1:, 1:].T, origin="lower") - ax1.set_xticks([]) - ax1.set_yticks([]) - - for word in word_segments: - ax1.axvline(word.start - 0.5) - ax1.axvline(word.end - 0.5) - - for i, seg in enumerate(segments): - if seg.label != "|": - ax1.annotate(seg.label, (seg.start, i + 0.3)) - ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 4), fontsize=8) - - # The original waveform - ratio = waveform.size(0) / (trellis.size(0) - 1) - ax2.plot(waveform) - for word in word_segments: - x0 = ratio * word.start - x1 = ratio * word.end - ax2.axvspan(x0, x1, alpha=0.1, color="red") - ax2.annotate(f"{word.score:.2f}", (x0, 0.8)) - - for seg in segments: - if seg.label != "|": - ax2.annotate(seg.label, (seg.start * ratio, 0.9)) - xticks = ax2.get_xticks() - plt.xticks(xticks, xticks / bundle.sample_rate) - ax2.set_xlabel("time [second]") - ax2.set_yticks([]) - ax2.set_ylim(-1.0, 1.0) - ax2.set_xlim(0, waveform.size(-1)) - - -plot_alignments( - trellis, - segments, - word_segments, - waveform[0], -) -plt.show() - -################################################################################ -# - -# A trick to embed the resulting audio to the generated file. -# `IPython.display.Audio` has to be the last call in a cell, -# and there should be only one call par cell. -def display_segment(i): - ratio = waveform.size(1) / (trellis.size(0) - 1) - word = word_segments[i] - x0 = int(ratio * word.start) - x1 = int(ratio * word.end) - print(f"{word.label} ({word.score:.2f}): {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f} sec") - segment = waveform[:, x0:x1] - return IPython.display.Audio(segment.numpy(), rate=bundle.sample_rate) - - -###################################################################### -# - -# Generate the audio for each segment -print(transcript) -IPython.display.Audio(SPEECH_FILE) - - -###################################################################### -# - -display_segment(0) - -###################################################################### -# - -display_segment(1) - -###################################################################### -# - -display_segment(2) - -###################################################################### -# - -display_segment(3) - -###################################################################### -# - -display_segment(4) - -###################################################################### -# - -display_segment(5) - -###################################################################### -# - -display_segment(6) - -###################################################################### -# - -display_segment(7) - -###################################################################### -# - -display_segment(8) - -###################################################################### -# Conclusion -# ---------- -# -# In this tutorial, we looked how to use torchaudio’s Wav2Vec2 model to -# perform CTC segmentation for forced alignment. -# diff --git a/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst b/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst new file mode 100644 index 00000000000..4c9752d016d --- /dev/null +++ b/intermediate_source/forced_alignment_with_torchaudio_tutorial.rst @@ -0,0 +1,11 @@ +Forced Alignment with Wav2Vec2 +============================== + +This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/forced_alignment_tutorial.html + +It will redirect in 3 seconds. + +.. raw:: html + + + diff --git a/intermediate_source/speech_command_classification_with_torchaudio_tutorial.py b/intermediate_source/speech_command_classification_with_torchaudio_tutorial.py deleted file mode 100644 index ba7ff93a875..00000000000 --- a/intermediate_source/speech_command_classification_with_torchaudio_tutorial.py +++ /dev/null @@ -1,545 +0,0 @@ -""" -Speech Command Classification with torchaudio -********************************************* - -This tutorial will show you how to correctly format an audio dataset and -then train/test an audio classifier network on the dataset. - -Colab has GPU option available. In the menu tabs, select “Runtime” then -“Change runtime type”. In the pop-up that follows, you can choose GPU. -After the change, your runtime should automatically restart (which means -information from executed cells disappear). - -First, let’s import the common torch packages such as -`torchaudio `__ that can be installed -by following the instructions on the website. - -""" - -# Uncomment the line corresponding to your "runtime type" to run in Google Colab - -# CPU: -# !pip install pydub torch==1.7.0+cpu torchvision==0.8.1+cpu torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html - -# GPU: -# !pip install pydub torch==1.7.0+cu101 torchvision==0.8.1+cu101 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torchaudio -import sys - -import matplotlib.pyplot as plt -import IPython.display as ipd - -from tqdm import tqdm - - -###################################################################### -# Let’s check if a CUDA GPU is available and select our device. Running -# the network on a GPU will greatly decrease the training/testing runtime. -# - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -print(device) - - -###################################################################### -# Importing the Dataset -# --------------------- -# -# We use torchaudio to download and represent the dataset. Here we use -# `SpeechCommands `__, which is a -# datasets of 35 commands spoken by different people. The dataset -# ``SPEECHCOMMANDS`` is a ``torch.utils.data.Dataset`` version of the -# dataset. In this dataset, all audio files are about 1 second long (and -# so about 16000 time frames long). -# -# The actual loading and formatting steps happen when a data point is -# being accessed, and torchaudio takes care of converting the audio files -# to tensors. If one wants to load an audio file directly instead, -# ``torchaudio.load()`` can be used. It returns a tuple containing the -# newly created tensor along with the sampling frequency of the audio file -# (16kHz for SpeechCommands). -# -# Going back to the dataset, here we create a subclass that splits it into -# standard training, validation, testing subsets. -# - -from torchaudio.datasets import SPEECHCOMMANDS -import os - - -class SubsetSC(SPEECHCOMMANDS): - def __init__(self, subset: str = None): - super().__init__("./", download=True) - - def load_list(filename): - filepath = os.path.join(self._path, filename) - with open(filepath) as fileobj: - return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj] - - if subset == "validation": - self._walker = load_list("validation_list.txt") - elif subset == "testing": - self._walker = load_list("testing_list.txt") - elif subset == "training": - excludes = load_list("validation_list.txt") + load_list("testing_list.txt") - excludes = set(excludes) - self._walker = [w for w in self._walker if w not in excludes] - - -# Create training and testing split of the data. We do not use validation in this tutorial. -train_set = SubsetSC("training") -test_set = SubsetSC("testing") - -waveform, sample_rate, label, speaker_id, utterance_number = train_set[0] - - -###################################################################### -# A data point in the SPEECHCOMMANDS dataset is a tuple made of a waveform -# (the audio signal), the sample rate, the utterance (label), the ID of -# the speaker, the number of the utterance. -# - -print("Shape of waveform: {}".format(waveform.size())) -print("Sample rate of waveform: {}".format(sample_rate)) - -plt.plot(waveform.t().numpy()); - - -###################################################################### -# Let’s find the list of labels available in the dataset. -# - -labels = sorted(list(set(datapoint[2] for datapoint in train_set))) -labels - - -###################################################################### -# The 35 audio labels are commands that are said by users. The first few -# files are people saying “marvin”. -# - -waveform_first, *_ = train_set[0] -ipd.Audio(waveform_first.numpy(), rate=sample_rate) - -waveform_second, *_ = train_set[1] -ipd.Audio(waveform_second.numpy(), rate=sample_rate) - - -###################################################################### -# The last file is someone saying “visual”. -# - -waveform_last, *_ = train_set[-1] -ipd.Audio(waveform_last.numpy(), rate=sample_rate) - - -###################################################################### -# Formatting the Data -# ------------------- -# -# This is a good place to apply transformations to the data. For the -# waveform, we downsample the audio for faster processing without losing -# too much of the classification power. -# -# We don’t need to apply other transformations here. It is common for some -# datasets though to have to reduce the number of channels (say from -# stereo to mono) by either taking the mean along the channel dimension, -# or simply keeping only one of the channels. Since SpeechCommands uses a -# single channel for audio, this is not needed here. -# - -new_sample_rate = 8000 -transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate) -transformed = transform(waveform) - -ipd.Audio(transformed.numpy(), rate=new_sample_rate) - - -###################################################################### -# We are encoding each word using its index in the list of labels. -# - - -def label_to_index(word): - # Return the position of the word in labels - return torch.tensor(labels.index(word)) - - -def index_to_label(index): - # Return the word corresponding to the index in labels - # This is the inverse of label_to_index - return labels[index] - - -word_start = "yes" -index = label_to_index(word_start) -word_recovered = index_to_label(index) - -print(word_start, "-->", index, "-->", word_recovered) - - -###################################################################### -# To turn a list of data point made of audio recordings and utterances -# into two batched tensors for the model, we implement a collate function -# which is used by the PyTorch DataLoader that allows us to iterate over a -# dataset by batches. Please see `the -# documentation `__ -# for more information about working with a collate function. -# -# In the collate function, we also apply the resampling, and the text -# encoding. -# - - -def pad_sequence(batch): - # Make all tensor in a batch the same length by padding with zeros - batch = [item.t() for item in batch] - batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.) - return batch.permute(0, 2, 1) - - -def collate_fn(batch): - - # A data tuple has the form: - # waveform, sample_rate, label, speaker_id, utterance_number - - tensors, targets = [], [] - - # Gather in lists, and encode labels as indices - for waveform, _, label, *_ in batch: - tensors += [waveform] - targets += [label_to_index(label)] - - # Group the list of tensors into a batched tensor - tensors = pad_sequence(tensors) - targets = torch.stack(targets) - - return tensors, targets - - -batch_size = 256 - -if device == "cuda": - num_workers = 1 - pin_memory = True -else: - num_workers = 0 - pin_memory = False - -train_loader = torch.utils.data.DataLoader( - train_set, - batch_size=batch_size, - shuffle=True, - collate_fn=collate_fn, - num_workers=num_workers, - pin_memory=pin_memory, -) -test_loader = torch.utils.data.DataLoader( - test_set, - batch_size=batch_size, - shuffle=False, - drop_last=False, - collate_fn=collate_fn, - num_workers=num_workers, - pin_memory=pin_memory, -) - - -###################################################################### -# Define the Network -# ------------------ -# -# For this tutorial we will use a convolutional neural network to process -# the raw audio data. Usually more advanced transforms are applied to the -# audio data, however CNNs can be used to accurately process the raw data. -# The specific architecture is modeled after the M5 network architecture -# described in `this paper `__. An -# important aspect of models processing raw audio data is the receptive -# field of their first layer’s filters. Our model’s first filter is length -# 80 so when processing audio sampled at 8kHz the receptive field is -# around 10ms (and at 4kHz, around 20 ms). This size is similar to speech -# processing applications that often use receptive fields ranging from -# 20ms to 40ms. -# - - -class M5(nn.Module): - def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32): - super().__init__() - self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride) - self.bn1 = nn.BatchNorm1d(n_channel) - self.pool1 = nn.MaxPool1d(4) - self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3) - self.bn2 = nn.BatchNorm1d(n_channel) - self.pool2 = nn.MaxPool1d(4) - self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3) - self.bn3 = nn.BatchNorm1d(2 * n_channel) - self.pool3 = nn.MaxPool1d(4) - self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3) - self.bn4 = nn.BatchNorm1d(2 * n_channel) - self.pool4 = nn.MaxPool1d(4) - self.fc1 = nn.Linear(2 * n_channel, n_output) - - def forward(self, x): - x = self.conv1(x) - x = F.relu(self.bn1(x)) - x = self.pool1(x) - x = self.conv2(x) - x = F.relu(self.bn2(x)) - x = self.pool2(x) - x = self.conv3(x) - x = F.relu(self.bn3(x)) - x = self.pool3(x) - x = self.conv4(x) - x = F.relu(self.bn4(x)) - x = self.pool4(x) - x = F.avg_pool1d(x, x.shape[-1]) - x = x.permute(0, 2, 1) - x = self.fc1(x) - return F.log_softmax(x, dim=2) - - -model = M5(n_input=transformed.shape[0], n_output=len(labels)) -model.to(device) -print(model) - - -def count_parameters(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - -n = count_parameters(model) -print("Number of parameters: %s" % n) - - -###################################################################### -# We will use the same optimization technique used in the paper, an Adam -# optimizer with weight decay set to 0.0001. At first, we will train with -# a learning rate of 0.01, but we will use a ``scheduler`` to decrease it -# to 0.001 during training after 20 epochs. -# - -optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001) -scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1) # reduce the learning after 20 epochs by a factor of 10 - - -###################################################################### -# Training and Testing the Network -# -------------------------------- -# -# Now let’s define a training function that will feed our training data -# into the model and perform the backward pass and optimization steps. For -# training, the loss we will use is the negative log-likelihood. The -# network will then be tested after each epoch to see how the accuracy -# varies during the training. -# - - -def train(model, epoch, log_interval): - model.train() - for batch_idx, (data, target) in enumerate(train_loader): - - data = data.to(device) - target = target.to(device) - - # apply transform and model on whole batch directly on device - data = transform(data) - output = model(data) - - # negative log-likelihood for a tensor of size (batch x 1 x n_output) - loss = F.nll_loss(output.squeeze(), target) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - # print training stats - if batch_idx % log_interval == 0: - print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}") - - # update progress bar - pbar.update(pbar_update) - # record loss - losses.append(loss.item()) - - -###################################################################### -# Now that we have a training function, we need to make one for testing -# the networks accuracy. We will set the model to ``eval()`` mode and then -# run inference on the test dataset. Calling ``eval()`` sets the training -# variable in all modules in the network to false. Certain layers like -# batch normalization and dropout layers behave differently during -# training so this step is crucial for getting correct results. -# - - -def number_of_correct(pred, target): - # count number of correct predictions - return pred.squeeze().eq(target).sum().item() - - -def get_likely_index(tensor): - # find most likely label index for each element in the batch - return tensor.argmax(dim=-1) - - -def test(model, epoch): - model.eval() - correct = 0 - for data, target in test_loader: - - data = data.to(device) - target = target.to(device) - - # apply transform and model on whole batch directly on device - data = transform(data) - output = model(data) - - pred = get_likely_index(output) - correct += number_of_correct(pred, target) - - # update progress bar - pbar.update(pbar_update) - - print(f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n") - - -###################################################################### -# Finally, we can train and test the network. We will train the network -# for ten epochs then reduce the learn rate and train for ten more epochs. -# The network will be tested after each epoch to see how the accuracy -# varies during the training. -# - -log_interval = 20 -n_epoch = 2 - -pbar_update = 1 / (len(train_loader) + len(test_loader)) -losses = [] - -# The transform needs to live on the same device as the model and the data. -transform = transform.to(device) -with tqdm(total=n_epoch) as pbar: - for epoch in range(1, n_epoch + 1): - train(model, epoch, log_interval) - test(model, epoch) - scheduler.step() - -# Let's plot the training loss versus the number of iteration. -# plt.plot(losses); -# plt.title("training loss"); - - -###################################################################### -# The network should be more than 65% accurate on the test set after 2 -# epochs, and 85% after 21 epochs. Let’s look at the last words in the -# train set, and see how the model did on it. -# - - -def predict(tensor): - # Use the model to predict the label of the waveform - tensor = tensor.to(device) - tensor = transform(tensor) - tensor = model(tensor.unsqueeze(0)) - tensor = get_likely_index(tensor) - tensor = index_to_label(tensor.squeeze()) - return tensor - - -waveform, sample_rate, utterance, *_ = train_set[-1] -ipd.Audio(waveform.numpy(), rate=sample_rate) - -print(f"Expected: {utterance}. Predicted: {predict(waveform)}.") - - -###################################################################### -# Let’s find an example that isn’t classified correctly, if there is one. -# - -for i, (waveform, sample_rate, utterance, *_) in enumerate(test_set): - output = predict(waveform) - if output != utterance: - ipd.Audio(waveform.numpy(), rate=sample_rate) - print(f"Data point #{i}. Expected: {utterance}. Predicted: {output}.") - break -else: - print("All examples in this dataset were correctly classified!") - print("In this case, let's just look at the last data point") - ipd.Audio(waveform.numpy(), rate=sample_rate) - print(f"Data point #{i}. Expected: {utterance}. Predicted: {output}.") - - -###################################################################### -# Feel free to try with one of your own recordings of one of the labels! -# For example, using Colab, say “Go” while executing the cell below. This -# will record one second of audio and try to classify it. -# - - -def record(seconds=1): - - from google.colab import output as colab_output - from base64 import b64decode - from io import BytesIO - from pydub import AudioSegment - - RECORD = ( - b"const sleep = time => new Promise(resolve => setTimeout(resolve, time))\n" - b"const b2text = blob => new Promise(resolve => {\n" - b" const reader = new FileReader()\n" - b" reader.onloadend = e => resolve(e.srcElement.result)\n" - b" reader.readAsDataURL(blob)\n" - b"})\n" - b"var record = time => new Promise(async resolve => {\n" - b" stream = await navigator.mediaDevices.getUserMedia({ audio: true })\n" - b" recorder = new MediaRecorder(stream)\n" - b" chunks = []\n" - b" recorder.ondataavailable = e => chunks.push(e.data)\n" - b" recorder.start()\n" - b" await sleep(time)\n" - b" recorder.onstop = async ()=>{\n" - b" blob = new Blob(chunks)\n" - b" text = await b2text(blob)\n" - b" resolve(text)\n" - b" }\n" - b" recorder.stop()\n" - b"})" - ) - RECORD = RECORD.decode("ascii") - - print(f"Recording started for {seconds} seconds.") - display(ipd.Javascript(RECORD)) - s = colab_output.eval_js("record(%d)" % (seconds * 1000)) - print("Recording ended.") - b = b64decode(s.split(",")[1]) - - fileformat = "wav" - filename = f"_audio.{fileformat}" - AudioSegment.from_file(BytesIO(b)).export(filename, format=fileformat) - return torchaudio.load(filename) - - -# Detect whether notebook runs in google colab -if "google.colab" in sys.modules: - waveform, sample_rate = record() - print(f"Predicted: {predict(waveform)}.") - ipd.Audio(waveform.numpy(), rate=sample_rate) - - -###################################################################### -# Conclusion -# ---------- -# -# In this tutorial, we used torchaudio to load a dataset and resample the -# signal. We have then defined a neural network that we trained to -# recognize a given command. There are also other data preprocessing -# methods, such as finding the mel frequency cepstral coefficients (MFCC), -# that can reduce the size of the dataset. This transform is also -# available in torchaudio as ``torchaudio.transforms.MFCC``. -# diff --git a/intermediate_source/speech_recognition_pipeline_tutorial.py b/intermediate_source/speech_recognition_pipeline_tutorial.py deleted file mode 100644 index 9ffbdcaaf0b..00000000000 --- a/intermediate_source/speech_recognition_pipeline_tutorial.py +++ /dev/null @@ -1,302 +0,0 @@ -""" -Speech Recognition with Wav2Vec2 -================================ - -**Author**: `Moto Hira `__ - -This tutorial shows how to perform speech recognition using -pre-trained models from wav2vec 2.0 -[`paper `__]. - -""" - - -###################################################################### -# Overview -# -------- -# -# The process of speech recognition looks like the following. -# -# 1. Extract the acoustic features from audio waveform -# -# 2. Estimate the class of the acoustic features frame-by-frame -# -# 3. Generate hypothesis from the sequence of the class probabilities -# -# Torchaudio provides easy access to the pre-trained weights and -# associated information, such as the expected sample rate and class -# labels. They are bundled together and available under -# :py:func:`torchaudio.pipelines` module. -# - - -###################################################################### -# Preparation -# ----------- -# -# First we import the necessary packages, and fetch data that we work on. -# - -# %matplotlib inline - -import os - -import IPython -import matplotlib -import matplotlib.pyplot as plt -import requests -import torch -import torchaudio - -matplotlib.rcParams["figure.figsize"] = [16.0, 4.8] - -torch.random.manual_seed(0) -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -print(torch.__version__) -print(torchaudio.__version__) -print(device) - -SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" # noqa: E501 -SPEECH_FILE = "_assets/speech.wav" - -if not os.path.exists(SPEECH_FILE): - os.makedirs("_assets", exist_ok=True) - with open(SPEECH_FILE, "wb") as file: - file.write(requests.get(SPEECH_URL).content) - - -###################################################################### -# Creating a pipeline -# ------------------- -# -# First, we will create a Wav2Vec2 model that performs the feature -# extraction and the classification. -# -# There are two types of Wav2Vec2 pre-trained weights available in -# torchaudio. The ones fine-tuned for ASR task, and the ones not -# fine-tuned. -# -# Wav2Vec2 (and HuBERT) models are trained in self-supervised manner. They -# are firstly trained with audio only for representation learning, then -# fine-tuned for a specific task with additional labels. -# -# The pre-trained weights without fine-tuning can be fine-tuned -# for other downstream tasks as well, but this tutorial does not -# cover that. -# -# We will use :py:func:`torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H` here. -# -# There are multiple models available as -# :py:mod:`torchaudio.pipelines`. Please check the documentation for -# the detail of how they are trained. -# -# The bundle object provides the interface to instantiate model and other -# information. Sampling rate and the class labels are found as follow. -# - -bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H - -print("Sample Rate:", bundle.sample_rate) - -print("Labels:", bundle.get_labels()) - - -###################################################################### -# Model can be constructed as following. This process will automatically -# fetch the pre-trained weights and load it into the model. -# - -model = bundle.get_model().to(device) - -print(model.__class__) - - -###################################################################### -# Loading data -# ------------ -# -# We will use the speech data from `VOiCES -# dataset `__, which is licensed under -# Creative Commons BY 4.0. -# - -IPython.display.Audio(SPEECH_FILE) - - -###################################################################### -# To load data, we use :py:func:`torchaudio.load`. -# -# If the sampling rate is different from what the pipeline expects, then -# we can use :py:func:`torchaudio.functional.resample` for resampling. -# -# .. note:: -# -# - :py:func:`torchaudio.functional.resample` works on CUDA tensors as well. -# - When performing resampling multiple times on the same set of sample rates, -# using :py:func:`torchaudio.transforms.Resample` might improve the performace. -# - -waveform, sample_rate = torchaudio.load(SPEECH_FILE) -waveform = waveform.to(device) - -if sample_rate != bundle.sample_rate: - waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate) - - -###################################################################### -# Extracting acoustic features -# ---------------------------- -# -# The next step is to extract acoustic features from the audio. -# -# .. note:: -# Wav2Vec2 models fine-tuned for ASR task can perform feature -# extraction and classification with one step, but for the sake of the -# tutorial, we also show how to perform feature extraction here. -# - -with torch.inference_mode(): - features, _ = model.extract_features(waveform) - - -###################################################################### -# The returned features is a list of tensors. Each tensor is the output of -# a transformer layer. -# - -fig, ax = plt.subplots(len(features), 1, figsize=(16, 4.3 * len(features))) -for i, feats in enumerate(features): - ax[i].imshow(feats[0].cpu()) - ax[i].set_title(f"Feature from transformer layer {i+1}") - ax[i].set_xlabel("Feature dimension") - ax[i].set_ylabel("Frame (time-axis)") -plt.tight_layout() -plt.show() - - -###################################################################### -# Feature classification -# ---------------------- -# -# Once the acoustic features are extracted, the next step is to classify -# them into a set of categories. -# -# Wav2Vec2 model provides method to perform the feature extraction and -# classification in one step. -# - -with torch.inference_mode(): - emission, _ = model(waveform) - - -###################################################################### -# The output is in the form of logits. It is not in the form of -# probability. -# -# Let’s visualize this. -# - -plt.imshow(emission[0].cpu().T) -plt.title("Classification result") -plt.xlabel("Frame (time-axis)") -plt.ylabel("Class") -plt.show() -print("Class labels:", bundle.get_labels()) - - -###################################################################### -# We can see that there are strong indications to certain labels across -# the time line. -# - - -###################################################################### -# Generating transcripts -# ---------------------- -# -# From the sequence of label probabilities, now we want to generate -# transcripts. The process to generate hypotheses is often called -# “decoding”. -# -# Decoding is more elaborate than simple classification because -# decoding at certain time step can be affected by surrounding -# observations. -# -# For example, take a word like ``night`` and ``knight``. Even if their -# prior probability distribution are differnt (in typical conversations, -# ``night`` would occur way more often than ``knight``), to accurately -# generate transcripts with ``knight``, such as ``a knight with a sword``, -# the decoding process has to postpone the final decision until it sees -# enough context. -# -# There are many decoding techniques proposed, and they require external -# resources, such as word dictionary and language models. -# -# In this tutorial, for the sake of simplicity, we will perform greedy -# decoding which does not depend on such external components, and simply -# pick up the best hypothesis at each time step. Therefore, the context -# information are not used, and only one transcript can be generated. -# -# We start by defining greedy decoding algorithm. -# - - -class GreedyCTCDecoder(torch.nn.Module): - def __init__(self, labels, blank=0): - super().__init__() - self.labels = labels - self.blank = blank - - def forward(self, emission: torch.Tensor) -> str: - """Given a sequence emission over labels, get the best path string - Args: - emission (Tensor): Logit tensors. Shape `[num_seq, num_label]`. - - Returns: - str: The resulting transcript - """ - indices = torch.argmax(emission, dim=-1) # [num_seq,] - indices = torch.unique_consecutive(indices, dim=-1) - indices = [i for i in indices if i != self.blank] - return "".join([self.labels[i] for i in indices]) - - -###################################################################### -# Now create the decoder object and decode the transcript. -# - -decoder = GreedyCTCDecoder(labels=bundle.get_labels()) -transcript = decoder(emission[0]) - - -###################################################################### -# Let’s check the result and listen again to the audio. -# - -print(transcript) -IPython.display.Audio(SPEECH_FILE) - - -###################################################################### -# The ASR model is fine-tuned using a loss function called Connectionist Temporal Classification (CTC). -# The detail of CTC loss is explained -# `here `__. In CTC a blank token (ϵ) is a -# special token which represents a repetition of the previous symbol. In -# decoding, these are simply ignored. -# - - -###################################################################### -# Conclusion -# ---------- -# -# In this tutorial, we looked at how to use :py:mod:`torchaudio.pipelines` to -# perform acoustic feature extraction and speech recognition. Constructing -# a model and getting the emission is as short as two lines. -# -# :: -# -# model = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H.get_model() -# emission = model(waveforms, ...) -# diff --git a/intermediate_source/speech_recognition_pipeline_tutorial.rst b/intermediate_source/speech_recognition_pipeline_tutorial.rst new file mode 100644 index 00000000000..4ec497b3bd8 --- /dev/null +++ b/intermediate_source/speech_recognition_pipeline_tutorial.rst @@ -0,0 +1,10 @@ +Speech Recognition with Wav2Vec2 +================================ + +This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/speech_recognition_pipeline_tutorial.html + +It will redirect in 3 seconds. + +.. raw:: html + + diff --git a/intermediate_source/text_to_speech_with_torchaudio.py b/intermediate_source/text_to_speech_with_torchaudio.py deleted file mode 100644 index ade495af6a8..00000000000 --- a/intermediate_source/text_to_speech_with_torchaudio.py +++ /dev/null @@ -1,326 +0,0 @@ -""" -Text-to-speech with torchaudio -============================== - -**Author**: `Yao-Yuan Yang `__, `Moto -Hira `__ - -""" - -###################################################################### -# Overview -# -------- -# -# This tutorial shows how to build text-to-speech pipeline, using the -# pretrained Tacotron2 in torchaudio. -# -# The text-to-speech pipeline goes as follows: 1. Text preprocessing -# -# First, the input text is encoded into a list of symbols. In this -# tutorial, we will use English characters and phonemes as the symbols. -# -# 2. Spectrogram generation -# -# From the encoded text, a spectrogram is generated. We use ``Tacotron2`` -# model for this. -# -# 3. Time-domain conversion -# -# The last step is converting the spectrogram into the waveform. The -# process to generate speech from spectrogram is also called Vocoder. In -# this tutorial, three different vocoders are used, -# ```WaveRNN`` `__, -# ```Griffin-Lim`` `__, -# and -# ```Nvidia's WaveGlow`` `__. -# -# The following figure illustrates the whole process. -# -# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/tacotron2_tts_pipeline.png -# - - -###################################################################### -# Preparation -# ----------- -# -# First, we install the necessary dependencies. In addition to -# ``torchaudio``, ``DeepPhonemizer`` is required to perform phoneme-based -# encoding. -# - -# %% -# .. code-block:: bash -# -# %%bash -# pip3 install deep_phonemizer - -import torch -import torchaudio -import matplotlib.pyplot as plt - -import IPython - -print(torch.__version__) -print(torchaudio.__version__) - -torch.random.manual_seed(0) -device = "cuda" if torch.cuda.is_available() else "cpu" - - - -###################################################################### -# Text Processing -# --------------- -# - - -###################################################################### -# Character-based encoding -# ~~~~~~~~~~~~~~~~~~~~~~~~ -# -# In this section, we will go through how the character-based encoding -# works. -# -# Since the pre-trained Tacotron2 model expects specific set of symbol -# tables, the same functionalities available in ``torchaudio``. This -# section is more for the explanation of the basis of encoding. -# -# Firstly, we define the set of symbols. For example, we can use -# ``'_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz'``. Then, we will map the -# each character of the input text into the index of the corresponding -# symbol in the table. -# -# The following is an example of such processing. In the example, symbols -# that are not in the table are ignored. -# - -symbols = '_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz' -look_up = {s: i for i, s in enumerate(symbols)} -symbols = set(symbols) - -def text_to_sequence(text): - text = text.lower() - return [look_up[s] for s in text if s in symbols] - -text = "Hello world! Text to speech!" -print(text_to_sequence(text)) - - -###################################################################### -# As mentioned in the above, the symbol table and indices must match -# what the pretrained Tacotron2 model expects. ``torchaudio`` provides the -# transform along with the pretrained model. For example, you can -# instantiate and use such transform as follow. -# - -processor = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH.get_text_processor() - -text = "Hello world! Text to speech!" -processed, lengths = processor(text) - -print(processed) -print(lengths) - - -###################################################################### -# The ``processor`` object takes either a text or list of texts as inputs. -# When a list of texts are provided, the returned ``lengths`` variable -# represents the valid length of each processed tokens in the output -# batch. -# -# The intermediate representation can be retrieved as follow. -# - -print([processor.tokens[i] for i in processed[0, :lengths[0]]]) - - -###################################################################### -# Phoneme-based encoding -# ~~~~~~~~~~~~~~~~~~~~~~ -# -# Phoneme-based encoding is similar to character-based encoding, but it -# uses a symbol table based on phonemes and a G2P (Grapheme-to-Phoneme) -# model. -# -# The detail of the G2P model is out of scope of this tutorial, we will -# just look at what the conversion looks like. -# -# Similar to the case of character-based encoding, the encoding process is -# expected to match what a pretrained Tacotron2 model is trained on. -# ``torchaudio`` has an interface to create the process. -# -# The following code illustrates how to make and use the process. Behind -# the scene, a G2P model is created using ``DeepPhonemizer`` package, and -# the pretrained weights published by the author of ``DeepPhonemizer`` is -# fetched. -# - -bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH - -processor = bundle.get_text_processor() - -text = "Hello world! Text to speech!" -with torch.inference_mode(): - processed, lengths = processor(text) - -print(processed) -print(lengths) - - -###################################################################### -# Notice that the encoded values are different from the example of -# character-based encoding. -# -# The intermediate representation looks like the following. -# - -print([processor.tokens[i] for i in processed[0, :lengths[0]]]) - - -###################################################################### -# Spectrogram Generation -# ---------------------- -# -# ``Tacotron2`` is the model we use to generate spectrogram from the -# encoded text. For the detail of the model, please refer to `the -# paper `__. -# -# It is easy to instantiate a Tacotron2 model with pretrained weight, -# however, note that the input to Tacotron2 models are processed by the -# matching text processor. -# -# ``torchaudio`` bundles the matching models and processors together so -# that it is easy to create the pipeline. -# -# (For the available bundles, and its usage, please refer to `the -# documentation `__.) -# - -bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH -processor = bundle.get_text_processor() -tacotron2 = bundle.get_tacotron2().to(device) - -text = "Hello world! Text to speech!" - -with torch.inference_mode(): - processed, lengths = processor(text) - processed = processed.to(device) - lengths = lengths.to(device) - spec, _, _ = tacotron2.infer(processed, lengths) - - -plt.imshow(spec[0].cpu().detach()) - - -###################################################################### -# Note that ``Tacotron2.infer`` method perfoms multinomial sampling, -# therefor, the process of generating the spectrogram incurs randomness. -# - -for _ in range(3): - with torch.inference_mode(): - spec, spec_lengths, _ = tacotron2.infer(processed, lengths) - plt.imshow(spec[0].cpu().detach()) - plt.show() - - -###################################################################### -# Waveform Generation -# ------------------- -# -# Once the spectrogram is generated, the last process is to recover the -# waveform from the spectrogram. -# -# ``torchaudio`` provides vocoders based on ``GriffinLim`` and -# ``WaveRNN``. -# - - -###################################################################### -# WaveRNN -# ~~~~~~~ -# -# Continuing from the previous section, we can instantiate the matching -# WaveRNN model from the same bundle. -# - -bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH - -processor = bundle.get_text_processor() -tacotron2 = bundle.get_tacotron2().to(device) -vocoder = bundle.get_vocoder().to(device) - -text = "Hello world! Text to speech!" - -with torch.inference_mode(): - processed, lengths = processor(text) - processed = processed.to(device) - lengths = lengths.to(device) - spec, spec_lengths, _ = tacotron2.infer(processed, lengths) - waveforms, lengths = vocoder(spec, spec_lengths) - -torchaudio.save("output_wavernn.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate) -IPython.display.display(IPython.display.Audio("output_wavernn.wav")) - - -###################################################################### -# Griffin-Lim -# ~~~~~~~~~~~ -# -# Using the Griffin-Lim vocoder is same as WaveRNN. You can instantiate -# the vocode object with ``get_vocoder`` method and pass the spectrogram. -# - -bundle = torchaudio.pipelines.TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH - -processor = bundle.get_text_processor() -tacotron2 = bundle.get_tacotron2().to(device) -vocoder = bundle.get_vocoder().to(device) - -with torch.inference_mode(): - processed, lengths = processor(text) - processed = processed.to(device) - lengths = lengths.to(device) - spec, spec_lengths, _ = tacotron2.infer(processed, lengths) -waveforms, lengths = vocoder(spec, spec_lengths) - -torchaudio.save("output_griffinlim.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate) -IPython.display.display(IPython.display.Audio("output_griffinlim.wav")) - - -###################################################################### -# Waveglow -# ~~~~~~~~ -# -# Waveglow is a vocoder published by Nvidia. The pretrained weights are -# published on Torch Hub. One can instantiate the model using ``torch.hub`` -# module. -# -if torch.cuda.is_available(): - waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32') -else: - # Workaround to load model mapped on GPU - # https://stackoverflow.com/a/61840832 - waveglow = torch.hub.load( - "NVIDIA/DeepLearningExamples:torchhub", - "nvidia_waveglow", - model_math="fp32", - pretrained=False, - ) - checkpoint = torch.hub.load_state_dict_from_url( - "https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ckpt_fp32/versions/19.09.0/files/nvidia_waveglowpyt_fp32_20190427", - progress=False, - map_location=device, - ) - state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()} - -waveglow = waveglow.remove_weightnorm(waveglow) -waveglow = waveglow.to(device) -waveglow.eval() - -with torch.no_grad(): - waveforms = waveglow.infer(spec) - -torchaudio.save("output_waveglow.wav", waveforms[0:1].cpu(), sample_rate=22050) -IPython.display.display(IPython.display.Audio("output_waveglow.wav")) diff --git a/intermediate_source/text_to_speech_with_torchaudio.rst b/intermediate_source/text_to_speech_with_torchaudio.rst new file mode 100644 index 00000000000..bbb6d7f272d --- /dev/null +++ b/intermediate_source/text_to_speech_with_torchaudio.rst @@ -0,0 +1,10 @@ +Text-to-speech with Tacotron2 +============================= + +This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/tacotron2_pipeline_tutorial.html + +It will redirect in 3 seconds. + +.. raw:: html + +