|
2 | 2 | """
|
3 | 3 | Audio Feature Augmentation
|
4 | 4 | ==========================
|
5 |
| -""" |
6 |
| - |
7 |
| -# When running this tutorial in Google Colab, install the required packages |
8 |
| -# with the following. |
9 |
| -# !pip install torchaudio librosa |
10 |
| - |
11 |
| -import torch |
12 |
| -import torchaudio |
13 |
| -import torchaudio.transforms as T |
14 |
| - |
15 |
| -print(torch.__version__) |
16 |
| -print(torchaudio.__version__) |
17 |
| - |
18 |
| -###################################################################### |
19 |
| -# Preparing data and utility functions (skip this section) |
20 |
| -# -------------------------------------------------------- |
21 |
| -# |
22 |
| - |
23 |
| -# @title Prepare data and utility functions. {display-mode: "form"} |
24 |
| -# @markdown |
25 |
| -# @markdown You do not need to look into this cell. |
26 |
| -# @markdown Just execute once and you are good to go. |
27 |
| -# @markdown |
28 |
| -# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), |
29 |
| -# @markdown which is licensed under Creative Commos BY 4.0. |
30 |
| - |
31 |
| -# ------------------------------------------------------------------------------- |
32 |
| -# Preparation of data and helper functions. |
33 |
| -# ------------------------------------------------------------------------------- |
34 |
| - |
35 |
| -import os |
36 |
| - |
37 |
| -import librosa |
38 |
| -import matplotlib.pyplot as plt |
39 |
| -import requests |
40 |
| - |
41 |
| - |
42 |
| -_SAMPLE_DIR = "_assets" |
43 |
| - |
44 |
| -SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" # noqa: E501 |
45 |
| -SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav") |
46 |
| - |
47 |
| -os.makedirs(_SAMPLE_DIR, exist_ok=True) |
48 |
| - |
49 |
| - |
50 |
| -def _fetch_data(): |
51 |
| - uri = [ |
52 |
| - (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), |
53 |
| - ] |
54 |
| - for url, path in uri: |
55 |
| - with open(path, "wb") as file_: |
56 |
| - file_.write(requests.get(url).content) |
57 |
| - |
58 |
| - |
59 |
| -_fetch_data() |
60 |
| - |
61 |
| - |
62 |
| -def _get_sample(path, resample=None): |
63 |
| - effects = [["remix", "1"]] |
64 |
| - if resample: |
65 |
| - effects.extend( |
66 |
| - [ |
67 |
| - ["lowpass", f"{resample // 2}"], |
68 |
| - ["rate", f"{resample}"], |
69 |
| - ] |
70 |
| - ) |
71 |
| - return torchaudio.sox_effects.apply_effects_file(path, effects=effects) |
72 |
| - |
73 |
| - |
74 |
| -def get_speech_sample(*, resample=None): |
75 |
| - return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample) |
76 |
| - |
77 | 5 |
|
78 |
| -def get_spectrogram( |
79 |
| - n_fft=400, |
80 |
| - win_len=None, |
81 |
| - hop_len=None, |
82 |
| - power=2.0, |
83 |
| -): |
84 |
| - waveform, _ = get_speech_sample() |
85 |
| - spectrogram = T.Spectrogram( |
86 |
| - n_fft=n_fft, |
87 |
| - win_length=win_len, |
88 |
| - hop_length=hop_len, |
89 |
| - center=True, |
90 |
| - pad_mode="reflect", |
91 |
| - power=power, |
92 |
| - ) |
93 |
| - return spectrogram(waveform) |
| 6 | +This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html |
94 | 7 |
|
95 |
| - |
96 |
| -def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None): |
97 |
| - fig, axs = plt.subplots(1, 1) |
98 |
| - axs.set_title(title or "Spectrogram (db)") |
99 |
| - axs.set_ylabel(ylabel) |
100 |
| - axs.set_xlabel("frame") |
101 |
| - im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect) |
102 |
| - if xmax: |
103 |
| - axs.set_xlim((0, xmax)) |
104 |
| - fig.colorbar(im, ax=axs) |
105 |
| - plt.show(block=False) |
106 |
| - |
107 |
| - |
108 |
| -###################################################################### |
109 |
| -# SpecAugment |
110 |
| -# ----------- |
111 |
| -# |
112 |
| -# `SpecAugment <https://ai.googleblog.com/2019/04/specaugment-new-data-augmentation.html>`__ |
113 |
| -# is a popular spectrogram augmentation technique. |
114 |
| -# |
115 |
| -# ``torchaudio`` implements :py:func:`torchaudio.transforms.TimeStretch`, |
116 |
| -# :py:func:`torchaudio.transforms.TimeMasking` and |
117 |
| -# :py:func:`torchaudio.transforms.FrequencyMasking`. |
118 |
| -# |
119 |
| - |
120 |
| -###################################################################### |
121 |
| -# TimeStretch |
122 |
| -# ----------- |
123 |
| -# |
124 |
| - |
125 |
| - |
126 |
| -spec = get_spectrogram(power=None) |
127 |
| -stretch = T.TimeStretch() |
128 |
| - |
129 |
| -rate = 1.2 |
130 |
| -spec_ = stretch(spec, rate) |
131 |
| -plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304) |
132 |
| - |
133 |
| -plot_spectrogram(torch.abs(spec[0]), title="Original", aspect="equal", xmax=304) |
134 |
| - |
135 |
| -rate = 0.9 |
136 |
| -spec_ = stretch(spec, rate) |
137 |
| -plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304) |
138 |
| - |
139 |
| -###################################################################### |
140 |
| -# TimeMasking |
141 |
| -# ----------- |
142 |
| -# |
143 |
| - |
144 |
| -torch.random.manual_seed(4) |
145 |
| - |
146 |
| -spec = get_spectrogram() |
147 |
| -plot_spectrogram(spec[0], title="Original") |
148 |
| - |
149 |
| -masking = T.TimeMasking(time_mask_param=80) |
150 |
| -spec = masking(spec) |
151 |
| - |
152 |
| -plot_spectrogram(spec[0], title="Masked along time axis") |
153 |
| - |
154 |
| -###################################################################### |
155 |
| -# FrequencyMasking |
156 |
| -# ---------------- |
157 |
| -# |
158 |
| - |
159 |
| - |
160 |
| -torch.random.manual_seed(4) |
161 |
| - |
162 |
| -spec = get_spectrogram() |
163 |
| -plot_spectrogram(spec[0], title="Original") |
164 |
| - |
165 |
| -masking = T.FrequencyMasking(freq_mask_param=80) |
166 |
| -spec = masking(spec) |
167 |
| - |
168 |
| -plot_spectrogram(spec[0], title="Masked along frequency axis") |
| 8 | +It will redirect in 3 seconds. |
| 9 | +.. raw::html |
| 10 | + <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/audio_data_augmentation_tutorial.html'" /> |
| 11 | +""" |
0 commit comments