|
2 | 2 | Text-to-speech with torchaudio
|
3 | 3 | ==============================
|
4 | 4 |
|
5 |
| -**Author**: `Yao-Yuan Yang <https://github.com/yangarbiter>`__, `Moto |
6 |
| -Hira <moto@fb.com>`__ |
7 |
| -
|
8 |
| -""" |
9 |
| - |
10 |
| -###################################################################### |
11 |
| -# Overview |
12 |
| -# -------- |
13 |
| -# |
14 |
| -# This tutorial shows how to build text-to-speech pipeline, using the |
15 |
| -# pretrained Tacotron2 in torchaudio. |
16 |
| -# |
17 |
| -# The text-to-speech pipeline goes as follows: 1. Text preprocessing |
18 |
| -# |
19 |
| -# First, the input text is encoded into a list of symbols. In this |
20 |
| -# tutorial, we will use English characters and phonemes as the symbols. |
21 |
| -# |
22 |
| -# 2. Spectrogram generation |
23 |
| -# |
24 |
| -# From the encoded text, a spectrogram is generated. We use ``Tacotron2`` |
25 |
| -# model for this. |
26 |
| -# |
27 |
| -# 3. Time-domain conversion |
28 |
| -# |
29 |
| -# The last step is converting the spectrogram into the waveform. The |
30 |
| -# process to generate speech from spectrogram is also called Vocoder. In |
31 |
| -# this tutorial, three different vocoders are used, |
32 |
| -# ```WaveRNN`` <https://pytorch.org/audio/stable/models/wavernn.html>`__, |
33 |
| -# ```Griffin-Lim`` <https://pytorch.org/audio/stable/transforms.html#griffinlim>`__, |
34 |
| -# and |
35 |
| -# ```Nvidia's WaveGlow`` <https://pytorch.org/hub/nvidia_deeplearningexamples_tacotron2/>`__. |
36 |
| -# |
37 |
| -# The following figure illustrates the whole process. |
38 |
| -# |
39 |
| -# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/tacotron2_tts_pipeline.png |
40 |
| -# |
41 |
| - |
42 |
| - |
43 |
| -###################################################################### |
44 |
| -# Preparation |
45 |
| -# ----------- |
46 |
| -# |
47 |
| -# First, we install the necessary dependencies. In addition to |
48 |
| -# ``torchaudio``, ``DeepPhonemizer`` is required to perform phoneme-based |
49 |
| -# encoding. |
50 |
| -# |
51 |
| - |
52 |
| -# %% |
53 |
| -# .. code-block:: bash |
54 |
| -# |
55 |
| -# %%bash |
56 |
| -# pip3 install deep_phonemizer |
57 |
| - |
58 |
| -import torch |
59 |
| -import torchaudio |
60 |
| -import matplotlib.pyplot as plt |
61 |
| - |
62 |
| -import IPython |
63 |
| - |
64 |
| -print(torch.__version__) |
65 |
| -print(torchaudio.__version__) |
66 |
| - |
67 |
| -torch.random.manual_seed(0) |
68 |
| -device = "cuda" if torch.cuda.is_available() else "cpu" |
69 |
| - |
70 |
| - |
71 |
| - |
72 |
| -###################################################################### |
73 |
| -# Text Processing |
74 |
| -# --------------- |
75 |
| -# |
76 |
| - |
77 |
| - |
78 |
| -###################################################################### |
79 |
| -# Character-based encoding |
80 |
| -# ~~~~~~~~~~~~~~~~~~~~~~~~ |
81 |
| -# |
82 |
| -# In this section, we will go through how the character-based encoding |
83 |
| -# works. |
84 |
| -# |
85 |
| -# Since the pre-trained Tacotron2 model expects specific set of symbol |
86 |
| -# tables, the same functionalities available in ``torchaudio``. This |
87 |
| -# section is more for the explanation of the basis of encoding. |
88 |
| -# |
89 |
| -# Firstly, we define the set of symbols. For example, we can use |
90 |
| -# ``'_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz'``. Then, we will map the |
91 |
| -# each character of the input text into the index of the corresponding |
92 |
| -# symbol in the table. |
93 |
| -# |
94 |
| -# The following is an example of such processing. In the example, symbols |
95 |
| -# that are not in the table are ignored. |
96 |
| -# |
97 |
| - |
98 |
| -symbols = '_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz' |
99 |
| -look_up = {s: i for i, s in enumerate(symbols)} |
100 |
| -symbols = set(symbols) |
101 |
| - |
102 |
| -def text_to_sequence(text): |
103 |
| - text = text.lower() |
104 |
| - return [look_up[s] for s in text if s in symbols] |
105 |
| - |
106 |
| -text = "Hello world! Text to speech!" |
107 |
| -print(text_to_sequence(text)) |
108 |
| - |
109 |
| - |
110 |
| -###################################################################### |
111 |
| -# As mentioned in the above, the symbol table and indices must match |
112 |
| -# what the pretrained Tacotron2 model expects. ``torchaudio`` provides the |
113 |
| -# transform along with the pretrained model. For example, you can |
114 |
| -# instantiate and use such transform as follow. |
115 |
| -# |
116 |
| - |
117 |
| -processor = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH.get_text_processor() |
118 |
| - |
119 |
| -text = "Hello world! Text to speech!" |
120 |
| -processed, lengths = processor(text) |
121 |
| - |
122 |
| -print(processed) |
123 |
| -print(lengths) |
124 |
| - |
125 |
| - |
126 |
| -###################################################################### |
127 |
| -# The ``processor`` object takes either a text or list of texts as inputs. |
128 |
| -# When a list of texts are provided, the returned ``lengths`` variable |
129 |
| -# represents the valid length of each processed tokens in the output |
130 |
| -# batch. |
131 |
| -# |
132 |
| -# The intermediate representation can be retrieved as follow. |
133 |
| -# |
134 |
| - |
135 |
| -print([processor.tokens[i] for i in processed[0, :lengths[0]]]) |
136 |
| - |
137 |
| - |
138 |
| -###################################################################### |
139 |
| -# Phoneme-based encoding |
140 |
| -# ~~~~~~~~~~~~~~~~~~~~~~ |
141 |
| -# |
142 |
| -# Phoneme-based encoding is similar to character-based encoding, but it |
143 |
| -# uses a symbol table based on phonemes and a G2P (Grapheme-to-Phoneme) |
144 |
| -# model. |
145 |
| -# |
146 |
| -# The detail of the G2P model is out of scope of this tutorial, we will |
147 |
| -# just look at what the conversion looks like. |
148 |
| -# |
149 |
| -# Similar to the case of character-based encoding, the encoding process is |
150 |
| -# expected to match what a pretrained Tacotron2 model is trained on. |
151 |
| -# ``torchaudio`` has an interface to create the process. |
152 |
| -# |
153 |
| -# The following code illustrates how to make and use the process. Behind |
154 |
| -# the scene, a G2P model is created using ``DeepPhonemizer`` package, and |
155 |
| -# the pretrained weights published by the author of ``DeepPhonemizer`` is |
156 |
| -# fetched. |
157 |
| -# |
158 |
| - |
159 |
| -bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH |
160 |
| - |
161 |
| -processor = bundle.get_text_processor() |
162 |
| - |
163 |
| -text = "Hello world! Text to speech!" |
164 |
| -with torch.inference_mode(): |
165 |
| - processed, lengths = processor(text) |
166 |
| - |
167 |
| -print(processed) |
168 |
| -print(lengths) |
| 5 | +This tutorial has been moved to https://pytorch.org/audio/stable/tutorials/tacotron2_pipeline_tutorial.html |
| 6 | +It will redirect in 3 seconds. |
169 | 7 |
|
| 8 | +.. raw::html |
| 9 | + <meta http-equiv="Refresh" content="3; url='https://pytorch.org/audio/stable/tutorials/tacotron2_pipeline_tutorial.html'" /> |
170 | 10 |
|
171 |
| -###################################################################### |
172 |
| -# Notice that the encoded values are different from the example of |
173 |
| -# character-based encoding. |
174 |
| -# |
175 |
| -# The intermediate representation looks like the following. |
176 |
| -# |
177 |
| - |
178 |
| -print([processor.tokens[i] for i in processed[0, :lengths[0]]]) |
179 |
| - |
180 |
| - |
181 |
| -###################################################################### |
182 |
| -# Spectrogram Generation |
183 |
| -# ---------------------- |
184 |
| -# |
185 |
| -# ``Tacotron2`` is the model we use to generate spectrogram from the |
186 |
| -# encoded text. For the detail of the model, please refer to `the |
187 |
| -# paper <https://arxiv.org/abs/1712.05884>`__. |
188 |
| -# |
189 |
| -# It is easy to instantiate a Tacotron2 model with pretrained weight, |
190 |
| -# however, note that the input to Tacotron2 models are processed by the |
191 |
| -# matching text processor. |
192 |
| -# |
193 |
| -# ``torchaudio`` bundles the matching models and processors together so |
194 |
| -# that it is easy to create the pipeline. |
195 |
| -# |
196 |
| -# (For the available bundles, and its usage, please refer to `the |
197 |
| -# documentation <https://pytorch.org/audio/stable/pipelines.html#tacotron2-text-to-speech>`__.) |
198 |
| -# |
199 |
| - |
200 |
| -bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH |
201 |
| -processor = bundle.get_text_processor() |
202 |
| -tacotron2 = bundle.get_tacotron2().to(device) |
203 |
| - |
204 |
| -text = "Hello world! Text to speech!" |
205 |
| - |
206 |
| -with torch.inference_mode(): |
207 |
| - processed, lengths = processor(text) |
208 |
| - processed = processed.to(device) |
209 |
| - lengths = lengths.to(device) |
210 |
| - spec, _, _ = tacotron2.infer(processed, lengths) |
211 |
| - |
212 |
| - |
213 |
| -plt.imshow(spec[0].cpu().detach()) |
214 |
| - |
215 |
| - |
216 |
| -###################################################################### |
217 |
| -# Note that ``Tacotron2.infer`` method perfoms multinomial sampling, |
218 |
| -# therefor, the process of generating the spectrogram incurs randomness. |
219 |
| -# |
220 |
| - |
221 |
| -for _ in range(3): |
222 |
| - with torch.inference_mode(): |
223 |
| - spec, spec_lengths, _ = tacotron2.infer(processed, lengths) |
224 |
| - plt.imshow(spec[0].cpu().detach()) |
225 |
| - plt.show() |
226 |
| - |
227 |
| - |
228 |
| -###################################################################### |
229 |
| -# Waveform Generation |
230 |
| -# ------------------- |
231 |
| -# |
232 |
| -# Once the spectrogram is generated, the last process is to recover the |
233 |
| -# waveform from the spectrogram. |
234 |
| -# |
235 |
| -# ``torchaudio`` provides vocoders based on ``GriffinLim`` and |
236 |
| -# ``WaveRNN``. |
237 |
| -# |
238 |
| - |
239 |
| - |
240 |
| -###################################################################### |
241 |
| -# WaveRNN |
242 |
| -# ~~~~~~~ |
243 |
| -# |
244 |
| -# Continuing from the previous section, we can instantiate the matching |
245 |
| -# WaveRNN model from the same bundle. |
246 |
| -# |
247 |
| - |
248 |
| -bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH |
249 |
| - |
250 |
| -processor = bundle.get_text_processor() |
251 |
| -tacotron2 = bundle.get_tacotron2().to(device) |
252 |
| -vocoder = bundle.get_vocoder().to(device) |
253 |
| - |
254 |
| -text = "Hello world! Text to speech!" |
255 |
| - |
256 |
| -with torch.inference_mode(): |
257 |
| - processed, lengths = processor(text) |
258 |
| - processed = processed.to(device) |
259 |
| - lengths = lengths.to(device) |
260 |
| - spec, spec_lengths, _ = tacotron2.infer(processed, lengths) |
261 |
| - waveforms, lengths = vocoder(spec, spec_lengths) |
262 |
| - |
263 |
| -torchaudio.save("output_wavernn.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate) |
264 |
| -IPython.display.display(IPython.display.Audio("output_wavernn.wav")) |
265 |
| - |
266 |
| - |
267 |
| -###################################################################### |
268 |
| -# Griffin-Lim |
269 |
| -# ~~~~~~~~~~~ |
270 |
| -# |
271 |
| -# Using the Griffin-Lim vocoder is same as WaveRNN. You can instantiate |
272 |
| -# the vocode object with ``get_vocoder`` method and pass the spectrogram. |
273 |
| -# |
274 |
| - |
275 |
| -bundle = torchaudio.pipelines.TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH |
276 |
| - |
277 |
| -processor = bundle.get_text_processor() |
278 |
| -tacotron2 = bundle.get_tacotron2().to(device) |
279 |
| -vocoder = bundle.get_vocoder().to(device) |
280 |
| - |
281 |
| -with torch.inference_mode(): |
282 |
| - processed, lengths = processor(text) |
283 |
| - processed = processed.to(device) |
284 |
| - lengths = lengths.to(device) |
285 |
| - spec, spec_lengths, _ = tacotron2.infer(processed, lengths) |
286 |
| -waveforms, lengths = vocoder(spec, spec_lengths) |
287 |
| - |
288 |
| -torchaudio.save("output_griffinlim.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate) |
289 |
| -IPython.display.display(IPython.display.Audio("output_griffinlim.wav")) |
290 |
| - |
291 |
| - |
292 |
| -###################################################################### |
293 |
| -# Waveglow |
294 |
| -# ~~~~~~~~ |
295 |
| -# |
296 |
| -# Waveglow is a vocoder published by Nvidia. The pretrained weights are |
297 |
| -# published on Torch Hub. One can instantiate the model using ``torch.hub`` |
298 |
| -# module. |
299 |
| -# |
300 |
| -if torch.cuda.is_available(): |
301 |
| - waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32') |
302 |
| -else: |
303 |
| - # Workaround to load model mapped on GPU |
304 |
| - # https://stackoverflow.com/a/61840832 |
305 |
| - waveglow = torch.hub.load( |
306 |
| - "NVIDIA/DeepLearningExamples:torchhub", |
307 |
| - "nvidia_waveglow", |
308 |
| - model_math="fp32", |
309 |
| - pretrained=False, |
310 |
| - ) |
311 |
| - checkpoint = torch.hub.load_state_dict_from_url( |
312 |
| - "https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ckpt_fp32/versions/19.09.0/files/nvidia_waveglowpyt_fp32_20190427", |
313 |
| - progress=False, |
314 |
| - map_location=device, |
315 |
| - ) |
316 |
| - state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()} |
317 |
| - |
318 |
| -waveglow = waveglow.remove_weightnorm(waveglow) |
319 |
| -waveglow = waveglow.to(device) |
320 |
| -waveglow.eval() |
321 |
| - |
322 |
| -with torch.no_grad(): |
323 |
| - waveforms = waveglow.infer(spec) |
324 |
| - |
325 |
| -torchaudio.save("output_waveglow.wav", waveforms[0:1].cpu(), sample_rate=22050) |
326 |
| -IPython.display.display(IPython.display.Audio("output_waveglow.wav")) |
| 11 | +""" |
0 commit comments