음성 처리(TTS) - Nvidia Tacotron2(pytorch)
Tacotron2(pytorch)
In [None]:
%%bash
pip install librosa scipy unidecode inflect
In [None]:
%%bash
apt-get update
apt-get install -y libsndfile1
In [3]:
import torch
tacotron2 = torch.hub.load("NVIDIA/DeepLearningExamples:torchhub",
'nvidia_tacotron2',
model_math="fp16")
tacotron2 = tacotron2.to('cuda')
tacotron2.eval()
Out [3]:
Tacotron2(
(embedding): Embedding(148, 512)
(encoder): Encoder(
(convolutions): ModuleList(
(0): Sequential(
(0): ConvNorm(
(conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): Sequential(
(0): ConvNorm(
(conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(2): Sequential(
(0): ConvNorm(
(conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(lstm): LSTM(512, 256, batch_first=True, bidirectional=True)
)
(decoder): Decoder(
(prenet): Prenet(
(layers): ModuleList(
(0): LinearNorm(
(linear_layer): Linear(in_features=80, out_features=256, bias=False)
)
(1): LinearNorm(
(linear_layer): Linear(in_features=256, out_features=256, bias=False)
)
)
)
(attention_rnn): LSTMCell(768, 1024)
(attention_layer): Attention(
(query_layer): LinearNorm(
(linear_layer): Linear(in_features=1024, out_features=128, bias=False)
)
(memory_layer): LinearNorm(
(linear_layer): Linear(in_features=512, out_features=128, bias=False)
)
(v): LinearNorm(
(linear_layer): Linear(in_features=128, out_features=1, bias=False)
)
(location_layer): LocationLayer(
(location_conv): ConvNorm(
(conv): Conv1d(2, 32, kernel_size=(31,), stride=(1,), padding=(15,), bias=False)
)
(location_dense): LinearNorm(
(linear_layer): Linear(in_features=32, out_features=128, bias=False)
)
)
)
(decoder_rnn): LSTMCell(1536, 1024, bias=1)
(linear_projection): LinearNorm(
(linear_layer): Linear(in_features=1536, out_features=80, bias=True)
)
(gate_layer): LinearNorm(
(linear_layer): Linear(in_features=1536, out_features=1, bias=True)
)
)
(postnet): Postnet(
(convolutions): ModuleList(
(0): Sequential(
(0): ConvNorm(
(conv): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): Sequential(
(0): ConvNorm(
(conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(2): Sequential(
(0): ConvNorm(
(conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(3): Sequential(
(0): ConvNorm(
(conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(4): Sequential(
(0): ConvNorm(
(conv): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,))
)
(1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
)
In [4]:
waveglow = torch.hub.load("NVIDIA/DeepLearningExamples:torchhub",
'nvidia_waveglow',
model_math="fp16")
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval()
Out [4]:
WaveGlow(
(upsample): ConvTranspose1d(80, 80, kernel_size=(1024,), stride=(256,))
(WN): ModuleList(
(0): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(4, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 8, kernel_size=(1,), stride=(1,))
)
(1): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(4, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 8, kernel_size=(1,), stride=(1,))
)
(2): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(4, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 8, kernel_size=(1,), stride=(1,))
)
(3): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(4, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 8, kernel_size=(1,), stride=(1,))
)
(4): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(3, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 6, kernel_size=(1,), stride=(1,))
)
(5): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(3, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 6, kernel_size=(1,), stride=(1,))
)
(6): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(3, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 6, kernel_size=(1,), stride=(1,))
)
(7): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(3, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 6, kernel_size=(1,), stride=(1,))
)
(8): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(2, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 4, kernel_size=(1,), stride=(1,))
)
(9): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(2, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 4, kernel_size=(1,), stride=(1,))
)
(10): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(2, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 4, kernel_size=(1,), stride=(1,))
)
(11): WN(
(in_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
(1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
(2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
(3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
(4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
(5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
(6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
(7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
)
(res_skip_layers): ModuleList(
(0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
)
(cond_layers): ModuleList(
(0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
(7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
)
(start): Conv1d(2, 512, kernel_size=(1,), stride=(1,))
(end): Conv1d(512, 4, kernel_size=(1,), stride=(1,))
)
)
(convinv): ModuleList(
(0): Invertible1x1Conv(
(conv): Conv1d(8, 8, kernel_size=(1,), stride=(1,), bias=False)
)
(1): Invertible1x1Conv(
(conv): Conv1d(8, 8, kernel_size=(1,), stride=(1,), bias=False)
)
(2): Invertible1x1Conv(
(conv): Conv1d(8, 8, kernel_size=(1,), stride=(1,), bias=False)
)
(3): Invertible1x1Conv(
(conv): Conv1d(8, 8, kernel_size=(1,), stride=(1,), bias=False)
)
(4): Invertible1x1Conv(
(conv): Conv1d(6, 6, kernel_size=(1,), stride=(1,), bias=False)
)
(5): Invertible1x1Conv(
(conv): Conv1d(6, 6, kernel_size=(1,), stride=(1,), bias=False)
)
(6): Invertible1x1Conv(
(conv): Conv1d(6, 6, kernel_size=(1,), stride=(1,), bias=False)
)
(7): Invertible1x1Conv(
(conv): Conv1d(6, 6, kernel_size=(1,), stride=(1,), bias=False)
)
(8): Invertible1x1Conv(
(conv): Conv1d(4, 4, kernel_size=(1,), stride=(1,), bias=False)
)
(9): Invertible1x1Conv(
(conv): Conv1d(4, 4, kernel_size=(1,), stride=(1,), bias=False)
)
(10): Invertible1x1Conv(
(conv): Conv1d(4, 4, kernel_size=(1,), stride=(1,), bias=False)
)
(11): Invertible1x1Conv(
(conv): Conv1d(4, 4, kernel_size=(1,), stride=(1,), bias=False)
)
)
)
In [5]:
text = 'Hello'
In [6]:
utils = torch.hub.load("NVIDIA/DeepLearningExamples:torchhub",
'nvidia_tts_utils')
# 전처리
sequences, lengths = utils.prepare_input_sequence([text])
sequences # 음소 하나하나를 숫자로
Out [6]:
tensor([[45, 42, 49, 49, 52]], device='cuda:0')
In [7]:
text = "H e l l o"
sequences, lengths = utils.prepare_input_sequence([text])
sequences # 빈칸도 의미가 있으므로 숫자로 변환
Out [7]:
tensor([[45, 11, 42, 11, 49, 11, 49, 11, 52]], device='cuda:0')
In [8]:
text = "Hello everyone, Let's take this in peace"
sequences, lengths = utils.prepare_input_sequence([text])
sequences
Out [8]:
tensor([[45, 42, 49, 49, 52, 11, 42, 59, 42, 55, 62, 52, 51, 42, 6, 11, 49, 42,
57, 3, 56, 11, 57, 38, 48, 42, 11, 57, 45, 46, 56, 11, 46, 51, 11, 53,
42, 38, 40, 42]], device='cuda:0')
In [9]:
with torch.no_grad():
mel, _, _ = tacotron2.infer(sequences, lengths) # melspectogram
audio = waveglow.infer(mel) # melspectogram to real sound
audio_rst = audio[0].data.cpu().numpy()
sr = 22050 # 미리 맞춰진 샘플링 레이트
In [10]:
from IPython.display import Audio
Audio(audio_rst, rate=sr)
Out [10]:
In [11]:
text = "aa nyong haa sae yo-u"
sequences, lengths = utils.prepare_input_sequence([text])
with torch.no_grad():
mel, _, _ = tacotron2.infer(sequences, lengths)
audio = waveglow.infer(mel)
audio_rst = audio[0].data.cpu().numpy()
sr = 22050
Audio(audio_rst, rate=sr)
Out [11]:
댓글남기기