10 분 소요


Tacotron2(pytorch)

In [None]:
%%bash
pip install librosa scipy unidecode inflect
In [None]:
%%bash
apt-get update
apt-get install -y libsndfile1
In [3]:
import torch
tacotron2 =  torch.hub.load("NVIDIA/DeepLearningExamples:torchhub",
                            'nvidia_tacotron2',
                            model_math="fp16")
tacotron2 = tacotron2.to('cuda')
tacotron2.eval()
Out [3]:
Tacotron2(
  (embedding): Embedding(148, 512)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (2): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (lstm): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (prenet): Prenet(
      (layers): ModuleList(
        (0): LinearNorm(
          (linear_layer): Linear(in_features=80, out_features=256, bias=False)
        )
        (1): LinearNorm(
          (linear_layer): Linear(in_features=256, out_features=256, bias=False)
        )
      )
    )
    (attention_rnn): LSTMCell(768, 1024)
    (attention_layer): Attention(
      (query_layer): LinearNorm(
        (linear_layer): Linear(in_features=1024, out_features=128, bias=False)
      )
      (memory_layer): LinearNorm(
        (linear_layer): Linear(in_features=512, out_features=128, bias=False)
      )
      (v): LinearNorm(
        (linear_layer): Linear(in_features=128, out_features=1, bias=False)
      )
      (location_layer): LocationLayer(
        (location_conv): ConvNorm(
          (conv): Conv1d(2, 32, kernel_size=(31,), stride=(1,), padding=(15,), bias=False)
        )
        (location_dense): LinearNorm(
          (linear_layer): Linear(in_features=32, out_features=128, bias=False)
        )
      )
    )
    (decoder_rnn): LSTMCell(1536, 1024, bias=1)
    (linear_projection): LinearNorm(
      (linear_layer): Linear(in_features=1536, out_features=80, bias=True)
    )
    (gate_layer): LinearNorm(
      (linear_layer): Linear(in_features=1536, out_features=1, bias=True)
    )
  )
  (postnet): Postnet(
    (convolutions): ModuleList(
      (0): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (2): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (3): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (4): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
)
In [4]:
waveglow =  torch.hub.load("NVIDIA/DeepLearningExamples:torchhub",
                            'nvidia_waveglow',
                            model_math="fp16")
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval()
Out [4]:
WaveGlow(
  (upsample): ConvTranspose1d(80, 80, kernel_size=(1024,), stride=(256,))
  (WN): ModuleList(
    (0): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(4, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 8, kernel_size=(1,), stride=(1,))
    )
    (1): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(4, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 8, kernel_size=(1,), stride=(1,))
    )
    (2): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(4, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 8, kernel_size=(1,), stride=(1,))
    )
    (3): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(4, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 8, kernel_size=(1,), stride=(1,))
    )
    (4): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(3, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 6, kernel_size=(1,), stride=(1,))
    )
    (5): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(3, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 6, kernel_size=(1,), stride=(1,))
    )
    (6): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(3, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 6, kernel_size=(1,), stride=(1,))
    )
    (7): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(3, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 6, kernel_size=(1,), stride=(1,))
    )
    (8): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(2, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 4, kernel_size=(1,), stride=(1,))
    )
    (9): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(2, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 4, kernel_size=(1,), stride=(1,))
    )
    (10): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(2, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 4, kernel_size=(1,), stride=(1,))
    )
    (11): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
      )
      (cond_layers): ModuleList(
        (0): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (2): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (3): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (4): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (5): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (6): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
        (7): Conv1d(640, 1024, kernel_size=(1,), stride=(1,))
      )
      (start): Conv1d(2, 512, kernel_size=(1,), stride=(1,))
      (end): Conv1d(512, 4, kernel_size=(1,), stride=(1,))
    )
  )
  (convinv): ModuleList(
    (0): Invertible1x1Conv(
      (conv): Conv1d(8, 8, kernel_size=(1,), stride=(1,), bias=False)
    )
    (1): Invertible1x1Conv(
      (conv): Conv1d(8, 8, kernel_size=(1,), stride=(1,), bias=False)
    )
    (2): Invertible1x1Conv(
      (conv): Conv1d(8, 8, kernel_size=(1,), stride=(1,), bias=False)
    )
    (3): Invertible1x1Conv(
      (conv): Conv1d(8, 8, kernel_size=(1,), stride=(1,), bias=False)
    )
    (4): Invertible1x1Conv(
      (conv): Conv1d(6, 6, kernel_size=(1,), stride=(1,), bias=False)
    )
    (5): Invertible1x1Conv(
      (conv): Conv1d(6, 6, kernel_size=(1,), stride=(1,), bias=False)
    )
    (6): Invertible1x1Conv(
      (conv): Conv1d(6, 6, kernel_size=(1,), stride=(1,), bias=False)
    )
    (7): Invertible1x1Conv(
      (conv): Conv1d(6, 6, kernel_size=(1,), stride=(1,), bias=False)
    )
    (8): Invertible1x1Conv(
      (conv): Conv1d(4, 4, kernel_size=(1,), stride=(1,), bias=False)
    )
    (9): Invertible1x1Conv(
      (conv): Conv1d(4, 4, kernel_size=(1,), stride=(1,), bias=False)
    )
    (10): Invertible1x1Conv(
      (conv): Conv1d(4, 4, kernel_size=(1,), stride=(1,), bias=False)
    )
    (11): Invertible1x1Conv(
      (conv): Conv1d(4, 4, kernel_size=(1,), stride=(1,), bias=False)
    )
  )
)
In [5]:
text = 'Hello'
In [6]:
utils = torch.hub.load("NVIDIA/DeepLearningExamples:torchhub",
                            'nvidia_tts_utils')
# 전처리
sequences, lengths = utils.prepare_input_sequence([text])
sequences # 음소 하나하나를 숫자로
Out [6]:
tensor([[45, 42, 49, 49, 52]], device='cuda:0')
In [7]:
text = "H e l l o"
sequences, lengths = utils.prepare_input_sequence([text])
sequences # 빈칸도 의미가 있으므로 숫자로 변환
Out [7]:
tensor([[45, 11, 42, 11, 49, 11, 49, 11, 52]], device='cuda:0')
In [8]:
text = "Hello everyone, Let's take this in peace"
sequences, lengths = utils.prepare_input_sequence([text])
sequences
Out [8]:
tensor([[45, 42, 49, 49, 52, 11, 42, 59, 42, 55, 62, 52, 51, 42,  6, 11, 49, 42,
         57,  3, 56, 11, 57, 38, 48, 42, 11, 57, 45, 46, 56, 11, 46, 51, 11, 53,
         42, 38, 40, 42]], device='cuda:0')
In [9]:
with torch.no_grad():
    mel, _, _ = tacotron2.infer(sequences, lengths) # melspectogram
    audio = waveglow.infer(mel) # melspectogram to real sound
audio_rst = audio[0].data.cpu().numpy()
sr = 22050 # 미리 맞춰진 샘플링 레이트
In [10]:
from IPython.display import Audio
Audio(audio_rst, rate=sr)
Out [10]:
In [11]:
text = "aa nyong haa sae yo-u"
sequences, lengths = utils.prepare_input_sequence([text])
with torch.no_grad():
    mel, _, _ = tacotron2.infer(sequences, lengths)
    audio = waveglow.infer(mel)
audio_rst = audio[0].data.cpu().numpy()
sr = 22050
Audio(audio_rst, rate=sr)
Out [11]:

댓글남기기