From 8195a55a254d3fdd8ff4b30189a0342a65864a61 Mon Sep 17 00:00:00 2001 From: peijiyang Date: Tue, 7 Sep 2021 21:41:16 +0800 Subject: [PATCH] add hifigan vocoder --- hifigan/config_16k_.json | 37 +++++ hifigan/env.py | 15 ++ hifigan/inference.py | 98 ++++++++++++++ hifigan/meldataset.py | 178 ++++++++++++++++++++++++ hifigan/models.py | 286 +++++++++++++++++++++++++++++++++++++++ hifigan/test.py | 58 ++++++++ hifigan/utils.py | 58 ++++++++ toolbox/__init__.py | 16 ++- 8 files changed, 745 insertions(+), 1 deletion(-) create mode 100644 hifigan/config_16k_.json create mode 100644 hifigan/env.py create mode 100644 hifigan/inference.py create mode 100644 hifigan/meldataset.py create mode 100644 hifigan/models.py create mode 100644 hifigan/test.py create mode 100644 hifigan/utils.py diff --git a/hifigan/config_16k_.json b/hifigan/config_16k_.json new file mode 100644 index 0000000..c2f30fe --- /dev/null +++ b/hifigan/config_16k_.json @@ -0,0 +1,37 @@ +{ + "resblock": "1", + "num_gpus": 0, + "batch_size": 16, + "learning_rate": 0.0002, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + + "upsample_rates": [5,5,4,2], + "upsample_kernel_sizes": [10,10,8,4], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + + "segment_size": 6400, + "num_mels": 80, + "num_freq": 1025, + "n_fft": 1024, + "hop_size": 200, + "win_size": 800, + + "sampling_rate": 16000, + + "fmin": 0, + "fmax": 7600, + "fmax_for_loss": null, + + "num_workers": 4, + + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321", + "world_size": 1 + } +} diff --git a/hifigan/env.py b/hifigan/env.py new file mode 100644 index 0000000..2bdbc95 --- /dev/null +++ b/hifigan/env.py @@ -0,0 +1,15 @@ +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) diff --git a/hifigan/inference.py b/hifigan/inference.py new file mode 100644 index 0000000..b663e63 --- /dev/null +++ b/hifigan/inference.py @@ -0,0 +1,98 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +import glob +import os +import argparse +import json +import torch +import numpy as np +from scipy.io.wavfile import write +from hifigan.env import AttrDict +from hifigan.meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav +from hifigan.models import Generator +import soundfile as sf + + +generator = None # type: Generator +_device = None + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def load_model(weights_fpath, verbose=True): + global generator, _device + + if verbose: + print("Building hifigan") + + with open("./hifigan/config_16k_.json") as f: + data = f.read() + json_config = json.loads(data) + h = AttrDict(json_config) + torch.manual_seed(h.seed) + + if torch.cuda.is_available(): + # _model = _model.cuda() + _device = torch.device('cuda') + else: + _device = torch.device('cpu') + + generator = Generator(h).to(_device) + state_dict_g = load_checkpoint( + weights_fpath, _device + ) + generator.load_state_dict(state_dict_g['generator']) + generator.eval() + generator.remove_weight_norm() + + +def is_loaded(): + return generator is not None + + +def infer_waveform(mel, progress_callback=None): + + if generator is None: + raise Exception("Please load hifi-gan in memory before using it") + + mel = torch.FloatTensor(mel).to(_device) + mel = mel.unsqueeze(0) + + with torch.no_grad(): + y_g_hat = generator(mel) + audio = y_g_hat.squeeze() + audio = audio.cpu().numpy() + + return audio + + + +# if __name__ == "__main__": + +# mel = np.load("./mel-T0055G0184S0349.wav_00.npy") +# # mel = torch.FloatTensor(mel.T).to(device) +# # mel = mel.unsqueeze(0) + +# load_model("../../../TTS/Vocoder/outputs/hifi-gan/models/g_00930000") +# audio = infer_waveform(mel) + +# sf.write("b.wav", audio, samplerate=16000) + + + # with torch.no_grad(): + # y_g_hat = generator(mel) + # audio = y_g_hat.squeeze() + + + # audio = audio.cpu().numpy() + # sf.write("a.wav", audio, samplerate=16000) + + +# import IPython.display as ipd +# ipd.Audio(audio, rate=16000) \ No newline at end of file diff --git a/hifigan/meldataset.py b/hifigan/meldataset.py new file mode 100644 index 0000000..54a6a88 --- /dev/null +++ b/hifigan/meldataset.py @@ -0,0 +1,178 @@ +import math +import os +import random +import torch +import torch.utils.data +import numpy as np +from librosa.util import normalize +from scipy.io.wavfile import read +from librosa.filters import mel as librosa_mel_fn + +MAX_WAV_VALUE = 32768.0 + + +def load_wav(full_path): + sampling_rate, data = read(full_path) + return data, sampling_rate + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): + if torch.min(y) < -1.: + print('min value is ', torch.min(y)) + if torch.max(y) > 1.: + print('max value is ', torch.max(y)) + + global mel_basis, hann_window + if fmax not in mel_basis: + mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) + mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) + hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], + center=center, pad_mode='reflect', normalized=False, onesided=True) + + spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) + + spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) + spec = spectral_normalize_torch(spec) + + return spec + + +def get_dataset_filelist(a): + # with open(a.input_training_file, 'r', encoding='utf-8') as fi: + # training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') + # for x in fi.read().split('\n') if len(x) > 0] + + # with open(a.input_validation_file, 'r', encoding='utf-8') as fi: + # validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') + # for x in fi.read().split('\n') if len(x) > 0] + + files = os.listdir(a.input_wavs_dir) + random.shuffle(files) + files = [os.path.join(a.input_wavs_dir, f) for f in files] + training_files = files[: -500] + validation_files = files[-500: ] + + return training_files, validation_files + + +class MelDataset(torch.utils.data.Dataset): + def __init__(self, training_files, segment_size, n_fft, num_mels, + hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1, + device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None): + self.audio_files = training_files + random.seed(1234) + if shuffle: + random.shuffle(self.audio_files) + self.segment_size = segment_size + self.sampling_rate = sampling_rate + self.split = split + self.n_fft = n_fft + self.num_mels = num_mels + self.hop_size = hop_size + self.win_size = win_size + self.fmin = fmin + self.fmax = fmax + self.fmax_loss = fmax_loss + self.cached_wav = None + self.n_cache_reuse = n_cache_reuse + self._cache_ref_count = 0 + self.device = device + self.fine_tuning = fine_tuning + self.base_mels_path = base_mels_path + + def __getitem__(self, index): + filename = self.audio_files[index] + if self._cache_ref_count == 0: + # audio, sampling_rate = load_wav(filename) + # audio = audio / MAX_WAV_VALUE + audio = np.load(filename) + if not self.fine_tuning: + audio = normalize(audio) * 0.95 + self.cached_wav = audio + # if sampling_rate != self.sampling_rate: + # raise ValueError("{} SR doesn't match target {} SR".format( + # sampling_rate, self.sampling_rate)) + self._cache_ref_count = self.n_cache_reuse + else: + audio = self.cached_wav + self._cache_ref_count -= 1 + + audio = torch.FloatTensor(audio) + audio = audio.unsqueeze(0) + + if not self.fine_tuning: + if self.split: + if audio.size(1) >= self.segment_size: + max_audio_start = audio.size(1) - self.segment_size + audio_start = random.randint(0, max_audio_start) + audio = audio[:, audio_start:audio_start+self.segment_size] + else: + audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') + + mel = mel_spectrogram(audio, self.n_fft, self.num_mels, + self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax, + center=False) + else: + mel_path = os.path.join(self.base_mels_path, "mel" + "-" + filename.split("/")[-1].split("-")[-1]) + mel = np.load(mel_path).T + # mel = np.load( + # os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy')) + mel = torch.from_numpy(mel) + + if len(mel.shape) < 3: + mel = mel.unsqueeze(0) + + if self.split: + frames_per_seg = math.ceil(self.segment_size / self.hop_size) + + if audio.size(1) >= self.segment_size: + mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) + mel = mel[:, :, mel_start:mel_start + frames_per_seg] + audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size] + else: + mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant') + audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') + + mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels, + self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss, + center=False) + + return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) + + def __len__(self): + return len(self.audio_files) diff --git a/hifigan/models.py b/hifigan/models.py new file mode 100644 index 0000000..725de4e --- /dev/null +++ b/hifigan/models.py @@ -0,0 +1,286 @@ +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from hifigan.utils import init_weights, get_padding + +LRELU_SLOPE = 0.1 + + +class ResBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.h = h + self.convs = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Generator(torch.nn.Module): + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)) + resblock = ResBlock1 if h.resblock == '1' else ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): +# self.ups.append(weight_norm( +# ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), +# k, u, padding=(k-u)//2))) + self.ups.append(weight_norm(ConvTranspose1d(h.upsample_initial_channel//(2**i), + h.upsample_initial_channel//(2**(i+1)), + k, u, padding=(u//2 + u%2), output_padding=u%2))) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel//(2**(i+1)) + for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x): + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i*self.num_kernels+j](x) + else: + xs += self.resblocks[i*self.num_kernels+j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiPeriodDiscriminator, self).__init__() + self.discriminators = nn.ModuleList([ + DiscriminatorP(2), + DiscriminatorP(3), + DiscriminatorP(5), + DiscriminatorP(7), + DiscriminatorP(11), + ]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv1d(1, 128, 15, 1, padding=7)), + norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiScaleDiscriminator, self).__init__() + self.discriminators = nn.ModuleList([ + DiscriminatorS(use_spectral_norm=True), + DiscriminatorS(), + DiscriminatorS(), + ]) + self.meanpools = nn.ModuleList([ + AvgPool1d(4, 2, padding=2), + AvgPool1d(4, 2, padding=2) + ]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i-1](y) + y_hat = self.meanpools[i-1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss*2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1-dr)**2) + g_loss = torch.mean(dg**2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1-dg)**2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + diff --git a/hifigan/test.py b/hifigan/test.py new file mode 100644 index 0000000..bdbcaf0 --- /dev/null +++ b/hifigan/test.py @@ -0,0 +1,58 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +import glob +import os +import argparse +import json +import torch +import numpy as np +from scipy.io.wavfile import write +from env import AttrDict +from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav +from models import Generator +import soundfile as sf + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +h = None +device = None + + +with open("config_16k_.json") as f: + data = f.read() +json_config = json.loads(data) +h = AttrDict(json_config) +torch.manual_seed(h.seed) +device = torch.device("cpu") + + +generator = Generator(h).to(device) +state_dict_g = load_checkpoint("../../../TTS/Vocoder/outputs/hifi-gan/models/g_00930000", device) +generator.load_state_dict(state_dict_g['generator']) +generator.eval() +generator.remove_weight_norm() + + +mel = np.load("./mel-T0055G0184S0349.wav_00.npy") +mel = torch.FloatTensor(mel.T).to(device) +mel = mel.unsqueeze(0) + + +with torch.no_grad(): + y_g_hat = generator(mel) + audio = y_g_hat.squeeze() + + +audio = audio.cpu().numpy() +sf.write("a.wav", audio, samplerate=16000) + + +# import IPython.display as ipd +# ipd.Audio(audio, rate=16000) \ No newline at end of file diff --git a/hifigan/utils.py b/hifigan/utils.py new file mode 100644 index 0000000..aa2a536 --- /dev/null +++ b/hifigan/utils.py @@ -0,0 +1,58 @@ +import glob +import os +import matplotlib +import torch +from torch.nn.utils import weight_norm +matplotlib.use("Agg") +import matplotlib.pylab as plt + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print("Saving checkpoint to {}".format(filepath)) + torch.save(obj, filepath) + print("Complete.") + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + '????????') + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] + diff --git a/toolbox/__init__.py b/toolbox/__init__.py index c763cb9..090c961 100644 --- a/toolbox/__init__.py +++ b/toolbox/__init__.py @@ -1,7 +1,8 @@ from toolbox.ui import UI from encoder import inference as encoder from synthesizer.inference import Synthesizer -from vocoder import inference as vocoder +from vocoder import inference as rnn_vocoder +from hifigan import inference as gan_vocoder from pathlib import Path from time import perf_counter as timer from toolbox.utterance import Utterance @@ -13,6 +14,9 @@ import librosa import re from audioread.exceptions import NoBackendError +# 默认使用wavernn +vocoder = rnn_vocoder + # Use this directory structure for your datasets, or modify it to fit your needs recognized_datasets = [ "LibriSpeech/dev-clean", @@ -353,10 +357,20 @@ class Toolbox: self.ui.set_loading(0) def init_vocoder(self): + + global vocoder model_fpath = self.ui.current_vocoder_fpath # Case of Griffin-lim if model_fpath is None: return + + + # Sekect vocoder based on model name + if model_fpath.name[0] == "g": + vocoder = gan_vocoder + self.ui.log("vocoder is hifigan") + else: + vocoder = rnn_vocoder self.ui.log("Loading the vocoder %s... " % model_fpath) self.ui.set_loading(1)