diff --git a/synthesizer/audio.py b/synthesizer/audio.py index 83dc96c..2e03ae5 100644 --- a/synthesizer/audio.py +++ b/synthesizer/audio.py @@ -167,7 +167,7 @@ def _mel_to_linear(mel_spectrogram, hparams): def _build_mel_basis(hparams): assert hparams.fmax <= hparams.sample_rate // 2 - return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels, + return librosa.filters.mel(sr=hparams.sample_rate, n_fft=hparams.n_fft, n_mels=hparams.num_mels, fmin=hparams.fmin, fmax=hparams.fmax) def _amp_to_db(x, hparams): diff --git a/synthesizer/inference.py b/synthesizer/inference.py index 2b4d15b..3ff856b 100644 --- a/synthesizer/inference.py +++ b/synthesizer/inference.py @@ -149,7 +149,7 @@ class Synthesizer: Loads and preprocesses an audio file under the same conditions the audio files were used to train the synthesizer. """ - wav = librosa.load(str(fpath), hparams.sample_rate)[0] + wav = librosa.load(path=str(fpath), sr=hparams.sample_rate)[0] if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max # denoise diff --git a/synthesizer/models/global_style_token.py b/synthesizer/models/global_style_token.py index 229b9ef..21ce07e 100644 --- a/synthesizer/models/global_style_token.py +++ b/synthesizer/models/global_style_token.py @@ -97,7 +97,7 @@ class STL(nn.Module): def forward(self, inputs): N = inputs.size(0) query = inputs.unsqueeze(1) # [N, 1, E//2] - keys = tFunctional.tanh(self.embed).unsqueeze(0).expand(N, -1, -1) # [N, token_num, E // num_heads] + keys = torch.tanh(self.embed).unsqueeze(0).expand(N, -1, -1) # [N, token_num, E // num_heads] style_embed = self.attention(query, keys) return style_embed diff --git a/synthesizer/preprocess_speaker.py b/synthesizer/preprocess_speaker.py index 88fad38..28ddad4 100644 --- a/synthesizer/preprocess_speaker.py +++ b/synthesizer/preprocess_speaker.py @@ -63,7 +63,7 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, def _split_on_silences(wav_fpath, words, hparams): # Load the audio waveform - wav, _ = librosa.load(wav_fpath, hparams.sample_rate) + wav, _ = librosa.load(wav_fpath, sr= hparams.sample_rate) wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0] if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max diff --git a/vocoder/wavernn/models/deepmind_version.py b/vocoder/wavernn/models/deepmind_version.py index 1d973d9..17b33b2 100644 --- a/vocoder/wavernn/models/deepmind_version.py +++ b/vocoder/wavernn/models/deepmind_version.py @@ -59,7 +59,7 @@ class WaveRNN(nn.Module) : # Compute all gates for coarse and fine u = F.sigmoid(R_u + I_u + self.bias_u) r = F.sigmoid(R_r + I_r + self.bias_r) - e = F.tanh(r * R_e + I_e + self.bias_e) + e = torch.tanh(r * R_e + I_e + self.bias_e) hidden = u * prev_hidden + (1. - u) * e # Split the hidden state @@ -118,7 +118,7 @@ class WaveRNN(nn.Module) : # Compute the coarse gates u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u) r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r) - e = F.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e) + e = torch.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e) hidden_coarse = u * hidden_coarse + (1. - u) * e # Compute the coarse output @@ -138,7 +138,7 @@ class WaveRNN(nn.Module) : # Compute the fine gates u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u) r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r) - e = F.tanh(r * R_fine_e + I_fine_e + b_fine_e) + e = torch.tanh(r * R_fine_e + I_fine_e + b_fine_e) hidden_fine = u * hidden_fine + (1. - u) * e # Compute the fine output