MockingBird/synthesizer/inference.py

import torch
from synthesizer import audio
from synthesizer.hparams import hparams
from synthesizer.models.tacotron import Tacotron
from synthesizer.utils.symbols import symbols
from synthesizer.utils.text import text_to_sequence
from vocoder.display import simple_table
from pathlib import Path
from typing import Union, List
import numpy as np
import librosa
from utils import logmmse
from pypinyin import lazy_pinyin, Style

class Synthesizer:
    sample_rate = hparams.sample_rate
    hparams = hparams
    
    def __init__(self, model_fpath: Path, verbose=True):
        """
        The model isn't instantiated and loaded in memory until needed or until load() is called.
        
        :param model_fpath: path to the trained model file
        :param verbose: if False, prints less information when using the model
        """
        self.model_fpath = model_fpath
        self.verbose = verbose
 
        # Check for GPU
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        if self.verbose:
            print("Synthesizer using device:", self.device)
        
        # Tacotron model will be instantiated later on first use.
        self._model = None

    def is_loaded(self):
        """
        Whether the model is loaded in memory.
        """
        return self._model is not None
    
    def load(self):
        """
        Instantiates and loads the model given the weights file that was passed in the constructor.
        """
        self._model = Tacotron(embed_dims=hparams.tts_embed_dims,
                               num_chars=len(symbols),
                               encoder_dims=hparams.tts_encoder_dims,
                               decoder_dims=hparams.tts_decoder_dims,
                               n_mels=hparams.num_mels,
                               fft_bins=hparams.num_mels,
                               postnet_dims=hparams.tts_postnet_dims,
                               encoder_K=hparams.tts_encoder_K,
                               lstm_dims=hparams.tts_lstm_dims,
                               postnet_K=hparams.tts_postnet_K,
                               num_highways=hparams.tts_num_highways,
                               dropout=hparams.tts_dropout,
                               stop_threshold=hparams.tts_stop_threshold,
                               speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)

        self._model.load(self.model_fpath)
        self._model.eval()

        if self.verbose:
            print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"]))

    def synthesize_spectrograms(self, texts: List[str],
                                embeddings: Union[np.ndarray, List[np.ndarray]],
                                return_alignments=False, style_idx=0, min_stop_token=5, steps=2000):
        """
        Synthesizes mel spectrograms from texts and speaker embeddings.

        :param texts: a list of N text prompts to be synthesized
        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 
        :param return_alignments: if True, a matrix representing the alignments between the 
        characters
        and each decoder output step will be returned for each spectrogram
        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 
        sequence length of spectrogram i, and possibly the alignments.
        """
        # Load the model on the first request.
        if not self.is_loaded():
            self.load()

            # Print some info about the model when it is loaded            
            tts_k = self._model.get_step() // 1000

            simple_table([("Tacotron", str(tts_k) + "k"),
                        ("r", self._model.r)])
        
        print("Read " + str(texts))
        texts = [" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) for v in texts]
        print("Synthesizing " + str(texts))
        # Preprocess text inputs
        inputs = [text_to_sequence(text, hparams.tts_cleaner_names) for text in texts]
        if not isinstance(embeddings, list):
            embeddings = [embeddings]

        # Batch inputs
        batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
                             for i in range(0, len(inputs), hparams.synthesis_batch_size)]
        batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
                             for i in range(0, len(embeddings), hparams.synthesis_batch_size)]

        specs = []
        for i, batch in enumerate(batched_inputs, 1):
            if self.verbose:
                print(f"\n| Generating {i}/{len(batched_inputs)}")

            # Pad texts so they are all the same length
            text_lens = [len(text) for text in batch]
            max_text_len = max(text_lens)
            chars = [pad1d(text, max_text_len) for text in batch]
            chars = np.stack(chars)

            # Stack speaker embeddings into 2D array for batch processing
            speaker_embeds = np.stack(batched_embeds[i-1])

            # Convert to tensor
            chars = torch.tensor(chars).long().to(self.device)
            speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)

            # Inference
            _, mels, alignments = self._model.generate(chars, speaker_embeddings, style_idx=style_idx, min_stop_token=min_stop_token, steps=steps)
            mels = mels.detach().cpu().numpy()
            for m in mels:
                # Trim silence from end of each spectrogram
                while np.max(m[:, -1]) < hparams.tts_stop_threshold:
                    m = m[:, :-1]
                specs.append(m)

        if self.verbose:
            print("\n\nDone.\n")
        return (specs, alignments) if return_alignments else specs

    @staticmethod
    def load_preprocess_wav(fpath):
        """
        Loads and preprocesses an audio file under the same conditions the audio files were used to
        train the synthesizer. 
        """
        wav = librosa.load(str(fpath), hparams.sample_rate)[0]
        if hparams.rescale:
            wav = wav / np.abs(wav).max() * hparams.rescaling_max
        # denoise
        if len(wav) > hparams.sample_rate*(0.3+0.1):
            noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
                                        wav[-int(hparams.sample_rate*0.15):]])
            profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
            wav = logmmse.denoise(wav, profile)
        return wav

    @staticmethod
    def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
        """
        Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that 
        were fed to the synthesizer when training.
        """
        if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
            wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
        else:
            wav = fpath_or_wav
        
        mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
        return mel_spectrogram
    
    @staticmethod
    def griffin_lim(mel):
        """
        Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
        with the same parameters present in hparams.py.
        """
        return audio.inv_mel_spectrogram(mel, hparams)


def pad1d(x, max_len, pad_value=0):
    return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`import torch`
			`from synthesizer import audio`
			`from synthesizer.hparams import hparams`
			`from synthesizer.models.tacotron import Tacotron`
			`from synthesizer.utils.symbols import symbols`
			`from synthesizer.utils.text import text_to_sequence`
			`from vocoder.display import simple_table`
			`from pathlib import Path`
			`from typing import Union, List`
			`import numpy as np`
			`import librosa`
Add quick path to preprocess audio, denoise audio when loading in toolbox 2021-08-30 22:22:06 +08:00			`from utils import logmmse`
Add Chinese character input support 2021-08-15 22:53:40 +08:00			`from pypinyin import lazy_pinyin, Style`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00
			`class Synthesizer:`
			`sample_rate = hparams.sample_rate`
			`hparams = hparams`

			`def __init__(self, model_fpath: Path, verbose=True):`
			`"""`
			`The model isn't instantiated and loaded in memory until needed or until load() is called.`

			`:param model_fpath: path to the trained model file`
			`:param verbose: if False, prints less information when using the model`
			`"""`
			`self.model_fpath = model_fpath`
			`self.verbose = verbose`

			`# Check for GPU`
			`if torch.cuda.is_available():`
			`self.device = torch.device("cuda")`
			`else:`
			`self.device = torch.device("cpu")`
			`if self.verbose:`
			`print("Synthesizer using device:", self.device)`

			`# Tacotron model will be instantiated later on first use.`
			`self._model = None`

			`def is_loaded(self):`
			`"""`
			`Whether the model is loaded in memory.`
			`"""`
			`return self._model is not None`

			`def load(self):`
			`"""`
			`Instantiates and loads the model given the weights file that was passed in the constructor.`
			`"""`
			`self._model = Tacotron(embed_dims=hparams.tts_embed_dims,`
			`num_chars=len(symbols),`
			`encoder_dims=hparams.tts_encoder_dims,`
			`decoder_dims=hparams.tts_decoder_dims,`
			`n_mels=hparams.num_mels,`
			`fft_bins=hparams.num_mels,`
			`postnet_dims=hparams.tts_postnet_dims,`
			`encoder_K=hparams.tts_encoder_K,`
			`lstm_dims=hparams.tts_lstm_dims,`
			`postnet_K=hparams.tts_postnet_K,`
			`num_highways=hparams.tts_num_highways,`
			`dropout=hparams.tts_dropout,`
			`stop_threshold=hparams.tts_stop_threshold,`
			`speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)`

			`self._model.load(self.model_fpath)`
			`self._model.eval()`

			`if self.verbose:`
			`print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"]))`

			`def synthesize_spectrograms(self, texts: List[str],`
			`embeddings: Union[np.ndarray, List[np.ndarray]],`
Add max steps control in toolbox 2021-11-06 13:27:11 +08:00			`return_alignments=False, style_idx=0, min_stop_token=5, steps=2000):`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`"""`
			`Synthesizes mel spectrograms from texts and speaker embeddings.`

			`:param texts: a list of N text prompts to be synthesized`
			`:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)`
			`:param return_alignments: if True, a matrix representing the alignments between the`
			`characters`
			`and each decoder output step will be returned for each spectrogram`
			`:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the`
			`sequence length of spectrogram i, and possibly the alignments.`
			`"""`
			`# Load the model on the first request.`
			`if not self.is_loaded():`
			`self.load()`

			`# Print some info about the model when it is loaded`
			`tts_k = self._model.get_step() // 1000`

			`simple_table([("Tacotron", str(tts_k) + "k"),`
			`("r", self._model.r)])`
Add quick path to preprocess audio, denoise audio when loading in toolbox 2021-08-30 22:22:06 +08:00
			`print("Read " + str(texts))`
			`texts = [" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) for v in texts]`
			`print("Synthesizing " + str(texts))`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`# Preprocess text inputs`
【bugfix】 fix bug causing non-sense output for long texts 修复多段文字发音错误 2021-08-22 23:44:25 +08:00			`inputs = [text_to_sequence(text, hparams.tts_cleaner_names) for text in texts]`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`if not isinstance(embeddings, list):`
			`embeddings = [embeddings]`

			`# Batch inputs`
			`batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]`
			`for i in range(0, len(inputs), hparams.synthesis_batch_size)]`
			`batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]`
			`for i in range(0, len(embeddings), hparams.synthesis_batch_size)]`

			`specs = []`
			`for i, batch in enumerate(batched_inputs, 1):`
			`if self.verbose:`
			`print(f"\n\| Generating {i}/{len(batched_inputs)}")`

			`# Pad texts so they are all the same length`
			`text_lens = [len(text) for text in batch]`
			`max_text_len = max(text_lens)`
			`chars = [pad1d(text, max_text_len) for text in batch]`
			`chars = np.stack(chars)`

			`# Stack speaker embeddings into 2D array for batch processing`
			`speaker_embeds = np.stack(batched_embeds[i-1])`

			`# Convert to tensor`
			`chars = torch.tensor(chars).long().to(self.device)`
			`speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)`

			`# Inference`
Add max steps control in toolbox 2021-11-06 13:27:11 +08:00			`_, mels, alignments = self._model.generate(chars, speaker_embeddings, style_idx=style_idx, min_stop_token=min_stop_token, steps=steps)`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`mels = mels.detach().cpu().numpy()`
			`for m in mels:`
			`# Trim silence from end of each spectrogram`
			`while np.max(m[:, -1]) < hparams.tts_stop_threshold:`
			`m = m[:, :-1]`
			`specs.append(m)`

			`if self.verbose:`
			`print("\n\nDone.\n")`
			`return (specs, alignments) if return_alignments else specs`

			`@staticmethod`
			`def load_preprocess_wav(fpath):`
			`"""`
			`Loads and preprocesses an audio file under the same conditions the audio files were used to`
			`train the synthesizer.`
			`"""`
			`wav = librosa.load(str(fpath), hparams.sample_rate)[0]`
			`if hparams.rescale:`
			`wav = wav / np.abs(wav).max() * hparams.rescaling_max`
Add quick path to preprocess audio, denoise audio when loading in toolbox 2021-08-30 22:22:06 +08:00			`# denoise`
			`if len(wav) > hparams.sample_rate*(0.3+0.1):`
			`noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],`
			`wav[-int(hparams.sample_rate*0.15):]])`
			`profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)`
			`wav = logmmse.denoise(wav, profile)`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`return wav`

			`@staticmethod`
			`def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):`
			`"""`
			`Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that`
			`were fed to the synthesizer when training.`
			`"""`
			`if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):`
			`wav = Synthesizer.load_preprocess_wav(fpath_or_wav)`
			`else:`
			`wav = fpath_or_wav`

			`mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)`
			`return mel_spectrogram`

			`@staticmethod`
			`def griffin_lim(mel):`
			`"""`
			`Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built`
			`with the same parameters present in hparams.py.`
			`"""`
			`return audio.inv_mel_spectrogram(mel, hparams)`


			`def pad1d(x, max_len, pad_value=0):`
			`return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)`