diff --git a/synthesizer_tacotron2/.DS_Store b/synthesizer_tacotron2/.DS_Store new file mode 100644 index 0000000..645d813 Binary files /dev/null and b/synthesizer_tacotron2/.DS_Store differ diff --git a/synthesizer_tacotron2/LICENSE.txt b/synthesizer_tacotron2/LICENSE.txt new file mode 100644 index 0000000..3337d45 --- /dev/null +++ b/synthesizer_tacotron2/LICENSE.txt @@ -0,0 +1,24 @@ +MIT License + +Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) +Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) +Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) +Modified work Copyright (c) 2020 blue-fish (https://github.com/blue-fish) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/synthesizer_tacotron2/__init__.py b/synthesizer_tacotron2/__init__.py new file mode 100644 index 0000000..4287ca8 --- /dev/null +++ b/synthesizer_tacotron2/__init__.py @@ -0,0 +1 @@ +# \ No newline at end of file diff --git a/synthesizer_tacotron2/audio.py b/synthesizer_tacotron2/audio.py new file mode 100644 index 0000000..83dc96c --- /dev/null +++ b/synthesizer_tacotron2/audio.py @@ -0,0 +1,206 @@ +import librosa +import librosa.filters +import numpy as np +from scipy import signal +from scipy.io import wavfile +import soundfile as sf + + +def load_wav(path, sr): + return librosa.core.load(path, sr=sr)[0] + +def save_wav(wav, path, sr): + wav *= 32767 / max(0.01, np.max(np.abs(wav))) + #proposed by @dsmiller + wavfile.write(path, sr, wav.astype(np.int16)) + +def save_wavenet_wav(wav, path, sr): + sf.write(path, wav.astype(np.float32), sr) + +def preemphasis(wav, k, preemphasize=True): + if preemphasize: + return signal.lfilter([1, -k], [1], wav) + return wav + +def inv_preemphasis(wav, k, inv_preemphasize=True): + if inv_preemphasize: + return signal.lfilter([1], [1, -k], wav) + return wav + +#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py +def start_and_end_indices(quantized, silence_threshold=2): + for start in range(quantized.size): + if abs(quantized[start] - 127) > silence_threshold: + break + for end in range(quantized.size - 1, 1, -1): + if abs(quantized[end] - 127) > silence_threshold: + break + + assert abs(quantized[start] - 127) > silence_threshold + assert abs(quantized[end] - 127) > silence_threshold + + return start, end + +def get_hop_size(hparams): + hop_size = hparams.hop_size + if hop_size is None: + assert hparams.frame_shift_ms is not None + hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) + return hop_size + +def linearspectrogram(wav, hparams): + D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) + S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db + + if hparams.signal_normalization: + return _normalize(S, hparams) + return S + +def melspectrogram(wav, hparams): + D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) + S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db + + if hparams.signal_normalization: + return _normalize(S, hparams) + return S + +def inv_linear_spectrogram(linear_spectrogram, hparams): + """Converts linear spectrogram to waveform using librosa""" + if hparams.signal_normalization: + D = _denormalize(linear_spectrogram, hparams) + else: + D = linear_spectrogram + + S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear + + if hparams.use_lws: + processor = _lws_processor(hparams) + D = processor.run_lws(S.astype(np.float64).T ** hparams.power) + y = processor.istft(D).astype(np.float32) + return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) + else: + return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) + +def inv_mel_spectrogram(mel_spectrogram, hparams): + """Converts mel spectrogram to waveform using librosa""" + if hparams.signal_normalization: + D = _denormalize(mel_spectrogram, hparams) + else: + D = mel_spectrogram + + S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear + + if hparams.use_lws: + processor = _lws_processor(hparams) + D = processor.run_lws(S.astype(np.float64).T ** hparams.power) + y = processor.istft(D).astype(np.float32) + return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) + else: + return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) + +def _lws_processor(hparams): + import lws + return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech") + +def _griffin_lim(S, hparams): + """librosa implementation of Griffin-Lim + Based on https://github.com/librosa/librosa/issues/434 + """ + angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) + S_complex = np.abs(S).astype(np.complex) + y = _istft(S_complex * angles, hparams) + for i in range(hparams.griffin_lim_iters): + angles = np.exp(1j * np.angle(_stft(y, hparams))) + y = _istft(S_complex * angles, hparams) + return y + +def _stft(y, hparams): + if hparams.use_lws: + return _lws_processor(hparams).stft(y).T + else: + return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size) + +def _istft(y, hparams): + return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size) + +########################################################## +#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!) +def num_frames(length, fsize, fshift): + """Compute number of time frames of spectrogram + """ + pad = (fsize - fshift) + if length % fshift == 0: + M = (length + pad * 2 - fsize) // fshift + 1 + else: + M = (length + pad * 2 - fsize) // fshift + 2 + return M + + +def pad_lr(x, fsize, fshift): + """Compute left and right padding + """ + M = num_frames(len(x), fsize, fshift) + pad = (fsize - fshift) + T = len(x) + 2 * pad + r = (M - 1) * fshift + fsize - T + return pad, pad + r +########################################################## +#Librosa correct padding +def librosa_pad_lr(x, fsize, fshift): + return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0] + +# Conversions +_mel_basis = None +_inv_mel_basis = None + +def _linear_to_mel(spectogram, hparams): + global _mel_basis + if _mel_basis is None: + _mel_basis = _build_mel_basis(hparams) + return np.dot(_mel_basis, spectogram) + +def _mel_to_linear(mel_spectrogram, hparams): + global _inv_mel_basis + if _inv_mel_basis is None: + _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams)) + return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) + +def _build_mel_basis(hparams): + assert hparams.fmax <= hparams.sample_rate // 2 + return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels, + fmin=hparams.fmin, fmax=hparams.fmax) + +def _amp_to_db(x, hparams): + min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) + return 20 * np.log10(np.maximum(min_level, x)) + +def _db_to_amp(x): + return np.power(10.0, (x) * 0.05) + +def _normalize(S, hparams): + if hparams.allow_clipping_in_normalization: + if hparams.symmetric_mels: + return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value, + -hparams.max_abs_value, hparams.max_abs_value) + else: + return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value) + + assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 + if hparams.symmetric_mels: + return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value + else: + return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)) + +def _denormalize(D, hparams): + if hparams.allow_clipping_in_normalization: + if hparams.symmetric_mels: + return (((np.clip(D, -hparams.max_abs_value, + hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + + hparams.min_level_db) + else: + return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) + + if hparams.symmetric_mels: + return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db) + else: + return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) diff --git a/synthesizer_tacotron2/feeder.py b/synthesizer_tacotron2/feeder.py new file mode 100644 index 0000000..6fc1b20 --- /dev/null +++ b/synthesizer_tacotron2/feeder.py @@ -0,0 +1,272 @@ +from sklearn.model_selection import train_test_split +from synthesizer.utils.text import text_to_sequence +from synthesizer.infolog import log +import tensorflow as tf +import numpy as np +import threading +import time +import os + +_batches_per_group = 64 + +class Feeder: + """ + Feeds batches of data into queue on a background thread. + """ + + def __init__(self, coordinator, metadata_filename, hparams): + super(Feeder, self).__init__() + self._coord = coordinator + self._hparams = hparams + self._cleaner_names = [x.strip() for x in hparams.cleaners.split(",")] + self._train_offset = 0 + self._test_offset = 0 + + # Load metadata + self._mel_dir = os.path.join(os.path.dirname(metadata_filename), "mels") + self._embed_dir = os.path.join(os.path.dirname(metadata_filename), "embeds") + with open(metadata_filename, encoding="utf-8") as f: + self._metadata = [line.strip().split("|") for line in f] + frame_shift_ms = hparams.hop_size / hparams.sample_rate + hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) + log("Loaded metadata for {} examples ({:.2f} hours)".format(len(self._metadata), hours)) + + #Train test split + if hparams.tacotron_test_size is None: + assert hparams.tacotron_test_batches is not None + + test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None + else hparams.tacotron_test_batches * hparams.tacotron_batch_size) + indices = np.arange(len(self._metadata)) + train_indices, test_indices = train_test_split(indices, + test_size=test_size, random_state=hparams.tacotron_data_random_state) + + #Make sure test_indices is a multiple of batch_size else round up + len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size) + extra_test = test_indices[len_test_indices:] + test_indices = test_indices[:len_test_indices] + train_indices = np.concatenate([train_indices, extra_test]) + + self._train_meta = list(np.array(self._metadata)[train_indices]) + self._test_meta = list(np.array(self._metadata)[test_indices]) + + self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size + + if hparams.tacotron_test_size is None: + assert hparams.tacotron_test_batches == self.test_steps + + #pad input sequences with the 0 ( _ ) + self._pad = 0 + #explicitely setting the padding to a value that doesn"t originally exist in the spectogram + #to avoid any possible conflicts, without affecting the output range of the model too much + if hparams.symmetric_mels: + self._target_pad = -hparams.max_abs_value + else: + self._target_pad = 0. + #Mark finished sequences with 1s + self._token_pad = 1. + + with tf.device("/cpu:0"): + # Create placeholders for inputs and targets. Don"t specify batch size because we want + # to be able to feed different batch sizes at eval time. + self._placeholders = [ + tf.placeholder(tf.int32, shape=(None, None), name="inputs"), + tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"), + tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), + name="mel_targets"), + tf.placeholder(tf.float32, shape=(None, None), name="token_targets"), + tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"), + tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), + name="split_infos"), + + # SV2TTS + tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), + name="speaker_embeddings") + ] + + # Create queue for buffering data + queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, + tf.int32, tf.int32, tf.float32], name="input_queue") + self._enqueue_op = queue.enqueue(self._placeholders) + self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \ + self.targets_lengths, self.split_infos, self.speaker_embeddings = queue.dequeue() + + self.inputs.set_shape(self._placeholders[0].shape) + self.input_lengths.set_shape(self._placeholders[1].shape) + self.mel_targets.set_shape(self._placeholders[2].shape) + self.token_targets.set_shape(self._placeholders[3].shape) + self.targets_lengths.set_shape(self._placeholders[4].shape) + self.split_infos.set_shape(self._placeholders[5].shape) + self.speaker_embeddings.set_shape(self._placeholders[6].shape) + + # Create eval queue for buffering eval data + eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, + tf.int32, tf.int32, tf.float32], name="eval_queue") + self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) + self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \ + self.eval_token_targets, self.eval_targets_lengths, \ + self.eval_split_infos, self.eval_speaker_embeddings = eval_queue.dequeue() + + self.eval_inputs.set_shape(self._placeholders[0].shape) + self.eval_input_lengths.set_shape(self._placeholders[1].shape) + self.eval_mel_targets.set_shape(self._placeholders[2].shape) + self.eval_token_targets.set_shape(self._placeholders[3].shape) + self.eval_targets_lengths.set_shape(self._placeholders[4].shape) + self.eval_split_infos.set_shape(self._placeholders[5].shape) + self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape) + + + def start_threads(self, session): + self._session = session + thread = threading.Thread(name="background", target=self._enqueue_next_train_group) + thread.daemon = True #Thread will close when parent quits + thread.start() + + thread = threading.Thread(name="background", target=self._enqueue_next_test_group) + thread.daemon = True #Thread will close when parent quits + thread.start() + + def _get_test_groups(self): + meta = self._test_meta[self._test_offset] + self._test_offset += 1 + + text = meta[5] + + input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) + mel_target = np.load(os.path.join(self._mel_dir, meta[1])) + #Create parallel sequences containing zeros to represent a non finished sequence + token_target = np.asarray([0.] * (len(mel_target) - 1)) + embed_target = np.load(os.path.join(self._embed_dir, meta[2])) + return input_data, mel_target, token_target, embed_target, len(mel_target) + + def make_test_batches(self): + start = time.time() + + # Read a group of examples + n = self._hparams.tacotron_batch_size + r = self._hparams.outputs_per_step + + #Test on entire test set + examples = [self._get_test_groups() for i in range(len(self._test_meta))] + + # Bucket examples based on similar output sequence length for efficiency + examples.sort(key=lambda x: x[-1]) + batches = [examples[i: i+n] for i in range(0, len(examples), n)] + np.random.shuffle(batches) + + log("\nGenerated %d test batches of size %d in %.3f sec" % (len(batches), n, time.time() - start)) + return batches, r + + def _enqueue_next_train_group(self): + while not self._coord.should_stop(): + start = time.time() + + # Read a group of examples + n = self._hparams.tacotron_batch_size + r = self._hparams.outputs_per_step + examples = [self._get_next_example() for i in range(n * _batches_per_group)] + + # Bucket examples based on similar output sequence length for efficiency + examples.sort(key=lambda x: x[-1]) + batches = [examples[i: i+n] for i in range(0, len(examples), n)] + np.random.shuffle(batches) + + log("\nGenerated {} train batches of size {} in {:.3f} sec".format(len(batches), n, time.time() - start)) + for batch in batches: + feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) + self._session.run(self._enqueue_op, feed_dict=feed_dict) + + def _enqueue_next_test_group(self): + #Create test batches once and evaluate on them for all test steps + test_batches, r = self.make_test_batches() + while not self._coord.should_stop(): + for batch in test_batches: + feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) + self._session.run(self._eval_enqueue_op, feed_dict=feed_dict) + + def _get_next_example(self): + """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk + """ + if self._train_offset >= len(self._train_meta): + self._train_offset = 0 + np.random.shuffle(self._train_meta) + + meta = self._train_meta[self._train_offset] + self._train_offset += 1 + + text = meta[5] + + input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) + mel_target = np.load(os.path.join(self._mel_dir, meta[1])) + #Create parallel sequences containing zeros to represent a non finished sequence + token_target = np.asarray([0.] * (len(mel_target) - 1)) + embed_target = np.load(os.path.join(self._embed_dir, meta[2])) + return input_data, mel_target, token_target, embed_target, len(mel_target) + + def _prepare_batch(self, batches, outputs_per_step): + assert 0 == len(batches) % self._hparams.tacotron_num_gpus + size_per_device = int(len(batches) / self._hparams.tacotron_num_gpus) + np.random.shuffle(batches) + + inputs = None + mel_targets = None + token_targets = None + targets_lengths = None + split_infos = [] + + targets_lengths = np.asarray([x[-1] for x in batches], dtype=np.int32) #Used to mask loss + input_lengths = np.asarray([len(x[0]) for x in batches], dtype=np.int32) + + for i in range(self._hparams.tacotron_num_gpus): + batch = batches[size_per_device*i:size_per_device*(i+1)] + input_cur_device, input_max_len = self._prepare_inputs([x[0] for x in batch]) + inputs = np.concatenate((inputs, input_cur_device), axis=1) if inputs is not None else input_cur_device + mel_target_cur_device, mel_target_max_len = self._prepare_targets([x[1] for x in batch], outputs_per_step) + mel_targets = np.concatenate(( mel_targets, mel_target_cur_device), axis=1) if mel_targets is not None else mel_target_cur_device + + #Pad sequences with 1 to infer that the sequence is done + token_target_cur_device, token_target_max_len = self._prepare_token_targets([x[2] for x in batch], outputs_per_step) + token_targets = np.concatenate((token_targets, token_target_cur_device),axis=1) if token_targets is not None else token_target_cur_device + split_infos.append([input_max_len, mel_target_max_len, token_target_max_len]) + + split_infos = np.asarray(split_infos, dtype=np.int32) + + ### SV2TTS ### + + embed_targets = np.asarray([x[3] for x in batches]) + + ############## + + return inputs, input_lengths, mel_targets, token_targets, targets_lengths, \ + split_infos, embed_targets + + def _prepare_inputs(self, inputs): + max_len = max([len(x) for x in inputs]) + return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len + + def _prepare_targets(self, targets, alignment): + max_len = max([len(t) for t in targets]) + data_len = self._round_up(max_len, alignment) + return np.stack([self._pad_target(t, data_len) for t in targets]), data_len + + def _prepare_token_targets(self, targets, alignment): + max_len = max([len(t) for t in targets]) + 1 + data_len = self._round_up(max_len, alignment) + return np.stack([self._pad_token_target(t, data_len) for t in targets]), data_len + + def _pad_input(self, x, length): + return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad) + + def _pad_target(self, t, length): + return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad) + + def _pad_token_target(self, t, length): + return np.pad(t, (0, length - t.shape[0]), mode="constant", constant_values=self._token_pad) + + def _round_up(self, x, multiple): + remainder = x % multiple + return x if remainder == 0 else x + multiple - remainder + + def _round_down(self, x, multiple): + remainder = x % multiple + return x if remainder == 0 else x - remainder diff --git a/synthesizer_tacotron2/hparams.py b/synthesizer_tacotron2/hparams.py new file mode 100644 index 0000000..7d912ee --- /dev/null +++ b/synthesizer_tacotron2/hparams.py @@ -0,0 +1,272 @@ +import ast +import pprint +from tensorflow.contrib.training import HParams + + + +hparams = HParams( + cleaners="basic_cleaners", + tacotron_gpu_start_idx=0, # idx of the first GPU to be used for Tacotron training. + tacotron_num_gpus=1, # Determines the number of gpus in use for Tacotron training. + split_on_cpu=True, + + ### Signal Processing (used in both synthesizer and vocoder) + sample_rate = 16000, + n_fft = 800, + num_mels = 80, + hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125) + win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050) + fmin = 55, + min_level_db = -100, + ref_level_db = 20, + max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small. + preemphasis = 0.97, # Filter coefficient to use if preemphasize is True + preemphasize = True, + frame_shift_ms=None, + normalize_for_wavenet=True, + # whether to rescale to [0, 1] for wavenet. (better audio quality) + clip_for_wavenet=True, + + + + ### Tacotron Text-to-Speech (TTS) + tts_embed_dims = 512, # Embedding dimension for the graphemes/phoneme inputs + tts_encoder_dims = 256, + tts_decoder_dims = 128, + tts_postnet_dims = 512, + tts_encoder_K = 5, + tts_lstm_dims = 1024, + tts_postnet_K = 5, + tts_num_highways = 4, + tts_dropout = 0.5, + tts_cleaner_names = ["basic_cleaners"], + tts_stop_threshold = -3.4, # Value below which audio generation ends. + # For example, for a range of [-4, 4], this + # will terminate the sequence at the first + # frame that has all values < -3.4 + + ### Tacotron Training + tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule + (2, 5e-4, 40_000, 12), # (r, lr, step, batch_size) + (2, 2e-4, 80_000, 12), # + (2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames + (2, 3e-5, 320_000, 12), # synthesized for each decoder iteration) + (2, 1e-5, 640_000, 12)], # lr = learning rate + + tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed + tts_eval_interval = 500, # Number of steps between model evaluation (sample generation) + # Set to -1 to generate after completing epoch, or 0 to disable + + tts_eval_num_samples = 1, # Makes this number of samples + + ### Data Preprocessing + max_mel_frames = 900, + rescale = True, + rescaling_max = 0.9, + synthesis_batch_size = 16, # For vocoder preprocessing and inference. + + ### Mel Visualization and Griffin-Lim + signal_normalization = True, + power = 1.5, + griffin_lim_iters = 60, + + ### Audio processing options + fmax = 7600, # Should not exceed (sample_rate // 2) + allow_clipping_in_normalization = True, # Used when signal_normalization = True + clip_mels_length = True, # If true, discards samples exceeding max_mel_frames + use_lws = False, # "Fast spectrogram phase recovery using local weighted sums" + symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True, + # and [0, max_abs_value] if False + trim_silence = True, # Use with sample_rate of 16000 for best results + silence_threshold=2, + trim_fft_size=512, + trim_hop_size=128, + trim_top_db=23, + + ### SV2TTS + speaker_embedding_size = 256, # Dimension for the speaker embedding + silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split + utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded + + # Tacotron + outputs_per_step=2, # Was 1 + # number of frames to generate at each decoding step (increase to speed up computation and + # allows for higher batch size, decreases G&L audio quality) + stop_at_any=True, + # Determines whether the decoder should stop when predicting to any frame or to all of + # them (True works pretty well) + + embedding_dim=512, # dimension of embedding space (these are NOT the speaker embeddings) + + # Encoder parameters + enc_conv_num_layers=3, # number of encoder convolutional layers + enc_conv_kernel_size=(5,), # size of encoder convolution filters for each layer + enc_conv_channels=512, # number of encoder convolutions filters for each layer + encoder_lstm_units=256, # number of lstm units for each direction (forward and backward) + + # Attention mechanism + smoothing=False, # Whether to smooth the attention normalization function + attention_dim=128, # dimension of attention space + attention_filters=32, # number of attention convolution filters + attention_kernel=(31,), # kernel size of attention convolution + cumulative_weights=True, + # Whether to cumulate (sum) all previous attention weights or simply feed previous weights ( + # Recommended: True) + + # Decoder + prenet_layers=[256, 256], # number of layers and number of units of prenet + decoder_layers=2, # number of decoder lstm layers + decoder_lstm_units=1024, # number of decoder lstm units on each layer + max_iters=2000, + # Max decoder steps during inference (Just for safety from infinite loop cases) + + # Residual postnet + postnet_num_layers=5, # number of postnet convolutional layers + postnet_kernel_size=(5,), # size of postnet convolution filters for each layer + postnet_channels=512, # number of postnet convolution filters for each layer + + # CBHG mel->linear postnet + cbhg_kernels=8, + # All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act + # as "K-grams" + cbhg_conv_channels=128, # Channels of the convolution bank + cbhg_pool_size=2, # pooling size of the CBHG + cbhg_projection=256, + # projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels) + cbhg_projection_kernel_size=3, # kernel_size of the CBHG projections + cbhg_highwaynet_layers=4, # Number of HighwayNet layers + cbhg_highway_units=128, # Number of units used in HighwayNet fully connected layers + cbhg_rnn_units=128, + # Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in + # shape + + # Loss params + mask_encoder=True, + # whether to mask encoder padding while computing attention. Set to True for better prosody + # but slower convergence. + mask_decoder=False, + # Whether to use loss mask for padded sequences (if False, loss function will not + # be weighted, else recommended pos_weight = 20) + cross_entropy_pos_weight=20, + # Use class weights to reduce the stop token classes imbalance (by adding more penalty on + # False Negatives (FN)) (1 = disabled) + predict_linear=False, + # Whether to add a post-processing network to the Tacotron to predict linear spectrograms ( + # True mode Not tested!!) + ########################################################################################################################################### + + # Tacotron Training + # Reproduction seeds + tacotron_random_seed=5339, + # Determines initial graph and operations (i.e: model) random state for reproducibility + tacotron_data_random_state=1234, # random state for train test split repeatability + + # performance parameters + tacotron_swap_with_cpu=False, + # Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause + # major slowdowns! Only use when critical!) + + # train/test split ratios, mini-batches sizes + tacotron_batch_size=36, # number of training samples on each training steps (was 32) + # Tacotron Batch synthesis supports ~16x the training batch size (no gradients during + # testing). + # Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times + # different from training. We thus recommend masking the encoder. + tacotron_synthesis_batch_size=128, + # DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN"T TRAIN TACOTRON WITH "mask_encoder=True"!! + tacotron_test_size=0.05, + # % of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is + # enough to have a good idea about overfit) + tacotron_test_batches=None, # number of test batches. + + # Learning rate schedule + tacotron_decay_learning_rate=True, + # boolean, determines if the learning rate will follow an exponential decay + tacotron_start_decay=50000, # Step at which learning decay starts + tacotron_decay_steps=50000, # Determines the learning rate decay slope (UNDER TEST) + tacotron_decay_rate=0.5, # learning rate decay rate (UNDER TEST) + tacotron_initial_learning_rate=1e-3, # starting learning rate + tacotron_final_learning_rate=1e-5, # minimal learning rate + + # Optimization parameters + tacotron_adam_beta1=0.9, # AdamOptimizer beta1 parameter + tacotron_adam_beta2=0.999, # AdamOptimizer beta2 parameter + tacotron_adam_epsilon=1e-6, # AdamOptimizer Epsilon parameter + + # Regularization parameters + tacotron_reg_weight=1e-7, # regularization weight (for L2 regularization) + tacotron_scale_regularization=False, + # Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is + # high and biasing the model) + tacotron_zoneout_rate=0.1, # zoneout rate for all LSTM cells in the network + tacotron_dropout_rate=0.5, # dropout rate for all convolutional layers + prenet + tacotron_clip_gradients=True, # whether to clip gradients + + # Evaluation parameters + natural_eval=False, + # Whether to use 100% natural eval (to evaluate Curriculum Learning performance) or with same + # teacher-forcing ratio as in training (just for overfit) + + # Decoder RNN learning can take be done in one of two ways: + # Teacher Forcing: vanilla teacher forcing (usually with ratio = 1). mode="constant" + # Curriculum Learning Scheme: From Teacher-Forcing to sampling from previous outputs is + # function of global step. (teacher forcing ratio decay) mode="scheduled" + # The second approach is inspired by: + # Bengio et al. 2015: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks. + # Can be found under: https://arxiv.org/pdf/1506.03099.pdf + tacotron_teacher_forcing_mode="constant", + # Can be ("constant" or "scheduled"). "scheduled" mode applies a cosine teacher forcing ratio + # decay. (Preference: scheduled) + tacotron_teacher_forcing_ratio=1., + # Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder + # inputs, Only relevant if mode="constant" + tacotron_teacher_forcing_init_ratio=1., + # initial teacher forcing ratio. Relevant if mode="scheduled" + tacotron_teacher_forcing_final_ratio=0., + # final teacher forcing ratio. Relevant if mode="scheduled" + tacotron_teacher_forcing_start_decay=10000, + # starting point of teacher forcing ratio decay. Relevant if mode="scheduled" + tacotron_teacher_forcing_decay_steps=280000, + # Determines the teacher forcing ratio decay slope. Relevant if mode="scheduled" + tacotron_teacher_forcing_decay_alpha=0., + # teacher forcing ratio decay rate. Relevant if mode="scheduled" + ########################################################################################################################################### + + # Tacotron-2 integration parameters + train_with_GTA=False, + # Whether to use GTA mels to train WaveNet instead of ground truth mels. + ########################################################################################################################################### + + # Eval sentences (if no eval text file was specified during synthesis, these sentences are + # used for eval) + sentences=[ + # From July 8, 2017 New York Times: + "Scientists at the CERN laboratory say they have discovered a new particle.", + "There\"s a way to measure the acute emotional intelligence that has never gone out of " + "style.", + "President Trump met with other leaders at the Group of 20 conference.", + "The Senate\"s bill to repeal and replace the Affordable Care Act is now imperiled.", + # From Google"s Tacotron example page: + "Generative adversarial network or variational auto-encoder.", + "Basilar membrane and otolaryngology are not auto-correlations.", + "He has read the whole thing.", + "He reads books.", + "He thought it was time to present the present.", + "Thisss isrealy awhsome.", + "Punctuation sensitivity, is working.", + "Punctuation sensitivity is working.", + "Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?", + "She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.", + "Tajima Airport serves Toyooka.", + # From The web (random long utterance) + "Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.\ + This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that\ + the adopted architecture is able to perform this task with wild success.", + "Thank you so much for your support!", + ], + ) + +def hparams_debug_string(): + values = hparams.values() + hp = [" %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"] + return "Hyperparameters:\n" + "\n".join(hp) \ No newline at end of file diff --git a/synthesizer_tacotron2/inference.py b/synthesizer_tacotron2/inference.py new file mode 100644 index 0000000..34a5ed5 --- /dev/null +++ b/synthesizer_tacotron2/inference.py @@ -0,0 +1,165 @@ +from synthesizer.tacotron2 import Tacotron2 +import torch +from synthesizer import audio +from synthesizer.hparams import hparams +from synthesizer.models.tacotron import Tacotron +from synthesizer.utils.symbols import symbols +from synthesizer.utils.text import text_to_sequence +from vocoder.display import simple_table +from pathlib import Path +from typing import Union, List +import numpy as np +import librosa +from utils import logmmse +from pypinyin import lazy_pinyin, Style +import os +import tensorflow as tf + +class Synthesizer: + sample_rate = hparams.sample_rate + hparams = hparams + + def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False): + """ + The model isn't instantiated and loaded in memory until needed or until load() is called. + + :param model_fpath: path to the trained model file + :param verbose: if False, prints less information when using the model + """ + self.verbose = verbose + self._low_mem = low_mem + + + + + # Prepare the model + self._model = None # type: Tacotron2 + checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir) + if checkpoint_state is None: + raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir) + self.checkpoint_fpath = checkpoint_state.model_checkpoint_path + if verbose: + model_name = checkpoints_dir.parent.name.replace("logs-", "") + step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:]) + print("Found synthesizer \"%s\" trained to step %d" % (model_name, step)) + + + def is_loaded(self): + """ + Whether the model is loaded in memory. + """ + return self._model is not None + + def load(self): + """ + Instantiates and loads the model given the weights file that was passed in the constructor. + """ + if self._low_mem: + raise Exception("Cannot load the synthesizer permanently in low mem mode") + tf.reset_default_graph() + self._model = Tacotron2(self.checkpoint_fpath, hparams) + + def synthesize_spectrograms(self, texts: List[str], + embeddings: Union[np.ndarray, List[np.ndarray]], + return_alignments=False): + """ + Synthesizes mel spectrograms from texts and speaker embeddings. + + :param texts: a list of N text prompts to be synthesized + :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) + :param return_alignments: if True, a matrix representing the alignments between the + characters + and each decoder output step will be returned for each spectrogram + :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the + sequence length of spectrogram i, and possibly the alignments. + """ + # Load the model on the first request. + if not self.is_loaded(): + self.load() + + + + print("Read " + str(texts)) + texts = [" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) for v in texts] + print("Synthesizing " + str(texts)) + # Preprocess text inputs + inputs = [text_to_sequence(text, hparams.tts_cleaner_names) for text in texts] + if not isinstance(embeddings, list): + embeddings = [embeddings] + + # Batch inputs + batched_inputs = [inputs[i:i+hparams.synthesis_batch_size] + for i in range(0, len(inputs), hparams.synthesis_batch_size)] + batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size] + for i in range(0, len(embeddings), hparams.synthesis_batch_size)] + + specs = [] + for i, batch in enumerate(batched_inputs, 1): + if self.verbose: + print(f"\n| Generating {i}/{len(batched_inputs)}") + + # Pad texts so they are all the same length + text_lens = [len(text) for text in batch] + max_text_len = max(text_lens) + # chars = [pad1d(text, max_text_len) for text in batch] + # chars = np.stack(chars) + # + # # Stack speaker embeddings into 2D array for batch processing + speaker_embeds = np.stack(batched_embeds[i-1]) + # + # # Convert to tensor + # chars = torch.tensor(chars).long().to(self.device) + # speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device) + + # Inference + #print(texts) + specs, alignments = self._model.my_synthesize(speaker_embeds, texts) #传入参数是embeddings还是speaker——embeds未确定 + + + + if self.verbose: + print("\n\nDone.\n") + return (specs, alignments) if return_alignments else specs + + @staticmethod + def load_preprocess_wav(fpath): + """ + Loads and preprocesses an audio file under the same conditions the audio files were used to + train the synthesizer. + """ + wav = librosa.load(str(fpath), hparams.sample_rate)[0] + if hparams.rescale: + wav = wav / np.abs(wav).max() * hparams.rescaling_max + # denoise + if len(wav) > hparams.sample_rate*(0.3+0.1): + noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)], + wav[-int(hparams.sample_rate*0.15):]]) + profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) + wav = logmmse.denoise(wav, profile) + return wav + + @staticmethod + def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]): + """ + Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that + were fed to the synthesizer when training. + """ + if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): + wav = Synthesizer.load_preprocess_wav(fpath_or_wav) + else: + wav = fpath_or_wav + + mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) + return mel_spectrogram + + @staticmethod + def griffin_lim(mel): + """ + Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built + with the same parameters present in hparams.py. + """ + return audio.inv_mel_spectrogram(mel, hparams) + + +def pad1d(x, max_len, pad_value=0): + return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value) diff --git a/synthesizer_tacotron2/infolog.py b/synthesizer_tacotron2/infolog.py new file mode 100644 index 0000000..e09cb30 --- /dev/null +++ b/synthesizer_tacotron2/infolog.py @@ -0,0 +1,50 @@ +import atexit +import json +from datetime import datetime +from threading import Thread +from urllib.request import Request, urlopen + +_format = "%Y-%m-%d %H:%M:%S.%f" +_file = None +_run_name = None +_slack_url = None + + +def init(filename, run_name, slack_url=None): + global _file, _run_name, _slack_url + _close_logfile() + _file = open(filename, "a") + _file = open(filename, "a") + _file.write("\n-----------------------------------------------------------------\n") + _file.write("Starting new {} training run\n".format(run_name)) + _file.write("-----------------------------------------------------------------\n") + _run_name = run_name + _slack_url = slack_url + + +def log(msg, end="\n", slack=False): + print(msg, end=end) + if _file is not None: + _file.write("[%s] %s\n" % (datetime.now().strftime(_format)[:-3], msg)) + if slack and _slack_url is not None: + Thread(target=_send_slack, args=(msg,)).start() + + +def _close_logfile(): + global _file + if _file is not None: + _file.close() + _file = None + + +def _send_slack(msg): + req = Request(_slack_url) + req.add_header("Content-Type", "application/json") + urlopen(req, json.dumps({ + "username": "tacotron", + "icon_emoji": ":taco:", + "text": "*%s*: %s" % (_run_name, msg) + }).encode()) + + +atexit.register(_close_logfile) diff --git a/synthesizer_tacotron2/models/__init__.py b/synthesizer_tacotron2/models/__init__.py new file mode 100644 index 0000000..0694546 --- /dev/null +++ b/synthesizer_tacotron2/models/__init__.py @@ -0,0 +1,8 @@ +from .tacotron import Tacotron + + +def create_model(name, hparams): + if name == "Tacotron": + return Tacotron(hparams) + else: + raise Exception("Unknown model: " + name) diff --git a/synthesizer_tacotron2/models/architecture_wrappers.py b/synthesizer_tacotron2/models/architecture_wrappers.py new file mode 100644 index 0000000..331d475 --- /dev/null +++ b/synthesizer_tacotron2/models/architecture_wrappers.py @@ -0,0 +1,207 @@ +"""A set of wrappers useful for tacotron 2 architecture +All notations and variable names were used in concordance with originial tensorflow implementation +""" +import collections +import tensorflow as tf +from synthesizer.models.attention import _compute_attention +from tensorflow.contrib.rnn import RNNCell +from tensorflow.python.framework import ops, tensor_shape +from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops +from tensorflow.python.util import nest + +_zero_state_tensors = rnn_cell_impl._zero_state_tensors + + + +class TacotronEncoderCell(RNNCell): + """Tacotron 2 Encoder Cell + Passes inputs through a stack of convolutional layers then through a bidirectional LSTM + layer to predict the hidden representation vector (or memory) + """ + + def __init__(self, convolutional_layers, lstm_layer): + """Initialize encoder parameters + + Args: + convolutional_layers: Encoder convolutional block class + lstm_layer: encoder bidirectional lstm layer class + """ + super(TacotronEncoderCell, self).__init__() + #Initialize encoder layers + self._convolutions = convolutional_layers + self._cell = lstm_layer + + def __call__(self, inputs, input_lengths=None): + #Pass input sequence through a stack of convolutional layers + conv_output = self._convolutions(inputs) + + #Extract hidden representation from encoder lstm cells + hidden_representation = self._cell(conv_output, input_lengths) + + #For shape visualization + self.conv_output_shape = conv_output.shape + return hidden_representation + + +class TacotronDecoderCellState( + collections.namedtuple("TacotronDecoderCellState", + ("cell_state", "attention", "time", "alignments", + "alignment_history"))): + """`namedtuple` storing the state of a `TacotronDecoderCell`. + Contains: + - `cell_state`: The state of the wrapped `RNNCell` at the previous time + step. + - `attention`: The attention emitted at the previous time step. + - `time`: int32 scalar containing the current time step. + - `alignments`: A single or tuple of `Tensor`(s) containing the alignments + emitted at the previous time step for each attention mechanism. + - `alignment_history`: a single or tuple of `TensorArray`(s) + containing alignment matrices from all time steps for each attention + mechanism. Call `stack()` on each to convert to a `Tensor`. + """ + def replace(self, **kwargs): + """Clones the current state while overwriting components provided by kwargs. + """ + return super(TacotronDecoderCellState, self)._replace(**kwargs) + +class TacotronDecoderCell(RNNCell): + """Tactron 2 Decoder Cell + Decodes encoder output and previous mel frames into next r frames + + Decoder Step i: + 1) Prenet to compress last output information + 2) Concat compressed inputs with previous context vector (input feeding) * + 3) Decoder RNN (actual decoding) to predict current state s_{i} * + 4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments * + 5) Predict new output y_{i} using s_{i} and c_{i} (concatenated) + 6) Predict output ys_{i} using s_{i} and c_{i} (concatenated) + + * : This is typically taking a vanilla LSTM, wrapping it using tensorflow"s attention wrapper, + and wrap that with the prenet before doing an input feeding, and with the prediction layer + that uses RNN states to project on output space. Actions marked with (*) can be replaced with + tensorflow"s attention wrapper call if it was using cumulative alignments instead of previous alignments only. + """ + + def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection): + """Initialize decoder parameters + + Args: + prenet: A tensorflow fully connected layer acting as the decoder pre-net + attention_mechanism: A _BaseAttentionMechanism instance, usefull to + learn encoder-decoder alignments + rnn_cell: Instance of RNNCell, main body of the decoder + frame_projection: tensorflow fully connected layer with r * num_mels output units + stop_projection: tensorflow fully connected layer, expected to project to a scalar + and through a sigmoid activation + mask_finished: Boolean, Whether to mask decoder frames after the + """ + super(TacotronDecoderCell, self).__init__() + #Initialize decoder layers + self._prenet = prenet + self._attention_mechanism = attention_mechanism + self._cell = rnn_cell + self._frame_projection = frame_projection + self._stop_projection = stop_projection + + self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value + + def _batch_size_checks(self, batch_size, error_message): + return [check_ops.assert_equal(batch_size, + self._attention_mechanism.batch_size, + message=error_message)] + + @property + def output_size(self): + return self._frame_projection.shape + + @property + def state_size(self): + """The `state_size` property of `TacotronDecoderCell`. + + Returns: + An `TacotronDecoderCell` tuple containing shapes used by this object. + """ + return TacotronDecoderCellState( + cell_state=self._cell._cell.state_size, + time=tensor_shape.TensorShape([]), + attention=self._attention_layer_size, + alignments=self._attention_mechanism.alignments_size, + alignment_history=()) + + def zero_state(self, batch_size, dtype): + """Return an initial (zero) state tuple for this `AttentionWrapper`. + + Args: + batch_size: `0D` integer tensor: the batch size. + dtype: The internal state data type. + Returns: + An `TacotronDecoderCellState` tuple containing zeroed out tensors and, + possibly, empty `TensorArray` objects. + Raises: + ValueError: (or, possibly at runtime, InvalidArgument), if + `batch_size` does not match the output size of the encoder passed + to the wrapper object at initialization time. + """ + with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): + cell_state = self._cell._cell.zero_state(batch_size, dtype) + error_message = ( + "When calling zero_state of TacotronDecoderCell %s: " % self._base_name + + "Non-matching batch sizes between the memory " + "(encoder output) and the requested batch size.") + with ops.control_dependencies( + self._batch_size_checks(batch_size, error_message)): + cell_state = nest.map_structure( + lambda s: array_ops.identity(s, name="checked_cell_state"), + cell_state) + return TacotronDecoderCellState( + cell_state=cell_state, + time=array_ops.zeros([], dtype=tf.int32), + attention=_zero_state_tensors(self._attention_layer_size, batch_size, + dtype), + alignments=self._attention_mechanism.initial_alignments(batch_size, dtype), + alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0, + dynamic_size=True)) + + def __call__(self, inputs, state): + #Information bottleneck (essential for learning attention) + prenet_output = self._prenet(inputs) + + #Concat context vector and prenet output to form LSTM cells input (input feeding) + LSTM_input = tf.concat([prenet_output, state.attention], axis=-1) + + #Unidirectional LSTM layers + LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state) + + + #Compute the attention (context) vector and alignments using + #the new decoder cell hidden state as query vector + #and cumulative alignments to extract location features + #The choice of the new cell hidden state (s_{i}) of the last + #decoder RNN Cell is based on Luong et Al. (2015): + #https://arxiv.org/pdf/1508.04025.pdf + previous_alignments = state.alignments + previous_alignment_history = state.alignment_history + context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism, + LSTM_output, + previous_alignments, + attention_layer=None) + + #Concat LSTM outputs and context vector to form projections inputs + projections_input = tf.concat([LSTM_output, context_vector], axis=-1) + + #Compute predicted frames and predicted + cell_outputs = self._frame_projection(projections_input) + stop_tokens = self._stop_projection(projections_input) + + #Save alignment history + alignment_history = previous_alignment_history.write(state.time, alignments) + + #Prepare next decoder state + next_state = TacotronDecoderCellState( + time=state.time + 1, + cell_state=next_cell_state, + attention=context_vector, + alignments=cumulated_alignments, + alignment_history=alignment_history) + + return (cell_outputs, stop_tokens), next_state diff --git a/synthesizer_tacotron2/models/attention.py b/synthesizer_tacotron2/models/attention.py new file mode 100644 index 0000000..58892ad --- /dev/null +++ b/synthesizer_tacotron2/models/attention.py @@ -0,0 +1,207 @@ +"""Attention file for location based attention (compatible with tensorflow attention wrapper)""" + +import tensorflow as tf +from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention +from tensorflow.python.layers import core as layers_core +from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope + + +#From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py +def _compute_attention(attention_mechanism, cell_output, attention_state, + attention_layer): + """Computes the attention and alignments for a given attention_mechanism.""" + alignments, next_attention_state = attention_mechanism( + cell_output, state=attention_state) + + # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time] + expanded_alignments = array_ops.expand_dims(alignments, 1) + # Context is the inner product of alignments and values along the + # memory time dimension. + # alignments shape is + # [batch_size, 1, memory_time] + # attention_mechanism.values shape is + # [batch_size, memory_time, memory_size] + # the batched matmul is over memory_time, so the output shape is + # [batch_size, 1, memory_size]. + # we then squeeze out the singleton dim. + context = math_ops.matmul(expanded_alignments, attention_mechanism.values) + context = array_ops.squeeze(context, [1]) + + if attention_layer is not None: + attention = attention_layer(array_ops.concat([cell_output, context], 1)) + else: + attention = context + + return attention, alignments, next_attention_state + + +def _location_sensitive_score(W_query, W_fil, W_keys): + """Impelements Bahdanau-style (cumulative) scoring function. + This attention is described in: + J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- + gio, “Attention-based models for speech recognition,” in Ad- + vances in Neural Information Processing Systems, 2015, pp. + 577–585. + + ############################################################################# + hybrid attention (content-based + location-based) + f = F * α_{i-1} + energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a)) + ############################################################################# + + Args: + W_query: Tensor, shape "[batch_size, 1, attention_dim]" to compare to location features. + W_location: processed previous alignments into location features, shape "[batch_size, max_time, attention_dim]" + W_keys: Tensor, shape "[batch_size, max_time, attention_dim]", typically the encoder outputs. + Returns: + A "[batch_size, max_time]" attention score (energy) + """ + # Get the number of hidden units from the trailing dimension of keys + dtype = W_query.dtype + num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] + + v_a = tf.get_variable( + "attention_variable_projection", shape=[num_units], dtype=dtype, + initializer=tf.contrib.layers.xavier_initializer()) + b_a = tf.get_variable( + "attention_bias", shape=[num_units], dtype=dtype, + initializer=tf.zeros_initializer()) + + return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2]) + +def _smoothing_normalization(e): + """Applies a smoothing normalization function instead of softmax + Introduced in: + J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- + gio, “Attention-based models for speech recognition,” in Ad- + vances in Neural Information Processing Systems, 2015, pp. + 577–585. + + ############################################################################ + Smoothing normalization function + a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) + ############################################################################ + + Args: + e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score) + values of an attention mechanism + Returns: + matrix [batch_size, max_time]: [0, 1] normalized alignments with possible + attendance to multiple memory time steps. + """ + return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True) + + +class LocationSensitiveAttention(BahdanauAttention): + """Impelements Bahdanau-style (cumulative) scoring function. + Usually referred to as "hybrid" attention (content-based + location-based) + Extends the additive attention described in: + "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla- + tion by jointly learning to align and translate,” in Proceedings + of ICLR, 2015." + to use previous alignments as additional location features. + + This attention is described in: + J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- + gio, “Attention-based models for speech recognition,” in Ad- + vances in Neural Information Processing Systems, 2015, pp. + 577–585. + """ + + def __init__(self, + num_units, + memory, + hparams, + mask_encoder=True, + memory_sequence_length=None, + smoothing=False, + cumulate_weights=True, + name="LocationSensitiveAttention"): + """Construct the Attention mechanism. + Args: + num_units: The depth of the query mechanism. + memory: The memory to query; usually the output of an RNN encoder. This + tensor should be shaped `[batch_size, max_time, ...]`. + mask_encoder (optional): Boolean, whether to mask encoder paddings. + memory_sequence_length (optional): Sequence lengths for the batch entries + in memory. If provided, the memory tensor rows are masked with zeros + for values past the respective sequence lengths. Only relevant if mask_encoder = True. + smoothing (optional): Boolean. Determines which normalization function to use. + Default normalization function (probablity_fn) is softmax. If smoothing is + enabled, we replace softmax with: + a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) + Introduced in: + J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- + gio, “Attention-based models for speech recognition,” in Ad- + vances in Neural Information Processing Systems, 2015, pp. + 577–585. + This is mainly used if the model wants to attend to multiple input parts + at the same decoding step. We probably won"t be using it since multiple sound + frames may depend on the same character/phone, probably not the way around. + Note: + We still keep it implemented in case we want to test it. They used it in the + paper in the context of speech recognition, where one phoneme may depend on + multiple subsequent sound frames. + name: Name to use when creating ops. + """ + #Create normalization function + #Setting it to None defaults in using softmax + normalization_function = _smoothing_normalization if (smoothing == True) else None + memory_length = memory_sequence_length if (mask_encoder==True) else None + super(LocationSensitiveAttention, self).__init__( + num_units=num_units, + memory=memory, + memory_sequence_length=memory_length, + probability_fn=normalization_function, + name=name) + + self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters, + kernel_size=hparams.attention_kernel, padding="same", use_bias=True, + bias_initializer=tf.zeros_initializer(), name="location_features_convolution") + self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, + dtype=tf.float32, name="location_features_layer") + self._cumulate = cumulate_weights + + def __call__(self, query, state): + """Score the query based on the keys and values. + Args: + query: Tensor of dtype matching `self.values` and shape + `[batch_size, query_depth]`. + state (previous alignments): Tensor of dtype matching `self.values` and shape + `[batch_size, alignments_size]` + (`alignments_size` is memory"s `max_time`). + Returns: + alignments: Tensor of dtype matching `self.values` and shape + `[batch_size, alignments_size]` (`alignments_size` is memory's + `max_time`). + """ + previous_alignments = state + with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]): + + # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim] + processed_query = self.query_layer(query) if self.query_layer else query + # -> [batch_size, 1, attention_dim] + processed_query = tf.expand_dims(processed_query, 1) + + # processed_location_features shape [batch_size, max_time, attention dimension] + # [batch_size, max_time] -> [batch_size, max_time, 1] + expanded_alignments = tf.expand_dims(previous_alignments, axis=2) + # location features [batch_size, max_time, filters] + f = self.location_convolution(expanded_alignments) + # Projected location features [batch_size, max_time, attention_dim] + processed_location_features = self.location_layer(f) + + # energy shape [batch_size, max_time] + energy = _location_sensitive_score(processed_query, processed_location_features, self.keys) + + + # alignments shape = energy shape = [batch_size, max_time] + alignments = self._probability_fn(energy, previous_alignments) + + # Cumulate alignments + if self._cumulate: + next_state = alignments + previous_alignments + else: + next_state = alignments + + return alignments, next_state diff --git a/synthesizer_tacotron2/models/custom_decoder.py b/synthesizer_tacotron2/models/custom_decoder.py new file mode 100644 index 0000000..de56876 --- /dev/null +++ b/synthesizer_tacotron2/models/custom_decoder.py @@ -0,0 +1,132 @@ +from __future__ import absolute_import, division, print_function +import collections +import tensorflow as tf +from synthesizer.models.helpers import TacoTestHelper, TacoTrainingHelper +from tensorflow.contrib.seq2seq.python.ops import decoder +from tensorflow.contrib.seq2seq.python.ops import helper as helper_py +from tensorflow.python.framework import ops, tensor_shape +from tensorflow.python.layers import base as layers_base +from tensorflow.python.ops import rnn_cell_impl +from tensorflow.python.util import nest + + +class CustomDecoderOutput( + collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))): + pass + + +class CustomDecoder(decoder.Decoder): + """Custom sampling decoder. + + Allows for stop token prediction at inference time + and returns equivalent loss in training time. + + Note: + Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers + """ + + def __init__(self, cell, helper, initial_state, output_layer=None): + """Initialize CustomDecoder. + Args: + cell: An `RNNCell` instance. + helper: A `Helper` instance. + initial_state: A (possibly nested tuple of...) tensors and TensorArrays. + The initial state of the RNNCell. + output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., + `tf.layers.Dense`. Optional layer to apply to the RNN output prior + to storing the result or sampling. + Raises: + TypeError: if `cell`, `helper` or `output_layer` have an incorrect type. + """ + rnn_cell_impl.assert_like_rnncell(type(cell), cell) + if not isinstance(helper, helper_py.Helper): + raise TypeError("helper must be a Helper, received: %s" % type(helper)) + if (output_layer is not None + and not isinstance(output_layer, layers_base.Layer)): + raise TypeError( + "output_layer must be a Layer, received: %s" % type(output_layer)) + self._cell = cell + self._helper = helper + self._initial_state = initial_state + self._output_layer = output_layer + + @property + def batch_size(self): + return self._helper.batch_size + + def _rnn_output_size(self): + size = self._cell.output_size + if self._output_layer is None: + return size + else: + # To use layer"s compute_output_shape, we need to convert the + # RNNCell"s output_size entries into shapes with an unknown + # batch size. We then pass this through the layer"s + # compute_output_shape and read off all but the first (batch) + # dimensions to get the output size of the rnn with the layer + # applied to the top. + output_shape_with_unknown_batch = nest.map_structure( + lambda s: tensor_shape.TensorShape([None]).concatenate(s), + size) + layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access + output_shape_with_unknown_batch) + return nest.map_structure(lambda s: s[1:], layer_output_shape) + + @property + def output_size(self): + # Return the cell output and the id + return CustomDecoderOutput( + rnn_output=self._rnn_output_size(), + token_output=self._helper.token_output_size, + sample_id=self._helper.sample_ids_shape) + + @property + def output_dtype(self): + # Assume the dtype of the cell is the output_size structure + # containing the input_state"s first component's dtype. + # Return that structure and the sample_ids_dtype from the helper. + dtype = nest.flatten(self._initial_state)[0].dtype + return CustomDecoderOutput( + nest.map_structure(lambda _: dtype, self._rnn_output_size()), + tf.float32, + self._helper.sample_ids_dtype) + + def initialize(self, name=None): + """Initialize the decoder. + Args: + name: Name scope for any created operations. + Returns: + `(finished, first_inputs, initial_state)`. + """ + return self._helper.initialize() + (self._initial_state,) + + def step(self, time, inputs, state, name=None): + """Perform a custom decoding step. + Enables for dyanmic prediction + Args: + time: scalar `int32` tensor. + inputs: A (structure of) input tensors. + state: A (structure of) state tensors and TensorArrays. + name: Name scope for any created operations. + Returns: + `(outputs, next_state, next_inputs, finished)`. + """ + with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)): + #Call outputprojection wrapper cell + (cell_outputs, stop_token), cell_state = self._cell(inputs, state) + + #apply output_layer (if existant) + if self._output_layer is not None: + cell_outputs = self._output_layer(cell_outputs) + sample_ids = self._helper.sample( + time=time, outputs=cell_outputs, state=cell_state) + + (finished, next_inputs, next_state) = self._helper.next_inputs( + time=time, + outputs=cell_outputs, + state=cell_state, + sample_ids=sample_ids, + stop_token_prediction=stop_token) + + outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids) + return (outputs, next_state, next_inputs, finished) diff --git a/synthesizer_tacotron2/models/helpers.py b/synthesizer_tacotron2/models/helpers.py new file mode 100644 index 0000000..eec0699 --- /dev/null +++ b/synthesizer_tacotron2/models/helpers.py @@ -0,0 +1,161 @@ +import numpy as np +import tensorflow as tf +from tensorflow.contrib.seq2seq import Helper + + +class TacoTestHelper(Helper): + def __init__(self, batch_size, hparams): + with tf.name_scope("TacoTestHelper"): + self._batch_size = batch_size + self._output_dim = hparams.num_mels + self._reduction_factor = hparams.outputs_per_step + self.stop_at_any = hparams.stop_at_any + + @property + def batch_size(self): + return self._batch_size + + @property + def token_output_size(self): + return self._reduction_factor + + @property + def sample_ids_shape(self): + return tf.TensorShape([]) + + @property + def sample_ids_dtype(self): + return np.int32 + + def initialize(self, name=None): + return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) + + def sample(self, time, outputs, state, name=None): + return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them + + def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): + """Stop on EOS. Otherwise, pass the last output as the next input and pass through state.""" + with tf.name_scope("TacoTestHelper"): + #A sequence is finished when the output probability is > 0.5 + finished = tf.cast(tf.round(stop_token_prediction), tf.bool) + + #Since we are predicting r frames at each step, two modes are + #then possible: + # Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended) + # Stop when the model outputs a p > 0.5 for all r frames (Safer) + #Note: + # With enough training steps, the model should be able to predict when to stop correctly + # and the use of stop_at_any = True would be recommended. If however the model didn"t + # learn to stop correctly yet, (stops too soon) one could choose to use the safer option + # to get a correct synthesis + if self.stop_at_any: + finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended + else: + finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option + + # Feed last output frame as next input. outputs is [N, output_dim * r] + next_inputs = outputs[:, -self._output_dim:] + next_state = state + return (finished, next_inputs, next_state) + + +class TacoTrainingHelper(Helper): + def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step): + # inputs is [N, T_in], targets is [N, T_out, D] + with tf.name_scope("TacoTrainingHelper"): + self._batch_size = batch_size + self._output_dim = hparams.num_mels + self._reduction_factor = hparams.outputs_per_step + self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio) + self.gta = gta + self.eval = evaluating + self._hparams = hparams + self.global_step = global_step + + r = self._reduction_factor + # Feed every r-th target frame as input + self._targets = targets[:, r-1::r, :] + + #Maximal sequence length + self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size]) + + @property + def batch_size(self): + return self._batch_size + + @property + def token_output_size(self): + return self._reduction_factor + + @property + def sample_ids_shape(self): + return tf.TensorShape([]) + + @property + def sample_ids_dtype(self): + return np.int32 + + def initialize(self, name=None): + #Compute teacher forcing ratio for this global step. + #In GTA mode, override teacher forcing scheme to work with full teacher forcing + if self.gta: + self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth + elif self.eval and self._hparams.natural_eval: + self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions + else: + if self._hparams.tacotron_teacher_forcing_mode == "scheduled": + self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio, + self.global_step, self._hparams) + + return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) + + def sample(self, time, outputs, state, name=None): + return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them + + def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): + with tf.name_scope(name or "TacoTrainingHelper"): + #synthesis stop (we let the model see paddings as we mask them when computing loss functions) + finished = (time + 1 >= self._lengths) + + #Pick previous outputs randomly with respect to teacher forcing ratio + next_inputs = tf.cond( + tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), + lambda: self._targets[:, time, :], #Teacher-forcing: return true frame + lambda: outputs[:,-self._output_dim:]) + + #Pass on state + next_state = state + return (finished, next_inputs, next_state) + + +def _go_frames(batch_size, output_dim): + """Returns all-zero frames for a given batch size and output dimension""" + return tf.tile([[0.0]], [batch_size, output_dim]) + +def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams): + ################################################################# + # Narrow Cosine Decay: + + # Phase 1: tfr = 1 + # We only start learning rate decay after 10k steps + + # Phase 2: tfr in ]0, 1[ + # decay reach minimal value at step ~280k + + # Phase 3: tfr = 0 + # clip by minimal teacher forcing ratio value (step >~ 280k) + ################################################################# + #Compute natural cosine decay + tfr = tf.train.cosine_decay(init_tfr, + global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr = 1 at step 10k + decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr = 0 at step ~280k + alpha=hparams.tacotron_teacher_forcing_decay_alpha, #tfr = 0% of init_tfr as final value + name="tfr_cosine_decay") + + #force teacher forcing ratio to take initial value when global step < start decay step. + narrow_tfr = tf.cond( + tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)), + lambda: tf.convert_to_tensor(init_tfr), + lambda: tfr) + + return narrow_tfr \ No newline at end of file diff --git a/synthesizer_tacotron2/models/modules.py b/synthesizer_tacotron2/models/modules.py new file mode 100644 index 0000000..7696572 --- /dev/null +++ b/synthesizer_tacotron2/models/modules.py @@ -0,0 +1,528 @@ +import tensorflow as tf + + +class HighwayNet: + def __init__(self, units, name=None): + self.units = units + self.scope = "HighwayNet" if name is None else name + + self.H_layer = tf.layers.Dense(units=self.units, activation=tf.nn.relu, name="H") + self.T_layer = tf.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T", + bias_initializer=tf.constant_initializer(-1.)) + + def __call__(self, inputs): + with tf.variable_scope(self.scope): + H = self.H_layer(inputs) + T = self.T_layer(inputs) + return H * T + inputs * (1. - T) + + +class CBHG: + def __init__(self, K, conv_channels, pool_size, projections, projection_kernel_size, + n_highwaynet_layers, highway_units, rnn_units, is_training, name=None): + self.K = K + self.conv_channels = conv_channels + self.pool_size = pool_size + + self.projections = projections + self.projection_kernel_size = projection_kernel_size + + self.is_training = is_training + self.scope = "CBHG" if name is None else name + + self.highway_units = highway_units + self.highwaynet_layers = [ + HighwayNet(highway_units, name="{}_highwaynet_{}".format(self.scope, i + 1)) for i in + range(n_highwaynet_layers)] + self._fw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_forward_RNN".format(self.scope)) + self._bw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_backward_RNN".format(self.scope)) + + def __call__(self, inputs, input_lengths): + with tf.variable_scope(self.scope): + with tf.variable_scope("conv_bank"): + # Convolution bank: concatenate on the last axis to stack channels from all + # convolutions + # The convolution bank uses multiple different kernel sizes to have many insights + # of the input sequence + # This makes one of the strengths of the CBHG block on sequences. + conv_outputs = tf.concat( + [conv1d(inputs, k, self.conv_channels, tf.nn.relu, self.is_training, 0., + "conv1d_{}".format(k)) for k in range(1, self.K + 1)], + axis=-1 + ) + + # Maxpooling (dimension reduction, Using max instead of average helps finding "Edges" + # in mels) + maxpool_output = tf.layers.max_pooling1d( + conv_outputs, + pool_size=self.pool_size, + strides=1, + padding="same") + + # Two projection layers + proj1_output = conv1d(maxpool_output, self.projection_kernel_size, self.projections[0], + tf.nn.relu, self.is_training, 0., "proj1") + proj2_output = conv1d(proj1_output, self.projection_kernel_size, self.projections[1], + lambda _: _, self.is_training, 0., "proj2") + + # Residual connection + highway_input = proj2_output + inputs + + # Additional projection in case of dimension mismatch (for HighwayNet "residual" + # connection) + if highway_input.shape[2] != self.highway_units: + highway_input = tf.layers.dense(highway_input, self.highway_units) + + # 4-layer HighwayNet + for highwaynet in self.highwaynet_layers: + highway_input = highwaynet(highway_input) + rnn_input = highway_input + + # Bidirectional RNN + outputs, states = tf.nn.bidirectional_dynamic_rnn( + self._fw_cell, + self._bw_cell, + rnn_input, + sequence_length=input_lengths, + dtype=tf.float32) + return tf.concat(outputs, axis=2) # Concat forward and backward outputs + + +class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell): + """Wrapper for tf LSTM to create Zoneout LSTM Cell + + inspired by: + https://github.com/teganmaharaj/zoneout/blob/master/zoneout_tensorflow.py + + Published by one of "https://arxiv.org/pdf/1606.01305.pdf" paper writers. + + Many thanks to @Ondal90 for pointing this out. You sir are a hero! + """ + + def __init__(self, num_units, is_training, zoneout_factor_cell=0., zoneout_factor_output=0., + state_is_tuple=True, name=None): + """Initializer with possibility to set different zoneout values for cell/hidden states. + """ + zm = min(zoneout_factor_output, zoneout_factor_cell) + zs = max(zoneout_factor_output, zoneout_factor_cell) + + if zm < 0. or zs > 1.: + raise ValueError("One/both provided Zoneout factors are not in [0, 1]") + + self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name) + self._zoneout_cell = zoneout_factor_cell + self._zoneout_outputs = zoneout_factor_output + self.is_training = is_training + self.state_is_tuple = state_is_tuple + + @property + def state_size(self): + return self._cell.state_size + + @property + def output_size(self): + return self._cell.output_size + + def __call__(self, inputs, state, scope=None): + """Runs vanilla LSTM Cell and applies zoneout. + """ + # Apply vanilla LSTM + output, new_state = self._cell(inputs, state, scope) + + if self.state_is_tuple: + (prev_c, prev_h) = state + (new_c, new_h) = new_state + else: + num_proj = self._cell._num_units if self._cell._num_proj is None else \ + self._cell._num_proj + prev_c = tf.slice(state, [0, 0], [-1, self._cell._num_units]) + prev_h = tf.slice(state, [0, self._cell._num_units], [-1, num_proj]) + new_c = tf.slice(new_state, [0, 0], [-1, self._cell._num_units]) + new_h = tf.slice(new_state, [0, self._cell._num_units], [-1, num_proj]) + + # Apply zoneout + if self.is_training: + # nn.dropout takes keep_prob (probability to keep activations) not drop_prob ( + # probability to mask activations)! + c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, + (1 - self._zoneout_cell)) + prev_c + h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, + (1 - self._zoneout_outputs)) + prev_h + + else: + c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c + h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h + + new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c, + h]) + + return output, new_state + + +class EncoderConvolutions: + """Encoder convolutional layers used to find local dependencies in inputs characters. + """ + + def __init__(self, is_training, hparams, activation=tf.nn.relu, scope=None): + """ + Args: + is_training: Boolean, determines if the model is training or in inference to control + dropout + kernel_size: tuple or integer, The size of convolution kernels + channels: integer, number of convolutional kernels + activation: callable, postnet activation function for each convolutional layer + scope: Postnet scope. + """ + super(EncoderConvolutions, self).__init__() + self.is_training = is_training + + self.kernel_size = hparams.enc_conv_kernel_size + self.channels = hparams.enc_conv_channels + self.activation = activation + self.scope = "enc_conv_layers" if scope is None else scope + self.drop_rate = hparams.tacotron_dropout_rate + self.enc_conv_num_layers = hparams.enc_conv_num_layers + + def __call__(self, inputs): + with tf.variable_scope(self.scope): + x = inputs + for i in range(self.enc_conv_num_layers): + x = conv1d(x, self.kernel_size, self.channels, self.activation, + self.is_training, self.drop_rate, + "conv_layer_{}_".format(i + 1) + self.scope) + return x + + +class EncoderRNN: + """Encoder bidirectional one layer LSTM + """ + + def __init__(self, is_training, size=256, zoneout=0.1, scope=None): + """ + Args: + is_training: Boolean, determines if the model is training or in inference to control + zoneout + size: integer, the number of LSTM units for each direction + zoneout: the zoneout factor + scope: EncoderRNN scope. + """ + super(EncoderRNN, self).__init__() + self.is_training = is_training + + self.size = size + self.zoneout = zoneout + self.scope = "encoder_LSTM" if scope is None else scope + + # Create forward LSTM Cell + self._fw_cell = ZoneoutLSTMCell(size, is_training, + zoneout_factor_cell=zoneout, + zoneout_factor_output=zoneout, + name="encoder_fw_LSTM") + + # Create backward LSTM Cell + self._bw_cell = ZoneoutLSTMCell(size, is_training, + zoneout_factor_cell=zoneout, + zoneout_factor_output=zoneout, + name="encoder_bw_LSTM") + + def __call__(self, inputs, input_lengths): + with tf.variable_scope(self.scope): + outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn( + self._fw_cell, + self._bw_cell, + inputs, + sequence_length=input_lengths, + dtype=tf.float32, + swap_memory=True) + + return tf.concat(outputs, axis=2) # Concat and return forward + backward outputs + + +class Prenet: + """Two fully connected layers used as an information bottleneck for the attention. + """ + + def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activation=tf.nn.relu, + scope=None): + """ + Args: + layers_sizes: list of integers, the length of the list represents the number of pre-net + layers and the list values represent the layers number of units + activation: callable, activation functions of the prenet layers. + scope: Prenet scope. + """ + super(Prenet, self).__init__() + self.drop_rate = drop_rate + + self.layers_sizes = layers_sizes + self.activation = activation + self.is_training = is_training + + self.scope = "prenet" if scope is None else scope + + def __call__(self, inputs): + x = inputs + + with tf.variable_scope(self.scope): + for i, size in enumerate(self.layers_sizes): + dense = tf.layers.dense(x, units=size, activation=self.activation, + name="dense_{}".format(i + 1)) + # The paper discussed introducing diversity in generation at inference time + # by using a dropout of 0.5 only in prenet layers (in both training and inference). + x = tf.layers.dropout(dense, rate=self.drop_rate, training=True, + name="dropout_{}".format(i + 1) + self.scope) + return x + + +class DecoderRNN: + """Decoder two uni directional LSTM Cells + """ + + def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None): + """ + Args: + is_training: Boolean, determines if the model is in training or inference to control + zoneout + layers: integer, the number of LSTM layers in the decoder + size: integer, the number of LSTM units in each layer + zoneout: the zoneout factor + """ + super(DecoderRNN, self).__init__() + self.is_training = is_training + + self.layers = layers + self.size = size + self.zoneout = zoneout + self.scope = "decoder_rnn" if scope is None else scope + + # Create a set of LSTM layers + self.rnn_layers = [ZoneoutLSTMCell(size, is_training, + zoneout_factor_cell=zoneout, + zoneout_factor_output=zoneout, + name="decoder_LSTM_{}".format(i + 1)) for i in + range(layers)] + + self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True) + + def __call__(self, inputs, states): + with tf.variable_scope(self.scope): + return self._cell(inputs, states) + + +class FrameProjection: + """Projection layer to r * num_mels dimensions or num_mels dimensions + """ + + def __init__(self, shape=80, activation=None, scope=None): + """ + Args: + shape: integer, dimensionality of output space (r*n_mels for decoder or n_mels for + postnet) + activation: callable, activation function + scope: FrameProjection scope. + """ + super(FrameProjection, self).__init__() + + self.shape = shape + self.activation = activation + + self.scope = "Linear_projection" if scope is None else scope + self.dense = tf.layers.Dense(units=shape, activation=activation, + name="projection_{}".format(self.scope)) + + def __call__(self, inputs): + with tf.variable_scope(self.scope): + # If activation==None, this returns a simple Linear projection + # else the projection will be passed through an activation function + # output = tf.layers.dense(inputs, units=self.shape, activation=self.activation, + # name="projection_{}".format(self.scope)) + output = self.dense(inputs) + + return output + + +class StopProjection: + """Projection to a scalar and through a sigmoid activation + """ + + def __init__(self, is_training, shape=1, activation=tf.nn.sigmoid, scope=None): + """ + Args: + is_training: Boolean, to control the use of sigmoid function as it is useless to use it + during training since it is integrate inside the sigmoid_crossentropy loss + shape: integer, dimensionality of output space. Defaults to 1 (scalar) + activation: callable, activation function. only used during inference + scope: StopProjection scope. + """ + super(StopProjection, self).__init__() + self.is_training = is_training + + self.shape = shape + self.activation = activation + self.scope = "stop_token_projection" if scope is None else scope + + def __call__(self, inputs): + with tf.variable_scope(self.scope): + output = tf.layers.dense(inputs, units=self.shape, + activation=None, name="projection_{}".format(self.scope)) + + # During training, don"t use activation as it is integrated inside the + # sigmoid_cross_entropy loss function + if self.is_training: + return output + return self.activation(output) + + +class Postnet: + """Postnet that takes final decoder output and fine tunes it (using vision on past and future + frames) + """ + + def __init__(self, is_training, hparams, activation=tf.nn.tanh, scope=None): + """ + Args: + is_training: Boolean, determines if the model is training or in inference to control + dropout + kernel_size: tuple or integer, The size of convolution kernels + channels: integer, number of convolutional kernels + activation: callable, postnet activation function for each convolutional layer + scope: Postnet scope. + """ + super(Postnet, self).__init__() + self.is_training = is_training + + self.kernel_size = hparams.postnet_kernel_size + self.channels = hparams.postnet_channels + self.activation = activation + self.scope = "postnet_convolutions" if scope is None else scope + self.postnet_num_layers = hparams.postnet_num_layers + self.drop_rate = hparams.tacotron_dropout_rate + + def __call__(self, inputs): + with tf.variable_scope(self.scope): + x = inputs + for i in range(self.postnet_num_layers - 1): + x = conv1d(x, self.kernel_size, self.channels, self.activation, + self.is_training, self.drop_rate, + "conv_layer_{}_".format(i + 1) + self.scope) + x = conv1d(x, self.kernel_size, self.channels, lambda _: _, self.is_training, + self.drop_rate, + "conv_layer_{}_".format(5) + self.scope) + return x + + +def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, scope): + with tf.variable_scope(scope): + conv1d_output = tf.layers.conv1d( + inputs, + filters=channels, + kernel_size=kernel_size, + activation=None, + padding="same") + batched = tf.layers.batch_normalization(conv1d_output, training=is_training) + activated = activation(batched) + return tf.layers.dropout(activated, rate=drop_rate, training=is_training, + name="dropout_{}".format(scope)) + + +def _round_up_tf(x, multiple): + # Tf version of remainder = x % multiple + remainder = tf.mod(x, multiple) + # Tf version of return x if remainder == 0 else x + multiple - remainder + x_round = tf.cond(tf.equal(remainder, tf.zeros(tf.shape(remainder), dtype=tf.int32)), + lambda: x, + lambda: x + multiple - remainder) + + return x_round + + +def sequence_mask(lengths, r, expand=True): + """Returns a 2-D or 3-D tensorflow sequence mask depending on the argument "expand" + """ + max_len = tf.reduce_max(lengths) + max_len = _round_up_tf(max_len, tf.convert_to_tensor(r)) + if expand: + return tf.expand_dims(tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32), axis=-1) + return tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32) + + +def MaskedMSE(targets, outputs, targets_lengths, hparams, mask=None): + """Computes a masked Mean Squared Error + """ + + # [batch_size, time_dimension, 1] + # example: + # sequence_mask([1, 3, 2], 5) = [[[1., 0., 0., 0., 0.]], + # [[1., 1., 1., 0., 0.]], + # [[1., 1., 0., 0., 0.]]] + # Note the maxlen argument that ensures mask shape is compatible with r>1 + # This will by default mask the extra paddings caused by r>1 + if mask is None: + mask = sequence_mask(targets_lengths, hparams.outputs_per_step, True) + + # [batch_size, time_dimension, channel_dimension(mels)] + ones = tf.ones(shape=[tf.shape(mask)[0], tf.shape(mask)[1], tf.shape(targets)[-1]], + dtype=tf.float32) + mask_ = mask * ones + + with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask_))]): + return tf.losses.mean_squared_error(labels=targets, predictions=outputs, weights=mask_) + + +def MaskedSigmoidCrossEntropy(targets, outputs, targets_lengths, hparams, mask=None): + """Computes a masked SigmoidCrossEntropy with logits + """ + + # [batch_size, time_dimension] + # example: + # sequence_mask([1, 3, 2], 5) = [[1., 0., 0., 0., 0.], + # [1., 1., 1., 0., 0.], + # [1., 1., 0., 0., 0.]] + # Note the maxlen argument that ensures mask shape is compatible with r>1 + # This will by default mask the extra paddings caused by r>1 + if mask is None: + mask = sequence_mask(targets_lengths, hparams.outputs_per_step, False) + + with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask))]): + # Use a weighted sigmoid cross entropy to measure the loss. Set + # hparams.cross_entropy_pos_weight to 1 + # will have the same effect as vanilla tf.nn.sigmoid_cross_entropy_with_logits. + losses = tf.nn.weighted_cross_entropy_with_logits(targets=targets, logits=outputs, + pos_weight=hparams.cross_entropy_pos_weight) + + with tf.control_dependencies([tf.assert_equal(tf.shape(mask), tf.shape(losses))]): + masked_loss = losses * mask + + return tf.reduce_sum(masked_loss) / tf.count_nonzero(masked_loss, dtype=tf.float32) + + +def MaskedLinearLoss(targets, outputs, targets_lengths, hparams, mask=None): + """Computes a masked MAE loss with priority to low frequencies + """ + + # [batch_size, time_dimension, 1] + # example: + # sequence_mask([1, 3, 2], 5) = [[[1., 0., 0., 0., 0.]], + # [[1., 1., 1., 0., 0.]], + # [[1., 1., 0., 0., 0.]]] + # Note the maxlen argument that ensures mask shape is compatible with r>1 + # This will by default mask the extra paddings caused by r>1 + if mask is None: + mask = sequence_mask(targets_lengths, hparams.outputs_per_step, True) + + # [batch_size, time_dimension, channel_dimension(freq)] + ones = tf.ones(shape=[tf.shape(mask)[0], tf.shape(mask)[1], tf.shape(targets)[-1]], + dtype=tf.float32) + mask_ = mask * ones + + l1 = tf.abs(targets - outputs) + n_priority_freq = int(2000 / (hparams.sample_rate * 0.5) * hparams.num_freq) + + with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask_))]): + masked_l1 = l1 * mask_ + masked_l1_low = masked_l1[:, :, 0:n_priority_freq] + + mean_l1 = tf.reduce_sum(masked_l1) / tf.reduce_sum(mask_) + mean_l1_low = tf.reduce_sum(masked_l1_low) / tf.reduce_sum(mask_) + + return 0.5 * mean_l1 + 0.5 * mean_l1_low diff --git a/synthesizer_tacotron2/models/tacotron.py b/synthesizer_tacotron2/models/tacotron.py new file mode 100644 index 0000000..9c4de4d --- /dev/null +++ b/synthesizer_tacotron2/models/tacotron.py @@ -0,0 +1,521 @@ +import tensorflow as tf +from synthesizer.utils.symbols import symbols +from synthesizer.infolog import log +from synthesizer.models.helpers import TacoTrainingHelper, TacoTestHelper +from synthesizer.models.modules import * +from tensorflow.contrib.seq2seq import dynamic_decode +from synthesizer.models.architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell +from synthesizer.models.custom_decoder import CustomDecoder +from synthesizer.models.attention import LocationSensitiveAttention + +import numpy as np + + +def split_func(x, split_pos): + rst = [] + start = 0 + # x will be a numpy array with the contents of the placeholder below + for i in range(split_pos.shape[0]): + rst.append(x[:, start:start + split_pos[i]]) + start += split_pos[i] + return rst + + +class Tacotron(): + """Tacotron-2 Feature prediction Model. + """ + + def __init__(self, hparams): + self._hparams = hparams + + def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, + stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, + global_step=None, is_training=False, is_evaluating=False, split_infos=None): + """ + Initializes the model for inference sets "mel_outputs" and "alignments" fields. + Args: + - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of + steps in the input time series, and values are character IDs + - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the + lengths of each sequence in inputs. + - embed_targets: float32 Tensor with shape [N, E] where E is the speaker + embedding size. + - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, + T_out is number of steps in the output time series, M is num_mels, and values are + entries in the mel spectrogram. Only needed for training. + """ + if mel_targets is None and stop_token_targets is not None: + raise ValueError("no multi targets were provided but token_targets were given") + if mel_targets is not None and stop_token_targets is None and not gta: + raise ValueError("Mel targets are provided without corresponding token_targets") + if not gta and self._hparams.predict_linear == True and linear_targets is None and \ + is_training: + raise ValueError( + "Model is set to use post processing to predict linear spectrograms in training " + "but no linear targets given!") + if gta and linear_targets is not None: + raise ValueError("Linear spectrogram prediction is not supported in GTA mode!") + if is_training and self._hparams.mask_decoder and targets_lengths is None: + raise RuntimeError( + "Model set to mask paddings but no targets lengths provided for the mask!") + if is_training and is_evaluating: + raise RuntimeError( + "Model can not be in training and evaluation modes at the same time!") + + split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or \ + self._hparams.split_on_cpu else "/gpu:{}".format( + self._hparams.tacotron_gpu_start_idx) + with tf.device(split_device): + hp = self._hparams + lout_int = [tf.int32] * hp.tacotron_num_gpus + lout_float = [tf.float32] * hp.tacotron_num_gpus + + tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus, + axis=0) + tower_targets_lengths = \ + tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if \ + targets_lengths is not None else targets_lengths + + ### SV2TTS ### + + tower_embed_targets = tf.split(embed_targets, num_or_size_splits=hp.tacotron_num_gpus, + axis=0) + + ############## + + p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) + p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]], + lout_float) if mel_targets is not None else mel_targets + p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:, 2]], + lout_float) if stop_token_targets is not None else \ + stop_token_targets + + tower_inputs = [] + tower_mel_targets = [] + tower_stop_token_targets = [] + + batch_size = tf.shape(inputs)[0] + mel_channels = hp.num_mels + for i in range(hp.tacotron_num_gpus): + tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) + if p_mel_targets is not None: + tower_mel_targets.append( + tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) + if p_stop_token_targets is not None: + tower_stop_token_targets.append( + tf.reshape(p_stop_token_targets[i], [batch_size, -1])) + + self.tower_decoder_output = [] + self.tower_alignments = [] + self.tower_stop_token_prediction = [] + self.tower_mel_outputs = [] + + tower_embedded_inputs = [] + tower_enc_conv_output_shape = [] + tower_encoder_cond_outputs = [] + tower_residual = [] + tower_projected_residual = [] + + # 1. Declare GPU Devices + gpus = ["/gpu:{}".format(i) for i in + range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] + for i in range(hp.tacotron_num_gpus): + with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + worker_device=gpus[i])): + with tf.variable_scope("inference") as scope: + assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled") + if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training: + assert global_step is not None + + # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit + # post processing when doing GTA synthesis + post_condition = hp.predict_linear and not gta + + # Embeddings ==> [batch_size, sequence_length, embedding_dim] + self.embedding_table = tf.get_variable( + "inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32) + embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i]) + + # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] + encoder_cell = TacotronEncoderCell( + EncoderConvolutions(is_training, hparams=hp, scope="encoder_convolutions"), + EncoderRNN(is_training, size=hp.encoder_lstm_units, + zoneout=hp.tacotron_zoneout_rate, scope="encoder_LSTM")) + + encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) + + # For shape visualization purpose + enc_conv_output_shape = encoder_cell.conv_output_shape + + + ### SV2TT2 ### + + # Append the speaker embedding to the encoder output at each timestep + tileable_shape = [-1, 1, self._hparams.speaker_embedding_size] + tileable_embed_targets = tf.reshape(tower_embed_targets[i], tileable_shape) + tiled_embed_targets = tf.tile(tileable_embed_targets, + [1, tf.shape(encoder_outputs)[1], 1]) + encoder_cond_outputs = tf.concat((encoder_outputs, tiled_embed_targets), 2) + + ############## + + + # Decoder Parts + # Attention Decoder Prenet + prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, + drop_rate=hp.tacotron_dropout_rate, scope="decoder_prenet") + # Attention Mechanism + attention_mechanism = LocationSensitiveAttention(hp.attention_dim, + encoder_cond_outputs, + hparams=hp, + mask_encoder=hp.mask_encoder, + memory_sequence_length=tf.reshape( + tower_input_lengths[i], + [-1]), + smoothing=hp.smoothing, + cumulate_weights=hp.cumulative_weights) + # Decoder LSTM Cells + decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, + size=hp.decoder_lstm_units, + zoneout=hp.tacotron_zoneout_rate, + scope="decoder_LSTM") + # Frames Projection layer + frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, + scope="linear_transform_projection") + # projection layer + stop_projection = StopProjection(is_training or is_evaluating, shape=hp + .outputs_per_step, + scope="stop_token_projection") + + # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) + decoder_cell = TacotronDecoderCell( + prenet, + attention_mechanism, + decoder_lstm, + frame_projection, + stop_projection) + + # Define the helper for our decoder + if is_training or is_evaluating or gta: + self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta, + is_evaluating, global_step) + else: + self.helper = TacoTestHelper(batch_size, hp) + + # initial decoder state + decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, + dtype=tf.float32) + + # Only use max iterations at synthesis time + max_iters = hp.max_iters if not (is_training or is_evaluating) else None + + # Decode + (frames_prediction, stop_token_prediction, + _), final_decoder_state, _ = dynamic_decode( + CustomDecoder(decoder_cell, self.helper, decoder_init_state), + impute_finished=False, + maximum_iterations=max_iters, + swap_memory=hp.tacotron_swap_with_cpu) + + # Reshape outputs to be one output per entry + # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] + decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) + stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) + + # Postnet + postnet = Postnet(is_training, hparams=hp, scope="postnet_convolutions") + + # Compute residual using post-net ==> [batch_size, decoder_steps * r, + # postnet_channels] + residual = postnet(decoder_output) + + # Project residual to same dimension as mel spectrogram + # ==> [batch_size, decoder_steps * r, num_mels] + residual_projection = FrameProjection(hp.num_mels, scope="postnet_projection") + projected_residual = residual_projection(residual) + + # Compute the mel spectrogram + mel_outputs = decoder_output + projected_residual + + if post_condition: + # Add post-processing CBHG. This does a great job at extracting features + # from mels before projection to Linear specs. + post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, + [hp.cbhg_projection, hp.num_mels], + hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, + hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, + name="CBHG_postnet") + + # [batch_size, decoder_steps(mel_frames), cbhg_channels] + post_outputs = post_cbhg(mel_outputs, None) + + # Linear projection of extracted features to make linear spectrogram + linear_specs_projection = FrameProjection(hp.num_freq, + scope="cbhg_linear_specs_projection") + + # [batch_size, decoder_steps(linear_frames), num_freq] + linear_outputs = linear_specs_projection(post_outputs) + + # Grab alignments from the final decoder state + alignments = tf.transpose(final_decoder_state.alignment_history.stack(), + [1, 2, 0]) + + self.tower_decoder_output.append(decoder_output) + self.tower_alignments.append(alignments) + self.tower_stop_token_prediction.append(stop_token_prediction) + self.tower_mel_outputs.append(mel_outputs) + tower_embedded_inputs.append(embedded_inputs) + tower_enc_conv_output_shape.append(enc_conv_output_shape) + tower_encoder_cond_outputs.append(encoder_cond_outputs) + tower_residual.append(residual) + tower_projected_residual.append(projected_residual) + + if post_condition: + self.tower_linear_outputs.append(linear_outputs) + log("initialisation done {}".format(gpus[i])) + + if is_training: + self.ratio = self.helper._ratio + self.tower_inputs = tower_inputs + self.tower_input_lengths = tower_input_lengths + self.tower_mel_targets = tower_mel_targets + # self.tower_linear_targets = tower_linear_targets + self.tower_targets_lengths = tower_targets_lengths + self.tower_stop_token_targets = tower_stop_token_targets + + self.all_vars = tf.trainable_variables() + + log("Initialized Tacotron model. Dimensions (? = dynamic shape): ") + log(" Train mode: {}".format(is_training)) + log(" Eval mode: {}".format(is_evaluating)) + log(" GTA mode: {}".format(gta)) + log(" Synthesis mode: {}".format(not (is_training or is_evaluating))) + log(" Input: {}".format(inputs.shape)) + for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): + log(" device: {}".format(i)) + log(" embedding: {}".format(tower_embedded_inputs[i].shape)) + log(" enc conv out: {}".format(tower_enc_conv_output_shape[i])) + log(" encoder out (cond): {}".format(tower_encoder_cond_outputs[i].shape)) + log(" decoder out: {}".format(self.tower_decoder_output[i].shape)) + log(" residual out: {}".format(tower_residual[i].shape)) + log(" projected residual out: {}".format(tower_projected_residual[i].shape)) + log(" mel out: {}".format(self.tower_mel_outputs[i].shape)) + if post_condition: + log(" linear out: {}".format(self.tower_linear_outputs[i].shape)) + log(" out: {}".format(self.tower_stop_token_prediction[i].shape)) + + # 1_000_000 is causing syntax problems for some people?! Python please :) + log(" Tacotron Parameters {:.3f} Million.".format( + np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000)) + + + def add_loss(self): + """Adds loss to the model. Sets "loss" field. initialize must have been called.""" + hp = self._hparams + + self.tower_before_loss = [] + self.tower_after_loss = [] + self.tower_stop_token_loss = [] + self.tower_regularization_loss = [] + self.tower_linear_loss = [] + self.tower_loss = [] + + total_before_loss = 0 + total_after_loss = 0 + total_stop_token_loss = 0 + total_regularization_loss = 0 + total_linear_loss = 0 + total_loss = 0 + + gpus = ["/gpu:{}".format(i) for i in + range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] + + for i in range(hp.tacotron_num_gpus): + with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + worker_device=gpus[i])): + with tf.variable_scope("loss") as scope: + if hp.mask_decoder: + # Compute loss of predictions before postnet + before = MaskedMSE(self.tower_mel_targets[i], self.tower_decoder_output[i], + self.tower_targets_lengths[i], + hparams=self._hparams) + # Compute loss after postnet + after = MaskedMSE(self.tower_mel_targets[i], self.tower_mel_outputs[i], + self.tower_targets_lengths[i], + hparams=self._hparams) + # Compute loss (for learning dynamic generation stop) + stop_token_loss = MaskedSigmoidCrossEntropy( + self.tower_stop_token_targets[i], + self.tower_stop_token_prediction[i], self.tower_targets_lengths[i], + hparams=self._hparams) + # SV2TTS extra L1 loss (disabled for now) + # linear_loss = MaskedLinearLoss(self.tower_mel_targets[i], + # self.tower_decoder_output[i], + # self.tower_targets_lengths[i], + # hparams=self._hparams) + linear_loss = 0. + else: + # Compute loss of predictions before postnet + before = tf.losses.mean_squared_error(self.tower_mel_targets[i], + self.tower_decoder_output[i]) + # Compute loss after postnet + after = tf.losses.mean_squared_error(self.tower_mel_targets[i], + self.tower_mel_outputs[i]) + # Compute loss (for learning dynamic generation stop) + stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( + labels=self.tower_stop_token_targets[i], + logits=self.tower_stop_token_prediction[i])) + + # SV2TTS extra L1 loss + l1 = tf.abs(self.tower_mel_targets[i] - self.tower_decoder_output[i]) + linear_loss = tf.reduce_mean(l1) + + # if hp.predict_linear: + # # Compute linear loss + # # From https://github.com/keithito/tacotron/blob/tacotron2-work-in + # # -progress/models/tacotron.py + # # Prioritize loss for frequencies under 2000 Hz. + # l1 = tf.abs(self.tower_linear_targets[i] - self.tower_linear_outputs[i]) + # n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_freq) + # linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean( + # l1[:, :, 0:n_priority_freq]) + # else: + # linear_loss = 0. + + # Compute the regularization weight + if hp.tacotron_scale_regularization: + reg_weight_scaler = 1. / ( + 2 * hp.max_abs_value) if hp.symmetric_mels else 1. / ( + hp.max_abs_value) + reg_weight = hp.tacotron_reg_weight * reg_weight_scaler + else: + reg_weight = hp.tacotron_reg_weight + + # Regularize variables + # Exclude all types of bias, RNN (Bengio et al. On the difficulty of training recurrent neural networks), embeddings and prediction projection layers. + # Note that we consider attention mechanism v_a weights as a prediction projection layer and we don"t regularize it. (This gave better stability) + regularization = tf.add_n([tf.nn.l2_loss(v) for v in self.all_vars + if not ( + "bias" in v.name or "Bias" in v.name or "_projection" in v.name or "inputs_embedding" in v.name + or "RNN" in v.name or "LSTM" in v.name)]) * reg_weight + + # Compute final loss term + self.tower_before_loss.append(before) + self.tower_after_loss.append(after) + self.tower_stop_token_loss.append(stop_token_loss) + self.tower_regularization_loss.append(regularization) + self.tower_linear_loss.append(linear_loss) + + loss = before + after + stop_token_loss + regularization + linear_loss + self.tower_loss.append(loss) + + for i in range(hp.tacotron_num_gpus): + total_before_loss += self.tower_before_loss[i] + total_after_loss += self.tower_after_loss[i] + total_stop_token_loss += self.tower_stop_token_loss[i] + total_regularization_loss += self.tower_regularization_loss[i] + total_linear_loss += self.tower_linear_loss[i] + total_loss += self.tower_loss[i] + + self.before_loss = total_before_loss / hp.tacotron_num_gpus + self.after_loss = total_after_loss / hp.tacotron_num_gpus + self.stop_token_loss = total_stop_token_loss / hp.tacotron_num_gpus + self.regularization_loss = total_regularization_loss / hp.tacotron_num_gpus + self.linear_loss = total_linear_loss / hp.tacotron_num_gpus + self.loss = total_loss / hp.tacotron_num_gpus + + def add_optimizer(self, global_step): + """Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. + Args: + global_step: int32 scalar Tensor representing current global step in training + """ + hp = self._hparams + tower_gradients = [] + + # 1. Declare GPU Devices + gpus = ["/gpu:{}".format(i) for i in + range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] + + grad_device = "/cpu:0" if hp.tacotron_num_gpus > 1 else gpus[0] + + with tf.device(grad_device): + with tf.variable_scope("optimizer") as scope: + if hp.tacotron_decay_learning_rate: + self.decay_steps = hp.tacotron_decay_steps + self.decay_rate = hp.tacotron_decay_rate + self.learning_rate = self._learning_rate_decay( + hp.tacotron_initial_learning_rate, global_step) + else: + self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate) + + optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1, + hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon) + + # 2. Compute Gradient + for i in range(hp.tacotron_num_gpus): + # Device placement + with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + worker_device=gpus[i])): + # agg_loss += self.tower_loss[i] + with tf.variable_scope("optimizer") as scope: + gradients = optimizer.compute_gradients(self.tower_loss[i]) + tower_gradients.append(gradients) + + # 3. Average Gradient + with tf.device(grad_device): + avg_grads = [] + vars = [] + for grad_and_vars in zip(*tower_gradients): + # grads_vars = [(grad1, var), (grad2, var), ...] + grads = [] + for g, _ in grad_and_vars: + expanded_g = tf.expand_dims(g, 0) + # Append on a "tower" dimension which we will average over below. + grads.append(expanded_g) + # Average over the "tower" dimension. + grad = tf.concat(axis=0, values=grads) + grad = tf.reduce_mean(grad, 0) + + v = grad_and_vars[0][1] + avg_grads.append(grad) + vars.append(v) + + self.gradients = avg_grads + # Just for causion + # https://github.com/Rayhane-mamah/Tacotron-2/issues/11 + if hp.tacotron_clip_gradients: + clipped_gradients, _ = tf.clip_by_global_norm(avg_grads, 1.) # __mark 0.5 refer + else: + clipped_gradients = avg_grads + + # Add dependency on UPDATE_OPS; otherwise batchnorm won"t work correctly. See: + # https://github.com/tensorflow/tensorflow/issues/1122 + with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): + self.optimize = optimizer.apply_gradients(zip(clipped_gradients, vars), + global_step=global_step) + + def _learning_rate_decay(self, init_lr, global_step): + ################################################################# + # Narrow Exponential Decay: + + # Phase 1: lr = 1e-3 + # We only start learning rate decay after 50k steps + + # Phase 2: lr in ]1e-5, 1e-3[ + # decay reach minimal value at step 310k + + # Phase 3: lr = 1e-5 + # clip by minimal learning rate value (step > 310k) + ################################################################# + hp = self._hparams + + # Compute natural exponential decay + lr = tf.train.exponential_decay(init_lr, + global_step - hp.tacotron_start_decay, + # lr = 1e-3 at step 50k + self.decay_steps, + self.decay_rate, # lr = 1e-5 around step 310k + name="lr_exponential_decay") + + # clip learning rate by max and min values (initial and final values) + return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr) diff --git a/synthesizer_tacotron2/preprocess.py b/synthesizer_tacotron2/preprocess.py new file mode 100644 index 0000000..dc305e4 --- /dev/null +++ b/synthesizer_tacotron2/preprocess.py @@ -0,0 +1,120 @@ +from multiprocessing.pool import Pool + +from functools import partial +from itertools import chain +from pathlib import Path +from tqdm import tqdm +import numpy as np +from encoder import inference as encoder +from synthesizer.preprocess_speaker import preprocess_speaker_general +from synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata + +data_info = { + "aidatatang_200zh": { + "subfolders": ["corpus/train"], + "trans_filepath": "transcript/aidatatang_200_zh_transcript.txt", + "speak_func": preprocess_speaker_general + }, + "magicdata": { + "subfolders": ["train"], + "trans_filepath": "train/TRANS.txt", + "speak_func": preprocess_speaker_general, + "transcript_func": preprocess_transcript_magicdata, + }, + "aishell3":{ + "subfolders": ["train/wav"], + "trans_filepath": "train/content.txt", + "speak_func": preprocess_speaker_general, + "transcript_func": preprocess_transcript_aishell3, + }, + "data_aishell":{ + "subfolders": ["wav/train"], + "trans_filepath": "transcript/aishell_transcript_v0.8.txt", + "speak_func": preprocess_speaker_general + } +} + +def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, + skip_existing: bool, hparams, no_alignments: bool, + dataset: str): + dataset_info = data_info[dataset] + # Gather the input directories + dataset_root = datasets_root.joinpath(dataset) + input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]] + print("\n ".join(map(str, ["Using data from:"] + input_dirs))) + assert all(input_dir.exists() for input_dir in input_dirs) + + # Create the output directories for each output file type + out_dir.joinpath("mels").mkdir(exist_ok=True) + out_dir.joinpath("audio").mkdir(exist_ok=True) + + # Create a metadata file + metadata_fpath = out_dir.joinpath("train.txt") + metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8") + + # Preprocess the dataset + dict_info = {} + transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"]) + assert transcript_dirs.exists(), str(transcript_dirs)+" not exist." + with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript: + # process with specific function for your dataset + if "transcript_func" in dataset_info: + dataset_info["transcript_func"](dict_info, dict_transcript) + else: + for v in dict_transcript: + if not v: + continue + v = v.strip().replace("\n","").replace("\t"," ").split(" ") + dict_info[v[0]] = " ".join(v[1:]) + + speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) + func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, + hparams=hparams, dict_info=dict_info, no_alignments=no_alignments) + job = Pool(n_processes).imap(func, speaker_dirs) + for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"): + for metadatum in speaker_metadata: + metadata_file.write("|".join(str(x) for x in metadatum) + "\n") + metadata_file.close() + + # Verify the contents of the metadata file + with metadata_fpath.open("r", encoding="utf-8") as metadata_file: + metadata = [line.split("|") for line in metadata_file] + mel_frames = sum([int(m[4]) for m in metadata]) + timesteps = sum([int(m[3]) for m in metadata]) + sample_rate = hparams.sample_rate + hours = (timesteps / sample_rate) / 3600 + print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." % + (len(metadata), mel_frames, timesteps, hours)) + print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata)) + print("Max mel frames length: %d" % max(int(m[4]) for m in metadata)) + print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata)) + +def embed_utterance(fpaths, encoder_model_fpath): + if not encoder.is_loaded(): + encoder.load_model(encoder_model_fpath) + + # Compute the speaker embedding of the utterance + wav_fpath, embed_fpath = fpaths + wav = np.load(wav_fpath) + wav = encoder.preprocess_wav(wav) + embed = encoder.embed_utterance(wav) + np.save(embed_fpath, embed, allow_pickle=False) + + +def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int): + wav_dir = synthesizer_root.joinpath("audio") + metadata_fpath = synthesizer_root.joinpath("train.txt") + assert wav_dir.exists() and metadata_fpath.exists() + embed_dir = synthesizer_root.joinpath("embeds") + embed_dir.mkdir(exist_ok=True) + + # Gather the input wave filepath and the target output embed filepath + with metadata_fpath.open("r", encoding="utf-8") as metadata_file: + metadata = [line.split("|") for line in metadata_file] + fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] + + # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here. + # Embed the utterances in separate threads + func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) + job = Pool(n_processes).imap(func, fpaths) + list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) diff --git a/synthesizer_tacotron2/preprocess_speaker.py b/synthesizer_tacotron2/preprocess_speaker.py new file mode 100644 index 0000000..88fad38 --- /dev/null +++ b/synthesizer_tacotron2/preprocess_speaker.py @@ -0,0 +1,99 @@ +import librosa +import numpy as np + +from encoder import inference as encoder +from utils import logmmse +from synthesizer import audio +from pathlib import Path +from pypinyin import Style +from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin +from pypinyin.converter import DefaultConverter +from pypinyin.core import Pinyin + +class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter): + pass + +pinyin = Pinyin(PinyinConverter()).pinyin + + +def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, + skip_existing: bool, hparams): + ## FOR REFERENCE: + # For you not to lose your head if you ever wish to change things here or implement your own + # synthesizer. + # - Both the audios and the mel spectrograms are saved as numpy arrays + # - There is no processing done to the audios that will be saved to disk beyond volume + # normalization (in split_on_silences) + # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This + # is why we re-apply it on the audio on the side of the vocoder. + # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved + # without extra padding. This means that you won't have an exact relation between the length + # of the wav and of the mel spectrogram. See the vocoder data loader. + + + # Skip existing utterances if needed + mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) + wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) + if skip_existing and mel_fpath.exists() and wav_fpath.exists(): + return None + + # Trim silence + if hparams.trim_silence: + wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True) + + # Skip utterances that are too short + if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: + return None + + # Compute the mel spectrogram + mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) + mel_frames = mel_spectrogram.shape[1] + + # Skip utterances that are too long + if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: + return None + + # Write the spectrogram, embed and audio to disk + np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) + np.save(wav_fpath, wav, allow_pickle=False) + + # Return a tuple describing this training example + return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text + + +def _split_on_silences(wav_fpath, words, hparams): + # Load the audio waveform + wav, _ = librosa.load(wav_fpath, hparams.sample_rate) + wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0] + if hparams.rescale: + wav = wav / np.abs(wav).max() * hparams.rescaling_max + # denoise, we may not need it here. + if len(wav) > hparams.sample_rate*(0.3+0.1): + noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)], + wav[-int(hparams.sample_rate*0.15):]]) + profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) + wav = logmmse.denoise(wav, profile, eta=0) + + resp = pinyin(words, style=Style.TONE3) + res = [v[0] for v in resp if v[0].strip()] + res = " ".join(res) + + return wav, res + +def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool): + metadata = [] + extensions = ["*.wav", "*.flac", "*.mp3"] + for extension in extensions: + wav_fpath_list = speaker_dir.glob(extension) + # Iterate over each wav + for wav_fpath in wav_fpath_list: + words = dict_info.get(wav_fpath.name.split(".")[0]) + words = dict_info.get(wav_fpath.name) if not words else words # try with wav + if not words: + print("no wordS") + continue + sub_basename = "%s_%02d" % (wav_fpath.name, 0) + wav, text = _split_on_silences(wav_fpath, words, hparams) + metadata.append(_process_utterance(wav, text, out_dir, sub_basename, + skip_existing, hparams)) + return [m for m in metadata if m is not None] diff --git a/synthesizer_tacotron2/preprocess_transcript.py b/synthesizer_tacotron2/preprocess_transcript.py new file mode 100644 index 0000000..7a26672 --- /dev/null +++ b/synthesizer_tacotron2/preprocess_transcript.py @@ -0,0 +1,18 @@ +def preprocess_transcript_aishell3(dict_info, dict_transcript): + for v in dict_transcript: + if not v: + continue + v = v.strip().replace("\n","").replace("\t"," ").split(" ") + transList = [] + for i in range(2, len(v), 2): + transList.append(v[i]) + dict_info[v[0]] = " ".join(transList) + + +def preprocess_transcript_magicdata(dict_info, dict_transcript): + for v in dict_transcript: + if not v: + continue + v = v.strip().replace("\n","").replace("\t"," ").split(" ") + dict_info[v[0]] = " ".join(v[2:]) + \ No newline at end of file diff --git a/synthesizer_tacotron2/synthesize.py b/synthesizer_tacotron2/synthesize.py new file mode 100644 index 0000000..1f1b49c --- /dev/null +++ b/synthesizer_tacotron2/synthesize.py @@ -0,0 +1,92 @@ +import torch +from torch.utils.data import DataLoader +from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer +from synthesizer.models.tacotron import Tacotron +from synthesizer.hparams import hparams_debug_string +from synthesizer.utils.text import text_to_sequence +from synthesizer.utils.symbols import symbols +import numpy as np +from pathlib import Path +from tqdm import tqdm +import sys +from synthesizer.infolog import log +import os +from synthesizer.tacotron2 import Tacotron2 +import time +import tensorflow as tf + + +def run_eval(args, checkpoint_path, output_dir, hparams, sentences): + eval_dir = os.path.join(output_dir, "eval") + log_dir = os.path.join(output_dir, "logs-eval") + + # Create output path if it doesn"t exist + os.makedirs(eval_dir, exist_ok=True) + os.makedirs(log_dir, exist_ok=True) + os.makedirs(os.path.join(log_dir, "wavs"), exist_ok=True) + os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True) + + log(hparams_debug_string()) + synth = Tacotron2(checkpoint_path, hparams) + + # Set inputs batch wise + sentences = [sentences[i: i + hparams.tacotron_synthesis_batch_size] for i + in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] + + log("Starting Synthesis") + with open(os.path.join(eval_dir, "map.txt"), "w") as file: + for i, texts in enumerate(tqdm(sentences)): + start = time.time() + basenames = ["batch_{}_sentence_{}".format(i, j) for j in range(len(texts))] + mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None) + + for elems in zip(texts, mel_filenames, speaker_ids): + file.write("|".join([str(x) for x in elems]) + "\n") + log("synthesized mel spectrograms at {}".format(eval_dir)) + return eval_dir + + +def run_synthesis(in_dir, out_dir, model_dir, hparams): + # This generates ground truth-aligned mels for vocoder training + synth_dir = os.path.join(out_dir, "mels_gta") + os.makedirs(synth_dir, exist_ok=True) + metadata_filename = os.path.join(in_dir, "train.txt") + print(hparams_debug_string()) + + # Load the model in memory + weights_dir = os.path.join(model_dir, "taco_pretrained") + checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path + synth = Tacotron2(checkpoint_fpath, hparams, gta=True) + + # Load the metadata + with open(metadata_filename, encoding="utf-8") as f: + metadata = [line.strip().split("|") for line in f] + frame_shift_ms = hparams.hop_size / hparams.sample_rate + hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / 3600 + print("Loaded metadata for {} examples ({:.2f} hours)".format(len(metadata), hours)) + + # Set inputs batch wise + metadata = [metadata[i: i + hparams.tacotron_synthesis_batch_size] for i in + range(0, len(metadata), hparams.tacotron_synthesis_batch_size)] + # TODO: come on big boy, fix this + # Quick and dirty fix to make sure that all batches have the same size + metadata = metadata[:-1] + + print("Starting Synthesis") + mel_dir = os.path.join(in_dir, "mels") + embed_dir = os.path.join(in_dir, "embeds") + meta_out_fpath = os.path.join(out_dir, "synthesized.txt") + with open(meta_out_fpath, "w") as file: + for i, meta in enumerate(tqdm(metadata)): + texts = [m[5] for m in meta] + mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta] + embed_filenames = [os.path.join(embed_dir, m[2]) for m in meta] + basenames = [os.path.basename(m).replace(".npy", "").replace("mel-", "") + for m in mel_filenames] + synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, embed_filenames) + + for elems in meta: + file.write("|".join([str(x) for x in elems]) + "\n") + + print("Synthesized mel spectrograms at {}".format(synth_dir)) + return meta_out_fpath diff --git a/synthesizer_tacotron2/synthesizer_dataset.py b/synthesizer_tacotron2/synthesizer_dataset.py new file mode 100644 index 0000000..975cb60 --- /dev/null +++ b/synthesizer_tacotron2/synthesizer_dataset.py @@ -0,0 +1,92 @@ +import torch +from torch.utils.data import Dataset +import numpy as np +from pathlib import Path +from synthesizer.utils.text import text_to_sequence + + +class SynthesizerDataset(Dataset): + def __init__(self, metadata_fpath: Path, mel_dir: Path, embed_dir: Path, hparams): + print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, embed_dir)) + + with metadata_fpath.open("r", encoding="utf-8") as metadata_file: + metadata = [line.split("|") for line in metadata_file] + + mel_fnames = [x[1] for x in metadata if int(x[4])] + mel_fpaths = [mel_dir.joinpath(fname) for fname in mel_fnames] + embed_fnames = [x[2] for x in metadata if int(x[4])] + embed_fpaths = [embed_dir.joinpath(fname) for fname in embed_fnames] + self.samples_fpaths = list(zip(mel_fpaths, embed_fpaths)) + self.samples_texts = [x[5].strip() for x in metadata if int(x[4])] + self.metadata = metadata + self.hparams = hparams + + print("Found %d samples" % len(self.samples_fpaths)) + + def __getitem__(self, index): + # Sometimes index may be a list of 2 (not sure why this happens) + # If that is the case, return a single item corresponding to first element in index + if index is list: + index = index[0] + + mel_path, embed_path = self.samples_fpaths[index] + mel = np.load(mel_path).T.astype(np.float32) + + # Load the embed + embed = np.load(embed_path) + + # Get the text and clean it + text = text_to_sequence(self.samples_texts[index], self.hparams.tts_cleaner_names) + + # Convert the list returned by text_to_sequence to a numpy array + text = np.asarray(text).astype(np.int32) + + return text, mel.astype(np.float32), embed.astype(np.float32), index + + def __len__(self): + return len(self.samples_fpaths) + + +def collate_synthesizer(batch): + # Text + x_lens = [len(x[0]) for x in batch] + max_x_len = max(x_lens) + + chars = [pad1d(x[0], max_x_len) for x in batch] + chars = np.stack(chars) + + # Mel spectrogram + spec_lens = [x[1].shape[-1] for x in batch] + max_spec_len = max(spec_lens) + 1 + if max_spec_len % 2 != 0: # FIXIT: Hardcoded due to incompatibility with Windows (no lambda) + max_spec_len += 2 - max_spec_len % 2 + + # WaveRNN mel spectrograms are normalized to [0, 1] so zero padding adds silence + # By default, SV2TTS uses symmetric mels, where -1*max_abs_value is silence. + # if hparams.symmetric_mels: + # mel_pad_value = -1 * hparams.max_abs_value + # else: + # mel_pad_value = 0 + mel_pad_value = -4 # FIXIT: Hardcoded due to incompatibility with Windows (no lambda) + mel = [pad2d(x[1], max_spec_len, pad_value=mel_pad_value) for x in batch] + mel = np.stack(mel) + + # Speaker embedding (SV2TTS) + embeds = [x[2] for x in batch] + + # Index (for vocoder preprocessing) + indices = [x[3] for x in batch] + + + # Convert all to tensor + chars = torch.tensor(chars).long() + mel = torch.tensor(mel) + embeds = torch.tensor(embeds) + + return chars, mel, embeds, indices + +def pad1d(x, max_len, pad_value=0): + return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value) + +def pad2d(x, max_len, pad_value=0): + return np.pad(x, ((0, 0), (0, max_len - x.shape[-1])), mode="constant", constant_values=pad_value) diff --git a/synthesizer_tacotron2/tacotron2.py b/synthesizer_tacotron2/tacotron2.py new file mode 100644 index 0000000..d3a0172 --- /dev/null +++ b/synthesizer_tacotron2/tacotron2.py @@ -0,0 +1,238 @@ +from synthesizer.utils.text import text_to_sequence +from synthesizer.infolog import log +from synthesizer.models import create_model +from synthesizer.utils import plot +from synthesizer import audio +import tensorflow as tf +import numpy as np +import os + + +class Tacotron2: + def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"): + log("Constructing model: %s" % model_name) + #Force the batch size to be known in order to use attention masking in batch synthesis + inputs = tf.placeholder(tf.int32, (None, None), name="inputs") + input_lengths = tf.placeholder(tf.int32, (None,), name="input_lengths") + speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size), + name="speaker_embeddings") + targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets") + split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos") + with tf.variable_scope("Tacotron_model") as scope: + self.model = create_model(model_name, hparams) + if gta: + self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta, + split_infos=split_infos) + else: + self.model.initialize(inputs, input_lengths, speaker_embeddings, + split_infos=split_infos) + + self.mel_outputs = self.model.tower_mel_outputs + self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None + self.alignments = self.model.tower_alignments + self.stop_token_prediction = self.model.tower_stop_token_prediction + self.targets = targets + + self.gta = gta + self._hparams = hparams + #pad input sequences with the 0 ( _ ) + self._pad = 0 + #explicitely setting the padding to a value that doesn"t originally exist in the spectogram + #to avoid any possible conflicts, without affecting the output range of the model too much + if hparams.symmetric_mels: + self._target_pad = -hparams.max_abs_value + else: + self._target_pad = 0. + + self.inputs = inputs + self.input_lengths = input_lengths + self.speaker_embeddings = speaker_embeddings + self.targets = targets + self.split_infos = split_infos + + log("Loading checkpoint: %s" % checkpoint_path) + #Memory allocation on the GPUs as needed + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + config.allow_soft_placement = True + + self.session = tf.Session(config=config) + self.session.run(tf.global_variables_initializer()) + + saver = tf.train.Saver() + saver.restore(self.session, checkpoint_path) + + def my_synthesize(self, speaker_embeds, texts): + """ + Lighter synthesis function that directly returns the mel spectrograms. + """ + print(texts) + # Prepare the input + cleaner_names = [x.strip() for x in self._hparams.cleaners.split(",")] + seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts] + input_lengths = [len(seq) for seq in seqs] + input_seqs, max_seq_len = self._prepare_inputs(seqs) + split_infos = [[max_seq_len, 0, 0, 0]] + feed_dict = { + self.inputs: input_seqs, + self.input_lengths: np.asarray(input_lengths, dtype=np.int32), + self.split_infos: np.asarray(split_infos, dtype=np.int32), + self.speaker_embeddings: speaker_embeds + } + + # Forward it + mels, alignments, stop_tokens = self.session.run( + [self.mel_outputs, self.alignments, self.stop_token_prediction], + feed_dict=feed_dict) + mels, alignments, stop_tokens = list(mels[0]), alignments[0], stop_tokens[0] + + # Trim the output + for i in range(len(mels)): + try: + target_length = list(np.round(stop_tokens[i])).index(1) + mels[i] = mels[i][:target_length, :] + except ValueError: + # If no token is generated, we simply do not trim the output + continue + + return [mel.T for mel in mels], alignments + + def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames, embed_filenames): + hparams = self._hparams + cleaner_names = [x.strip() for x in hparams.cleaners.split(",")] + + assert 0 == len(texts) % self._hparams.tacotron_num_gpus + seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts] + input_lengths = [len(seq) for seq in seqs] + + size_per_device = len(seqs) // self._hparams.tacotron_num_gpus + + #Pad inputs according to each GPU max length + input_seqs = None + split_infos = [] + for i in range(self._hparams.tacotron_num_gpus): + device_input = seqs[size_per_device*i: size_per_device*(i+1)] + device_input, max_seq_len = self._prepare_inputs(device_input) + input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input + split_infos.append([max_seq_len, 0, 0, 0]) + + feed_dict = { + self.inputs: input_seqs, + self.input_lengths: np.asarray(input_lengths, dtype=np.int32), + } + + if self.gta: + np_targets = [np.load(mel_filename) for mel_filename in mel_filenames] + target_lengths = [len(np_target) for np_target in np_targets] + + #pad targets according to each GPU max length + target_seqs = None + for i in range(self._hparams.tacotron_num_gpus): + device_target = np_targets[size_per_device*i: size_per_device*(i+1)] + device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step) + target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target + split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe? + + feed_dict[self.targets] = target_seqs + assert len(np_targets) == len(texts) + + feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) + feed_dict[self.speaker_embeddings] = [np.load(f) for f in embed_filenames] + + if self.gta or not hparams.predict_linear: + mels, alignments, stop_tokens = self.session.run( + [self.mel_outputs, self.alignments, self.stop_token_prediction], + feed_dict=feed_dict) + #Linearize outputs (1D arrays) + mels = [mel for gpu_mels in mels for mel in gpu_mels] + alignments = [align for gpu_aligns in alignments for align in gpu_aligns] + stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token] + + if not self.gta: + #Natural batch synthesis + #Get Mel lengths for the entire batch from stop_tokens predictions + target_lengths = self._get_output_lengths(stop_tokens) + + #Take off the batch wise padding + mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] + assert len(mels) == len(texts) + + else: + linears, mels, alignments, stop_tokens = self.session.run( + [self.linear_outputs, self.mel_outputs, self.alignments, + self.stop_token_prediction], + feed_dict=feed_dict) + #Linearize outputs (1D arrays) + linears = [linear for gpu_linear in linears for linear in gpu_linear] + mels = [mel for gpu_mels in mels for mel in gpu_mels] + alignments = [align for gpu_aligns in alignments for align in gpu_aligns] + stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token] + + #Natural batch synthesis + #Get Mel/Linear lengths for the entire batch from stop_tokens predictions + # target_lengths = self._get_output_lengths(stop_tokens) + target_lengths = [9999] + + #Take off the batch wise padding + mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] + linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)] + assert len(mels) == len(linears) == len(texts) + + if basenames is None: + raise NotImplemented() + + saved_mels_paths = [] + for i, mel in enumerate(mels): + # Write the spectrogram to disk + # Note: outputs mel-spectrogram files and target ones have same names, just different folders + mel_filename = os.path.join(out_dir, "mel-{}.npy".format(basenames[i])) + np.save(mel_filename, mel, allow_pickle=False) + saved_mels_paths.append(mel_filename) + + if log_dir is not None: + #save wav (mel -> wav) + wav = audio.inv_mel_spectrogram(mel.T, hparams) + audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-mel.wav".format(basenames[i])), sr=hparams.sample_rate) + + #save alignments + plot.plot_alignment(alignments[i], os.path.join(log_dir, "plots/alignment-{}.png".format(basenames[i])), + title="{}".format(texts[i]), split_title=True, max_len=target_lengths[i]) + + #save mel spectrogram plot + plot.plot_spectrogram(mel, os.path.join(log_dir, "plots/mel-{}.png".format(basenames[i])), + title="{}".format(texts[i]), split_title=True) + + if hparams.predict_linear: + #save wav (linear -> wav) + wav = audio.inv_linear_spectrogram(linears[i].T, hparams) + audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-linear.wav".format(basenames[i])), sr=hparams.sample_rate) + + #save linear spectrogram plot + plot.plot_spectrogram(linears[i], os.path.join(log_dir, "plots/linear-{}.png".format(basenames[i])), + title="{}".format(texts[i]), split_title=True, auto_aspect=True) + + return saved_mels_paths + + def _round_up(self, x, multiple): + remainder = x % multiple + return x if remainder == 0 else x + multiple - remainder + + def _prepare_inputs(self, inputs): + max_len = max([len(x) for x in inputs]) + return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len + + def _pad_input(self, x, length): + return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad) + + def _prepare_targets(self, targets, alignment): + max_len = max([len(t) for t in targets]) + data_len = self._round_up(max_len, alignment) + return np.stack([self._pad_target(t, data_len) for t in targets]), data_len + + def _pad_target(self, t, length): + return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad) + + def _get_output_lengths(self, stop_tokens): + #Determine each mel length by the stop token predictions. (len = first occurence of 1 in stop_tokens row wise) + output_lengths = [row.index(1) for row in np.round(stop_tokens).tolist()] + return output_lengths diff --git a/synthesizer_tacotron2/train.py b/synthesizer_tacotron2/train.py new file mode 100644 index 0000000..6ecf878 --- /dev/null +++ b/synthesizer_tacotron2/train.py @@ -0,0 +1,393 @@ +from synthesizer.utils.symbols import symbols +from synthesizer.utils.text import sequence_to_text +from synthesizer.hparams import hparams_debug_string +from synthesizer.feeder import Feeder +from synthesizer.models import create_model +from synthesizer.utils import ValueWindow, plot +from synthesizer import infolog, audio +from datetime import datetime +from tqdm import tqdm +import tensorflow as tf +import numpy as np +import traceback +import time +import os + +log = infolog.log + + +def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoint_path): + # Create tensorboard projector + config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig() + config.model_checkpoint_path = checkpoint_path + + for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta): + # Initialize config + embedding = config.embeddings.add() + # Specifiy the embedding variable and the metadata + embedding.tensor_name = embedding_name + embedding.metadata_path = path_to_meta + + # Project the embeddings to space dimensions for visualization + tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config) + + +def add_train_stats(model, hparams): + with tf.variable_scope("stats") as scope: + for i in range(hparams.tacotron_num_gpus): + tf.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i]) + tf.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i]) + tf.summary.scalar("before_loss", model.before_loss) + tf.summary.scalar("after_loss", model.after_loss) + + if hparams.predict_linear: + tf.summary.scalar("linear_loss", model.linear_loss) + for i in range(hparams.tacotron_num_gpus): + tf.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i]) + tf.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i]) + + tf.summary.scalar("regularization_loss", model.regularization_loss) + tf.summary.scalar("stop_token_loss", model.stop_token_loss) + tf.summary.scalar("loss", model.loss) + tf.summary.scalar("learning_rate", model.learning_rate) # Control learning rate decay speed + if hparams.tacotron_teacher_forcing_mode == "scheduled": + tf.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing + # ratio decay when mode = "scheduled" + gradient_norms = [tf.norm(grad) for grad in model.gradients] + tf.summary.histogram("gradient_norm", gradient_norms) + tf.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms)) # visualize + # gradients (in case of explosion) + return tf.summary.merge_all() + + +def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, + loss): + values = [ + tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss", + simple_value=before_loss), + tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss", + simple_value=after_loss), + tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss", + simple_value=stop_token_loss), + tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss), + ] + if linear_loss is not None: + values.append(tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss", + simple_value=linear_loss)) + test_summary = tf.Summary(value=values) + summary_writer.add_summary(test_summary, step) + + +def time_string(): + return datetime.now().strftime("%Y-%m-%d %H:%M") + + +def model_train_mode(args, feeder, hparams, global_step): + with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope: + model = create_model("Tacotron", hparams) + model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings, + feeder.mel_targets, feeder.token_targets, + targets_lengths=feeder.targets_lengths, global_step=global_step, + is_training=True, split_infos=feeder.split_infos) + model.add_loss() + model.add_optimizer(global_step) + stats = add_train_stats(model, hparams) + return model, stats + + +def model_test_mode(args, feeder, hparams, global_step): + with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope: + model = create_model("Tacotron", hparams) + model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, + feeder.eval_speaker_embeddings, feeder.eval_mel_targets, + feeder.eval_token_targets, targets_lengths=feeder.eval_targets_lengths, + global_step=global_step, is_training=False, is_evaluating=True, + split_infos=feeder.eval_split_infos) + model.add_loss() + return model + + +def train(log_dir, args, hparams): + save_dir = os.path.join(log_dir, "taco_pretrained") + plot_dir = os.path.join(log_dir, "plots") + wav_dir = os.path.join(log_dir, "wavs") + mel_dir = os.path.join(log_dir, "mel-spectrograms") + eval_dir = os.path.join(log_dir, "eval-dir") + eval_plot_dir = os.path.join(eval_dir, "plots") + eval_wav_dir = os.path.join(eval_dir, "wavs") + tensorboard_dir = os.path.join(log_dir, "tacotron_events") + meta_folder = os.path.join(log_dir, "metas") + os.makedirs(save_dir, exist_ok=True) + os.makedirs(plot_dir, exist_ok=True) + os.makedirs(wav_dir, exist_ok=True) + os.makedirs(mel_dir, exist_ok=True) + os.makedirs(eval_dir, exist_ok=True) + os.makedirs(eval_plot_dir, exist_ok=True) + os.makedirs(eval_wav_dir, exist_ok=True) + os.makedirs(tensorboard_dir, exist_ok=True) + os.makedirs(meta_folder, exist_ok=True) + + + checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt") + metadat_fpath = os.path.join(args.synthesizer_root, "train.txt") + + log("Checkpoint path: {}".format(checkpoint_fpath)) + log("Loading training data from: {}".format(metadat_fpath)) + log("Using model: Tacotron") + log(hparams_debug_string()) + + # Start by setting a seed for repeatability + tf.set_random_seed(hparams.tacotron_random_seed) + + # Set up data feeder + coord = tf.train.Coordinator() + with tf.variable_scope("datafeeder") as scope: + feeder = Feeder(coord, metadat_fpath, hparams) + + # Set up model: + global_step = tf.Variable(0, name="global_step", trainable=False) + model, stats = model_train_mode(args, feeder, hparams, global_step) + eval_model = model_test_mode(args, feeder, hparams, global_step) + + # Embeddings metadata + char_embedding_meta = os.path.join(meta_folder, "CharacterEmbeddings.tsv") + if not os.path.isfile(char_embedding_meta): + with open(char_embedding_meta, "w", encoding="utf-8") as f: + for symbol in symbols: + if symbol == " ": + symbol = "\\s" # For visual purposes, swap space with \s + + f.write("{}\n".format(symbol)) + + char_embedding_meta = char_embedding_meta.replace(log_dir, "..") + + # Book keeping + step = 0 + time_window = ValueWindow(100) + loss_window = ValueWindow(100) + saver = tf.train.Saver(max_to_keep=5) + + log("Tacotron training set to a maximum of {} steps".format(args.tacotron_train_steps)) + + # Memory allocation on the GPU as needed + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + config.allow_soft_placement = True + + # Train + with tf.Session(config=config) as sess: + try: + summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) + + sess.run(tf.global_variables_initializer()) + + # saved model restoring + if args.restore: + # Restore saved model if the user requested it, default = True + try: + checkpoint_state = tf.train.get_checkpoint_state(save_dir) + + if checkpoint_state and checkpoint_state.model_checkpoint_path: + log("Loading checkpoint {}".format(checkpoint_state.model_checkpoint_path), + slack=True) + saver.restore(sess, checkpoint_state.model_checkpoint_path) + + else: + log("No model to load at {}".format(save_dir), slack=True) + saver.save(sess, checkpoint_fpath, global_step=global_step) + + except tf.errors.OutOfRangeError as e: + log("Cannot restore checkpoint: {}".format(e), slack=True) + else: + log("Starting new training!", slack=True) + saver.save(sess, checkpoint_fpath, global_step=global_step) + + # initializing feeder + feeder.start_threads(sess) + + # Training loop + while not coord.should_stop() and step < args.tacotron_train_steps: + start_time = time.time() + step, loss, opt = sess.run([global_step, model.loss, model.optimize]) + time_window.append(time.time() - start_time) + loss_window.append(loss) + message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]".format( + step, time_window.average, loss, loss_window.average) + log(message, end="\r", slack=(step % args.checkpoint_interval == 0)) + print(message) + + if loss > 100 or np.isnan(loss): + log("Loss exploded to {:.5f} at step {}".format(loss, step)) + raise Exception("Loss exploded") + + if step % args.summary_interval == 0: + log("\nWriting summary at step {}".format(step)) + summary_writer.add_summary(sess.run(stats), step) + + if step % args.eval_interval == 0: + # Run eval and save eval stats + log("\nRunning evaluation at step {}".format(step)) + + eval_losses = [] + before_losses = [] + after_losses = [] + stop_token_losses = [] + linear_losses = [] + linear_loss = None + + if hparams.predict_linear: + for i in tqdm(range(feeder.test_steps)): + eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, \ + mel_t, t_len, align, lin_p, lin_t = sess.run( + [ + eval_model.tower_loss[0], eval_model.tower_before_loss[0], + eval_model.tower_after_loss[0], + eval_model.tower_stop_token_loss[0], + eval_model.tower_linear_loss[0], + eval_model.tower_mel_outputs[0][0], + eval_model.tower_mel_targets[0][0], + eval_model.tower_targets_lengths[0][0], + eval_model.tower_alignments[0][0], + eval_model.tower_linear_outputs[0][0], + eval_model.tower_linear_targets[0][0], + ]) + eval_losses.append(eloss) + before_losses.append(before_loss) + after_losses.append(after_loss) + stop_token_losses.append(stop_token_loss) + linear_losses.append(linear_loss) + linear_loss = sum(linear_losses) / len(linear_losses) + + wav = audio.inv_linear_spectrogram(lin_p.T, hparams) + audio.save_wav(wav, os.path.join(eval_wav_dir, + "step-{}-eval-wave-from-linear.wav".format( + step)), sr=hparams.sample_rate) + + else: + for i in tqdm(range(feeder.test_steps)): + eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, \ + align = sess.run( + [ + eval_model.tower_loss[0], eval_model.tower_before_loss[0], + eval_model.tower_after_loss[0], + eval_model.tower_stop_token_loss[0], + eval_model.tower_mel_outputs[0][0], + eval_model.tower_mel_targets[0][0], + eval_model.tower_targets_lengths[0][0], + eval_model.tower_alignments[0][0] + ]) + eval_losses.append(eloss) + before_losses.append(before_loss) + after_losses.append(after_loss) + stop_token_losses.append(stop_token_loss) + + eval_loss = sum(eval_losses) / len(eval_losses) + before_loss = sum(before_losses) / len(before_losses) + after_loss = sum(after_losses) / len(after_losses) + stop_token_loss = sum(stop_token_losses) / len(stop_token_losses) + + log("Saving eval log to {}..".format(eval_dir)) + # Save some log to monitor model improvement on same unseen sequence + wav = audio.inv_mel_spectrogram(mel_p.T, hparams) + audio.save_wav(wav, os.path.join(eval_wav_dir, + "step-{}-eval-wave-from-mel.wav".format(step)), + sr=hparams.sample_rate) + + plot.plot_alignment(align, os.path.join(eval_plot_dir, + "step-{}-eval-align.png".format(step)), + title="{}, {}, step={}, loss={:.5f}".format("Tacotron", + time_string(), + step, + eval_loss), + max_len=t_len // hparams.outputs_per_step) + plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, + "step-{" + "}-eval-mel-spectrogram.png".format( + step)), + title="{}, {}, step={}, loss={:.5f}".format("Tacotron", + time_string(), + step, + eval_loss), + target_spectrogram=mel_t, + max_len=t_len) + + if hparams.predict_linear: + plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir, + "step-{}-eval-linear-spectrogram.png".format( + step)), + title="{}, {}, step={}, loss={:.5f}".format( + "Tacotron", time_string(), step, eval_loss), + target_spectrogram=lin_t, + max_len=t_len, auto_aspect=True) + + log("Eval loss for global step {}: {:.3f}".format(step, eval_loss)) + log("Writing eval summary!") + add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, + stop_token_loss, eval_loss) + + if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \ + step == 300: + # Save model and current global step + saver.save(sess, checkpoint_fpath, global_step=global_step) + + log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..") + input_seq, mel_prediction, alignment, target, target_length = sess.run([ + model.tower_inputs[0][0], + model.tower_mel_outputs[0][0], + model.tower_alignments[0][0], + model.tower_mel_targets[0][0], + model.tower_targets_lengths[0][0], + ]) + + # save predicted mel spectrogram to disk (debug) + mel_filename = "mel-prediction-step-{}.npy".format(step) + np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, + allow_pickle=False) + + # save griffin lim inverted wav for debug (mel -> wav) + wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) + audio.save_wav(wav, + os.path.join(wav_dir, "step-{}-wave-from-mel.wav".format(step)), + sr=hparams.sample_rate) + + # save alignment plot to disk (control purposes) + plot.plot_alignment(alignment, + os.path.join(plot_dir, "step-{}-align.png".format(step)), + title="{}, {}, step={}, loss={:.5f}".format("Tacotron", + time_string(), + step, loss), + max_len=target_length // hparams.outputs_per_step) + # save real and predicted mel-spectrogram plot to disk (control purposes) + plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, + "step-{}-mel-spectrogram.png".format( + step)), + title="{}, {}, step={}, loss={:.5f}".format("Tacotron", + time_string(), + step, loss), + target_spectrogram=target, + max_len=target_length) + log("Input at step {}: {}".format(step, sequence_to_text(input_seq))) + + if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: + # Get current checkpoint state + checkpoint_state = tf.train.get_checkpoint_state(save_dir) + + # Update Projector + log("\nSaving Model Character Embeddings visualization..") + add_embedding_stats(summary_writer, [model.embedding_table.name], + [char_embedding_meta], + checkpoint_state.model_checkpoint_path) + log("Tacotron Character embeddings have been updated on tensorboard!") + + log("Tacotron training complete after {} global steps!".format( + args.tacotron_train_steps), slack=True) + return save_dir + + except Exception as e: + log("Exiting due to exception: {}".format(e), slack=True) + traceback.print_exc() + coord.request_stop(e) + + +def tacotron_train(args, log_dir, hparams): + return train(log_dir, args, hparams) diff --git a/synthesizer_tacotron2/utils/__init__.py b/synthesizer_tacotron2/utils/__init__.py new file mode 100644 index 0000000..5ae3e48 --- /dev/null +++ b/synthesizer_tacotron2/utils/__init__.py @@ -0,0 +1,45 @@ +import torch + + +_output_ref = None +_replicas_ref = None + +def data_parallel_workaround(model, *input): + global _output_ref + global _replicas_ref + device_ids = list(range(torch.cuda.device_count())) + output_device = device_ids[0] + replicas = torch.nn.parallel.replicate(model, device_ids) + # input.shape = (num_args, batch, ...) + inputs = torch.nn.parallel.scatter(input, device_ids) + # inputs.shape = (num_gpus, num_args, batch/num_gpus, ...) + replicas = replicas[:len(inputs)] + outputs = torch.nn.parallel.parallel_apply(replicas, inputs) + y_hat = torch.nn.parallel.gather(outputs, output_device) + _output_ref = outputs + _replicas_ref = replicas + return y_hat + + +class ValueWindow(): + def __init__(self, window_size=100): + self._window_size = window_size + self._values = [] + + def append(self, x): + self._values = self._values[-(self._window_size - 1):] + [x] + + @property + def sum(self): + return sum(self._values) + + @property + def count(self): + return len(self._values) + + @property + def average(self): + return self.sum / max(1, self.count) + + def reset(self): + self._values = [] diff --git a/synthesizer_tacotron2/utils/_cmudict.py b/synthesizer_tacotron2/utils/_cmudict.py new file mode 100644 index 0000000..2cef1f8 --- /dev/null +++ b/synthesizer_tacotron2/utils/_cmudict.py @@ -0,0 +1,62 @@ +import re + +valid_symbols = [ + "AA", "AA0", "AA1", "AA2", "AE", "AE0", "AE1", "AE2", "AH", "AH0", "AH1", "AH2", + "AO", "AO0", "AO1", "AO2", "AW", "AW0", "AW1", "AW2", "AY", "AY0", "AY1", "AY2", + "B", "CH", "D", "DH", "EH", "EH0", "EH1", "EH2", "ER", "ER0", "ER1", "ER2", "EY", + "EY0", "EY1", "EY2", "F", "G", "HH", "IH", "IH0", "IH1", "IH2", "IY", "IY0", "IY1", + "IY2", "JH", "K", "L", "M", "N", "NG", "OW", "OW0", "OW1", "OW2", "OY", "OY0", + "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH", "UH0", "UH1", "UH2", "UW", + "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH" +] + +_valid_symbol_set = set(valid_symbols) + + +class CMUDict: + """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict""" + def __init__(self, file_or_path, keep_ambiguous=True): + if isinstance(file_or_path, str): + with open(file_or_path, encoding="latin-1") as f: + entries = _parse_cmudict(f) + else: + entries = _parse_cmudict(file_or_path) + if not keep_ambiguous: + entries = {word: pron for word, pron in entries.items() if len(pron) == 1} + self._entries = entries + + + def __len__(self): + return len(self._entries) + + + def lookup(self, word): + """Returns list of ARPAbet pronunciations of the given word.""" + return self._entries.get(word.upper()) + + + +_alt_re = re.compile(r"\([0-9]+\)") + + +def _parse_cmudict(file): + cmudict = {} + for line in file: + if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"): + parts = line.split(" ") + word = re.sub(_alt_re, "", parts[0]) + pronunciation = _get_pronunciation(parts[1]) + if pronunciation: + if word in cmudict: + cmudict[word].append(pronunciation) + else: + cmudict[word] = [pronunciation] + return cmudict + + +def _get_pronunciation(s): + parts = s.strip().split(" ") + for part in parts: + if part not in _valid_symbol_set: + return None + return " ".join(parts) diff --git a/synthesizer_tacotron2/utils/cleaners.py b/synthesizer_tacotron2/utils/cleaners.py new file mode 100644 index 0000000..eab63f0 --- /dev/null +++ b/synthesizer_tacotron2/utils/cleaners.py @@ -0,0 +1,88 @@ +""" +Cleaners are transformations that run over the input text at both training and eval time. + +Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" +hyperparameter. Some cleaners are English-specific. You"ll typically want to use: + 1. "english_cleaners" for English text + 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using + the Unidecode library (https://pypi.python.org/pypi/Unidecode) + 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update + the symbols in symbols.py to match your data). +""" + +import re +from unidecode import unidecode +from .numbers import normalize_numbers + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r"\s+") + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ + ("mrs", "misess"), + ("mr", "mister"), + ("dr", "doctor"), + ("st", "saint"), + ("co", "company"), + ("jr", "junior"), + ("maj", "major"), + ("gen", "general"), + ("drs", "doctors"), + ("rev", "reverend"), + ("lt", "lieutenant"), + ("hon", "honorable"), + ("sgt", "sergeant"), + ("capt", "captain"), + ("esq", "esquire"), + ("ltd", "limited"), + ("col", "colonel"), + ("ft", "fort"), +]] + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + + +def expand_numbers(text): + return normalize_numbers(text) + + +def lowercase(text): + """lowercase input tokens.""" + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, " ", text) + + +def convert_to_ascii(text): + return unidecode(text) + + +def basic_cleaners(text): + """Basic pipeline that lowercases and collapses whitespace without transliteration.""" + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + """Pipeline for non-English text that transliterates to ASCII.""" + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners(text): + """Pipeline for English text, including number and abbreviation expansion.""" + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + return text diff --git a/synthesizer_tacotron2/utils/numbers.py b/synthesizer_tacotron2/utils/numbers.py new file mode 100644 index 0000000..75020a0 --- /dev/null +++ b/synthesizer_tacotron2/utils/numbers.py @@ -0,0 +1,68 @@ +import re +import inflect + +_inflect = inflect.engine() +_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") +_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") +_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)") +_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)") +_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") +_number_re = re.compile(r"[0-9]+") + + +def _remove_commas(m): + return m.group(1).replace(",", "") + + +def _expand_decimal_point(m): + return m.group(1).replace(".", " point ") + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split(".") + if len(parts) > 2: + return match + " dollars" # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = "dollar" if dollars == 1 else "dollars" + cent_unit = "cent" if cents == 1 else "cents" + return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = "dollar" if dollars == 1 else "dollars" + return "%s %s" % (dollars, dollar_unit) + elif cents: + cent_unit = "cent" if cents == 1 else "cents" + return "%s %s" % (cents, cent_unit) + else: + return "zero dollars" + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return "two thousand" + elif num > 2000 and num < 2010: + return "two thousand " + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + " hundred" + else: + return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") + else: + return _inflect.number_to_words(num, andword="") + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r"\1 pounds", text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text diff --git a/synthesizer_tacotron2/utils/plot.py b/synthesizer_tacotron2/utils/plot.py new file mode 100644 index 0000000..f47d271 --- /dev/null +++ b/synthesizer_tacotron2/utils/plot.py @@ -0,0 +1,76 @@ +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np + + +def split_title_line(title_text, max_words=5): + """ + A function that splits any string based on specific character + (returning it with the string), with maximum number of words on it + """ + seq = title_text.split() + return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)]) + +def plot_alignment(alignment, path, title=None, split_title=False, max_len=None): + if max_len is not None: + alignment = alignment[:, :max_len] + + fig = plt.figure(figsize=(8, 6)) + ax = fig.add_subplot(111) + + im = ax.imshow( + alignment, + aspect="auto", + origin="lower", + interpolation="none") + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + + if split_title: + title = split_title_line(title) + + plt.xlabel(xlabel) + plt.title(title) + plt.ylabel("Encoder timestep") + plt.tight_layout() + plt.savefig(path, format="png") + plt.close() + + +def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False): + if max_len is not None: + target_spectrogram = target_spectrogram[:max_len] + pred_spectrogram = pred_spectrogram[:max_len] + + if split_title: + title = split_title_line(title) + + fig = plt.figure(figsize=(10, 8)) + # Set common labels + fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16) + + #target spectrogram subplot + if target_spectrogram is not None: + ax1 = fig.add_subplot(311) + ax2 = fig.add_subplot(312) + + if auto_aspect: + im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none") + else: + im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none") + ax1.set_title("Target Mel-Spectrogram") + fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1) + ax2.set_title("Predicted Mel-Spectrogram") + else: + ax2 = fig.add_subplot(211) + + if auto_aspect: + im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none") + else: + im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none") + fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2) + + plt.tight_layout() + plt.savefig(path, format="png") + plt.close() diff --git a/synthesizer_tacotron2/utils/symbols.py b/synthesizer_tacotron2/utils/symbols.py new file mode 100644 index 0000000..066633e --- /dev/null +++ b/synthesizer_tacotron2/utils/symbols.py @@ -0,0 +1,18 @@ +""" +Defines the set of symbols used in text input to the model. + +The default is a set of ASCII characters that works well for English or text that has been run +through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. +""" +# from . import cmudict + +_pad = "_" +_eos = "~" +_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? ' + +#_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? ' # use this old one if you want to train old model +# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): +#_arpabet = ["@' + s for s in cmudict.valid_symbols] + +# Export all symbols: +symbols = [_pad, _eos] + list(_characters) #+ _arpabet diff --git a/synthesizer_tacotron2/utils/text.py b/synthesizer_tacotron2/utils/text.py new file mode 100644 index 0000000..2937217 --- /dev/null +++ b/synthesizer_tacotron2/utils/text.py @@ -0,0 +1,74 @@ +from .symbols import symbols +from . import cleaners +import re + +# Mappings from symbol to numeric ID and vice versa: +_symbol_to_id = {s: i for i, s in enumerate(symbols)} +_id_to_symbol = {i: s for i, s in enumerate(symbols)} + +# Regular expression matching text enclosed in curly braces: +_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)") + + +def text_to_sequence(text, cleaner_names): + """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + + The text can optionally have ARPAbet sequences enclosed in curly braces embedded + in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." + + Args: + text: string to convert to a sequence + cleaner_names: names of the cleaner functions to run the text through + + Returns: + List of integers corresponding to the symbols in the text + """ + sequence = [] + + # Check for curly braces and treat their contents as ARPAbet: + while len(text): + m = _curly_re.match(text) + if not m: + sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) + break + sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) + sequence += _arpabet_to_sequence(m.group(2)) + text = m.group(3) + + # Append EOS token + sequence.append(_symbol_to_id["~"]) + return sequence + + +def sequence_to_text(sequence): + """Converts a sequence of IDs back to a string""" + result = "" + for symbol_id in sequence: + if symbol_id in _id_to_symbol: + s = _id_to_symbol[symbol_id] + # Enclose ARPAbet back in curly braces: + if len(s) > 1 and s[0] == "@": + s = "{%s}" % s[1:] + result += s + return result.replace("}{", " ") + + +def _clean_text(text, cleaner_names): + for name in cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception("Unknown cleaner: %s" % name) + text = cleaner(text) + return text + + +def _symbols_to_sequence(symbols): + return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] + + +def _arpabet_to_sequence(text): + return _symbols_to_sequence(["@" + s for s in text.split()]) + + +def _should_keep_symbol(s): + return s in _symbol_to_id and s not in ("_", "~")