mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
modified tacotron to tacotron2
This commit is contained in:
parent
b56ec5ee1b
commit
de4e525a0d
BIN
synthesizer_tacotron2/.DS_Store
vendored
Normal file
BIN
synthesizer_tacotron2/.DS_Store
vendored
Normal file
Binary file not shown.
24
synthesizer_tacotron2/LICENSE.txt
Normal file
24
synthesizer_tacotron2/LICENSE.txt
Normal file
|
@ -0,0 +1,24 @@
|
|||
MIT License
|
||||
|
||||
Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
|
||||
Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
|
||||
Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
|
||||
Modified work Copyright (c) 2020 blue-fish (https://github.com/blue-fish)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
1
synthesizer_tacotron2/__init__.py
Normal file
1
synthesizer_tacotron2/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
#
|
206
synthesizer_tacotron2/audio.py
Normal file
206
synthesizer_tacotron2/audio.py
Normal file
|
@ -0,0 +1,206 @@
|
|||
import librosa
|
||||
import librosa.filters
|
||||
import numpy as np
|
||||
from scipy import signal
|
||||
from scipy.io import wavfile
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
def load_wav(path, sr):
|
||||
return librosa.core.load(path, sr=sr)[0]
|
||||
|
||||
def save_wav(wav, path, sr):
|
||||
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
|
||||
#proposed by @dsmiller
|
||||
wavfile.write(path, sr, wav.astype(np.int16))
|
||||
|
||||
def save_wavenet_wav(wav, path, sr):
|
||||
sf.write(path, wav.astype(np.float32), sr)
|
||||
|
||||
def preemphasis(wav, k, preemphasize=True):
|
||||
if preemphasize:
|
||||
return signal.lfilter([1, -k], [1], wav)
|
||||
return wav
|
||||
|
||||
def inv_preemphasis(wav, k, inv_preemphasize=True):
|
||||
if inv_preemphasize:
|
||||
return signal.lfilter([1], [1, -k], wav)
|
||||
return wav
|
||||
|
||||
#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
|
||||
def start_and_end_indices(quantized, silence_threshold=2):
|
||||
for start in range(quantized.size):
|
||||
if abs(quantized[start] - 127) > silence_threshold:
|
||||
break
|
||||
for end in range(quantized.size - 1, 1, -1):
|
||||
if abs(quantized[end] - 127) > silence_threshold:
|
||||
break
|
||||
|
||||
assert abs(quantized[start] - 127) > silence_threshold
|
||||
assert abs(quantized[end] - 127) > silence_threshold
|
||||
|
||||
return start, end
|
||||
|
||||
def get_hop_size(hparams):
|
||||
hop_size = hparams.hop_size
|
||||
if hop_size is None:
|
||||
assert hparams.frame_shift_ms is not None
|
||||
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
|
||||
return hop_size
|
||||
|
||||
def linearspectrogram(wav, hparams):
|
||||
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
|
||||
S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
|
||||
|
||||
if hparams.signal_normalization:
|
||||
return _normalize(S, hparams)
|
||||
return S
|
||||
|
||||
def melspectrogram(wav, hparams):
|
||||
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
|
||||
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
|
||||
|
||||
if hparams.signal_normalization:
|
||||
return _normalize(S, hparams)
|
||||
return S
|
||||
|
||||
def inv_linear_spectrogram(linear_spectrogram, hparams):
|
||||
"""Converts linear spectrogram to waveform using librosa"""
|
||||
if hparams.signal_normalization:
|
||||
D = _denormalize(linear_spectrogram, hparams)
|
||||
else:
|
||||
D = linear_spectrogram
|
||||
|
||||
S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
|
||||
|
||||
if hparams.use_lws:
|
||||
processor = _lws_processor(hparams)
|
||||
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
|
||||
y = processor.istft(D).astype(np.float32)
|
||||
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
|
||||
else:
|
||||
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
|
||||
|
||||
def inv_mel_spectrogram(mel_spectrogram, hparams):
|
||||
"""Converts mel spectrogram to waveform using librosa"""
|
||||
if hparams.signal_normalization:
|
||||
D = _denormalize(mel_spectrogram, hparams)
|
||||
else:
|
||||
D = mel_spectrogram
|
||||
|
||||
S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear
|
||||
|
||||
if hparams.use_lws:
|
||||
processor = _lws_processor(hparams)
|
||||
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
|
||||
y = processor.istft(D).astype(np.float32)
|
||||
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
|
||||
else:
|
||||
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
|
||||
|
||||
def _lws_processor(hparams):
|
||||
import lws
|
||||
return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
|
||||
|
||||
def _griffin_lim(S, hparams):
|
||||
"""librosa implementation of Griffin-Lim
|
||||
Based on https://github.com/librosa/librosa/issues/434
|
||||
"""
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
y = _istft(S_complex * angles, hparams)
|
||||
for i in range(hparams.griffin_lim_iters):
|
||||
angles = np.exp(1j * np.angle(_stft(y, hparams)))
|
||||
y = _istft(S_complex * angles, hparams)
|
||||
return y
|
||||
|
||||
def _stft(y, hparams):
|
||||
if hparams.use_lws:
|
||||
return _lws_processor(hparams).stft(y).T
|
||||
else:
|
||||
return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
|
||||
|
||||
def _istft(y, hparams):
|
||||
return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
|
||||
|
||||
##########################################################
|
||||
#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
|
||||
def num_frames(length, fsize, fshift):
|
||||
"""Compute number of time frames of spectrogram
|
||||
"""
|
||||
pad = (fsize - fshift)
|
||||
if length % fshift == 0:
|
||||
M = (length + pad * 2 - fsize) // fshift + 1
|
||||
else:
|
||||
M = (length + pad * 2 - fsize) // fshift + 2
|
||||
return M
|
||||
|
||||
|
||||
def pad_lr(x, fsize, fshift):
|
||||
"""Compute left and right padding
|
||||
"""
|
||||
M = num_frames(len(x), fsize, fshift)
|
||||
pad = (fsize - fshift)
|
||||
T = len(x) + 2 * pad
|
||||
r = (M - 1) * fshift + fsize - T
|
||||
return pad, pad + r
|
||||
##########################################################
|
||||
#Librosa correct padding
|
||||
def librosa_pad_lr(x, fsize, fshift):
|
||||
return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
|
||||
|
||||
# Conversions
|
||||
_mel_basis = None
|
||||
_inv_mel_basis = None
|
||||
|
||||
def _linear_to_mel(spectogram, hparams):
|
||||
global _mel_basis
|
||||
if _mel_basis is None:
|
||||
_mel_basis = _build_mel_basis(hparams)
|
||||
return np.dot(_mel_basis, spectogram)
|
||||
|
||||
def _mel_to_linear(mel_spectrogram, hparams):
|
||||
global _inv_mel_basis
|
||||
if _inv_mel_basis is None:
|
||||
_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
|
||||
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
|
||||
|
||||
def _build_mel_basis(hparams):
|
||||
assert hparams.fmax <= hparams.sample_rate // 2
|
||||
return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
|
||||
fmin=hparams.fmin, fmax=hparams.fmax)
|
||||
|
||||
def _amp_to_db(x, hparams):
|
||||
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
|
||||
return 20 * np.log10(np.maximum(min_level, x))
|
||||
|
||||
def _db_to_amp(x):
|
||||
return np.power(10.0, (x) * 0.05)
|
||||
|
||||
def _normalize(S, hparams):
|
||||
if hparams.allow_clipping_in_normalization:
|
||||
if hparams.symmetric_mels:
|
||||
return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
|
||||
-hparams.max_abs_value, hparams.max_abs_value)
|
||||
else:
|
||||
return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
|
||||
|
||||
assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
|
||||
if hparams.symmetric_mels:
|
||||
return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
|
||||
else:
|
||||
return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
|
||||
|
||||
def _denormalize(D, hparams):
|
||||
if hparams.allow_clipping_in_normalization:
|
||||
if hparams.symmetric_mels:
|
||||
return (((np.clip(D, -hparams.max_abs_value,
|
||||
hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
|
||||
+ hparams.min_level_db)
|
||||
else:
|
||||
return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
|
||||
|
||||
if hparams.symmetric_mels:
|
||||
return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
|
||||
else:
|
||||
return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
|
272
synthesizer_tacotron2/feeder.py
Normal file
272
synthesizer_tacotron2/feeder.py
Normal file
|
@ -0,0 +1,272 @@
|
|||
from sklearn.model_selection import train_test_split
|
||||
from synthesizer.utils.text import text_to_sequence
|
||||
from synthesizer.infolog import log
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import threading
|
||||
import time
|
||||
import os
|
||||
|
||||
_batches_per_group = 64
|
||||
|
||||
class Feeder:
|
||||
"""
|
||||
Feeds batches of data into queue on a background thread.
|
||||
"""
|
||||
|
||||
def __init__(self, coordinator, metadata_filename, hparams):
|
||||
super(Feeder, self).__init__()
|
||||
self._coord = coordinator
|
||||
self._hparams = hparams
|
||||
self._cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]
|
||||
self._train_offset = 0
|
||||
self._test_offset = 0
|
||||
|
||||
# Load metadata
|
||||
self._mel_dir = os.path.join(os.path.dirname(metadata_filename), "mels")
|
||||
self._embed_dir = os.path.join(os.path.dirname(metadata_filename), "embeds")
|
||||
with open(metadata_filename, encoding="utf-8") as f:
|
||||
self._metadata = [line.strip().split("|") for line in f]
|
||||
frame_shift_ms = hparams.hop_size / hparams.sample_rate
|
||||
hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
|
||||
log("Loaded metadata for {} examples ({:.2f} hours)".format(len(self._metadata), hours))
|
||||
|
||||
#Train test split
|
||||
if hparams.tacotron_test_size is None:
|
||||
assert hparams.tacotron_test_batches is not None
|
||||
|
||||
test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None
|
||||
else hparams.tacotron_test_batches * hparams.tacotron_batch_size)
|
||||
indices = np.arange(len(self._metadata))
|
||||
train_indices, test_indices = train_test_split(indices,
|
||||
test_size=test_size, random_state=hparams.tacotron_data_random_state)
|
||||
|
||||
#Make sure test_indices is a multiple of batch_size else round up
|
||||
len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size)
|
||||
extra_test = test_indices[len_test_indices:]
|
||||
test_indices = test_indices[:len_test_indices]
|
||||
train_indices = np.concatenate([train_indices, extra_test])
|
||||
|
||||
self._train_meta = list(np.array(self._metadata)[train_indices])
|
||||
self._test_meta = list(np.array(self._metadata)[test_indices])
|
||||
|
||||
self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size
|
||||
|
||||
if hparams.tacotron_test_size is None:
|
||||
assert hparams.tacotron_test_batches == self.test_steps
|
||||
|
||||
#pad input sequences with the <pad_token> 0 ( _ )
|
||||
self._pad = 0
|
||||
#explicitely setting the padding to a value that doesn"t originally exist in the spectogram
|
||||
#to avoid any possible conflicts, without affecting the output range of the model too much
|
||||
if hparams.symmetric_mels:
|
||||
self._target_pad = -hparams.max_abs_value
|
||||
else:
|
||||
self._target_pad = 0.
|
||||
#Mark finished sequences with 1s
|
||||
self._token_pad = 1.
|
||||
|
||||
with tf.device("/cpu:0"):
|
||||
# Create placeholders for inputs and targets. Don"t specify batch size because we want
|
||||
# to be able to feed different batch sizes at eval time.
|
||||
self._placeholders = [
|
||||
tf.placeholder(tf.int32, shape=(None, None), name="inputs"),
|
||||
tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
|
||||
tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels),
|
||||
name="mel_targets"),
|
||||
tf.placeholder(tf.float32, shape=(None, None), name="token_targets"),
|
||||
tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
|
||||
tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None),
|
||||
name="split_infos"),
|
||||
|
||||
# SV2TTS
|
||||
tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size),
|
||||
name="speaker_embeddings")
|
||||
]
|
||||
|
||||
# Create queue for buffering data
|
||||
queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32,
|
||||
tf.int32, tf.int32, tf.float32], name="input_queue")
|
||||
self._enqueue_op = queue.enqueue(self._placeholders)
|
||||
self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \
|
||||
self.targets_lengths, self.split_infos, self.speaker_embeddings = queue.dequeue()
|
||||
|
||||
self.inputs.set_shape(self._placeholders[0].shape)
|
||||
self.input_lengths.set_shape(self._placeholders[1].shape)
|
||||
self.mel_targets.set_shape(self._placeholders[2].shape)
|
||||
self.token_targets.set_shape(self._placeholders[3].shape)
|
||||
self.targets_lengths.set_shape(self._placeholders[4].shape)
|
||||
self.split_infos.set_shape(self._placeholders[5].shape)
|
||||
self.speaker_embeddings.set_shape(self._placeholders[6].shape)
|
||||
|
||||
# Create eval queue for buffering eval data
|
||||
eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,
|
||||
tf.int32, tf.int32, tf.float32], name="eval_queue")
|
||||
self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
|
||||
self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \
|
||||
self.eval_token_targets, self.eval_targets_lengths, \
|
||||
self.eval_split_infos, self.eval_speaker_embeddings = eval_queue.dequeue()
|
||||
|
||||
self.eval_inputs.set_shape(self._placeholders[0].shape)
|
||||
self.eval_input_lengths.set_shape(self._placeholders[1].shape)
|
||||
self.eval_mel_targets.set_shape(self._placeholders[2].shape)
|
||||
self.eval_token_targets.set_shape(self._placeholders[3].shape)
|
||||
self.eval_targets_lengths.set_shape(self._placeholders[4].shape)
|
||||
self.eval_split_infos.set_shape(self._placeholders[5].shape)
|
||||
self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape)
|
||||
|
||||
|
||||
def start_threads(self, session):
|
||||
self._session = session
|
||||
thread = threading.Thread(name="background", target=self._enqueue_next_train_group)
|
||||
thread.daemon = True #Thread will close when parent quits
|
||||
thread.start()
|
||||
|
||||
thread = threading.Thread(name="background", target=self._enqueue_next_test_group)
|
||||
thread.daemon = True #Thread will close when parent quits
|
||||
thread.start()
|
||||
|
||||
def _get_test_groups(self):
|
||||
meta = self._test_meta[self._test_offset]
|
||||
self._test_offset += 1
|
||||
|
||||
text = meta[5]
|
||||
|
||||
input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
|
||||
mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
|
||||
#Create parallel sequences containing zeros to represent a non finished sequence
|
||||
token_target = np.asarray([0.] * (len(mel_target) - 1))
|
||||
embed_target = np.load(os.path.join(self._embed_dir, meta[2]))
|
||||
return input_data, mel_target, token_target, embed_target, len(mel_target)
|
||||
|
||||
def make_test_batches(self):
|
||||
start = time.time()
|
||||
|
||||
# Read a group of examples
|
||||
n = self._hparams.tacotron_batch_size
|
||||
r = self._hparams.outputs_per_step
|
||||
|
||||
#Test on entire test set
|
||||
examples = [self._get_test_groups() for i in range(len(self._test_meta))]
|
||||
|
||||
# Bucket examples based on similar output sequence length for efficiency
|
||||
examples.sort(key=lambda x: x[-1])
|
||||
batches = [examples[i: i+n] for i in range(0, len(examples), n)]
|
||||
np.random.shuffle(batches)
|
||||
|
||||
log("\nGenerated %d test batches of size %d in %.3f sec" % (len(batches), n, time.time() - start))
|
||||
return batches, r
|
||||
|
||||
def _enqueue_next_train_group(self):
|
||||
while not self._coord.should_stop():
|
||||
start = time.time()
|
||||
|
||||
# Read a group of examples
|
||||
n = self._hparams.tacotron_batch_size
|
||||
r = self._hparams.outputs_per_step
|
||||
examples = [self._get_next_example() for i in range(n * _batches_per_group)]
|
||||
|
||||
# Bucket examples based on similar output sequence length for efficiency
|
||||
examples.sort(key=lambda x: x[-1])
|
||||
batches = [examples[i: i+n] for i in range(0, len(examples), n)]
|
||||
np.random.shuffle(batches)
|
||||
|
||||
log("\nGenerated {} train batches of size {} in {:.3f} sec".format(len(batches), n, time.time() - start))
|
||||
for batch in batches:
|
||||
feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
|
||||
self._session.run(self._enqueue_op, feed_dict=feed_dict)
|
||||
|
||||
def _enqueue_next_test_group(self):
|
||||
#Create test batches once and evaluate on them for all test steps
|
||||
test_batches, r = self.make_test_batches()
|
||||
while not self._coord.should_stop():
|
||||
for batch in test_batches:
|
||||
feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
|
||||
self._session.run(self._eval_enqueue_op, feed_dict=feed_dict)
|
||||
|
||||
def _get_next_example(self):
|
||||
"""Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
|
||||
"""
|
||||
if self._train_offset >= len(self._train_meta):
|
||||
self._train_offset = 0
|
||||
np.random.shuffle(self._train_meta)
|
||||
|
||||
meta = self._train_meta[self._train_offset]
|
||||
self._train_offset += 1
|
||||
|
||||
text = meta[5]
|
||||
|
||||
input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
|
||||
mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
|
||||
#Create parallel sequences containing zeros to represent a non finished sequence
|
||||
token_target = np.asarray([0.] * (len(mel_target) - 1))
|
||||
embed_target = np.load(os.path.join(self._embed_dir, meta[2]))
|
||||
return input_data, mel_target, token_target, embed_target, len(mel_target)
|
||||
|
||||
def _prepare_batch(self, batches, outputs_per_step):
|
||||
assert 0 == len(batches) % self._hparams.tacotron_num_gpus
|
||||
size_per_device = int(len(batches) / self._hparams.tacotron_num_gpus)
|
||||
np.random.shuffle(batches)
|
||||
|
||||
inputs = None
|
||||
mel_targets = None
|
||||
token_targets = None
|
||||
targets_lengths = None
|
||||
split_infos = []
|
||||
|
||||
targets_lengths = np.asarray([x[-1] for x in batches], dtype=np.int32) #Used to mask loss
|
||||
input_lengths = np.asarray([len(x[0]) for x in batches], dtype=np.int32)
|
||||
|
||||
for i in range(self._hparams.tacotron_num_gpus):
|
||||
batch = batches[size_per_device*i:size_per_device*(i+1)]
|
||||
input_cur_device, input_max_len = self._prepare_inputs([x[0] for x in batch])
|
||||
inputs = np.concatenate((inputs, input_cur_device), axis=1) if inputs is not None else input_cur_device
|
||||
mel_target_cur_device, mel_target_max_len = self._prepare_targets([x[1] for x in batch], outputs_per_step)
|
||||
mel_targets = np.concatenate(( mel_targets, mel_target_cur_device), axis=1) if mel_targets is not None else mel_target_cur_device
|
||||
|
||||
#Pad sequences with 1 to infer that the sequence is done
|
||||
token_target_cur_device, token_target_max_len = self._prepare_token_targets([x[2] for x in batch], outputs_per_step)
|
||||
token_targets = np.concatenate((token_targets, token_target_cur_device),axis=1) if token_targets is not None else token_target_cur_device
|
||||
split_infos.append([input_max_len, mel_target_max_len, token_target_max_len])
|
||||
|
||||
split_infos = np.asarray(split_infos, dtype=np.int32)
|
||||
|
||||
### SV2TTS ###
|
||||
|
||||
embed_targets = np.asarray([x[3] for x in batches])
|
||||
|
||||
##############
|
||||
|
||||
return inputs, input_lengths, mel_targets, token_targets, targets_lengths, \
|
||||
split_infos, embed_targets
|
||||
|
||||
def _prepare_inputs(self, inputs):
|
||||
max_len = max([len(x) for x in inputs])
|
||||
return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
|
||||
|
||||
def _prepare_targets(self, targets, alignment):
|
||||
max_len = max([len(t) for t in targets])
|
||||
data_len = self._round_up(max_len, alignment)
|
||||
return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
|
||||
|
||||
def _prepare_token_targets(self, targets, alignment):
|
||||
max_len = max([len(t) for t in targets]) + 1
|
||||
data_len = self._round_up(max_len, alignment)
|
||||
return np.stack([self._pad_token_target(t, data_len) for t in targets]), data_len
|
||||
|
||||
def _pad_input(self, x, length):
|
||||
return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad)
|
||||
|
||||
def _pad_target(self, t, length):
|
||||
return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad)
|
||||
|
||||
def _pad_token_target(self, t, length):
|
||||
return np.pad(t, (0, length - t.shape[0]), mode="constant", constant_values=self._token_pad)
|
||||
|
||||
def _round_up(self, x, multiple):
|
||||
remainder = x % multiple
|
||||
return x if remainder == 0 else x + multiple - remainder
|
||||
|
||||
def _round_down(self, x, multiple):
|
||||
remainder = x % multiple
|
||||
return x if remainder == 0 else x - remainder
|
272
synthesizer_tacotron2/hparams.py
Normal file
272
synthesizer_tacotron2/hparams.py
Normal file
|
@ -0,0 +1,272 @@
|
|||
import ast
|
||||
import pprint
|
||||
from tensorflow.contrib.training import HParams
|
||||
|
||||
|
||||
|
||||
hparams = HParams(
|
||||
cleaners="basic_cleaners",
|
||||
tacotron_gpu_start_idx=0, # idx of the first GPU to be used for Tacotron training.
|
||||
tacotron_num_gpus=1, # Determines the number of gpus in use for Tacotron training.
|
||||
split_on_cpu=True,
|
||||
|
||||
### Signal Processing (used in both synthesizer and vocoder)
|
||||
sample_rate = 16000,
|
||||
n_fft = 800,
|
||||
num_mels = 80,
|
||||
hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
|
||||
win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
|
||||
fmin = 55,
|
||||
min_level_db = -100,
|
||||
ref_level_db = 20,
|
||||
max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.
|
||||
preemphasis = 0.97, # Filter coefficient to use if preemphasize is True
|
||||
preemphasize = True,
|
||||
frame_shift_ms=None,
|
||||
normalize_for_wavenet=True,
|
||||
# whether to rescale to [0, 1] for wavenet. (better audio quality)
|
||||
clip_for_wavenet=True,
|
||||
|
||||
|
||||
|
||||
### Tacotron Text-to-Speech (TTS)
|
||||
tts_embed_dims = 512, # Embedding dimension for the graphemes/phoneme inputs
|
||||
tts_encoder_dims = 256,
|
||||
tts_decoder_dims = 128,
|
||||
tts_postnet_dims = 512,
|
||||
tts_encoder_K = 5,
|
||||
tts_lstm_dims = 1024,
|
||||
tts_postnet_K = 5,
|
||||
tts_num_highways = 4,
|
||||
tts_dropout = 0.5,
|
||||
tts_cleaner_names = ["basic_cleaners"],
|
||||
tts_stop_threshold = -3.4, # Value below which audio generation ends.
|
||||
# For example, for a range of [-4, 4], this
|
||||
# will terminate the sequence at the first
|
||||
# frame that has all values < -3.4
|
||||
|
||||
### Tacotron Training
|
||||
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
|
||||
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 80_000, 12), #
|
||||
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
|
||||
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
|
||||
(2, 1e-5, 640_000, 12)], # lr = learning rate
|
||||
|
||||
tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed
|
||||
tts_eval_interval = 500, # Number of steps between model evaluation (sample generation)
|
||||
# Set to -1 to generate after completing epoch, or 0 to disable
|
||||
|
||||
tts_eval_num_samples = 1, # Makes this number of samples
|
||||
|
||||
### Data Preprocessing
|
||||
max_mel_frames = 900,
|
||||
rescale = True,
|
||||
rescaling_max = 0.9,
|
||||
synthesis_batch_size = 16, # For vocoder preprocessing and inference.
|
||||
|
||||
### Mel Visualization and Griffin-Lim
|
||||
signal_normalization = True,
|
||||
power = 1.5,
|
||||
griffin_lim_iters = 60,
|
||||
|
||||
### Audio processing options
|
||||
fmax = 7600, # Should not exceed (sample_rate // 2)
|
||||
allow_clipping_in_normalization = True, # Used when signal_normalization = True
|
||||
clip_mels_length = True, # If true, discards samples exceeding max_mel_frames
|
||||
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
|
||||
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
|
||||
# and [0, max_abs_value] if False
|
||||
trim_silence = True, # Use with sample_rate of 16000 for best results
|
||||
silence_threshold=2,
|
||||
trim_fft_size=512,
|
||||
trim_hop_size=128,
|
||||
trim_top_db=23,
|
||||
|
||||
### SV2TTS
|
||||
speaker_embedding_size = 256, # Dimension for the speaker embedding
|
||||
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
|
||||
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
|
||||
|
||||
# Tacotron
|
||||
outputs_per_step=2, # Was 1
|
||||
# number of frames to generate at each decoding step (increase to speed up computation and
|
||||
# allows for higher batch size, decreases G&L audio quality)
|
||||
stop_at_any=True,
|
||||
# Determines whether the decoder should stop when predicting <stop> to any frame or to all of
|
||||
# them (True works pretty well)
|
||||
|
||||
embedding_dim=512, # dimension of embedding space (these are NOT the speaker embeddings)
|
||||
|
||||
# Encoder parameters
|
||||
enc_conv_num_layers=3, # number of encoder convolutional layers
|
||||
enc_conv_kernel_size=(5,), # size of encoder convolution filters for each layer
|
||||
enc_conv_channels=512, # number of encoder convolutions filters for each layer
|
||||
encoder_lstm_units=256, # number of lstm units for each direction (forward and backward)
|
||||
|
||||
# Attention mechanism
|
||||
smoothing=False, # Whether to smooth the attention normalization function
|
||||
attention_dim=128, # dimension of attention space
|
||||
attention_filters=32, # number of attention convolution filters
|
||||
attention_kernel=(31,), # kernel size of attention convolution
|
||||
cumulative_weights=True,
|
||||
# Whether to cumulate (sum) all previous attention weights or simply feed previous weights (
|
||||
# Recommended: True)
|
||||
|
||||
# Decoder
|
||||
prenet_layers=[256, 256], # number of layers and number of units of prenet
|
||||
decoder_layers=2, # number of decoder lstm layers
|
||||
decoder_lstm_units=1024, # number of decoder lstm units on each layer
|
||||
max_iters=2000,
|
||||
# Max decoder steps during inference (Just for safety from infinite loop cases)
|
||||
|
||||
# Residual postnet
|
||||
postnet_num_layers=5, # number of postnet convolutional layers
|
||||
postnet_kernel_size=(5,), # size of postnet convolution filters for each layer
|
||||
postnet_channels=512, # number of postnet convolution filters for each layer
|
||||
|
||||
# CBHG mel->linear postnet
|
||||
cbhg_kernels=8,
|
||||
# All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act
|
||||
# as "K-grams"
|
||||
cbhg_conv_channels=128, # Channels of the convolution bank
|
||||
cbhg_pool_size=2, # pooling size of the CBHG
|
||||
cbhg_projection=256,
|
||||
# projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels)
|
||||
cbhg_projection_kernel_size=3, # kernel_size of the CBHG projections
|
||||
cbhg_highwaynet_layers=4, # Number of HighwayNet layers
|
||||
cbhg_highway_units=128, # Number of units used in HighwayNet fully connected layers
|
||||
cbhg_rnn_units=128,
|
||||
# Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in
|
||||
# shape
|
||||
|
||||
# Loss params
|
||||
mask_encoder=True,
|
||||
# whether to mask encoder padding while computing attention. Set to True for better prosody
|
||||
# but slower convergence.
|
||||
mask_decoder=False,
|
||||
# Whether to use loss mask for padded sequences (if False, <stop_token> loss function will not
|
||||
# be weighted, else recommended pos_weight = 20)
|
||||
cross_entropy_pos_weight=20,
|
||||
# Use class weights to reduce the stop token classes imbalance (by adding more penalty on
|
||||
# False Negatives (FN)) (1 = disabled)
|
||||
predict_linear=False,
|
||||
# Whether to add a post-processing network to the Tacotron to predict linear spectrograms (
|
||||
# True mode Not tested!!)
|
||||
###########################################################################################################################################
|
||||
|
||||
# Tacotron Training
|
||||
# Reproduction seeds
|
||||
tacotron_random_seed=5339,
|
||||
# Determines initial graph and operations (i.e: model) random state for reproducibility
|
||||
tacotron_data_random_state=1234, # random state for train test split repeatability
|
||||
|
||||
# performance parameters
|
||||
tacotron_swap_with_cpu=False,
|
||||
# Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause
|
||||
# major slowdowns! Only use when critical!)
|
||||
|
||||
# train/test split ratios, mini-batches sizes
|
||||
tacotron_batch_size=36, # number of training samples on each training steps (was 32)
|
||||
# Tacotron Batch synthesis supports ~16x the training batch size (no gradients during
|
||||
# testing).
|
||||
# Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times
|
||||
# different from training. We thus recommend masking the encoder.
|
||||
tacotron_synthesis_batch_size=128,
|
||||
# DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN"T TRAIN TACOTRON WITH "mask_encoder=True"!!
|
||||
tacotron_test_size=0.05,
|
||||
# % of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is
|
||||
# enough to have a good idea about overfit)
|
||||
tacotron_test_batches=None, # number of test batches.
|
||||
|
||||
# Learning rate schedule
|
||||
tacotron_decay_learning_rate=True,
|
||||
# boolean, determines if the learning rate will follow an exponential decay
|
||||
tacotron_start_decay=50000, # Step at which learning decay starts
|
||||
tacotron_decay_steps=50000, # Determines the learning rate decay slope (UNDER TEST)
|
||||
tacotron_decay_rate=0.5, # learning rate decay rate (UNDER TEST)
|
||||
tacotron_initial_learning_rate=1e-3, # starting learning rate
|
||||
tacotron_final_learning_rate=1e-5, # minimal learning rate
|
||||
|
||||
# Optimization parameters
|
||||
tacotron_adam_beta1=0.9, # AdamOptimizer beta1 parameter
|
||||
tacotron_adam_beta2=0.999, # AdamOptimizer beta2 parameter
|
||||
tacotron_adam_epsilon=1e-6, # AdamOptimizer Epsilon parameter
|
||||
|
||||
# Regularization parameters
|
||||
tacotron_reg_weight=1e-7, # regularization weight (for L2 regularization)
|
||||
tacotron_scale_regularization=False,
|
||||
# Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is
|
||||
# high and biasing the model)
|
||||
tacotron_zoneout_rate=0.1, # zoneout rate for all LSTM cells in the network
|
||||
tacotron_dropout_rate=0.5, # dropout rate for all convolutional layers + prenet
|
||||
tacotron_clip_gradients=True, # whether to clip gradients
|
||||
|
||||
# Evaluation parameters
|
||||
natural_eval=False,
|
||||
# Whether to use 100% natural eval (to evaluate Curriculum Learning performance) or with same
|
||||
# teacher-forcing ratio as in training (just for overfit)
|
||||
|
||||
# Decoder RNN learning can take be done in one of two ways:
|
||||
# Teacher Forcing: vanilla teacher forcing (usually with ratio = 1). mode="constant"
|
||||
# Curriculum Learning Scheme: From Teacher-Forcing to sampling from previous outputs is
|
||||
# function of global step. (teacher forcing ratio decay) mode="scheduled"
|
||||
# The second approach is inspired by:
|
||||
# Bengio et al. 2015: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks.
|
||||
# Can be found under: https://arxiv.org/pdf/1506.03099.pdf
|
||||
tacotron_teacher_forcing_mode="constant",
|
||||
# Can be ("constant" or "scheduled"). "scheduled" mode applies a cosine teacher forcing ratio
|
||||
# decay. (Preference: scheduled)
|
||||
tacotron_teacher_forcing_ratio=1.,
|
||||
# Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder
|
||||
# inputs, Only relevant if mode="constant"
|
||||
tacotron_teacher_forcing_init_ratio=1.,
|
||||
# initial teacher forcing ratio. Relevant if mode="scheduled"
|
||||
tacotron_teacher_forcing_final_ratio=0.,
|
||||
# final teacher forcing ratio. Relevant if mode="scheduled"
|
||||
tacotron_teacher_forcing_start_decay=10000,
|
||||
# starting point of teacher forcing ratio decay. Relevant if mode="scheduled"
|
||||
tacotron_teacher_forcing_decay_steps=280000,
|
||||
# Determines the teacher forcing ratio decay slope. Relevant if mode="scheduled"
|
||||
tacotron_teacher_forcing_decay_alpha=0.,
|
||||
# teacher forcing ratio decay rate. Relevant if mode="scheduled"
|
||||
###########################################################################################################################################
|
||||
|
||||
# Tacotron-2 integration parameters
|
||||
train_with_GTA=False,
|
||||
# Whether to use GTA mels to train WaveNet instead of ground truth mels.
|
||||
###########################################################################################################################################
|
||||
|
||||
# Eval sentences (if no eval text file was specified during synthesis, these sentences are
|
||||
# used for eval)
|
||||
sentences=[
|
||||
# From July 8, 2017 New York Times:
|
||||
"Scientists at the CERN laboratory say they have discovered a new particle.",
|
||||
"There\"s a way to measure the acute emotional intelligence that has never gone out of "
|
||||
"style.",
|
||||
"President Trump met with other leaders at the Group of 20 conference.",
|
||||
"The Senate\"s bill to repeal and replace the Affordable Care Act is now imperiled.",
|
||||
# From Google"s Tacotron example page:
|
||||
"Generative adversarial network or variational auto-encoder.",
|
||||
"Basilar membrane and otolaryngology are not auto-correlations.",
|
||||
"He has read the whole thing.",
|
||||
"He reads books.",
|
||||
"He thought it was time to present the present.",
|
||||
"Thisss isrealy awhsome.",
|
||||
"Punctuation sensitivity, is working.",
|
||||
"Punctuation sensitivity is working.",
|
||||
"Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
|
||||
"She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
|
||||
"Tajima Airport serves Toyooka.",
|
||||
# From The web (random long utterance)
|
||||
"Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.\
|
||||
This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that\
|
||||
the adopted architecture is able to perform this task with wild success.",
|
||||
"Thank you so much for your support!",
|
||||
],
|
||||
)
|
||||
|
||||
def hparams_debug_string():
|
||||
values = hparams.values()
|
||||
hp = [" %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"]
|
||||
return "Hyperparameters:\n" + "\n".join(hp)
|
165
synthesizer_tacotron2/inference.py
Normal file
165
synthesizer_tacotron2/inference.py
Normal file
|
@ -0,0 +1,165 @@
|
|||
from synthesizer.tacotron2 import Tacotron2
|
||||
import torch
|
||||
from synthesizer import audio
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.models.tacotron import Tacotron
|
||||
from synthesizer.utils.symbols import symbols
|
||||
from synthesizer.utils.text import text_to_sequence
|
||||
from vocoder.display import simple_table
|
||||
from pathlib import Path
|
||||
from typing import Union, List
|
||||
import numpy as np
|
||||
import librosa
|
||||
from utils import logmmse
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
import os
|
||||
import tensorflow as tf
|
||||
|
||||
class Synthesizer:
|
||||
sample_rate = hparams.sample_rate
|
||||
hparams = hparams
|
||||
|
||||
def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False):
|
||||
"""
|
||||
The model isn't instantiated and loaded in memory until needed or until load() is called.
|
||||
|
||||
:param model_fpath: path to the trained model file
|
||||
:param verbose: if False, prints less information when using the model
|
||||
"""
|
||||
self.verbose = verbose
|
||||
self._low_mem = low_mem
|
||||
|
||||
|
||||
|
||||
|
||||
# Prepare the model
|
||||
self._model = None # type: Tacotron2
|
||||
checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir)
|
||||
if checkpoint_state is None:
|
||||
raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
|
||||
self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
|
||||
if verbose:
|
||||
model_name = checkpoints_dir.parent.name.replace("logs-", "")
|
||||
step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
|
||||
print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))
|
||||
|
||||
|
||||
def is_loaded(self):
|
||||
"""
|
||||
Whether the model is loaded in memory.
|
||||
"""
|
||||
return self._model is not None
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Instantiates and loads the model given the weights file that was passed in the constructor.
|
||||
"""
|
||||
if self._low_mem:
|
||||
raise Exception("Cannot load the synthesizer permanently in low mem mode")
|
||||
tf.reset_default_graph()
|
||||
self._model = Tacotron2(self.checkpoint_fpath, hparams)
|
||||
|
||||
def synthesize_spectrograms(self, texts: List[str],
|
||||
embeddings: Union[np.ndarray, List[np.ndarray]],
|
||||
return_alignments=False):
|
||||
"""
|
||||
Synthesizes mel spectrograms from texts and speaker embeddings.
|
||||
|
||||
:param texts: a list of N text prompts to be synthesized
|
||||
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
|
||||
:param return_alignments: if True, a matrix representing the alignments between the
|
||||
characters
|
||||
and each decoder output step will be returned for each spectrogram
|
||||
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
|
||||
sequence length of spectrogram i, and possibly the alignments.
|
||||
"""
|
||||
# Load the model on the first request.
|
||||
if not self.is_loaded():
|
||||
self.load()
|
||||
|
||||
|
||||
|
||||
print("Read " + str(texts))
|
||||
texts = [" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) for v in texts]
|
||||
print("Synthesizing " + str(texts))
|
||||
# Preprocess text inputs
|
||||
inputs = [text_to_sequence(text, hparams.tts_cleaner_names) for text in texts]
|
||||
if not isinstance(embeddings, list):
|
||||
embeddings = [embeddings]
|
||||
|
||||
# Batch inputs
|
||||
batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
|
||||
for i in range(0, len(inputs), hparams.synthesis_batch_size)]
|
||||
batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
|
||||
for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
|
||||
|
||||
specs = []
|
||||
for i, batch in enumerate(batched_inputs, 1):
|
||||
if self.verbose:
|
||||
print(f"\n| Generating {i}/{len(batched_inputs)}")
|
||||
|
||||
# Pad texts so they are all the same length
|
||||
text_lens = [len(text) for text in batch]
|
||||
max_text_len = max(text_lens)
|
||||
# chars = [pad1d(text, max_text_len) for text in batch]
|
||||
# chars = np.stack(chars)
|
||||
#
|
||||
# # Stack speaker embeddings into 2D array for batch processing
|
||||
speaker_embeds = np.stack(batched_embeds[i-1])
|
||||
#
|
||||
# # Convert to tensor
|
||||
# chars = torch.tensor(chars).long().to(self.device)
|
||||
# speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
|
||||
|
||||
# Inference
|
||||
#print(texts)
|
||||
specs, alignments = self._model.my_synthesize(speaker_embeds, texts) #传入参数是embeddings还是speaker——embeds未确定
|
||||
|
||||
|
||||
|
||||
if self.verbose:
|
||||
print("\n\nDone.\n")
|
||||
return (specs, alignments) if return_alignments else specs
|
||||
|
||||
@staticmethod
|
||||
def load_preprocess_wav(fpath):
|
||||
"""
|
||||
Loads and preprocesses an audio file under the same conditions the audio files were used to
|
||||
train the synthesizer.
|
||||
"""
|
||||
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
|
||||
if hparams.rescale:
|
||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||
# denoise
|
||||
if len(wav) > hparams.sample_rate*(0.3+0.1):
|
||||
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
|
||||
wav[-int(hparams.sample_rate*0.15):]])
|
||||
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
|
||||
wav = logmmse.denoise(wav, profile)
|
||||
return wav
|
||||
|
||||
@staticmethod
|
||||
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
|
||||
"""
|
||||
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
|
||||
were fed to the synthesizer when training.
|
||||
"""
|
||||
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
||||
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
|
||||
else:
|
||||
wav = fpath_or_wav
|
||||
|
||||
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
||||
return mel_spectrogram
|
||||
|
||||
@staticmethod
|
||||
def griffin_lim(mel):
|
||||
"""
|
||||
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
|
||||
with the same parameters present in hparams.py.
|
||||
"""
|
||||
return audio.inv_mel_spectrogram(mel, hparams)
|
||||
|
||||
|
||||
def pad1d(x, max_len, pad_value=0):
|
||||
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
|
50
synthesizer_tacotron2/infolog.py
Normal file
50
synthesizer_tacotron2/infolog.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
import atexit
|
||||
import json
|
||||
from datetime import datetime
|
||||
from threading import Thread
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
_format = "%Y-%m-%d %H:%M:%S.%f"
|
||||
_file = None
|
||||
_run_name = None
|
||||
_slack_url = None
|
||||
|
||||
|
||||
def init(filename, run_name, slack_url=None):
|
||||
global _file, _run_name, _slack_url
|
||||
_close_logfile()
|
||||
_file = open(filename, "a")
|
||||
_file = open(filename, "a")
|
||||
_file.write("\n-----------------------------------------------------------------\n")
|
||||
_file.write("Starting new {} training run\n".format(run_name))
|
||||
_file.write("-----------------------------------------------------------------\n")
|
||||
_run_name = run_name
|
||||
_slack_url = slack_url
|
||||
|
||||
|
||||
def log(msg, end="\n", slack=False):
|
||||
print(msg, end=end)
|
||||
if _file is not None:
|
||||
_file.write("[%s] %s\n" % (datetime.now().strftime(_format)[:-3], msg))
|
||||
if slack and _slack_url is not None:
|
||||
Thread(target=_send_slack, args=(msg,)).start()
|
||||
|
||||
|
||||
def _close_logfile():
|
||||
global _file
|
||||
if _file is not None:
|
||||
_file.close()
|
||||
_file = None
|
||||
|
||||
|
||||
def _send_slack(msg):
|
||||
req = Request(_slack_url)
|
||||
req.add_header("Content-Type", "application/json")
|
||||
urlopen(req, json.dumps({
|
||||
"username": "tacotron",
|
||||
"icon_emoji": ":taco:",
|
||||
"text": "*%s*: %s" % (_run_name, msg)
|
||||
}).encode())
|
||||
|
||||
|
||||
atexit.register(_close_logfile)
|
8
synthesizer_tacotron2/models/__init__.py
Normal file
8
synthesizer_tacotron2/models/__init__.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from .tacotron import Tacotron
|
||||
|
||||
|
||||
def create_model(name, hparams):
|
||||
if name == "Tacotron":
|
||||
return Tacotron(hparams)
|
||||
else:
|
||||
raise Exception("Unknown model: " + name)
|
207
synthesizer_tacotron2/models/architecture_wrappers.py
Normal file
207
synthesizer_tacotron2/models/architecture_wrappers.py
Normal file
|
@ -0,0 +1,207 @@
|
|||
"""A set of wrappers useful for tacotron 2 architecture
|
||||
All notations and variable names were used in concordance with originial tensorflow implementation
|
||||
"""
|
||||
import collections
|
||||
import tensorflow as tf
|
||||
from synthesizer.models.attention import _compute_attention
|
||||
from tensorflow.contrib.rnn import RNNCell
|
||||
from tensorflow.python.framework import ops, tensor_shape
|
||||
from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops
|
||||
from tensorflow.python.util import nest
|
||||
|
||||
_zero_state_tensors = rnn_cell_impl._zero_state_tensors
|
||||
|
||||
|
||||
|
||||
class TacotronEncoderCell(RNNCell):
|
||||
"""Tacotron 2 Encoder Cell
|
||||
Passes inputs through a stack of convolutional layers then through a bidirectional LSTM
|
||||
layer to predict the hidden representation vector (or memory)
|
||||
"""
|
||||
|
||||
def __init__(self, convolutional_layers, lstm_layer):
|
||||
"""Initialize encoder parameters
|
||||
|
||||
Args:
|
||||
convolutional_layers: Encoder convolutional block class
|
||||
lstm_layer: encoder bidirectional lstm layer class
|
||||
"""
|
||||
super(TacotronEncoderCell, self).__init__()
|
||||
#Initialize encoder layers
|
||||
self._convolutions = convolutional_layers
|
||||
self._cell = lstm_layer
|
||||
|
||||
def __call__(self, inputs, input_lengths=None):
|
||||
#Pass input sequence through a stack of convolutional layers
|
||||
conv_output = self._convolutions(inputs)
|
||||
|
||||
#Extract hidden representation from encoder lstm cells
|
||||
hidden_representation = self._cell(conv_output, input_lengths)
|
||||
|
||||
#For shape visualization
|
||||
self.conv_output_shape = conv_output.shape
|
||||
return hidden_representation
|
||||
|
||||
|
||||
class TacotronDecoderCellState(
|
||||
collections.namedtuple("TacotronDecoderCellState",
|
||||
("cell_state", "attention", "time", "alignments",
|
||||
"alignment_history"))):
|
||||
"""`namedtuple` storing the state of a `TacotronDecoderCell`.
|
||||
Contains:
|
||||
- `cell_state`: The state of the wrapped `RNNCell` at the previous time
|
||||
step.
|
||||
- `attention`: The attention emitted at the previous time step.
|
||||
- `time`: int32 scalar containing the current time step.
|
||||
- `alignments`: A single or tuple of `Tensor`(s) containing the alignments
|
||||
emitted at the previous time step for each attention mechanism.
|
||||
- `alignment_history`: a single or tuple of `TensorArray`(s)
|
||||
containing alignment matrices from all time steps for each attention
|
||||
mechanism. Call `stack()` on each to convert to a `Tensor`.
|
||||
"""
|
||||
def replace(self, **kwargs):
|
||||
"""Clones the current state while overwriting components provided by kwargs.
|
||||
"""
|
||||
return super(TacotronDecoderCellState, self)._replace(**kwargs)
|
||||
|
||||
class TacotronDecoderCell(RNNCell):
|
||||
"""Tactron 2 Decoder Cell
|
||||
Decodes encoder output and previous mel frames into next r frames
|
||||
|
||||
Decoder Step i:
|
||||
1) Prenet to compress last output information
|
||||
2) Concat compressed inputs with previous context vector (input feeding) *
|
||||
3) Decoder RNN (actual decoding) to predict current state s_{i} *
|
||||
4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments *
|
||||
5) Predict new output y_{i} using s_{i} and c_{i} (concatenated)
|
||||
6) Predict <stop_token> output ys_{i} using s_{i} and c_{i} (concatenated)
|
||||
|
||||
* : This is typically taking a vanilla LSTM, wrapping it using tensorflow"s attention wrapper,
|
||||
and wrap that with the prenet before doing an input feeding, and with the prediction layer
|
||||
that uses RNN states to project on output space. Actions marked with (*) can be replaced with
|
||||
tensorflow"s attention wrapper call if it was using cumulative alignments instead of previous alignments only.
|
||||
"""
|
||||
|
||||
def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection):
|
||||
"""Initialize decoder parameters
|
||||
|
||||
Args:
|
||||
prenet: A tensorflow fully connected layer acting as the decoder pre-net
|
||||
attention_mechanism: A _BaseAttentionMechanism instance, usefull to
|
||||
learn encoder-decoder alignments
|
||||
rnn_cell: Instance of RNNCell, main body of the decoder
|
||||
frame_projection: tensorflow fully connected layer with r * num_mels output units
|
||||
stop_projection: tensorflow fully connected layer, expected to project to a scalar
|
||||
and through a sigmoid activation
|
||||
mask_finished: Boolean, Whether to mask decoder frames after the <stop_token>
|
||||
"""
|
||||
super(TacotronDecoderCell, self).__init__()
|
||||
#Initialize decoder layers
|
||||
self._prenet = prenet
|
||||
self._attention_mechanism = attention_mechanism
|
||||
self._cell = rnn_cell
|
||||
self._frame_projection = frame_projection
|
||||
self._stop_projection = stop_projection
|
||||
|
||||
self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value
|
||||
|
||||
def _batch_size_checks(self, batch_size, error_message):
|
||||
return [check_ops.assert_equal(batch_size,
|
||||
self._attention_mechanism.batch_size,
|
||||
message=error_message)]
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return self._frame_projection.shape
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
"""The `state_size` property of `TacotronDecoderCell`.
|
||||
|
||||
Returns:
|
||||
An `TacotronDecoderCell` tuple containing shapes used by this object.
|
||||
"""
|
||||
return TacotronDecoderCellState(
|
||||
cell_state=self._cell._cell.state_size,
|
||||
time=tensor_shape.TensorShape([]),
|
||||
attention=self._attention_layer_size,
|
||||
alignments=self._attention_mechanism.alignments_size,
|
||||
alignment_history=())
|
||||
|
||||
def zero_state(self, batch_size, dtype):
|
||||
"""Return an initial (zero) state tuple for this `AttentionWrapper`.
|
||||
|
||||
Args:
|
||||
batch_size: `0D` integer tensor: the batch size.
|
||||
dtype: The internal state data type.
|
||||
Returns:
|
||||
An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
|
||||
possibly, empty `TensorArray` objects.
|
||||
Raises:
|
||||
ValueError: (or, possibly at runtime, InvalidArgument), if
|
||||
`batch_size` does not match the output size of the encoder passed
|
||||
to the wrapper object at initialization time.
|
||||
"""
|
||||
with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
|
||||
cell_state = self._cell._cell.zero_state(batch_size, dtype)
|
||||
error_message = (
|
||||
"When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
|
||||
"Non-matching batch sizes between the memory "
|
||||
"(encoder output) and the requested batch size.")
|
||||
with ops.control_dependencies(
|
||||
self._batch_size_checks(batch_size, error_message)):
|
||||
cell_state = nest.map_structure(
|
||||
lambda s: array_ops.identity(s, name="checked_cell_state"),
|
||||
cell_state)
|
||||
return TacotronDecoderCellState(
|
||||
cell_state=cell_state,
|
||||
time=array_ops.zeros([], dtype=tf.int32),
|
||||
attention=_zero_state_tensors(self._attention_layer_size, batch_size,
|
||||
dtype),
|
||||
alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
|
||||
alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
|
||||
dynamic_size=True))
|
||||
|
||||
def __call__(self, inputs, state):
|
||||
#Information bottleneck (essential for learning attention)
|
||||
prenet_output = self._prenet(inputs)
|
||||
|
||||
#Concat context vector and prenet output to form LSTM cells input (input feeding)
|
||||
LSTM_input = tf.concat([prenet_output, state.attention], axis=-1)
|
||||
|
||||
#Unidirectional LSTM layers
|
||||
LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state)
|
||||
|
||||
|
||||
#Compute the attention (context) vector and alignments using
|
||||
#the new decoder cell hidden state as query vector
|
||||
#and cumulative alignments to extract location features
|
||||
#The choice of the new cell hidden state (s_{i}) of the last
|
||||
#decoder RNN Cell is based on Luong et Al. (2015):
|
||||
#https://arxiv.org/pdf/1508.04025.pdf
|
||||
previous_alignments = state.alignments
|
||||
previous_alignment_history = state.alignment_history
|
||||
context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism,
|
||||
LSTM_output,
|
||||
previous_alignments,
|
||||
attention_layer=None)
|
||||
|
||||
#Concat LSTM outputs and context vector to form projections inputs
|
||||
projections_input = tf.concat([LSTM_output, context_vector], axis=-1)
|
||||
|
||||
#Compute predicted frames and predicted <stop_token>
|
||||
cell_outputs = self._frame_projection(projections_input)
|
||||
stop_tokens = self._stop_projection(projections_input)
|
||||
|
||||
#Save alignment history
|
||||
alignment_history = previous_alignment_history.write(state.time, alignments)
|
||||
|
||||
#Prepare next decoder state
|
||||
next_state = TacotronDecoderCellState(
|
||||
time=state.time + 1,
|
||||
cell_state=next_cell_state,
|
||||
attention=context_vector,
|
||||
alignments=cumulated_alignments,
|
||||
alignment_history=alignment_history)
|
||||
|
||||
return (cell_outputs, stop_tokens), next_state
|
207
synthesizer_tacotron2/models/attention.py
Normal file
207
synthesizer_tacotron2/models/attention.py
Normal file
|
@ -0,0 +1,207 @@
|
|||
"""Attention file for location based attention (compatible with tensorflow attention wrapper)"""
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention
|
||||
from tensorflow.python.layers import core as layers_core
|
||||
from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope
|
||||
|
||||
|
||||
#From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
|
||||
def _compute_attention(attention_mechanism, cell_output, attention_state,
|
||||
attention_layer):
|
||||
"""Computes the attention and alignments for a given attention_mechanism."""
|
||||
alignments, next_attention_state = attention_mechanism(
|
||||
cell_output, state=attention_state)
|
||||
|
||||
# Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
|
||||
expanded_alignments = array_ops.expand_dims(alignments, 1)
|
||||
# Context is the inner product of alignments and values along the
|
||||
# memory time dimension.
|
||||
# alignments shape is
|
||||
# [batch_size, 1, memory_time]
|
||||
# attention_mechanism.values shape is
|
||||
# [batch_size, memory_time, memory_size]
|
||||
# the batched matmul is over memory_time, so the output shape is
|
||||
# [batch_size, 1, memory_size].
|
||||
# we then squeeze out the singleton dim.
|
||||
context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
|
||||
context = array_ops.squeeze(context, [1])
|
||||
|
||||
if attention_layer is not None:
|
||||
attention = attention_layer(array_ops.concat([cell_output, context], 1))
|
||||
else:
|
||||
attention = context
|
||||
|
||||
return attention, alignments, next_attention_state
|
||||
|
||||
|
||||
def _location_sensitive_score(W_query, W_fil, W_keys):
|
||||
"""Impelements Bahdanau-style (cumulative) scoring function.
|
||||
This attention is described in:
|
||||
J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
|
||||
gio, “Attention-based models for speech recognition,” in Ad-
|
||||
vances in Neural Information Processing Systems, 2015, pp.
|
||||
577–585.
|
||||
|
||||
#############################################################################
|
||||
hybrid attention (content-based + location-based)
|
||||
f = F * α_{i-1}
|
||||
energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
|
||||
#############################################################################
|
||||
|
||||
Args:
|
||||
W_query: Tensor, shape "[batch_size, 1, attention_dim]" to compare to location features.
|
||||
W_location: processed previous alignments into location features, shape "[batch_size, max_time, attention_dim]"
|
||||
W_keys: Tensor, shape "[batch_size, max_time, attention_dim]", typically the encoder outputs.
|
||||
Returns:
|
||||
A "[batch_size, max_time]" attention score (energy)
|
||||
"""
|
||||
# Get the number of hidden units from the trailing dimension of keys
|
||||
dtype = W_query.dtype
|
||||
num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
|
||||
|
||||
v_a = tf.get_variable(
|
||||
"attention_variable_projection", shape=[num_units], dtype=dtype,
|
||||
initializer=tf.contrib.layers.xavier_initializer())
|
||||
b_a = tf.get_variable(
|
||||
"attention_bias", shape=[num_units], dtype=dtype,
|
||||
initializer=tf.zeros_initializer())
|
||||
|
||||
return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
|
||||
|
||||
def _smoothing_normalization(e):
|
||||
"""Applies a smoothing normalization function instead of softmax
|
||||
Introduced in:
|
||||
J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
|
||||
gio, “Attention-based models for speech recognition,” in Ad-
|
||||
vances in Neural Information Processing Systems, 2015, pp.
|
||||
577–585.
|
||||
|
||||
############################################################################
|
||||
Smoothing normalization function
|
||||
a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
|
||||
############################################################################
|
||||
|
||||
Args:
|
||||
e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
|
||||
values of an attention mechanism
|
||||
Returns:
|
||||
matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
|
||||
attendance to multiple memory time steps.
|
||||
"""
|
||||
return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
|
||||
|
||||
|
||||
class LocationSensitiveAttention(BahdanauAttention):
|
||||
"""Impelements Bahdanau-style (cumulative) scoring function.
|
||||
Usually referred to as "hybrid" attention (content-based + location-based)
|
||||
Extends the additive attention described in:
|
||||
"D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
|
||||
tion by jointly learning to align and translate,” in Proceedings
|
||||
of ICLR, 2015."
|
||||
to use previous alignments as additional location features.
|
||||
|
||||
This attention is described in:
|
||||
J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
|
||||
gio, “Attention-based models for speech recognition,” in Ad-
|
||||
vances in Neural Information Processing Systems, 2015, pp.
|
||||
577–585.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_units,
|
||||
memory,
|
||||
hparams,
|
||||
mask_encoder=True,
|
||||
memory_sequence_length=None,
|
||||
smoothing=False,
|
||||
cumulate_weights=True,
|
||||
name="LocationSensitiveAttention"):
|
||||
"""Construct the Attention mechanism.
|
||||
Args:
|
||||
num_units: The depth of the query mechanism.
|
||||
memory: The memory to query; usually the output of an RNN encoder. This
|
||||
tensor should be shaped `[batch_size, max_time, ...]`.
|
||||
mask_encoder (optional): Boolean, whether to mask encoder paddings.
|
||||
memory_sequence_length (optional): Sequence lengths for the batch entries
|
||||
in memory. If provided, the memory tensor rows are masked with zeros
|
||||
for values past the respective sequence lengths. Only relevant if mask_encoder = True.
|
||||
smoothing (optional): Boolean. Determines which normalization function to use.
|
||||
Default normalization function (probablity_fn) is softmax. If smoothing is
|
||||
enabled, we replace softmax with:
|
||||
a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
|
||||
Introduced in:
|
||||
J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
|
||||
gio, “Attention-based models for speech recognition,” in Ad-
|
||||
vances in Neural Information Processing Systems, 2015, pp.
|
||||
577–585.
|
||||
This is mainly used if the model wants to attend to multiple input parts
|
||||
at the same decoding step. We probably won"t be using it since multiple sound
|
||||
frames may depend on the same character/phone, probably not the way around.
|
||||
Note:
|
||||
We still keep it implemented in case we want to test it. They used it in the
|
||||
paper in the context of speech recognition, where one phoneme may depend on
|
||||
multiple subsequent sound frames.
|
||||
name: Name to use when creating ops.
|
||||
"""
|
||||
#Create normalization function
|
||||
#Setting it to None defaults in using softmax
|
||||
normalization_function = _smoothing_normalization if (smoothing == True) else None
|
||||
memory_length = memory_sequence_length if (mask_encoder==True) else None
|
||||
super(LocationSensitiveAttention, self).__init__(
|
||||
num_units=num_units,
|
||||
memory=memory,
|
||||
memory_sequence_length=memory_length,
|
||||
probability_fn=normalization_function,
|
||||
name=name)
|
||||
|
||||
self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
|
||||
kernel_size=hparams.attention_kernel, padding="same", use_bias=True,
|
||||
bias_initializer=tf.zeros_initializer(), name="location_features_convolution")
|
||||
self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,
|
||||
dtype=tf.float32, name="location_features_layer")
|
||||
self._cumulate = cumulate_weights
|
||||
|
||||
def __call__(self, query, state):
|
||||
"""Score the query based on the keys and values.
|
||||
Args:
|
||||
query: Tensor of dtype matching `self.values` and shape
|
||||
`[batch_size, query_depth]`.
|
||||
state (previous alignments): Tensor of dtype matching `self.values` and shape
|
||||
`[batch_size, alignments_size]`
|
||||
(`alignments_size` is memory"s `max_time`).
|
||||
Returns:
|
||||
alignments: Tensor of dtype matching `self.values` and shape
|
||||
`[batch_size, alignments_size]` (`alignments_size` is memory's
|
||||
`max_time`).
|
||||
"""
|
||||
previous_alignments = state
|
||||
with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
|
||||
|
||||
# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
|
||||
processed_query = self.query_layer(query) if self.query_layer else query
|
||||
# -> [batch_size, 1, attention_dim]
|
||||
processed_query = tf.expand_dims(processed_query, 1)
|
||||
|
||||
# processed_location_features shape [batch_size, max_time, attention dimension]
|
||||
# [batch_size, max_time] -> [batch_size, max_time, 1]
|
||||
expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
|
||||
# location features [batch_size, max_time, filters]
|
||||
f = self.location_convolution(expanded_alignments)
|
||||
# Projected location features [batch_size, max_time, attention_dim]
|
||||
processed_location_features = self.location_layer(f)
|
||||
|
||||
# energy shape [batch_size, max_time]
|
||||
energy = _location_sensitive_score(processed_query, processed_location_features, self.keys)
|
||||
|
||||
|
||||
# alignments shape = energy shape = [batch_size, max_time]
|
||||
alignments = self._probability_fn(energy, previous_alignments)
|
||||
|
||||
# Cumulate alignments
|
||||
if self._cumulate:
|
||||
next_state = alignments + previous_alignments
|
||||
else:
|
||||
next_state = alignments
|
||||
|
||||
return alignments, next_state
|
132
synthesizer_tacotron2/models/custom_decoder.py
Normal file
132
synthesizer_tacotron2/models/custom_decoder.py
Normal file
|
@ -0,0 +1,132 @@
|
|||
from __future__ import absolute_import, division, print_function
|
||||
import collections
|
||||
import tensorflow as tf
|
||||
from synthesizer.models.helpers import TacoTestHelper, TacoTrainingHelper
|
||||
from tensorflow.contrib.seq2seq.python.ops import decoder
|
||||
from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
|
||||
from tensorflow.python.framework import ops, tensor_shape
|
||||
from tensorflow.python.layers import base as layers_base
|
||||
from tensorflow.python.ops import rnn_cell_impl
|
||||
from tensorflow.python.util import nest
|
||||
|
||||
|
||||
class CustomDecoderOutput(
|
||||
collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))):
|
||||
pass
|
||||
|
||||
|
||||
class CustomDecoder(decoder.Decoder):
|
||||
"""Custom sampling decoder.
|
||||
|
||||
Allows for stop token prediction at inference time
|
||||
and returns equivalent loss in training time.
|
||||
|
||||
Note:
|
||||
Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers
|
||||
"""
|
||||
|
||||
def __init__(self, cell, helper, initial_state, output_layer=None):
|
||||
"""Initialize CustomDecoder.
|
||||
Args:
|
||||
cell: An `RNNCell` instance.
|
||||
helper: A `Helper` instance.
|
||||
initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
|
||||
The initial state of the RNNCell.
|
||||
output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
|
||||
`tf.layers.Dense`. Optional layer to apply to the RNN output prior
|
||||
to storing the result or sampling.
|
||||
Raises:
|
||||
TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
|
||||
"""
|
||||
rnn_cell_impl.assert_like_rnncell(type(cell), cell)
|
||||
if not isinstance(helper, helper_py.Helper):
|
||||
raise TypeError("helper must be a Helper, received: %s" % type(helper))
|
||||
if (output_layer is not None
|
||||
and not isinstance(output_layer, layers_base.Layer)):
|
||||
raise TypeError(
|
||||
"output_layer must be a Layer, received: %s" % type(output_layer))
|
||||
self._cell = cell
|
||||
self._helper = helper
|
||||
self._initial_state = initial_state
|
||||
self._output_layer = output_layer
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
return self._helper.batch_size
|
||||
|
||||
def _rnn_output_size(self):
|
||||
size = self._cell.output_size
|
||||
if self._output_layer is None:
|
||||
return size
|
||||
else:
|
||||
# To use layer"s compute_output_shape, we need to convert the
|
||||
# RNNCell"s output_size entries into shapes with an unknown
|
||||
# batch size. We then pass this through the layer"s
|
||||
# compute_output_shape and read off all but the first (batch)
|
||||
# dimensions to get the output size of the rnn with the layer
|
||||
# applied to the top.
|
||||
output_shape_with_unknown_batch = nest.map_structure(
|
||||
lambda s: tensor_shape.TensorShape([None]).concatenate(s),
|
||||
size)
|
||||
layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access
|
||||
output_shape_with_unknown_batch)
|
||||
return nest.map_structure(lambda s: s[1:], layer_output_shape)
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
# Return the cell output and the id
|
||||
return CustomDecoderOutput(
|
||||
rnn_output=self._rnn_output_size(),
|
||||
token_output=self._helper.token_output_size,
|
||||
sample_id=self._helper.sample_ids_shape)
|
||||
|
||||
@property
|
||||
def output_dtype(self):
|
||||
# Assume the dtype of the cell is the output_size structure
|
||||
# containing the input_state"s first component's dtype.
|
||||
# Return that structure and the sample_ids_dtype from the helper.
|
||||
dtype = nest.flatten(self._initial_state)[0].dtype
|
||||
return CustomDecoderOutput(
|
||||
nest.map_structure(lambda _: dtype, self._rnn_output_size()),
|
||||
tf.float32,
|
||||
self._helper.sample_ids_dtype)
|
||||
|
||||
def initialize(self, name=None):
|
||||
"""Initialize the decoder.
|
||||
Args:
|
||||
name: Name scope for any created operations.
|
||||
Returns:
|
||||
`(finished, first_inputs, initial_state)`.
|
||||
"""
|
||||
return self._helper.initialize() + (self._initial_state,)
|
||||
|
||||
def step(self, time, inputs, state, name=None):
|
||||
"""Perform a custom decoding step.
|
||||
Enables for dyanmic <stop_token> prediction
|
||||
Args:
|
||||
time: scalar `int32` tensor.
|
||||
inputs: A (structure of) input tensors.
|
||||
state: A (structure of) state tensors and TensorArrays.
|
||||
name: Name scope for any created operations.
|
||||
Returns:
|
||||
`(outputs, next_state, next_inputs, finished)`.
|
||||
"""
|
||||
with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)):
|
||||
#Call outputprojection wrapper cell
|
||||
(cell_outputs, stop_token), cell_state = self._cell(inputs, state)
|
||||
|
||||
#apply output_layer (if existant)
|
||||
if self._output_layer is not None:
|
||||
cell_outputs = self._output_layer(cell_outputs)
|
||||
sample_ids = self._helper.sample(
|
||||
time=time, outputs=cell_outputs, state=cell_state)
|
||||
|
||||
(finished, next_inputs, next_state) = self._helper.next_inputs(
|
||||
time=time,
|
||||
outputs=cell_outputs,
|
||||
state=cell_state,
|
||||
sample_ids=sample_ids,
|
||||
stop_token_prediction=stop_token)
|
||||
|
||||
outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids)
|
||||
return (outputs, next_state, next_inputs, finished)
|
161
synthesizer_tacotron2/models/helpers.py
Normal file
161
synthesizer_tacotron2/models/helpers.py
Normal file
|
@ -0,0 +1,161 @@
|
|||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.seq2seq import Helper
|
||||
|
||||
|
||||
class TacoTestHelper(Helper):
|
||||
def __init__(self, batch_size, hparams):
|
||||
with tf.name_scope("TacoTestHelper"):
|
||||
self._batch_size = batch_size
|
||||
self._output_dim = hparams.num_mels
|
||||
self._reduction_factor = hparams.outputs_per_step
|
||||
self.stop_at_any = hparams.stop_at_any
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
return self._batch_size
|
||||
|
||||
@property
|
||||
def token_output_size(self):
|
||||
return self._reduction_factor
|
||||
|
||||
@property
|
||||
def sample_ids_shape(self):
|
||||
return tf.TensorShape([])
|
||||
|
||||
@property
|
||||
def sample_ids_dtype(self):
|
||||
return np.int32
|
||||
|
||||
def initialize(self, name=None):
|
||||
return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
|
||||
|
||||
def sample(self, time, outputs, state, name=None):
|
||||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them
|
||||
|
||||
def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
|
||||
"""Stop on EOS. Otherwise, pass the last output as the next input and pass through state."""
|
||||
with tf.name_scope("TacoTestHelper"):
|
||||
#A sequence is finished when the output probability is > 0.5
|
||||
finished = tf.cast(tf.round(stop_token_prediction), tf.bool)
|
||||
|
||||
#Since we are predicting r frames at each step, two modes are
|
||||
#then possible:
|
||||
# Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended)
|
||||
# Stop when the model outputs a p > 0.5 for all r frames (Safer)
|
||||
#Note:
|
||||
# With enough training steps, the model should be able to predict when to stop correctly
|
||||
# and the use of stop_at_any = True would be recommended. If however the model didn"t
|
||||
# learn to stop correctly yet, (stops too soon) one could choose to use the safer option
|
||||
# to get a correct synthesis
|
||||
if self.stop_at_any:
|
||||
finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended
|
||||
else:
|
||||
finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option
|
||||
|
||||
# Feed last output frame as next input. outputs is [N, output_dim * r]
|
||||
next_inputs = outputs[:, -self._output_dim:]
|
||||
next_state = state
|
||||
return (finished, next_inputs, next_state)
|
||||
|
||||
|
||||
class TacoTrainingHelper(Helper):
|
||||
def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step):
|
||||
# inputs is [N, T_in], targets is [N, T_out, D]
|
||||
with tf.name_scope("TacoTrainingHelper"):
|
||||
self._batch_size = batch_size
|
||||
self._output_dim = hparams.num_mels
|
||||
self._reduction_factor = hparams.outputs_per_step
|
||||
self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio)
|
||||
self.gta = gta
|
||||
self.eval = evaluating
|
||||
self._hparams = hparams
|
||||
self.global_step = global_step
|
||||
|
||||
r = self._reduction_factor
|
||||
# Feed every r-th target frame as input
|
||||
self._targets = targets[:, r-1::r, :]
|
||||
|
||||
#Maximal sequence length
|
||||
self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size])
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
return self._batch_size
|
||||
|
||||
@property
|
||||
def token_output_size(self):
|
||||
return self._reduction_factor
|
||||
|
||||
@property
|
||||
def sample_ids_shape(self):
|
||||
return tf.TensorShape([])
|
||||
|
||||
@property
|
||||
def sample_ids_dtype(self):
|
||||
return np.int32
|
||||
|
||||
def initialize(self, name=None):
|
||||
#Compute teacher forcing ratio for this global step.
|
||||
#In GTA mode, override teacher forcing scheme to work with full teacher forcing
|
||||
if self.gta:
|
||||
self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth
|
||||
elif self.eval and self._hparams.natural_eval:
|
||||
self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions
|
||||
else:
|
||||
if self._hparams.tacotron_teacher_forcing_mode == "scheduled":
|
||||
self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio,
|
||||
self.global_step, self._hparams)
|
||||
|
||||
return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
|
||||
|
||||
def sample(self, time, outputs, state, name=None):
|
||||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them
|
||||
|
||||
def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
|
||||
with tf.name_scope(name or "TacoTrainingHelper"):
|
||||
#synthesis stop (we let the model see paddings as we mask them when computing loss functions)
|
||||
finished = (time + 1 >= self._lengths)
|
||||
|
||||
#Pick previous outputs randomly with respect to teacher forcing ratio
|
||||
next_inputs = tf.cond(
|
||||
tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
|
||||
lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
|
||||
lambda: outputs[:,-self._output_dim:])
|
||||
|
||||
#Pass on state
|
||||
next_state = state
|
||||
return (finished, next_inputs, next_state)
|
||||
|
||||
|
||||
def _go_frames(batch_size, output_dim):
|
||||
"""Returns all-zero <GO> frames for a given batch size and output dimension"""
|
||||
return tf.tile([[0.0]], [batch_size, output_dim])
|
||||
|
||||
def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams):
|
||||
#################################################################
|
||||
# Narrow Cosine Decay:
|
||||
|
||||
# Phase 1: tfr = 1
|
||||
# We only start learning rate decay after 10k steps
|
||||
|
||||
# Phase 2: tfr in ]0, 1[
|
||||
# decay reach minimal value at step ~280k
|
||||
|
||||
# Phase 3: tfr = 0
|
||||
# clip by minimal teacher forcing ratio value (step >~ 280k)
|
||||
#################################################################
|
||||
#Compute natural cosine decay
|
||||
tfr = tf.train.cosine_decay(init_tfr,
|
||||
global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr = 1 at step 10k
|
||||
decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr = 0 at step ~280k
|
||||
alpha=hparams.tacotron_teacher_forcing_decay_alpha, #tfr = 0% of init_tfr as final value
|
||||
name="tfr_cosine_decay")
|
||||
|
||||
#force teacher forcing ratio to take initial value when global step < start decay step.
|
||||
narrow_tfr = tf.cond(
|
||||
tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)),
|
||||
lambda: tf.convert_to_tensor(init_tfr),
|
||||
lambda: tfr)
|
||||
|
||||
return narrow_tfr
|
528
synthesizer_tacotron2/models/modules.py
Normal file
528
synthesizer_tacotron2/models/modules.py
Normal file
|
@ -0,0 +1,528 @@
|
|||
import tensorflow as tf
|
||||
|
||||
|
||||
class HighwayNet:
|
||||
def __init__(self, units, name=None):
|
||||
self.units = units
|
||||
self.scope = "HighwayNet" if name is None else name
|
||||
|
||||
self.H_layer = tf.layers.Dense(units=self.units, activation=tf.nn.relu, name="H")
|
||||
self.T_layer = tf.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T",
|
||||
bias_initializer=tf.constant_initializer(-1.))
|
||||
|
||||
def __call__(self, inputs):
|
||||
with tf.variable_scope(self.scope):
|
||||
H = self.H_layer(inputs)
|
||||
T = self.T_layer(inputs)
|
||||
return H * T + inputs * (1. - T)
|
||||
|
||||
|
||||
class CBHG:
|
||||
def __init__(self, K, conv_channels, pool_size, projections, projection_kernel_size,
|
||||
n_highwaynet_layers, highway_units, rnn_units, is_training, name=None):
|
||||
self.K = K
|
||||
self.conv_channels = conv_channels
|
||||
self.pool_size = pool_size
|
||||
|
||||
self.projections = projections
|
||||
self.projection_kernel_size = projection_kernel_size
|
||||
|
||||
self.is_training = is_training
|
||||
self.scope = "CBHG" if name is None else name
|
||||
|
||||
self.highway_units = highway_units
|
||||
self.highwaynet_layers = [
|
||||
HighwayNet(highway_units, name="{}_highwaynet_{}".format(self.scope, i + 1)) for i in
|
||||
range(n_highwaynet_layers)]
|
||||
self._fw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_forward_RNN".format(self.scope))
|
||||
self._bw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_backward_RNN".format(self.scope))
|
||||
|
||||
def __call__(self, inputs, input_lengths):
|
||||
with tf.variable_scope(self.scope):
|
||||
with tf.variable_scope("conv_bank"):
|
||||
# Convolution bank: concatenate on the last axis to stack channels from all
|
||||
# convolutions
|
||||
# The convolution bank uses multiple different kernel sizes to have many insights
|
||||
# of the input sequence
|
||||
# This makes one of the strengths of the CBHG block on sequences.
|
||||
conv_outputs = tf.concat(
|
||||
[conv1d(inputs, k, self.conv_channels, tf.nn.relu, self.is_training, 0.,
|
||||
"conv1d_{}".format(k)) for k in range(1, self.K + 1)],
|
||||
axis=-1
|
||||
)
|
||||
|
||||
# Maxpooling (dimension reduction, Using max instead of average helps finding "Edges"
|
||||
# in mels)
|
||||
maxpool_output = tf.layers.max_pooling1d(
|
||||
conv_outputs,
|
||||
pool_size=self.pool_size,
|
||||
strides=1,
|
||||
padding="same")
|
||||
|
||||
# Two projection layers
|
||||
proj1_output = conv1d(maxpool_output, self.projection_kernel_size, self.projections[0],
|
||||
tf.nn.relu, self.is_training, 0., "proj1")
|
||||
proj2_output = conv1d(proj1_output, self.projection_kernel_size, self.projections[1],
|
||||
lambda _: _, self.is_training, 0., "proj2")
|
||||
|
||||
# Residual connection
|
||||
highway_input = proj2_output + inputs
|
||||
|
||||
# Additional projection in case of dimension mismatch (for HighwayNet "residual"
|
||||
# connection)
|
||||
if highway_input.shape[2] != self.highway_units:
|
||||
highway_input = tf.layers.dense(highway_input, self.highway_units)
|
||||
|
||||
# 4-layer HighwayNet
|
||||
for highwaynet in self.highwaynet_layers:
|
||||
highway_input = highwaynet(highway_input)
|
||||
rnn_input = highway_input
|
||||
|
||||
# Bidirectional RNN
|
||||
outputs, states = tf.nn.bidirectional_dynamic_rnn(
|
||||
self._fw_cell,
|
||||
self._bw_cell,
|
||||
rnn_input,
|
||||
sequence_length=input_lengths,
|
||||
dtype=tf.float32)
|
||||
return tf.concat(outputs, axis=2) # Concat forward and backward outputs
|
||||
|
||||
|
||||
class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
|
||||
"""Wrapper for tf LSTM to create Zoneout LSTM Cell
|
||||
|
||||
inspired by:
|
||||
https://github.com/teganmaharaj/zoneout/blob/master/zoneout_tensorflow.py
|
||||
|
||||
Published by one of "https://arxiv.org/pdf/1606.01305.pdf" paper writers.
|
||||
|
||||
Many thanks to @Ondal90 for pointing this out. You sir are a hero!
|
||||
"""
|
||||
|
||||
def __init__(self, num_units, is_training, zoneout_factor_cell=0., zoneout_factor_output=0.,
|
||||
state_is_tuple=True, name=None):
|
||||
"""Initializer with possibility to set different zoneout values for cell/hidden states.
|
||||
"""
|
||||
zm = min(zoneout_factor_output, zoneout_factor_cell)
|
||||
zs = max(zoneout_factor_output, zoneout_factor_cell)
|
||||
|
||||
if zm < 0. or zs > 1.:
|
||||
raise ValueError("One/both provided Zoneout factors are not in [0, 1]")
|
||||
|
||||
self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name)
|
||||
self._zoneout_cell = zoneout_factor_cell
|
||||
self._zoneout_outputs = zoneout_factor_output
|
||||
self.is_training = is_training
|
||||
self.state_is_tuple = state_is_tuple
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
return self._cell.state_size
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return self._cell.output_size
|
||||
|
||||
def __call__(self, inputs, state, scope=None):
|
||||
"""Runs vanilla LSTM Cell and applies zoneout.
|
||||
"""
|
||||
# Apply vanilla LSTM
|
||||
output, new_state = self._cell(inputs, state, scope)
|
||||
|
||||
if self.state_is_tuple:
|
||||
(prev_c, prev_h) = state
|
||||
(new_c, new_h) = new_state
|
||||
else:
|
||||
num_proj = self._cell._num_units if self._cell._num_proj is None else \
|
||||
self._cell._num_proj
|
||||
prev_c = tf.slice(state, [0, 0], [-1, self._cell._num_units])
|
||||
prev_h = tf.slice(state, [0, self._cell._num_units], [-1, num_proj])
|
||||
new_c = tf.slice(new_state, [0, 0], [-1, self._cell._num_units])
|
||||
new_h = tf.slice(new_state, [0, self._cell._num_units], [-1, num_proj])
|
||||
|
||||
# Apply zoneout
|
||||
if self.is_training:
|
||||
# nn.dropout takes keep_prob (probability to keep activations) not drop_prob (
|
||||
# probability to mask activations)!
|
||||
c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c,
|
||||
(1 - self._zoneout_cell)) + prev_c
|
||||
h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h,
|
||||
(1 - self._zoneout_outputs)) + prev_h
|
||||
|
||||
else:
|
||||
c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c
|
||||
h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h
|
||||
|
||||
new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c,
|
||||
h])
|
||||
|
||||
return output, new_state
|
||||
|
||||
|
||||
class EncoderConvolutions:
|
||||
"""Encoder convolutional layers used to find local dependencies in inputs characters.
|
||||
"""
|
||||
|
||||
def __init__(self, is_training, hparams, activation=tf.nn.relu, scope=None):
|
||||
"""
|
||||
Args:
|
||||
is_training: Boolean, determines if the model is training or in inference to control
|
||||
dropout
|
||||
kernel_size: tuple or integer, The size of convolution kernels
|
||||
channels: integer, number of convolutional kernels
|
||||
activation: callable, postnet activation function for each convolutional layer
|
||||
scope: Postnet scope.
|
||||
"""
|
||||
super(EncoderConvolutions, self).__init__()
|
||||
self.is_training = is_training
|
||||
|
||||
self.kernel_size = hparams.enc_conv_kernel_size
|
||||
self.channels = hparams.enc_conv_channels
|
||||
self.activation = activation
|
||||
self.scope = "enc_conv_layers" if scope is None else scope
|
||||
self.drop_rate = hparams.tacotron_dropout_rate
|
||||
self.enc_conv_num_layers = hparams.enc_conv_num_layers
|
||||
|
||||
def __call__(self, inputs):
|
||||
with tf.variable_scope(self.scope):
|
||||
x = inputs
|
||||
for i in range(self.enc_conv_num_layers):
|
||||
x = conv1d(x, self.kernel_size, self.channels, self.activation,
|
||||
self.is_training, self.drop_rate,
|
||||
"conv_layer_{}_".format(i + 1) + self.scope)
|
||||
return x
|
||||
|
||||
|
||||
class EncoderRNN:
|
||||
"""Encoder bidirectional one layer LSTM
|
||||
"""
|
||||
|
||||
def __init__(self, is_training, size=256, zoneout=0.1, scope=None):
|
||||
"""
|
||||
Args:
|
||||
is_training: Boolean, determines if the model is training or in inference to control
|
||||
zoneout
|
||||
size: integer, the number of LSTM units for each direction
|
||||
zoneout: the zoneout factor
|
||||
scope: EncoderRNN scope.
|
||||
"""
|
||||
super(EncoderRNN, self).__init__()
|
||||
self.is_training = is_training
|
||||
|
||||
self.size = size
|
||||
self.zoneout = zoneout
|
||||
self.scope = "encoder_LSTM" if scope is None else scope
|
||||
|
||||
# Create forward LSTM Cell
|
||||
self._fw_cell = ZoneoutLSTMCell(size, is_training,
|
||||
zoneout_factor_cell=zoneout,
|
||||
zoneout_factor_output=zoneout,
|
||||
name="encoder_fw_LSTM")
|
||||
|
||||
# Create backward LSTM Cell
|
||||
self._bw_cell = ZoneoutLSTMCell(size, is_training,
|
||||
zoneout_factor_cell=zoneout,
|
||||
zoneout_factor_output=zoneout,
|
||||
name="encoder_bw_LSTM")
|
||||
|
||||
def __call__(self, inputs, input_lengths):
|
||||
with tf.variable_scope(self.scope):
|
||||
outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(
|
||||
self._fw_cell,
|
||||
self._bw_cell,
|
||||
inputs,
|
||||
sequence_length=input_lengths,
|
||||
dtype=tf.float32,
|
||||
swap_memory=True)
|
||||
|
||||
return tf.concat(outputs, axis=2) # Concat and return forward + backward outputs
|
||||
|
||||
|
||||
class Prenet:
|
||||
"""Two fully connected layers used as an information bottleneck for the attention.
|
||||
"""
|
||||
|
||||
def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activation=tf.nn.relu,
|
||||
scope=None):
|
||||
"""
|
||||
Args:
|
||||
layers_sizes: list of integers, the length of the list represents the number of pre-net
|
||||
layers and the list values represent the layers number of units
|
||||
activation: callable, activation functions of the prenet layers.
|
||||
scope: Prenet scope.
|
||||
"""
|
||||
super(Prenet, self).__init__()
|
||||
self.drop_rate = drop_rate
|
||||
|
||||
self.layers_sizes = layers_sizes
|
||||
self.activation = activation
|
||||
self.is_training = is_training
|
||||
|
||||
self.scope = "prenet" if scope is None else scope
|
||||
|
||||
def __call__(self, inputs):
|
||||
x = inputs
|
||||
|
||||
with tf.variable_scope(self.scope):
|
||||
for i, size in enumerate(self.layers_sizes):
|
||||
dense = tf.layers.dense(x, units=size, activation=self.activation,
|
||||
name="dense_{}".format(i + 1))
|
||||
# The paper discussed introducing diversity in generation at inference time
|
||||
# by using a dropout of 0.5 only in prenet layers (in both training and inference).
|
||||
x = tf.layers.dropout(dense, rate=self.drop_rate, training=True,
|
||||
name="dropout_{}".format(i + 1) + self.scope)
|
||||
return x
|
||||
|
||||
|
||||
class DecoderRNN:
|
||||
"""Decoder two uni directional LSTM Cells
|
||||
"""
|
||||
|
||||
def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None):
|
||||
"""
|
||||
Args:
|
||||
is_training: Boolean, determines if the model is in training or inference to control
|
||||
zoneout
|
||||
layers: integer, the number of LSTM layers in the decoder
|
||||
size: integer, the number of LSTM units in each layer
|
||||
zoneout: the zoneout factor
|
||||
"""
|
||||
super(DecoderRNN, self).__init__()
|
||||
self.is_training = is_training
|
||||
|
||||
self.layers = layers
|
||||
self.size = size
|
||||
self.zoneout = zoneout
|
||||
self.scope = "decoder_rnn" if scope is None else scope
|
||||
|
||||
# Create a set of LSTM layers
|
||||
self.rnn_layers = [ZoneoutLSTMCell(size, is_training,
|
||||
zoneout_factor_cell=zoneout,
|
||||
zoneout_factor_output=zoneout,
|
||||
name="decoder_LSTM_{}".format(i + 1)) for i in
|
||||
range(layers)]
|
||||
|
||||
self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True)
|
||||
|
||||
def __call__(self, inputs, states):
|
||||
with tf.variable_scope(self.scope):
|
||||
return self._cell(inputs, states)
|
||||
|
||||
|
||||
class FrameProjection:
|
||||
"""Projection layer to r * num_mels dimensions or num_mels dimensions
|
||||
"""
|
||||
|
||||
def __init__(self, shape=80, activation=None, scope=None):
|
||||
"""
|
||||
Args:
|
||||
shape: integer, dimensionality of output space (r*n_mels for decoder or n_mels for
|
||||
postnet)
|
||||
activation: callable, activation function
|
||||
scope: FrameProjection scope.
|
||||
"""
|
||||
super(FrameProjection, self).__init__()
|
||||
|
||||
self.shape = shape
|
||||
self.activation = activation
|
||||
|
||||
self.scope = "Linear_projection" if scope is None else scope
|
||||
self.dense = tf.layers.Dense(units=shape, activation=activation,
|
||||
name="projection_{}".format(self.scope))
|
||||
|
||||
def __call__(self, inputs):
|
||||
with tf.variable_scope(self.scope):
|
||||
# If activation==None, this returns a simple Linear projection
|
||||
# else the projection will be passed through an activation function
|
||||
# output = tf.layers.dense(inputs, units=self.shape, activation=self.activation,
|
||||
# name="projection_{}".format(self.scope))
|
||||
output = self.dense(inputs)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class StopProjection:
|
||||
"""Projection to a scalar and through a sigmoid activation
|
||||
"""
|
||||
|
||||
def __init__(self, is_training, shape=1, activation=tf.nn.sigmoid, scope=None):
|
||||
"""
|
||||
Args:
|
||||
is_training: Boolean, to control the use of sigmoid function as it is useless to use it
|
||||
during training since it is integrate inside the sigmoid_crossentropy loss
|
||||
shape: integer, dimensionality of output space. Defaults to 1 (scalar)
|
||||
activation: callable, activation function. only used during inference
|
||||
scope: StopProjection scope.
|
||||
"""
|
||||
super(StopProjection, self).__init__()
|
||||
self.is_training = is_training
|
||||
|
||||
self.shape = shape
|
||||
self.activation = activation
|
||||
self.scope = "stop_token_projection" if scope is None else scope
|
||||
|
||||
def __call__(self, inputs):
|
||||
with tf.variable_scope(self.scope):
|
||||
output = tf.layers.dense(inputs, units=self.shape,
|
||||
activation=None, name="projection_{}".format(self.scope))
|
||||
|
||||
# During training, don"t use activation as it is integrated inside the
|
||||
# sigmoid_cross_entropy loss function
|
||||
if self.is_training:
|
||||
return output
|
||||
return self.activation(output)
|
||||
|
||||
|
||||
class Postnet:
|
||||
"""Postnet that takes final decoder output and fine tunes it (using vision on past and future
|
||||
frames)
|
||||
"""
|
||||
|
||||
def __init__(self, is_training, hparams, activation=tf.nn.tanh, scope=None):
|
||||
"""
|
||||
Args:
|
||||
is_training: Boolean, determines if the model is training or in inference to control
|
||||
dropout
|
||||
kernel_size: tuple or integer, The size of convolution kernels
|
||||
channels: integer, number of convolutional kernels
|
||||
activation: callable, postnet activation function for each convolutional layer
|
||||
scope: Postnet scope.
|
||||
"""
|
||||
super(Postnet, self).__init__()
|
||||
self.is_training = is_training
|
||||
|
||||
self.kernel_size = hparams.postnet_kernel_size
|
||||
self.channels = hparams.postnet_channels
|
||||
self.activation = activation
|
||||
self.scope = "postnet_convolutions" if scope is None else scope
|
||||
self.postnet_num_layers = hparams.postnet_num_layers
|
||||
self.drop_rate = hparams.tacotron_dropout_rate
|
||||
|
||||
def __call__(self, inputs):
|
||||
with tf.variable_scope(self.scope):
|
||||
x = inputs
|
||||
for i in range(self.postnet_num_layers - 1):
|
||||
x = conv1d(x, self.kernel_size, self.channels, self.activation,
|
||||
self.is_training, self.drop_rate,
|
||||
"conv_layer_{}_".format(i + 1) + self.scope)
|
||||
x = conv1d(x, self.kernel_size, self.channels, lambda _: _, self.is_training,
|
||||
self.drop_rate,
|
||||
"conv_layer_{}_".format(5) + self.scope)
|
||||
return x
|
||||
|
||||
|
||||
def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, scope):
|
||||
with tf.variable_scope(scope):
|
||||
conv1d_output = tf.layers.conv1d(
|
||||
inputs,
|
||||
filters=channels,
|
||||
kernel_size=kernel_size,
|
||||
activation=None,
|
||||
padding="same")
|
||||
batched = tf.layers.batch_normalization(conv1d_output, training=is_training)
|
||||
activated = activation(batched)
|
||||
return tf.layers.dropout(activated, rate=drop_rate, training=is_training,
|
||||
name="dropout_{}".format(scope))
|
||||
|
||||
|
||||
def _round_up_tf(x, multiple):
|
||||
# Tf version of remainder = x % multiple
|
||||
remainder = tf.mod(x, multiple)
|
||||
# Tf version of return x if remainder == 0 else x + multiple - remainder
|
||||
x_round = tf.cond(tf.equal(remainder, tf.zeros(tf.shape(remainder), dtype=tf.int32)),
|
||||
lambda: x,
|
||||
lambda: x + multiple - remainder)
|
||||
|
||||
return x_round
|
||||
|
||||
|
||||
def sequence_mask(lengths, r, expand=True):
|
||||
"""Returns a 2-D or 3-D tensorflow sequence mask depending on the argument "expand"
|
||||
"""
|
||||
max_len = tf.reduce_max(lengths)
|
||||
max_len = _round_up_tf(max_len, tf.convert_to_tensor(r))
|
||||
if expand:
|
||||
return tf.expand_dims(tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32), axis=-1)
|
||||
return tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32)
|
||||
|
||||
|
||||
def MaskedMSE(targets, outputs, targets_lengths, hparams, mask=None):
|
||||
"""Computes a masked Mean Squared Error
|
||||
"""
|
||||
|
||||
# [batch_size, time_dimension, 1]
|
||||
# example:
|
||||
# sequence_mask([1, 3, 2], 5) = [[[1., 0., 0., 0., 0.]],
|
||||
# [[1., 1., 1., 0., 0.]],
|
||||
# [[1., 1., 0., 0., 0.]]]
|
||||
# Note the maxlen argument that ensures mask shape is compatible with r>1
|
||||
# This will by default mask the extra paddings caused by r>1
|
||||
if mask is None:
|
||||
mask = sequence_mask(targets_lengths, hparams.outputs_per_step, True)
|
||||
|
||||
# [batch_size, time_dimension, channel_dimension(mels)]
|
||||
ones = tf.ones(shape=[tf.shape(mask)[0], tf.shape(mask)[1], tf.shape(targets)[-1]],
|
||||
dtype=tf.float32)
|
||||
mask_ = mask * ones
|
||||
|
||||
with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask_))]):
|
||||
return tf.losses.mean_squared_error(labels=targets, predictions=outputs, weights=mask_)
|
||||
|
||||
|
||||
def MaskedSigmoidCrossEntropy(targets, outputs, targets_lengths, hparams, mask=None):
|
||||
"""Computes a masked SigmoidCrossEntropy with logits
|
||||
"""
|
||||
|
||||
# [batch_size, time_dimension]
|
||||
# example:
|
||||
# sequence_mask([1, 3, 2], 5) = [[1., 0., 0., 0., 0.],
|
||||
# [1., 1., 1., 0., 0.],
|
||||
# [1., 1., 0., 0., 0.]]
|
||||
# Note the maxlen argument that ensures mask shape is compatible with r>1
|
||||
# This will by default mask the extra paddings caused by r>1
|
||||
if mask is None:
|
||||
mask = sequence_mask(targets_lengths, hparams.outputs_per_step, False)
|
||||
|
||||
with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask))]):
|
||||
# Use a weighted sigmoid cross entropy to measure the <stop_token> loss. Set
|
||||
# hparams.cross_entropy_pos_weight to 1
|
||||
# will have the same effect as vanilla tf.nn.sigmoid_cross_entropy_with_logits.
|
||||
losses = tf.nn.weighted_cross_entropy_with_logits(targets=targets, logits=outputs,
|
||||
pos_weight=hparams.cross_entropy_pos_weight)
|
||||
|
||||
with tf.control_dependencies([tf.assert_equal(tf.shape(mask), tf.shape(losses))]):
|
||||
masked_loss = losses * mask
|
||||
|
||||
return tf.reduce_sum(masked_loss) / tf.count_nonzero(masked_loss, dtype=tf.float32)
|
||||
|
||||
|
||||
def MaskedLinearLoss(targets, outputs, targets_lengths, hparams, mask=None):
|
||||
"""Computes a masked MAE loss with priority to low frequencies
|
||||
"""
|
||||
|
||||
# [batch_size, time_dimension, 1]
|
||||
# example:
|
||||
# sequence_mask([1, 3, 2], 5) = [[[1., 0., 0., 0., 0.]],
|
||||
# [[1., 1., 1., 0., 0.]],
|
||||
# [[1., 1., 0., 0., 0.]]]
|
||||
# Note the maxlen argument that ensures mask shape is compatible with r>1
|
||||
# This will by default mask the extra paddings caused by r>1
|
||||
if mask is None:
|
||||
mask = sequence_mask(targets_lengths, hparams.outputs_per_step, True)
|
||||
|
||||
# [batch_size, time_dimension, channel_dimension(freq)]
|
||||
ones = tf.ones(shape=[tf.shape(mask)[0], tf.shape(mask)[1], tf.shape(targets)[-1]],
|
||||
dtype=tf.float32)
|
||||
mask_ = mask * ones
|
||||
|
||||
l1 = tf.abs(targets - outputs)
|
||||
n_priority_freq = int(2000 / (hparams.sample_rate * 0.5) * hparams.num_freq)
|
||||
|
||||
with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask_))]):
|
||||
masked_l1 = l1 * mask_
|
||||
masked_l1_low = masked_l1[:, :, 0:n_priority_freq]
|
||||
|
||||
mean_l1 = tf.reduce_sum(masked_l1) / tf.reduce_sum(mask_)
|
||||
mean_l1_low = tf.reduce_sum(masked_l1_low) / tf.reduce_sum(mask_)
|
||||
|
||||
return 0.5 * mean_l1 + 0.5 * mean_l1_low
|
521
synthesizer_tacotron2/models/tacotron.py
Normal file
521
synthesizer_tacotron2/models/tacotron.py
Normal file
|
@ -0,0 +1,521 @@
|
|||
import tensorflow as tf
|
||||
from synthesizer.utils.symbols import symbols
|
||||
from synthesizer.infolog import log
|
||||
from synthesizer.models.helpers import TacoTrainingHelper, TacoTestHelper
|
||||
from synthesizer.models.modules import *
|
||||
from tensorflow.contrib.seq2seq import dynamic_decode
|
||||
from synthesizer.models.architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
|
||||
from synthesizer.models.custom_decoder import CustomDecoder
|
||||
from synthesizer.models.attention import LocationSensitiveAttention
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def split_func(x, split_pos):
|
||||
rst = []
|
||||
start = 0
|
||||
# x will be a numpy array with the contents of the placeholder below
|
||||
for i in range(split_pos.shape[0]):
|
||||
rst.append(x[:, start:start + split_pos[i]])
|
||||
start += split_pos[i]
|
||||
return rst
|
||||
|
||||
|
||||
class Tacotron():
|
||||
"""Tacotron-2 Feature prediction Model.
|
||||
"""
|
||||
|
||||
def __init__(self, hparams):
|
||||
self._hparams = hparams
|
||||
|
||||
def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None,
|
||||
stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
|
||||
global_step=None, is_training=False, is_evaluating=False, split_infos=None):
|
||||
"""
|
||||
Initializes the model for inference sets "mel_outputs" and "alignments" fields.
|
||||
Args:
|
||||
- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
|
||||
steps in the input time series, and values are character IDs
|
||||
- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the
|
||||
lengths of each sequence in inputs.
|
||||
- embed_targets: float32 Tensor with shape [N, E] where E is the speaker
|
||||
embedding size.
|
||||
- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size,
|
||||
T_out is number of steps in the output time series, M is num_mels, and values are
|
||||
entries in the mel spectrogram. Only needed for training.
|
||||
"""
|
||||
if mel_targets is None and stop_token_targets is not None:
|
||||
raise ValueError("no multi targets were provided but token_targets were given")
|
||||
if mel_targets is not None and stop_token_targets is None and not gta:
|
||||
raise ValueError("Mel targets are provided without corresponding token_targets")
|
||||
if not gta and self._hparams.predict_linear == True and linear_targets is None and \
|
||||
is_training:
|
||||
raise ValueError(
|
||||
"Model is set to use post processing to predict linear spectrograms in training "
|
||||
"but no linear targets given!")
|
||||
if gta and linear_targets is not None:
|
||||
raise ValueError("Linear spectrogram prediction is not supported in GTA mode!")
|
||||
if is_training and self._hparams.mask_decoder and targets_lengths is None:
|
||||
raise RuntimeError(
|
||||
"Model set to mask paddings but no targets lengths provided for the mask!")
|
||||
if is_training and is_evaluating:
|
||||
raise RuntimeError(
|
||||
"Model can not be in training and evaluation modes at the same time!")
|
||||
|
||||
split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or \
|
||||
self._hparams.split_on_cpu else "/gpu:{}".format(
|
||||
self._hparams.tacotron_gpu_start_idx)
|
||||
with tf.device(split_device):
|
||||
hp = self._hparams
|
||||
lout_int = [tf.int32] * hp.tacotron_num_gpus
|
||||
lout_float = [tf.float32] * hp.tacotron_num_gpus
|
||||
|
||||
tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus,
|
||||
axis=0)
|
||||
tower_targets_lengths = \
|
||||
tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if \
|
||||
targets_lengths is not None else targets_lengths
|
||||
|
||||
### SV2TTS ###
|
||||
|
||||
tower_embed_targets = tf.split(embed_targets, num_or_size_splits=hp.tacotron_num_gpus,
|
||||
axis=0)
|
||||
|
||||
##############
|
||||
|
||||
p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int)
|
||||
p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]],
|
||||
lout_float) if mel_targets is not None else mel_targets
|
||||
p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:, 2]],
|
||||
lout_float) if stop_token_targets is not None else \
|
||||
stop_token_targets
|
||||
|
||||
tower_inputs = []
|
||||
tower_mel_targets = []
|
||||
tower_stop_token_targets = []
|
||||
|
||||
batch_size = tf.shape(inputs)[0]
|
||||
mel_channels = hp.num_mels
|
||||
for i in range(hp.tacotron_num_gpus):
|
||||
tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
|
||||
if p_mel_targets is not None:
|
||||
tower_mel_targets.append(
|
||||
tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels]))
|
||||
if p_stop_token_targets is not None:
|
||||
tower_stop_token_targets.append(
|
||||
tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
|
||||
|
||||
self.tower_decoder_output = []
|
||||
self.tower_alignments = []
|
||||
self.tower_stop_token_prediction = []
|
||||
self.tower_mel_outputs = []
|
||||
|
||||
tower_embedded_inputs = []
|
||||
tower_enc_conv_output_shape = []
|
||||
tower_encoder_cond_outputs = []
|
||||
tower_residual = []
|
||||
tower_projected_residual = []
|
||||
|
||||
# 1. Declare GPU Devices
|
||||
gpus = ["/gpu:{}".format(i) for i in
|
||||
range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
|
||||
for i in range(hp.tacotron_num_gpus):
|
||||
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
|
||||
worker_device=gpus[i])):
|
||||
with tf.variable_scope("inference") as scope:
|
||||
assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled")
|
||||
if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training:
|
||||
assert global_step is not None
|
||||
|
||||
# GTA is only used for predicting mels to train Wavenet vocoder, so we ommit
|
||||
# post processing when doing GTA synthesis
|
||||
post_condition = hp.predict_linear and not gta
|
||||
|
||||
# Embeddings ==> [batch_size, sequence_length, embedding_dim]
|
||||
self.embedding_table = tf.get_variable(
|
||||
"inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32)
|
||||
embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])
|
||||
|
||||
# Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
|
||||
encoder_cell = TacotronEncoderCell(
|
||||
EncoderConvolutions(is_training, hparams=hp, scope="encoder_convolutions"),
|
||||
EncoderRNN(is_training, size=hp.encoder_lstm_units,
|
||||
zoneout=hp.tacotron_zoneout_rate, scope="encoder_LSTM"))
|
||||
|
||||
encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i])
|
||||
|
||||
# For shape visualization purpose
|
||||
enc_conv_output_shape = encoder_cell.conv_output_shape
|
||||
|
||||
|
||||
### SV2TT2 ###
|
||||
|
||||
# Append the speaker embedding to the encoder output at each timestep
|
||||
tileable_shape = [-1, 1, self._hparams.speaker_embedding_size]
|
||||
tileable_embed_targets = tf.reshape(tower_embed_targets[i], tileable_shape)
|
||||
tiled_embed_targets = tf.tile(tileable_embed_targets,
|
||||
[1, tf.shape(encoder_outputs)[1], 1])
|
||||
encoder_cond_outputs = tf.concat((encoder_outputs, tiled_embed_targets), 2)
|
||||
|
||||
##############
|
||||
|
||||
|
||||
# Decoder Parts
|
||||
# Attention Decoder Prenet
|
||||
prenet = Prenet(is_training, layers_sizes=hp.prenet_layers,
|
||||
drop_rate=hp.tacotron_dropout_rate, scope="decoder_prenet")
|
||||
# Attention Mechanism
|
||||
attention_mechanism = LocationSensitiveAttention(hp.attention_dim,
|
||||
encoder_cond_outputs,
|
||||
hparams=hp,
|
||||
mask_encoder=hp.mask_encoder,
|
||||
memory_sequence_length=tf.reshape(
|
||||
tower_input_lengths[i],
|
||||
[-1]),
|
||||
smoothing=hp.smoothing,
|
||||
cumulate_weights=hp.cumulative_weights)
|
||||
# Decoder LSTM Cells
|
||||
decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
|
||||
size=hp.decoder_lstm_units,
|
||||
zoneout=hp.tacotron_zoneout_rate,
|
||||
scope="decoder_LSTM")
|
||||
# Frames Projection layer
|
||||
frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step,
|
||||
scope="linear_transform_projection")
|
||||
# <stop_token> projection layer
|
||||
stop_projection = StopProjection(is_training or is_evaluating, shape=hp
|
||||
.outputs_per_step,
|
||||
scope="stop_token_projection")
|
||||
|
||||
# Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
|
||||
decoder_cell = TacotronDecoderCell(
|
||||
prenet,
|
||||
attention_mechanism,
|
||||
decoder_lstm,
|
||||
frame_projection,
|
||||
stop_projection)
|
||||
|
||||
# Define the helper for our decoder
|
||||
if is_training or is_evaluating or gta:
|
||||
self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta,
|
||||
is_evaluating, global_step)
|
||||
else:
|
||||
self.helper = TacoTestHelper(batch_size, hp)
|
||||
|
||||
# initial decoder state
|
||||
decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
|
||||
dtype=tf.float32)
|
||||
|
||||
# Only use max iterations at synthesis time
|
||||
max_iters = hp.max_iters if not (is_training or is_evaluating) else None
|
||||
|
||||
# Decode
|
||||
(frames_prediction, stop_token_prediction,
|
||||
_), final_decoder_state, _ = dynamic_decode(
|
||||
CustomDecoder(decoder_cell, self.helper, decoder_init_state),
|
||||
impute_finished=False,
|
||||
maximum_iterations=max_iters,
|
||||
swap_memory=hp.tacotron_swap_with_cpu)
|
||||
|
||||
# Reshape outputs to be one output per entry
|
||||
# ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
|
||||
decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
|
||||
stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])
|
||||
|
||||
# Postnet
|
||||
postnet = Postnet(is_training, hparams=hp, scope="postnet_convolutions")
|
||||
|
||||
# Compute residual using post-net ==> [batch_size, decoder_steps * r,
|
||||
# postnet_channels]
|
||||
residual = postnet(decoder_output)
|
||||
|
||||
# Project residual to same dimension as mel spectrogram
|
||||
# ==> [batch_size, decoder_steps * r, num_mels]
|
||||
residual_projection = FrameProjection(hp.num_mels, scope="postnet_projection")
|
||||
projected_residual = residual_projection(residual)
|
||||
|
||||
# Compute the mel spectrogram
|
||||
mel_outputs = decoder_output + projected_residual
|
||||
|
||||
if post_condition:
|
||||
# Add post-processing CBHG. This does a great job at extracting features
|
||||
# from mels before projection to Linear specs.
|
||||
post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size,
|
||||
[hp.cbhg_projection, hp.num_mels],
|
||||
hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers,
|
||||
hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training,
|
||||
name="CBHG_postnet")
|
||||
|
||||
# [batch_size, decoder_steps(mel_frames), cbhg_channels]
|
||||
post_outputs = post_cbhg(mel_outputs, None)
|
||||
|
||||
# Linear projection of extracted features to make linear spectrogram
|
||||
linear_specs_projection = FrameProjection(hp.num_freq,
|
||||
scope="cbhg_linear_specs_projection")
|
||||
|
||||
# [batch_size, decoder_steps(linear_frames), num_freq]
|
||||
linear_outputs = linear_specs_projection(post_outputs)
|
||||
|
||||
# Grab alignments from the final decoder state
|
||||
alignments = tf.transpose(final_decoder_state.alignment_history.stack(),
|
||||
[1, 2, 0])
|
||||
|
||||
self.tower_decoder_output.append(decoder_output)
|
||||
self.tower_alignments.append(alignments)
|
||||
self.tower_stop_token_prediction.append(stop_token_prediction)
|
||||
self.tower_mel_outputs.append(mel_outputs)
|
||||
tower_embedded_inputs.append(embedded_inputs)
|
||||
tower_enc_conv_output_shape.append(enc_conv_output_shape)
|
||||
tower_encoder_cond_outputs.append(encoder_cond_outputs)
|
||||
tower_residual.append(residual)
|
||||
tower_projected_residual.append(projected_residual)
|
||||
|
||||
if post_condition:
|
||||
self.tower_linear_outputs.append(linear_outputs)
|
||||
log("initialisation done {}".format(gpus[i]))
|
||||
|
||||
if is_training:
|
||||
self.ratio = self.helper._ratio
|
||||
self.tower_inputs = tower_inputs
|
||||
self.tower_input_lengths = tower_input_lengths
|
||||
self.tower_mel_targets = tower_mel_targets
|
||||
# self.tower_linear_targets = tower_linear_targets
|
||||
self.tower_targets_lengths = tower_targets_lengths
|
||||
self.tower_stop_token_targets = tower_stop_token_targets
|
||||
|
||||
self.all_vars = tf.trainable_variables()
|
||||
|
||||
log("Initialized Tacotron model. Dimensions (? = dynamic shape): ")
|
||||
log(" Train mode: {}".format(is_training))
|
||||
log(" Eval mode: {}".format(is_evaluating))
|
||||
log(" GTA mode: {}".format(gta))
|
||||
log(" Synthesis mode: {}".format(not (is_training or is_evaluating)))
|
||||
log(" Input: {}".format(inputs.shape))
|
||||
for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
|
||||
log(" device: {}".format(i))
|
||||
log(" embedding: {}".format(tower_embedded_inputs[i].shape))
|
||||
log(" enc conv out: {}".format(tower_enc_conv_output_shape[i]))
|
||||
log(" encoder out (cond): {}".format(tower_encoder_cond_outputs[i].shape))
|
||||
log(" decoder out: {}".format(self.tower_decoder_output[i].shape))
|
||||
log(" residual out: {}".format(tower_residual[i].shape))
|
||||
log(" projected residual out: {}".format(tower_projected_residual[i].shape))
|
||||
log(" mel out: {}".format(self.tower_mel_outputs[i].shape))
|
||||
if post_condition:
|
||||
log(" linear out: {}".format(self.tower_linear_outputs[i].shape))
|
||||
log(" <stop_token> out: {}".format(self.tower_stop_token_prediction[i].shape))
|
||||
|
||||
# 1_000_000 is causing syntax problems for some people?! Python please :)
|
||||
log(" Tacotron Parameters {:.3f} Million.".format(
|
||||
np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
|
||||
|
||||
|
||||
def add_loss(self):
|
||||
"""Adds loss to the model. Sets "loss" field. initialize must have been called."""
|
||||
hp = self._hparams
|
||||
|
||||
self.tower_before_loss = []
|
||||
self.tower_after_loss = []
|
||||
self.tower_stop_token_loss = []
|
||||
self.tower_regularization_loss = []
|
||||
self.tower_linear_loss = []
|
||||
self.tower_loss = []
|
||||
|
||||
total_before_loss = 0
|
||||
total_after_loss = 0
|
||||
total_stop_token_loss = 0
|
||||
total_regularization_loss = 0
|
||||
total_linear_loss = 0
|
||||
total_loss = 0
|
||||
|
||||
gpus = ["/gpu:{}".format(i) for i in
|
||||
range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
|
||||
|
||||
for i in range(hp.tacotron_num_gpus):
|
||||
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
|
||||
worker_device=gpus[i])):
|
||||
with tf.variable_scope("loss") as scope:
|
||||
if hp.mask_decoder:
|
||||
# Compute loss of predictions before postnet
|
||||
before = MaskedMSE(self.tower_mel_targets[i], self.tower_decoder_output[i],
|
||||
self.tower_targets_lengths[i],
|
||||
hparams=self._hparams)
|
||||
# Compute loss after postnet
|
||||
after = MaskedMSE(self.tower_mel_targets[i], self.tower_mel_outputs[i],
|
||||
self.tower_targets_lengths[i],
|
||||
hparams=self._hparams)
|
||||
# Compute <stop_token> loss (for learning dynamic generation stop)
|
||||
stop_token_loss = MaskedSigmoidCrossEntropy(
|
||||
self.tower_stop_token_targets[i],
|
||||
self.tower_stop_token_prediction[i], self.tower_targets_lengths[i],
|
||||
hparams=self._hparams)
|
||||
# SV2TTS extra L1 loss (disabled for now)
|
||||
# linear_loss = MaskedLinearLoss(self.tower_mel_targets[i],
|
||||
# self.tower_decoder_output[i],
|
||||
# self.tower_targets_lengths[i],
|
||||
# hparams=self._hparams)
|
||||
linear_loss = 0.
|
||||
else:
|
||||
# Compute loss of predictions before postnet
|
||||
before = tf.losses.mean_squared_error(self.tower_mel_targets[i],
|
||||
self.tower_decoder_output[i])
|
||||
# Compute loss after postnet
|
||||
after = tf.losses.mean_squared_error(self.tower_mel_targets[i],
|
||||
self.tower_mel_outputs[i])
|
||||
# Compute <stop_token> loss (for learning dynamic generation stop)
|
||||
stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
|
||||
labels=self.tower_stop_token_targets[i],
|
||||
logits=self.tower_stop_token_prediction[i]))
|
||||
|
||||
# SV2TTS extra L1 loss
|
||||
l1 = tf.abs(self.tower_mel_targets[i] - self.tower_decoder_output[i])
|
||||
linear_loss = tf.reduce_mean(l1)
|
||||
|
||||
# if hp.predict_linear:
|
||||
# # Compute linear loss
|
||||
# # From https://github.com/keithito/tacotron/blob/tacotron2-work-in
|
||||
# # -progress/models/tacotron.py
|
||||
# # Prioritize loss for frequencies under 2000 Hz.
|
||||
# l1 = tf.abs(self.tower_linear_targets[i] - self.tower_linear_outputs[i])
|
||||
# n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_freq)
|
||||
# linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(
|
||||
# l1[:, :, 0:n_priority_freq])
|
||||
# else:
|
||||
# linear_loss = 0.
|
||||
|
||||
# Compute the regularization weight
|
||||
if hp.tacotron_scale_regularization:
|
||||
reg_weight_scaler = 1. / (
|
||||
2 * hp.max_abs_value) if hp.symmetric_mels else 1. / (
|
||||
hp.max_abs_value)
|
||||
reg_weight = hp.tacotron_reg_weight * reg_weight_scaler
|
||||
else:
|
||||
reg_weight = hp.tacotron_reg_weight
|
||||
|
||||
# Regularize variables
|
||||
# Exclude all types of bias, RNN (Bengio et al. On the difficulty of training recurrent neural networks), embeddings and prediction projection layers.
|
||||
# Note that we consider attention mechanism v_a weights as a prediction projection layer and we don"t regularize it. (This gave better stability)
|
||||
regularization = tf.add_n([tf.nn.l2_loss(v) for v in self.all_vars
|
||||
if not (
|
||||
"bias" in v.name or "Bias" in v.name or "_projection" in v.name or "inputs_embedding" in v.name
|
||||
or "RNN" in v.name or "LSTM" in v.name)]) * reg_weight
|
||||
|
||||
# Compute final loss term
|
||||
self.tower_before_loss.append(before)
|
||||
self.tower_after_loss.append(after)
|
||||
self.tower_stop_token_loss.append(stop_token_loss)
|
||||
self.tower_regularization_loss.append(regularization)
|
||||
self.tower_linear_loss.append(linear_loss)
|
||||
|
||||
loss = before + after + stop_token_loss + regularization + linear_loss
|
||||
self.tower_loss.append(loss)
|
||||
|
||||
for i in range(hp.tacotron_num_gpus):
|
||||
total_before_loss += self.tower_before_loss[i]
|
||||
total_after_loss += self.tower_after_loss[i]
|
||||
total_stop_token_loss += self.tower_stop_token_loss[i]
|
||||
total_regularization_loss += self.tower_regularization_loss[i]
|
||||
total_linear_loss += self.tower_linear_loss[i]
|
||||
total_loss += self.tower_loss[i]
|
||||
|
||||
self.before_loss = total_before_loss / hp.tacotron_num_gpus
|
||||
self.after_loss = total_after_loss / hp.tacotron_num_gpus
|
||||
self.stop_token_loss = total_stop_token_loss / hp.tacotron_num_gpus
|
||||
self.regularization_loss = total_regularization_loss / hp.tacotron_num_gpus
|
||||
self.linear_loss = total_linear_loss / hp.tacotron_num_gpus
|
||||
self.loss = total_loss / hp.tacotron_num_gpus
|
||||
|
||||
def add_optimizer(self, global_step):
|
||||
"""Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
|
||||
Args:
|
||||
global_step: int32 scalar Tensor representing current global step in training
|
||||
"""
|
||||
hp = self._hparams
|
||||
tower_gradients = []
|
||||
|
||||
# 1. Declare GPU Devices
|
||||
gpus = ["/gpu:{}".format(i) for i in
|
||||
range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
|
||||
|
||||
grad_device = "/cpu:0" if hp.tacotron_num_gpus > 1 else gpus[0]
|
||||
|
||||
with tf.device(grad_device):
|
||||
with tf.variable_scope("optimizer") as scope:
|
||||
if hp.tacotron_decay_learning_rate:
|
||||
self.decay_steps = hp.tacotron_decay_steps
|
||||
self.decay_rate = hp.tacotron_decay_rate
|
||||
self.learning_rate = self._learning_rate_decay(
|
||||
hp.tacotron_initial_learning_rate, global_step)
|
||||
else:
|
||||
self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate)
|
||||
|
||||
optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
|
||||
hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
|
||||
|
||||
# 2. Compute Gradient
|
||||
for i in range(hp.tacotron_num_gpus):
|
||||
# Device placement
|
||||
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
|
||||
worker_device=gpus[i])):
|
||||
# agg_loss += self.tower_loss[i]
|
||||
with tf.variable_scope("optimizer") as scope:
|
||||
gradients = optimizer.compute_gradients(self.tower_loss[i])
|
||||
tower_gradients.append(gradients)
|
||||
|
||||
# 3. Average Gradient
|
||||
with tf.device(grad_device):
|
||||
avg_grads = []
|
||||
vars = []
|
||||
for grad_and_vars in zip(*tower_gradients):
|
||||
# grads_vars = [(grad1, var), (grad2, var), ...]
|
||||
grads = []
|
||||
for g, _ in grad_and_vars:
|
||||
expanded_g = tf.expand_dims(g, 0)
|
||||
# Append on a "tower" dimension which we will average over below.
|
||||
grads.append(expanded_g)
|
||||
# Average over the "tower" dimension.
|
||||
grad = tf.concat(axis=0, values=grads)
|
||||
grad = tf.reduce_mean(grad, 0)
|
||||
|
||||
v = grad_and_vars[0][1]
|
||||
avg_grads.append(grad)
|
||||
vars.append(v)
|
||||
|
||||
self.gradients = avg_grads
|
||||
# Just for causion
|
||||
# https://github.com/Rayhane-mamah/Tacotron-2/issues/11
|
||||
if hp.tacotron_clip_gradients:
|
||||
clipped_gradients, _ = tf.clip_by_global_norm(avg_grads, 1.) # __mark 0.5 refer
|
||||
else:
|
||||
clipped_gradients = avg_grads
|
||||
|
||||
# Add dependency on UPDATE_OPS; otherwise batchnorm won"t work correctly. See:
|
||||
# https://github.com/tensorflow/tensorflow/issues/1122
|
||||
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
|
||||
self.optimize = optimizer.apply_gradients(zip(clipped_gradients, vars),
|
||||
global_step=global_step)
|
||||
|
||||
def _learning_rate_decay(self, init_lr, global_step):
|
||||
#################################################################
|
||||
# Narrow Exponential Decay:
|
||||
|
||||
# Phase 1: lr = 1e-3
|
||||
# We only start learning rate decay after 50k steps
|
||||
|
||||
# Phase 2: lr in ]1e-5, 1e-3[
|
||||
# decay reach minimal value at step 310k
|
||||
|
||||
# Phase 3: lr = 1e-5
|
||||
# clip by minimal learning rate value (step > 310k)
|
||||
#################################################################
|
||||
hp = self._hparams
|
||||
|
||||
# Compute natural exponential decay
|
||||
lr = tf.train.exponential_decay(init_lr,
|
||||
global_step - hp.tacotron_start_decay,
|
||||
# lr = 1e-3 at step 50k
|
||||
self.decay_steps,
|
||||
self.decay_rate, # lr = 1e-5 around step 310k
|
||||
name="lr_exponential_decay")
|
||||
|
||||
# clip learning rate by max and min values (initial and final values)
|
||||
return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr)
|
120
synthesizer_tacotron2/preprocess.py
Normal file
120
synthesizer_tacotron2/preprocess.py
Normal file
|
@ -0,0 +1,120 @@
|
|||
from multiprocessing.pool import Pool
|
||||
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
from encoder import inference as encoder
|
||||
from synthesizer.preprocess_speaker import preprocess_speaker_general
|
||||
from synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata
|
||||
|
||||
data_info = {
|
||||
"aidatatang_200zh": {
|
||||
"subfolders": ["corpus/train"],
|
||||
"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
|
||||
"speak_func": preprocess_speaker_general
|
||||
},
|
||||
"magicdata": {
|
||||
"subfolders": ["train"],
|
||||
"trans_filepath": "train/TRANS.txt",
|
||||
"speak_func": preprocess_speaker_general,
|
||||
"transcript_func": preprocess_transcript_magicdata,
|
||||
},
|
||||
"aishell3":{
|
||||
"subfolders": ["train/wav"],
|
||||
"trans_filepath": "train/content.txt",
|
||||
"speak_func": preprocess_speaker_general,
|
||||
"transcript_func": preprocess_transcript_aishell3,
|
||||
},
|
||||
"data_aishell":{
|
||||
"subfolders": ["wav/train"],
|
||||
"trans_filepath": "transcript/aishell_transcript_v0.8.txt",
|
||||
"speak_func": preprocess_speaker_general
|
||||
}
|
||||
}
|
||||
|
||||
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||
skip_existing: bool, hparams, no_alignments: bool,
|
||||
dataset: str):
|
||||
dataset_info = data_info[dataset]
|
||||
# Gather the input directories
|
||||
dataset_root = datasets_root.joinpath(dataset)
|
||||
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]
|
||||
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
||||
assert all(input_dir.exists() for input_dir in input_dirs)
|
||||
|
||||
# Create the output directories for each output file type
|
||||
out_dir.joinpath("mels").mkdir(exist_ok=True)
|
||||
out_dir.joinpath("audio").mkdir(exist_ok=True)
|
||||
|
||||
# Create a metadata file
|
||||
metadata_fpath = out_dir.joinpath("train.txt")
|
||||
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
|
||||
|
||||
# Preprocess the dataset
|
||||
dict_info = {}
|
||||
transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
|
||||
assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
|
||||
with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
|
||||
# process with specific function for your dataset
|
||||
if "transcript_func" in dataset_info:
|
||||
dataset_info["transcript_func"](dict_info, dict_transcript)
|
||||
else:
|
||||
for v in dict_transcript:
|
||||
if not v:
|
||||
continue
|
||||
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
|
||||
dict_info[v[0]] = " ".join(v[1:])
|
||||
|
||||
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
||||
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)
|
||||
job = Pool(n_processes).imap(func, speaker_dirs)
|
||||
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
||||
for metadatum in speaker_metadata:
|
||||
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
||||
metadata_file.close()
|
||||
|
||||
# Verify the contents of the metadata file
|
||||
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
||||
metadata = [line.split("|") for line in metadata_file]
|
||||
mel_frames = sum([int(m[4]) for m in metadata])
|
||||
timesteps = sum([int(m[3]) for m in metadata])
|
||||
sample_rate = hparams.sample_rate
|
||||
hours = (timesteps / sample_rate) / 3600
|
||||
print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
|
||||
(len(metadata), mel_frames, timesteps, hours))
|
||||
print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
|
||||
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
||||
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
||||
|
||||
def embed_utterance(fpaths, encoder_model_fpath):
|
||||
if not encoder.is_loaded():
|
||||
encoder.load_model(encoder_model_fpath)
|
||||
|
||||
# Compute the speaker embedding of the utterance
|
||||
wav_fpath, embed_fpath = fpaths
|
||||
wav = np.load(wav_fpath)
|
||||
wav = encoder.preprocess_wav(wav)
|
||||
embed = encoder.embed_utterance(wav)
|
||||
np.save(embed_fpath, embed, allow_pickle=False)
|
||||
|
||||
|
||||
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
|
||||
wav_dir = synthesizer_root.joinpath("audio")
|
||||
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
||||
assert wav_dir.exists() and metadata_fpath.exists()
|
||||
embed_dir = synthesizer_root.joinpath("embeds")
|
||||
embed_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Gather the input wave filepath and the target output embed filepath
|
||||
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
||||
metadata = [line.split("|") for line in metadata_file]
|
||||
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
|
||||
|
||||
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
||||
# Embed the utterances in separate threads
|
||||
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
||||
job = Pool(n_processes).imap(func, fpaths)
|
||||
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
99
synthesizer_tacotron2/preprocess_speaker.py
Normal file
99
synthesizer_tacotron2/preprocess_speaker.py
Normal file
|
@ -0,0 +1,99 @@
|
|||
import librosa
|
||||
import numpy as np
|
||||
|
||||
from encoder import inference as encoder
|
||||
from utils import logmmse
|
||||
from synthesizer import audio
|
||||
from pathlib import Path
|
||||
from pypinyin import Style
|
||||
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
|
||||
from pypinyin.converter import DefaultConverter
|
||||
from pypinyin.core import Pinyin
|
||||
|
||||
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
|
||||
pass
|
||||
|
||||
pinyin = Pinyin(PinyinConverter()).pinyin
|
||||
|
||||
|
||||
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||
skip_existing: bool, hparams):
|
||||
## FOR REFERENCE:
|
||||
# For you not to lose your head if you ever wish to change things here or implement your own
|
||||
# synthesizer.
|
||||
# - Both the audios and the mel spectrograms are saved as numpy arrays
|
||||
# - There is no processing done to the audios that will be saved to disk beyond volume
|
||||
# normalization (in split_on_silences)
|
||||
# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
|
||||
# is why we re-apply it on the audio on the side of the vocoder.
|
||||
# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
|
||||
# without extra padding. This means that you won't have an exact relation between the length
|
||||
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
||||
|
||||
|
||||
# Skip existing utterances if needed
|
||||
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
|
||||
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
|
||||
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
|
||||
return None
|
||||
|
||||
# Trim silence
|
||||
if hparams.trim_silence:
|
||||
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
||||
|
||||
# Skip utterances that are too short
|
||||
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
|
||||
return None
|
||||
|
||||
# Compute the mel spectrogram
|
||||
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
||||
mel_frames = mel_spectrogram.shape[1]
|
||||
|
||||
# Skip utterances that are too long
|
||||
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
|
||||
return None
|
||||
|
||||
# Write the spectrogram, embed and audio to disk
|
||||
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
|
||||
np.save(wav_fpath, wav, allow_pickle=False)
|
||||
|
||||
# Return a tuple describing this training example
|
||||
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
|
||||
|
||||
|
||||
def _split_on_silences(wav_fpath, words, hparams):
|
||||
# Load the audio waveform
|
||||
wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
|
||||
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
|
||||
if hparams.rescale:
|
||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||
# denoise, we may not need it here.
|
||||
if len(wav) > hparams.sample_rate*(0.3+0.1):
|
||||
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
|
||||
wav[-int(hparams.sample_rate*0.15):]])
|
||||
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
|
||||
wav = logmmse.denoise(wav, profile, eta=0)
|
||||
|
||||
resp = pinyin(words, style=Style.TONE3)
|
||||
res = [v[0] for v in resp if v[0].strip()]
|
||||
res = " ".join(res)
|
||||
|
||||
return wav, res
|
||||
|
||||
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||
metadata = []
|
||||
extensions = ["*.wav", "*.flac", "*.mp3"]
|
||||
for extension in extensions:
|
||||
wav_fpath_list = speaker_dir.glob(extension)
|
||||
# Iterate over each wav
|
||||
for wav_fpath in wav_fpath_list:
|
||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||
words = dict_info.get(wav_fpath.name) if not words else words # try with wav
|
||||
if not words:
|
||||
print("no wordS")
|
||||
continue
|
||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
||||
skip_existing, hparams))
|
||||
return [m for m in metadata if m is not None]
|
18
synthesizer_tacotron2/preprocess_transcript.py
Normal file
18
synthesizer_tacotron2/preprocess_transcript.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
def preprocess_transcript_aishell3(dict_info, dict_transcript):
|
||||
for v in dict_transcript:
|
||||
if not v:
|
||||
continue
|
||||
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
|
||||
transList = []
|
||||
for i in range(2, len(v), 2):
|
||||
transList.append(v[i])
|
||||
dict_info[v[0]] = " ".join(transList)
|
||||
|
||||
|
||||
def preprocess_transcript_magicdata(dict_info, dict_transcript):
|
||||
for v in dict_transcript:
|
||||
if not v:
|
||||
continue
|
||||
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
|
||||
dict_info[v[0]] = " ".join(v[2:])
|
||||
|
92
synthesizer_tacotron2/synthesize.py
Normal file
92
synthesizer_tacotron2/synthesize.py
Normal file
|
@ -0,0 +1,92 @@
|
|||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
|
||||
from synthesizer.models.tacotron import Tacotron
|
||||
from synthesizer.hparams import hparams_debug_string
|
||||
from synthesizer.utils.text import text_to_sequence
|
||||
from synthesizer.utils.symbols import symbols
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import sys
|
||||
from synthesizer.infolog import log
|
||||
import os
|
||||
from synthesizer.tacotron2 import Tacotron2
|
||||
import time
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
|
||||
eval_dir = os.path.join(output_dir, "eval")
|
||||
log_dir = os.path.join(output_dir, "logs-eval")
|
||||
|
||||
# Create output path if it doesn"t exist
|
||||
os.makedirs(eval_dir, exist_ok=True)
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
os.makedirs(os.path.join(log_dir, "wavs"), exist_ok=True)
|
||||
os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True)
|
||||
|
||||
log(hparams_debug_string())
|
||||
synth = Tacotron2(checkpoint_path, hparams)
|
||||
|
||||
# Set inputs batch wise
|
||||
sentences = [sentences[i: i + hparams.tacotron_synthesis_batch_size] for i
|
||||
in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]
|
||||
|
||||
log("Starting Synthesis")
|
||||
with open(os.path.join(eval_dir, "map.txt"), "w") as file:
|
||||
for i, texts in enumerate(tqdm(sentences)):
|
||||
start = time.time()
|
||||
basenames = ["batch_{}_sentence_{}".format(i, j) for j in range(len(texts))]
|
||||
mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None)
|
||||
|
||||
for elems in zip(texts, mel_filenames, speaker_ids):
|
||||
file.write("|".join([str(x) for x in elems]) + "\n")
|
||||
log("synthesized mel spectrograms at {}".format(eval_dir))
|
||||
return eval_dir
|
||||
|
||||
|
||||
def run_synthesis(in_dir, out_dir, model_dir, hparams):
|
||||
# This generates ground truth-aligned mels for vocoder training
|
||||
synth_dir = os.path.join(out_dir, "mels_gta")
|
||||
os.makedirs(synth_dir, exist_ok=True)
|
||||
metadata_filename = os.path.join(in_dir, "train.txt")
|
||||
print(hparams_debug_string())
|
||||
|
||||
# Load the model in memory
|
||||
weights_dir = os.path.join(model_dir, "taco_pretrained")
|
||||
checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path
|
||||
synth = Tacotron2(checkpoint_fpath, hparams, gta=True)
|
||||
|
||||
# Load the metadata
|
||||
with open(metadata_filename, encoding="utf-8") as f:
|
||||
metadata = [line.strip().split("|") for line in f]
|
||||
frame_shift_ms = hparams.hop_size / hparams.sample_rate
|
||||
hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / 3600
|
||||
print("Loaded metadata for {} examples ({:.2f} hours)".format(len(metadata), hours))
|
||||
|
||||
# Set inputs batch wise
|
||||
metadata = [metadata[i: i + hparams.tacotron_synthesis_batch_size] for i in
|
||||
range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]
|
||||
# TODO: come on big boy, fix this
|
||||
# Quick and dirty fix to make sure that all batches have the same size
|
||||
metadata = metadata[:-1]
|
||||
|
||||
print("Starting Synthesis")
|
||||
mel_dir = os.path.join(in_dir, "mels")
|
||||
embed_dir = os.path.join(in_dir, "embeds")
|
||||
meta_out_fpath = os.path.join(out_dir, "synthesized.txt")
|
||||
with open(meta_out_fpath, "w") as file:
|
||||
for i, meta in enumerate(tqdm(metadata)):
|
||||
texts = [m[5] for m in meta]
|
||||
mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
|
||||
embed_filenames = [os.path.join(embed_dir, m[2]) for m in meta]
|
||||
basenames = [os.path.basename(m).replace(".npy", "").replace("mel-", "")
|
||||
for m in mel_filenames]
|
||||
synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, embed_filenames)
|
||||
|
||||
for elems in meta:
|
||||
file.write("|".join([str(x) for x in elems]) + "\n")
|
||||
|
||||
print("Synthesized mel spectrograms at {}".format(synth_dir))
|
||||
return meta_out_fpath
|
92
synthesizer_tacotron2/synthesizer_dataset.py
Normal file
92
synthesizer_tacotron2/synthesizer_dataset.py
Normal file
|
@ -0,0 +1,92 @@
|
|||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from synthesizer.utils.text import text_to_sequence
|
||||
|
||||
|
||||
class SynthesizerDataset(Dataset):
|
||||
def __init__(self, metadata_fpath: Path, mel_dir: Path, embed_dir: Path, hparams):
|
||||
print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, embed_dir))
|
||||
|
||||
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
||||
metadata = [line.split("|") for line in metadata_file]
|
||||
|
||||
mel_fnames = [x[1] for x in metadata if int(x[4])]
|
||||
mel_fpaths = [mel_dir.joinpath(fname) for fname in mel_fnames]
|
||||
embed_fnames = [x[2] for x in metadata if int(x[4])]
|
||||
embed_fpaths = [embed_dir.joinpath(fname) for fname in embed_fnames]
|
||||
self.samples_fpaths = list(zip(mel_fpaths, embed_fpaths))
|
||||
self.samples_texts = [x[5].strip() for x in metadata if int(x[4])]
|
||||
self.metadata = metadata
|
||||
self.hparams = hparams
|
||||
|
||||
print("Found %d samples" % len(self.samples_fpaths))
|
||||
|
||||
def __getitem__(self, index):
|
||||
# Sometimes index may be a list of 2 (not sure why this happens)
|
||||
# If that is the case, return a single item corresponding to first element in index
|
||||
if index is list:
|
||||
index = index[0]
|
||||
|
||||
mel_path, embed_path = self.samples_fpaths[index]
|
||||
mel = np.load(mel_path).T.astype(np.float32)
|
||||
|
||||
# Load the embed
|
||||
embed = np.load(embed_path)
|
||||
|
||||
# Get the text and clean it
|
||||
text = text_to_sequence(self.samples_texts[index], self.hparams.tts_cleaner_names)
|
||||
|
||||
# Convert the list returned by text_to_sequence to a numpy array
|
||||
text = np.asarray(text).astype(np.int32)
|
||||
|
||||
return text, mel.astype(np.float32), embed.astype(np.float32), index
|
||||
|
||||
def __len__(self):
|
||||
return len(self.samples_fpaths)
|
||||
|
||||
|
||||
def collate_synthesizer(batch):
|
||||
# Text
|
||||
x_lens = [len(x[0]) for x in batch]
|
||||
max_x_len = max(x_lens)
|
||||
|
||||
chars = [pad1d(x[0], max_x_len) for x in batch]
|
||||
chars = np.stack(chars)
|
||||
|
||||
# Mel spectrogram
|
||||
spec_lens = [x[1].shape[-1] for x in batch]
|
||||
max_spec_len = max(spec_lens) + 1
|
||||
if max_spec_len % 2 != 0: # FIXIT: Hardcoded due to incompatibility with Windows (no lambda)
|
||||
max_spec_len += 2 - max_spec_len % 2
|
||||
|
||||
# WaveRNN mel spectrograms are normalized to [0, 1] so zero padding adds silence
|
||||
# By default, SV2TTS uses symmetric mels, where -1*max_abs_value is silence.
|
||||
# if hparams.symmetric_mels:
|
||||
# mel_pad_value = -1 * hparams.max_abs_value
|
||||
# else:
|
||||
# mel_pad_value = 0
|
||||
mel_pad_value = -4 # FIXIT: Hardcoded due to incompatibility with Windows (no lambda)
|
||||
mel = [pad2d(x[1], max_spec_len, pad_value=mel_pad_value) for x in batch]
|
||||
mel = np.stack(mel)
|
||||
|
||||
# Speaker embedding (SV2TTS)
|
||||
embeds = [x[2] for x in batch]
|
||||
|
||||
# Index (for vocoder preprocessing)
|
||||
indices = [x[3] for x in batch]
|
||||
|
||||
|
||||
# Convert all to tensor
|
||||
chars = torch.tensor(chars).long()
|
||||
mel = torch.tensor(mel)
|
||||
embeds = torch.tensor(embeds)
|
||||
|
||||
return chars, mel, embeds, indices
|
||||
|
||||
def pad1d(x, max_len, pad_value=0):
|
||||
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
|
||||
|
||||
def pad2d(x, max_len, pad_value=0):
|
||||
return np.pad(x, ((0, 0), (0, max_len - x.shape[-1])), mode="constant", constant_values=pad_value)
|
238
synthesizer_tacotron2/tacotron2.py
Normal file
238
synthesizer_tacotron2/tacotron2.py
Normal file
|
@ -0,0 +1,238 @@
|
|||
from synthesizer.utils.text import text_to_sequence
|
||||
from synthesizer.infolog import log
|
||||
from synthesizer.models import create_model
|
||||
from synthesizer.utils import plot
|
||||
from synthesizer import audio
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
|
||||
class Tacotron2:
|
||||
def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
|
||||
log("Constructing model: %s" % model_name)
|
||||
#Force the batch size to be known in order to use attention masking in batch synthesis
|
||||
inputs = tf.placeholder(tf.int32, (None, None), name="inputs")
|
||||
input_lengths = tf.placeholder(tf.int32, (None,), name="input_lengths")
|
||||
speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
|
||||
name="speaker_embeddings")
|
||||
targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets")
|
||||
split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos")
|
||||
with tf.variable_scope("Tacotron_model") as scope:
|
||||
self.model = create_model(model_name, hparams)
|
||||
if gta:
|
||||
self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta,
|
||||
split_infos=split_infos)
|
||||
else:
|
||||
self.model.initialize(inputs, input_lengths, speaker_embeddings,
|
||||
split_infos=split_infos)
|
||||
|
||||
self.mel_outputs = self.model.tower_mel_outputs
|
||||
self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None
|
||||
self.alignments = self.model.tower_alignments
|
||||
self.stop_token_prediction = self.model.tower_stop_token_prediction
|
||||
self.targets = targets
|
||||
|
||||
self.gta = gta
|
||||
self._hparams = hparams
|
||||
#pad input sequences with the <pad_token> 0 ( _ )
|
||||
self._pad = 0
|
||||
#explicitely setting the padding to a value that doesn"t originally exist in the spectogram
|
||||
#to avoid any possible conflicts, without affecting the output range of the model too much
|
||||
if hparams.symmetric_mels:
|
||||
self._target_pad = -hparams.max_abs_value
|
||||
else:
|
||||
self._target_pad = 0.
|
||||
|
||||
self.inputs = inputs
|
||||
self.input_lengths = input_lengths
|
||||
self.speaker_embeddings = speaker_embeddings
|
||||
self.targets = targets
|
||||
self.split_infos = split_infos
|
||||
|
||||
log("Loading checkpoint: %s" % checkpoint_path)
|
||||
#Memory allocation on the GPUs as needed
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
config.allow_soft_placement = True
|
||||
|
||||
self.session = tf.Session(config=config)
|
||||
self.session.run(tf.global_variables_initializer())
|
||||
|
||||
saver = tf.train.Saver()
|
||||
saver.restore(self.session, checkpoint_path)
|
||||
|
||||
def my_synthesize(self, speaker_embeds, texts):
|
||||
"""
|
||||
Lighter synthesis function that directly returns the mel spectrograms.
|
||||
"""
|
||||
print(texts)
|
||||
# Prepare the input
|
||||
cleaner_names = [x.strip() for x in self._hparams.cleaners.split(",")]
|
||||
seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
|
||||
input_lengths = [len(seq) for seq in seqs]
|
||||
input_seqs, max_seq_len = self._prepare_inputs(seqs)
|
||||
split_infos = [[max_seq_len, 0, 0, 0]]
|
||||
feed_dict = {
|
||||
self.inputs: input_seqs,
|
||||
self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
|
||||
self.split_infos: np.asarray(split_infos, dtype=np.int32),
|
||||
self.speaker_embeddings: speaker_embeds
|
||||
}
|
||||
|
||||
# Forward it
|
||||
mels, alignments, stop_tokens = self.session.run(
|
||||
[self.mel_outputs, self.alignments, self.stop_token_prediction],
|
||||
feed_dict=feed_dict)
|
||||
mels, alignments, stop_tokens = list(mels[0]), alignments[0], stop_tokens[0]
|
||||
|
||||
# Trim the output
|
||||
for i in range(len(mels)):
|
||||
try:
|
||||
target_length = list(np.round(stop_tokens[i])).index(1)
|
||||
mels[i] = mels[i][:target_length, :]
|
||||
except ValueError:
|
||||
# If no token is generated, we simply do not trim the output
|
||||
continue
|
||||
|
||||
return [mel.T for mel in mels], alignments
|
||||
|
||||
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames, embed_filenames):
|
||||
hparams = self._hparams
|
||||
cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]
|
||||
|
||||
assert 0 == len(texts) % self._hparams.tacotron_num_gpus
|
||||
seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
|
||||
input_lengths = [len(seq) for seq in seqs]
|
||||
|
||||
size_per_device = len(seqs) // self._hparams.tacotron_num_gpus
|
||||
|
||||
#Pad inputs according to each GPU max length
|
||||
input_seqs = None
|
||||
split_infos = []
|
||||
for i in range(self._hparams.tacotron_num_gpus):
|
||||
device_input = seqs[size_per_device*i: size_per_device*(i+1)]
|
||||
device_input, max_seq_len = self._prepare_inputs(device_input)
|
||||
input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input
|
||||
split_infos.append([max_seq_len, 0, 0, 0])
|
||||
|
||||
feed_dict = {
|
||||
self.inputs: input_seqs,
|
||||
self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
|
||||
}
|
||||
|
||||
if self.gta:
|
||||
np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
|
||||
target_lengths = [len(np_target) for np_target in np_targets]
|
||||
|
||||
#pad targets according to each GPU max length
|
||||
target_seqs = None
|
||||
for i in range(self._hparams.tacotron_num_gpus):
|
||||
device_target = np_targets[size_per_device*i: size_per_device*(i+1)]
|
||||
device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step)
|
||||
target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target
|
||||
split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe?
|
||||
|
||||
feed_dict[self.targets] = target_seqs
|
||||
assert len(np_targets) == len(texts)
|
||||
|
||||
feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
|
||||
feed_dict[self.speaker_embeddings] = [np.load(f) for f in embed_filenames]
|
||||
|
||||
if self.gta or not hparams.predict_linear:
|
||||
mels, alignments, stop_tokens = self.session.run(
|
||||
[self.mel_outputs, self.alignments, self.stop_token_prediction],
|
||||
feed_dict=feed_dict)
|
||||
#Linearize outputs (1D arrays)
|
||||
mels = [mel for gpu_mels in mels for mel in gpu_mels]
|
||||
alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
|
||||
stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
|
||||
|
||||
if not self.gta:
|
||||
#Natural batch synthesis
|
||||
#Get Mel lengths for the entire batch from stop_tokens predictions
|
||||
target_lengths = self._get_output_lengths(stop_tokens)
|
||||
|
||||
#Take off the batch wise padding
|
||||
mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
|
||||
assert len(mels) == len(texts)
|
||||
|
||||
else:
|
||||
linears, mels, alignments, stop_tokens = self.session.run(
|
||||
[self.linear_outputs, self.mel_outputs, self.alignments,
|
||||
self.stop_token_prediction],
|
||||
feed_dict=feed_dict)
|
||||
#Linearize outputs (1D arrays)
|
||||
linears = [linear for gpu_linear in linears for linear in gpu_linear]
|
||||
mels = [mel for gpu_mels in mels for mel in gpu_mels]
|
||||
alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
|
||||
stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
|
||||
|
||||
#Natural batch synthesis
|
||||
#Get Mel/Linear lengths for the entire batch from stop_tokens predictions
|
||||
# target_lengths = self._get_output_lengths(stop_tokens)
|
||||
target_lengths = [9999]
|
||||
|
||||
#Take off the batch wise padding
|
||||
mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
|
||||
linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
|
||||
assert len(mels) == len(linears) == len(texts)
|
||||
|
||||
if basenames is None:
|
||||
raise NotImplemented()
|
||||
|
||||
saved_mels_paths = []
|
||||
for i, mel in enumerate(mels):
|
||||
# Write the spectrogram to disk
|
||||
# Note: outputs mel-spectrogram files and target ones have same names, just different folders
|
||||
mel_filename = os.path.join(out_dir, "mel-{}.npy".format(basenames[i]))
|
||||
np.save(mel_filename, mel, allow_pickle=False)
|
||||
saved_mels_paths.append(mel_filename)
|
||||
|
||||
if log_dir is not None:
|
||||
#save wav (mel -> wav)
|
||||
wav = audio.inv_mel_spectrogram(mel.T, hparams)
|
||||
audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-mel.wav".format(basenames[i])), sr=hparams.sample_rate)
|
||||
|
||||
#save alignments
|
||||
plot.plot_alignment(alignments[i], os.path.join(log_dir, "plots/alignment-{}.png".format(basenames[i])),
|
||||
title="{}".format(texts[i]), split_title=True, max_len=target_lengths[i])
|
||||
|
||||
#save mel spectrogram plot
|
||||
plot.plot_spectrogram(mel, os.path.join(log_dir, "plots/mel-{}.png".format(basenames[i])),
|
||||
title="{}".format(texts[i]), split_title=True)
|
||||
|
||||
if hparams.predict_linear:
|
||||
#save wav (linear -> wav)
|
||||
wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
|
||||
audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-linear.wav".format(basenames[i])), sr=hparams.sample_rate)
|
||||
|
||||
#save linear spectrogram plot
|
||||
plot.plot_spectrogram(linears[i], os.path.join(log_dir, "plots/linear-{}.png".format(basenames[i])),
|
||||
title="{}".format(texts[i]), split_title=True, auto_aspect=True)
|
||||
|
||||
return saved_mels_paths
|
||||
|
||||
def _round_up(self, x, multiple):
|
||||
remainder = x % multiple
|
||||
return x if remainder == 0 else x + multiple - remainder
|
||||
|
||||
def _prepare_inputs(self, inputs):
|
||||
max_len = max([len(x) for x in inputs])
|
||||
return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
|
||||
|
||||
def _pad_input(self, x, length):
|
||||
return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad)
|
||||
|
||||
def _prepare_targets(self, targets, alignment):
|
||||
max_len = max([len(t) for t in targets])
|
||||
data_len = self._round_up(max_len, alignment)
|
||||
return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
|
||||
|
||||
def _pad_target(self, t, length):
|
||||
return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad)
|
||||
|
||||
def _get_output_lengths(self, stop_tokens):
|
||||
#Determine each mel length by the stop token predictions. (len = first occurence of 1 in stop_tokens row wise)
|
||||
output_lengths = [row.index(1) for row in np.round(stop_tokens).tolist()]
|
||||
return output_lengths
|
393
synthesizer_tacotron2/train.py
Normal file
393
synthesizer_tacotron2/train.py
Normal file
|
@ -0,0 +1,393 @@
|
|||
from synthesizer.utils.symbols import symbols
|
||||
from synthesizer.utils.text import sequence_to_text
|
||||
from synthesizer.hparams import hparams_debug_string
|
||||
from synthesizer.feeder import Feeder
|
||||
from synthesizer.models import create_model
|
||||
from synthesizer.utils import ValueWindow, plot
|
||||
from synthesizer import infolog, audio
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import traceback
|
||||
import time
|
||||
import os
|
||||
|
||||
log = infolog.log
|
||||
|
||||
|
||||
def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoint_path):
|
||||
# Create tensorboard projector
|
||||
config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
|
||||
config.model_checkpoint_path = checkpoint_path
|
||||
|
||||
for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta):
|
||||
# Initialize config
|
||||
embedding = config.embeddings.add()
|
||||
# Specifiy the embedding variable and the metadata
|
||||
embedding.tensor_name = embedding_name
|
||||
embedding.metadata_path = path_to_meta
|
||||
|
||||
# Project the embeddings to space dimensions for visualization
|
||||
tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config)
|
||||
|
||||
|
||||
def add_train_stats(model, hparams):
|
||||
with tf.variable_scope("stats") as scope:
|
||||
for i in range(hparams.tacotron_num_gpus):
|
||||
tf.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i])
|
||||
tf.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i])
|
||||
tf.summary.scalar("before_loss", model.before_loss)
|
||||
tf.summary.scalar("after_loss", model.after_loss)
|
||||
|
||||
if hparams.predict_linear:
|
||||
tf.summary.scalar("linear_loss", model.linear_loss)
|
||||
for i in range(hparams.tacotron_num_gpus):
|
||||
tf.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i])
|
||||
tf.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i])
|
||||
|
||||
tf.summary.scalar("regularization_loss", model.regularization_loss)
|
||||
tf.summary.scalar("stop_token_loss", model.stop_token_loss)
|
||||
tf.summary.scalar("loss", model.loss)
|
||||
tf.summary.scalar("learning_rate", model.learning_rate) # Control learning rate decay speed
|
||||
if hparams.tacotron_teacher_forcing_mode == "scheduled":
|
||||
tf.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing
|
||||
# ratio decay when mode = "scheduled"
|
||||
gradient_norms = [tf.norm(grad) for grad in model.gradients]
|
||||
tf.summary.histogram("gradient_norm", gradient_norms)
|
||||
tf.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms)) # visualize
|
||||
# gradients (in case of explosion)
|
||||
return tf.summary.merge_all()
|
||||
|
||||
|
||||
def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss,
|
||||
loss):
|
||||
values = [
|
||||
tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss",
|
||||
simple_value=before_loss),
|
||||
tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss",
|
||||
simple_value=after_loss),
|
||||
tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss",
|
||||
simple_value=stop_token_loss),
|
||||
tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss),
|
||||
]
|
||||
if linear_loss is not None:
|
||||
values.append(tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss",
|
||||
simple_value=linear_loss))
|
||||
test_summary = tf.Summary(value=values)
|
||||
summary_writer.add_summary(test_summary, step)
|
||||
|
||||
|
||||
def time_string():
|
||||
return datetime.now().strftime("%Y-%m-%d %H:%M")
|
||||
|
||||
|
||||
def model_train_mode(args, feeder, hparams, global_step):
|
||||
with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
|
||||
model = create_model("Tacotron", hparams)
|
||||
model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings,
|
||||
feeder.mel_targets, feeder.token_targets,
|
||||
targets_lengths=feeder.targets_lengths, global_step=global_step,
|
||||
is_training=True, split_infos=feeder.split_infos)
|
||||
model.add_loss()
|
||||
model.add_optimizer(global_step)
|
||||
stats = add_train_stats(model, hparams)
|
||||
return model, stats
|
||||
|
||||
|
||||
def model_test_mode(args, feeder, hparams, global_step):
|
||||
with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
|
||||
model = create_model("Tacotron", hparams)
|
||||
model.initialize(feeder.eval_inputs, feeder.eval_input_lengths,
|
||||
feeder.eval_speaker_embeddings, feeder.eval_mel_targets,
|
||||
feeder.eval_token_targets, targets_lengths=feeder.eval_targets_lengths,
|
||||
global_step=global_step, is_training=False, is_evaluating=True,
|
||||
split_infos=feeder.eval_split_infos)
|
||||
model.add_loss()
|
||||
return model
|
||||
|
||||
|
||||
def train(log_dir, args, hparams):
|
||||
save_dir = os.path.join(log_dir, "taco_pretrained")
|
||||
plot_dir = os.path.join(log_dir, "plots")
|
||||
wav_dir = os.path.join(log_dir, "wavs")
|
||||
mel_dir = os.path.join(log_dir, "mel-spectrograms")
|
||||
eval_dir = os.path.join(log_dir, "eval-dir")
|
||||
eval_plot_dir = os.path.join(eval_dir, "plots")
|
||||
eval_wav_dir = os.path.join(eval_dir, "wavs")
|
||||
tensorboard_dir = os.path.join(log_dir, "tacotron_events")
|
||||
meta_folder = os.path.join(log_dir, "metas")
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
os.makedirs(plot_dir, exist_ok=True)
|
||||
os.makedirs(wav_dir, exist_ok=True)
|
||||
os.makedirs(mel_dir, exist_ok=True)
|
||||
os.makedirs(eval_dir, exist_ok=True)
|
||||
os.makedirs(eval_plot_dir, exist_ok=True)
|
||||
os.makedirs(eval_wav_dir, exist_ok=True)
|
||||
os.makedirs(tensorboard_dir, exist_ok=True)
|
||||
os.makedirs(meta_folder, exist_ok=True)
|
||||
|
||||
|
||||
checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt")
|
||||
metadat_fpath = os.path.join(args.synthesizer_root, "train.txt")
|
||||
|
||||
log("Checkpoint path: {}".format(checkpoint_fpath))
|
||||
log("Loading training data from: {}".format(metadat_fpath))
|
||||
log("Using model: Tacotron")
|
||||
log(hparams_debug_string())
|
||||
|
||||
# Start by setting a seed for repeatability
|
||||
tf.set_random_seed(hparams.tacotron_random_seed)
|
||||
|
||||
# Set up data feeder
|
||||
coord = tf.train.Coordinator()
|
||||
with tf.variable_scope("datafeeder") as scope:
|
||||
feeder = Feeder(coord, metadat_fpath, hparams)
|
||||
|
||||
# Set up model:
|
||||
global_step = tf.Variable(0, name="global_step", trainable=False)
|
||||
model, stats = model_train_mode(args, feeder, hparams, global_step)
|
||||
eval_model = model_test_mode(args, feeder, hparams, global_step)
|
||||
|
||||
# Embeddings metadata
|
||||
char_embedding_meta = os.path.join(meta_folder, "CharacterEmbeddings.tsv")
|
||||
if not os.path.isfile(char_embedding_meta):
|
||||
with open(char_embedding_meta, "w", encoding="utf-8") as f:
|
||||
for symbol in symbols:
|
||||
if symbol == " ":
|
||||
symbol = "\\s" # For visual purposes, swap space with \s
|
||||
|
||||
f.write("{}\n".format(symbol))
|
||||
|
||||
char_embedding_meta = char_embedding_meta.replace(log_dir, "..")
|
||||
|
||||
# Book keeping
|
||||
step = 0
|
||||
time_window = ValueWindow(100)
|
||||
loss_window = ValueWindow(100)
|
||||
saver = tf.train.Saver(max_to_keep=5)
|
||||
|
||||
log("Tacotron training set to a maximum of {} steps".format(args.tacotron_train_steps))
|
||||
|
||||
# Memory allocation on the GPU as needed
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
config.allow_soft_placement = True
|
||||
|
||||
# Train
|
||||
with tf.Session(config=config) as sess:
|
||||
try:
|
||||
summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
|
||||
|
||||
sess.run(tf.global_variables_initializer())
|
||||
|
||||
# saved model restoring
|
||||
if args.restore:
|
||||
# Restore saved model if the user requested it, default = True
|
||||
try:
|
||||
checkpoint_state = tf.train.get_checkpoint_state(save_dir)
|
||||
|
||||
if checkpoint_state and checkpoint_state.model_checkpoint_path:
|
||||
log("Loading checkpoint {}".format(checkpoint_state.model_checkpoint_path),
|
||||
slack=True)
|
||||
saver.restore(sess, checkpoint_state.model_checkpoint_path)
|
||||
|
||||
else:
|
||||
log("No model to load at {}".format(save_dir), slack=True)
|
||||
saver.save(sess, checkpoint_fpath, global_step=global_step)
|
||||
|
||||
except tf.errors.OutOfRangeError as e:
|
||||
log("Cannot restore checkpoint: {}".format(e), slack=True)
|
||||
else:
|
||||
log("Starting new training!", slack=True)
|
||||
saver.save(sess, checkpoint_fpath, global_step=global_step)
|
||||
|
||||
# initializing feeder
|
||||
feeder.start_threads(sess)
|
||||
|
||||
# Training loop
|
||||
while not coord.should_stop() and step < args.tacotron_train_steps:
|
||||
start_time = time.time()
|
||||
step, loss, opt = sess.run([global_step, model.loss, model.optimize])
|
||||
time_window.append(time.time() - start_time)
|
||||
loss_window.append(loss)
|
||||
message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]".format(
|
||||
step, time_window.average, loss, loss_window.average)
|
||||
log(message, end="\r", slack=(step % args.checkpoint_interval == 0))
|
||||
print(message)
|
||||
|
||||
if loss > 100 or np.isnan(loss):
|
||||
log("Loss exploded to {:.5f} at step {}".format(loss, step))
|
||||
raise Exception("Loss exploded")
|
||||
|
||||
if step % args.summary_interval == 0:
|
||||
log("\nWriting summary at step {}".format(step))
|
||||
summary_writer.add_summary(sess.run(stats), step)
|
||||
|
||||
if step % args.eval_interval == 0:
|
||||
# Run eval and save eval stats
|
||||
log("\nRunning evaluation at step {}".format(step))
|
||||
|
||||
eval_losses = []
|
||||
before_losses = []
|
||||
after_losses = []
|
||||
stop_token_losses = []
|
||||
linear_losses = []
|
||||
linear_loss = None
|
||||
|
||||
if hparams.predict_linear:
|
||||
for i in tqdm(range(feeder.test_steps)):
|
||||
eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, \
|
||||
mel_t, t_len, align, lin_p, lin_t = sess.run(
|
||||
[
|
||||
eval_model.tower_loss[0], eval_model.tower_before_loss[0],
|
||||
eval_model.tower_after_loss[0],
|
||||
eval_model.tower_stop_token_loss[0],
|
||||
eval_model.tower_linear_loss[0],
|
||||
eval_model.tower_mel_outputs[0][0],
|
||||
eval_model.tower_mel_targets[0][0],
|
||||
eval_model.tower_targets_lengths[0][0],
|
||||
eval_model.tower_alignments[0][0],
|
||||
eval_model.tower_linear_outputs[0][0],
|
||||
eval_model.tower_linear_targets[0][0],
|
||||
])
|
||||
eval_losses.append(eloss)
|
||||
before_losses.append(before_loss)
|
||||
after_losses.append(after_loss)
|
||||
stop_token_losses.append(stop_token_loss)
|
||||
linear_losses.append(linear_loss)
|
||||
linear_loss = sum(linear_losses) / len(linear_losses)
|
||||
|
||||
wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
|
||||
audio.save_wav(wav, os.path.join(eval_wav_dir,
|
||||
"step-{}-eval-wave-from-linear.wav".format(
|
||||
step)), sr=hparams.sample_rate)
|
||||
|
||||
else:
|
||||
for i in tqdm(range(feeder.test_steps)):
|
||||
eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, \
|
||||
align = sess.run(
|
||||
[
|
||||
eval_model.tower_loss[0], eval_model.tower_before_loss[0],
|
||||
eval_model.tower_after_loss[0],
|
||||
eval_model.tower_stop_token_loss[0],
|
||||
eval_model.tower_mel_outputs[0][0],
|
||||
eval_model.tower_mel_targets[0][0],
|
||||
eval_model.tower_targets_lengths[0][0],
|
||||
eval_model.tower_alignments[0][0]
|
||||
])
|
||||
eval_losses.append(eloss)
|
||||
before_losses.append(before_loss)
|
||||
after_losses.append(after_loss)
|
||||
stop_token_losses.append(stop_token_loss)
|
||||
|
||||
eval_loss = sum(eval_losses) / len(eval_losses)
|
||||
before_loss = sum(before_losses) / len(before_losses)
|
||||
after_loss = sum(after_losses) / len(after_losses)
|
||||
stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)
|
||||
|
||||
log("Saving eval log to {}..".format(eval_dir))
|
||||
# Save some log to monitor model improvement on same unseen sequence
|
||||
wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
|
||||
audio.save_wav(wav, os.path.join(eval_wav_dir,
|
||||
"step-{}-eval-wave-from-mel.wav".format(step)),
|
||||
sr=hparams.sample_rate)
|
||||
|
||||
plot.plot_alignment(align, os.path.join(eval_plot_dir,
|
||||
"step-{}-eval-align.png".format(step)),
|
||||
title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
|
||||
time_string(),
|
||||
step,
|
||||
eval_loss),
|
||||
max_len=t_len // hparams.outputs_per_step)
|
||||
plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir,
|
||||
"step-{"
|
||||
"}-eval-mel-spectrogram.png".format(
|
||||
step)),
|
||||
title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
|
||||
time_string(),
|
||||
step,
|
||||
eval_loss),
|
||||
target_spectrogram=mel_t,
|
||||
max_len=t_len)
|
||||
|
||||
if hparams.predict_linear:
|
||||
plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir,
|
||||
"step-{}-eval-linear-spectrogram.png".format(
|
||||
step)),
|
||||
title="{}, {}, step={}, loss={:.5f}".format(
|
||||
"Tacotron", time_string(), step, eval_loss),
|
||||
target_spectrogram=lin_t,
|
||||
max_len=t_len, auto_aspect=True)
|
||||
|
||||
log("Eval loss for global step {}: {:.3f}".format(step, eval_loss))
|
||||
log("Writing eval summary!")
|
||||
add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss,
|
||||
stop_token_loss, eval_loss)
|
||||
|
||||
if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \
|
||||
step == 300:
|
||||
# Save model and current global step
|
||||
saver.save(sess, checkpoint_fpath, global_step=global_step)
|
||||
|
||||
log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..")
|
||||
input_seq, mel_prediction, alignment, target, target_length = sess.run([
|
||||
model.tower_inputs[0][0],
|
||||
model.tower_mel_outputs[0][0],
|
||||
model.tower_alignments[0][0],
|
||||
model.tower_mel_targets[0][0],
|
||||
model.tower_targets_lengths[0][0],
|
||||
])
|
||||
|
||||
# save predicted mel spectrogram to disk (debug)
|
||||
mel_filename = "mel-prediction-step-{}.npy".format(step)
|
||||
np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T,
|
||||
allow_pickle=False)
|
||||
|
||||
# save griffin lim inverted wav for debug (mel -> wav)
|
||||
wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
|
||||
audio.save_wav(wav,
|
||||
os.path.join(wav_dir, "step-{}-wave-from-mel.wav".format(step)),
|
||||
sr=hparams.sample_rate)
|
||||
|
||||
# save alignment plot to disk (control purposes)
|
||||
plot.plot_alignment(alignment,
|
||||
os.path.join(plot_dir, "step-{}-align.png".format(step)),
|
||||
title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
|
||||
time_string(),
|
||||
step, loss),
|
||||
max_len=target_length // hparams.outputs_per_step)
|
||||
# save real and predicted mel-spectrogram plot to disk (control purposes)
|
||||
plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir,
|
||||
"step-{}-mel-spectrogram.png".format(
|
||||
step)),
|
||||
title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
|
||||
time_string(),
|
||||
step, loss),
|
||||
target_spectrogram=target,
|
||||
max_len=target_length)
|
||||
log("Input at step {}: {}".format(step, sequence_to_text(input_seq)))
|
||||
|
||||
if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
|
||||
# Get current checkpoint state
|
||||
checkpoint_state = tf.train.get_checkpoint_state(save_dir)
|
||||
|
||||
# Update Projector
|
||||
log("\nSaving Model Character Embeddings visualization..")
|
||||
add_embedding_stats(summary_writer, [model.embedding_table.name],
|
||||
[char_embedding_meta],
|
||||
checkpoint_state.model_checkpoint_path)
|
||||
log("Tacotron Character embeddings have been updated on tensorboard!")
|
||||
|
||||
log("Tacotron training complete after {} global steps!".format(
|
||||
args.tacotron_train_steps), slack=True)
|
||||
return save_dir
|
||||
|
||||
except Exception as e:
|
||||
log("Exiting due to exception: {}".format(e), slack=True)
|
||||
traceback.print_exc()
|
||||
coord.request_stop(e)
|
||||
|
||||
|
||||
def tacotron_train(args, log_dir, hparams):
|
||||
return train(log_dir, args, hparams)
|
45
synthesizer_tacotron2/utils/__init__.py
Normal file
45
synthesizer_tacotron2/utils/__init__.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
import torch
|
||||
|
||||
|
||||
_output_ref = None
|
||||
_replicas_ref = None
|
||||
|
||||
def data_parallel_workaround(model, *input):
|
||||
global _output_ref
|
||||
global _replicas_ref
|
||||
device_ids = list(range(torch.cuda.device_count()))
|
||||
output_device = device_ids[0]
|
||||
replicas = torch.nn.parallel.replicate(model, device_ids)
|
||||
# input.shape = (num_args, batch, ...)
|
||||
inputs = torch.nn.parallel.scatter(input, device_ids)
|
||||
# inputs.shape = (num_gpus, num_args, batch/num_gpus, ...)
|
||||
replicas = replicas[:len(inputs)]
|
||||
outputs = torch.nn.parallel.parallel_apply(replicas, inputs)
|
||||
y_hat = torch.nn.parallel.gather(outputs, output_device)
|
||||
_output_ref = outputs
|
||||
_replicas_ref = replicas
|
||||
return y_hat
|
||||
|
||||
|
||||
class ValueWindow():
|
||||
def __init__(self, window_size=100):
|
||||
self._window_size = window_size
|
||||
self._values = []
|
||||
|
||||
def append(self, x):
|
||||
self._values = self._values[-(self._window_size - 1):] + [x]
|
||||
|
||||
@property
|
||||
def sum(self):
|
||||
return sum(self._values)
|
||||
|
||||
@property
|
||||
def count(self):
|
||||
return len(self._values)
|
||||
|
||||
@property
|
||||
def average(self):
|
||||
return self.sum / max(1, self.count)
|
||||
|
||||
def reset(self):
|
||||
self._values = []
|
62
synthesizer_tacotron2/utils/_cmudict.py
Normal file
62
synthesizer_tacotron2/utils/_cmudict.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
import re
|
||||
|
||||
valid_symbols = [
|
||||
"AA", "AA0", "AA1", "AA2", "AE", "AE0", "AE1", "AE2", "AH", "AH0", "AH1", "AH2",
|
||||
"AO", "AO0", "AO1", "AO2", "AW", "AW0", "AW1", "AW2", "AY", "AY0", "AY1", "AY2",
|
||||
"B", "CH", "D", "DH", "EH", "EH0", "EH1", "EH2", "ER", "ER0", "ER1", "ER2", "EY",
|
||||
"EY0", "EY1", "EY2", "F", "G", "HH", "IH", "IH0", "IH1", "IH2", "IY", "IY0", "IY1",
|
||||
"IY2", "JH", "K", "L", "M", "N", "NG", "OW", "OW0", "OW1", "OW2", "OY", "OY0",
|
||||
"OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH", "UH0", "UH1", "UH2", "UW",
|
||||
"UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"
|
||||
]
|
||||
|
||||
_valid_symbol_set = set(valid_symbols)
|
||||
|
||||
|
||||
class CMUDict:
|
||||
"""Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
|
||||
def __init__(self, file_or_path, keep_ambiguous=True):
|
||||
if isinstance(file_or_path, str):
|
||||
with open(file_or_path, encoding="latin-1") as f:
|
||||
entries = _parse_cmudict(f)
|
||||
else:
|
||||
entries = _parse_cmudict(file_or_path)
|
||||
if not keep_ambiguous:
|
||||
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
|
||||
self._entries = entries
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self._entries)
|
||||
|
||||
|
||||
def lookup(self, word):
|
||||
"""Returns list of ARPAbet pronunciations of the given word."""
|
||||
return self._entries.get(word.upper())
|
||||
|
||||
|
||||
|
||||
_alt_re = re.compile(r"\([0-9]+\)")
|
||||
|
||||
|
||||
def _parse_cmudict(file):
|
||||
cmudict = {}
|
||||
for line in file:
|
||||
if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
|
||||
parts = line.split(" ")
|
||||
word = re.sub(_alt_re, "", parts[0])
|
||||
pronunciation = _get_pronunciation(parts[1])
|
||||
if pronunciation:
|
||||
if word in cmudict:
|
||||
cmudict[word].append(pronunciation)
|
||||
else:
|
||||
cmudict[word] = [pronunciation]
|
||||
return cmudict
|
||||
|
||||
|
||||
def _get_pronunciation(s):
|
||||
parts = s.strip().split(" ")
|
||||
for part in parts:
|
||||
if part not in _valid_symbol_set:
|
||||
return None
|
||||
return " ".join(parts)
|
88
synthesizer_tacotron2/utils/cleaners.py
Normal file
88
synthesizer_tacotron2/utils/cleaners.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
"""
|
||||
Cleaners are transformations that run over the input text at both training and eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You"ll typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
"""
|
||||
|
||||
import re
|
||||
from unidecode import unidecode
|
||||
from .numbers import normalize_numbers
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r"\s+")
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
_abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
("mrs", "misess"),
|
||||
("mr", "mister"),
|
||||
("dr", "doctor"),
|
||||
("st", "saint"),
|
||||
("co", "company"),
|
||||
("jr", "junior"),
|
||||
("maj", "major"),
|
||||
("gen", "general"),
|
||||
("drs", "doctors"),
|
||||
("rev", "reverend"),
|
||||
("lt", "lieutenant"),
|
||||
("hon", "honorable"),
|
||||
("sgt", "sergeant"),
|
||||
("capt", "captain"),
|
||||
("esq", "esquire"),
|
||||
("ltd", "limited"),
|
||||
("col", "colonel"),
|
||||
("ft", "fort"),
|
||||
]]
|
||||
|
||||
|
||||
def expand_abbreviations(text):
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def expand_numbers(text):
|
||||
return normalize_numbers(text)
|
||||
|
||||
|
||||
def lowercase(text):
|
||||
"""lowercase input tokens."""
|
||||
return text.lower()
|
||||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(_whitespace_re, " ", text)
|
||||
|
||||
|
||||
def convert_to_ascii(text):
|
||||
return unidecode(text)
|
||||
|
||||
|
||||
def basic_cleaners(text):
|
||||
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def transliteration_cleaners(text):
|
||||
"""Pipeline for non-English text that transliterates to ASCII."""
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def english_cleaners(text):
|
||||
"""Pipeline for English text, including number and abbreviation expansion."""
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = expand_numbers(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
68
synthesizer_tacotron2/utils/numbers.py
Normal file
68
synthesizer_tacotron2/utils/numbers.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
import re
|
||||
import inflect
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
|
||||
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
|
||||
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
|
||||
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
||||
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
|
||||
_number_re = re.compile(r"[0-9]+")
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
return m.group(1).replace(",", "")
|
||||
|
||||
|
||||
def _expand_decimal_point(m):
|
||||
return m.group(1).replace(".", " point ")
|
||||
|
||||
|
||||
def _expand_dollars(m):
|
||||
match = m.group(1)
|
||||
parts = match.split(".")
|
||||
if len(parts) > 2:
|
||||
return match + " dollars" # Unexpected format
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
||||
cent_unit = "cent" if cents == 1 else "cents"
|
||||
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
|
||||
elif dollars:
|
||||
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
||||
return "%s %s" % (dollars, dollar_unit)
|
||||
elif cents:
|
||||
cent_unit = "cent" if cents == 1 else "cents"
|
||||
return "%s %s" % (cents, cent_unit)
|
||||
else:
|
||||
return "zero dollars"
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
|
||||
|
||||
def _expand_number(m):
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return "two thousand"
|
||||
elif num > 2000 and num < 2010:
|
||||
return "two thousand " + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + " hundred"
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword="")
|
||||
|
||||
|
||||
def normalize_numbers(text):
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_pounds_re, r"\1 pounds", text)
|
||||
text = re.sub(_dollars_re, _expand_dollars, text)
|
||||
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
return text
|
76
synthesizer_tacotron2/utils/plot.py
Normal file
76
synthesizer_tacotron2/utils/plot.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
|
||||
def split_title_line(title_text, max_words=5):
|
||||
"""
|
||||
A function that splits any string based on specific character
|
||||
(returning it with the string), with maximum number of words on it
|
||||
"""
|
||||
seq = title_text.split()
|
||||
return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
|
||||
|
||||
def plot_alignment(alignment, path, title=None, split_title=False, max_len=None):
|
||||
if max_len is not None:
|
||||
alignment = alignment[:, :max_len]
|
||||
|
||||
fig = plt.figure(figsize=(8, 6))
|
||||
ax = fig.add_subplot(111)
|
||||
|
||||
im = ax.imshow(
|
||||
alignment,
|
||||
aspect="auto",
|
||||
origin="lower",
|
||||
interpolation="none")
|
||||
fig.colorbar(im, ax=ax)
|
||||
xlabel = "Decoder timestep"
|
||||
|
||||
if split_title:
|
||||
title = split_title_line(title)
|
||||
|
||||
plt.xlabel(xlabel)
|
||||
plt.title(title)
|
||||
plt.ylabel("Encoder timestep")
|
||||
plt.tight_layout()
|
||||
plt.savefig(path, format="png")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
|
||||
if max_len is not None:
|
||||
target_spectrogram = target_spectrogram[:max_len]
|
||||
pred_spectrogram = pred_spectrogram[:max_len]
|
||||
|
||||
if split_title:
|
||||
title = split_title_line(title)
|
||||
|
||||
fig = plt.figure(figsize=(10, 8))
|
||||
# Set common labels
|
||||
fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16)
|
||||
|
||||
#target spectrogram subplot
|
||||
if target_spectrogram is not None:
|
||||
ax1 = fig.add_subplot(311)
|
||||
ax2 = fig.add_subplot(312)
|
||||
|
||||
if auto_aspect:
|
||||
im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none")
|
||||
else:
|
||||
im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none")
|
||||
ax1.set_title("Target Mel-Spectrogram")
|
||||
fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1)
|
||||
ax2.set_title("Predicted Mel-Spectrogram")
|
||||
else:
|
||||
ax2 = fig.add_subplot(211)
|
||||
|
||||
if auto_aspect:
|
||||
im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none")
|
||||
else:
|
||||
im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none")
|
||||
fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(path, format="png")
|
||||
plt.close()
|
18
synthesizer_tacotron2/utils/symbols.py
Normal file
18
synthesizer_tacotron2/utils/symbols.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
"""
|
||||
Defines the set of symbols used in text input to the model.
|
||||
|
||||
The default is a set of ASCII characters that works well for English or text that has been run
|
||||
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
|
||||
"""
|
||||
# from . import cmudict
|
||||
|
||||
_pad = "_"
|
||||
_eos = "~"
|
||||
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? '
|
||||
|
||||
#_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? ' # use this old one if you want to train old model
|
||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
||||
#_arpabet = ["@' + s for s in cmudict.valid_symbols]
|
||||
|
||||
# Export all symbols:
|
||||
symbols = [_pad, _eos] + list(_characters) #+ _arpabet
|
74
synthesizer_tacotron2/utils/text.py
Normal file
74
synthesizer_tacotron2/utils/text.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
from .symbols import symbols
|
||||
from . import cleaners
|
||||
import re
|
||||
|
||||
# Mappings from symbol to numeric ID and vice versa:
|
||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
||||
|
||||
# Regular expression matching text enclosed in curly braces:
|
||||
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
|
||||
|
||||
|
||||
def text_to_sequence(text, cleaner_names):
|
||||
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
|
||||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
||||
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
||||
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
cleaner_names: names of the cleaner functions to run the text through
|
||||
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
"""
|
||||
sequence = []
|
||||
|
||||
# Check for curly braces and treat their contents as ARPAbet:
|
||||
while len(text):
|
||||
m = _curly_re.match(text)
|
||||
if not m:
|
||||
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
|
||||
break
|
||||
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
|
||||
sequence += _arpabet_to_sequence(m.group(2))
|
||||
text = m.group(3)
|
||||
|
||||
# Append EOS token
|
||||
sequence.append(_symbol_to_id["~"])
|
||||
return sequence
|
||||
|
||||
|
||||
def sequence_to_text(sequence):
|
||||
"""Converts a sequence of IDs back to a string"""
|
||||
result = ""
|
||||
for symbol_id in sequence:
|
||||
if symbol_id in _id_to_symbol:
|
||||
s = _id_to_symbol[symbol_id]
|
||||
# Enclose ARPAbet back in curly braces:
|
||||
if len(s) > 1 and s[0] == "@":
|
||||
s = "{%s}" % s[1:]
|
||||
result += s
|
||||
return result.replace("}{", " ")
|
||||
|
||||
|
||||
def _clean_text(text, cleaner_names):
|
||||
for name in cleaner_names:
|
||||
cleaner = getattr(cleaners, name)
|
||||
if not cleaner:
|
||||
raise Exception("Unknown cleaner: %s" % name)
|
||||
text = cleaner(text)
|
||||
return text
|
||||
|
||||
|
||||
def _symbols_to_sequence(symbols):
|
||||
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
|
||||
|
||||
|
||||
def _arpabet_to_sequence(text):
|
||||
return _symbols_to_sequence(["@" + s for s in text.split()])
|
||||
|
||||
|
||||
def _should_keep_symbol(s):
|
||||
return s in _symbol_to_id and s not in ("_", "~")
|
Loading…
Reference in New Issue
Block a user