Compare commits

...

3 Commits

Author SHA1 Message Date
Rita-ritally 529dca2892
Merge de4e525a0d into 156723e37c 2023-09-11 12:35:10 -06:00
Vega 156723e37c
Skip embedding (#950)
* Skip embedding

* Skip earlier

* Remove unused paramater

* Pass param
2023-09-05 23:15:04 +08:00
Rita de4e525a0d modified tacotron to tacotron2 2021-12-26 12:37:35 +08:00
32 changed files with 4272 additions and 63 deletions

View File

@ -39,6 +39,9 @@ data_info = {
}
}
def should_skip(fpath: Path, skip_existing: bool) -> bool:
return skip_existing and fpath.exists()
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
skip_existing: bool, hparams, no_alignments: bool,
dataset: str, emotion_extract = False, encoder_model_fpath=None):
@ -99,7 +102,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
def embed_utterance(fpaths, encoder_model_fpath):
def _embed_utterance(fpaths: str, encoder_model_fpath: str):
if not encoder.is_loaded():
encoder.load_model(encoder_model_fpath)
@ -110,15 +113,13 @@ def embed_utterance(fpaths, encoder_model_fpath):
embed = encoder.embed_utterance(wav)
np.save(embed_fpath, embed, allow_pickle=False)
def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False):
if skip_existing and fpaths.exists():
return
def _emo_extract_from_utterance(fpaths, hparams):
wav_fpath, emo_fpath = fpaths
wav = np.load(wav_fpath)
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int, skip_existing: bool):
wav_dir = synthesizer_root.joinpath("audio")
metadata_fpath = synthesizer_root.joinpath("train.txt")
assert wav_dir.exists() and metadata_fpath.exists()
@ -128,11 +129,11 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
# Gather the input wave filepath and the target output embed filepath
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata if not should_skip(embed_dir.joinpath(m[2]), skip_existing)]
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
# Embed the utterances in separate threads
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
func = partial(_embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths)
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
@ -142,14 +143,14 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
assert wav_dir.exists() and metadata_fpath.exists()
emo_dir = synthesizer_root.joinpath("emo")
emo_dir.mkdir(exist_ok=True)
# Gather the input wave filepath and the target output embed filepath
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata]
fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata if not should_skip(emo_dir.joinpath(m[0].replace("audio-", "emo-")), skip_existing)]
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
# Embed the utterances in separate threads
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
func = partial(_emo_extract_from_utterance, hparams=hparams)
job = Pool(n_processes).imap(func, fpaths)
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))

View File

@ -45,7 +45,7 @@ def extract_emo(
return y
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams, encoder_model_fpath):
mel_fpath: str, wav_fpath: str, hparams, encoder_model_fpath):
## FOR REFERENCE:
# For you not to lose your head if you ever wish to change things here or implement your own
# synthesizer.
@ -58,13 +58,6 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
# without extra padding. This means that you won't have an exact relation between the length
# of the wav and of the mel spectrogram. See the vocoder data loader.
# Skip existing utterances if needed
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
return None
# Trim silence
if hparams.trim_silence:
if not encoder.is_loaded():
@ -112,50 +105,28 @@ def _split_on_silences(wav_fpath, words, hparams):
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
metadata = []
extensions = ("*.wav", "*.flac", "*.mp3")
if skip_existing:
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
if not words:
words = dict_info.get(wav_fpath.name) # try with extension
if not words:
words = dict_info.get(wav_fpath.name) # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
if mel_fpath.exists() and wav_fpath_.exists():
print(f"No word found in dict_info for {wav_fpath.name}, skip it")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
wav_fpath = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
continue
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath) # accelarate
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath) # accelarate
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
else:
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
if not words:
words = dict_info.get(wav_fpath.name) # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath)
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
return metadata

2
pre.py
View File

@ -71,7 +71,7 @@ if __name__ == "__main__":
del args.n_processes_embed
preprocess_dataset(**vars(args))
create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath)
create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath, skip_existing=args.skip_existing)
if args.emotion_extract:
create_emo(synthesizer_root=args.out_dir, n_processes=n_processes_embed, skip_existing=args.skip_existing, hparams=args.hparams)

BIN
synthesizer_tacotron2/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,24 @@
MIT License
Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
Modified work Copyright (c) 2020 blue-fish (https://github.com/blue-fish)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1 @@
#

View File

@ -0,0 +1,206 @@
import librosa
import librosa.filters
import numpy as np
from scipy import signal
from scipy.io import wavfile
import soundfile as sf
def load_wav(path, sr):
return librosa.core.load(path, sr=sr)[0]
def save_wav(wav, path, sr):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
#proposed by @dsmiller
wavfile.write(path, sr, wav.astype(np.int16))
def save_wavenet_wav(wav, path, sr):
sf.write(path, wav.astype(np.float32), sr)
def preemphasis(wav, k, preemphasize=True):
if preemphasize:
return signal.lfilter([1, -k], [1], wav)
return wav
def inv_preemphasis(wav, k, inv_preemphasize=True):
if inv_preemphasize:
return signal.lfilter([1], [1, -k], wav)
return wav
#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
def start_and_end_indices(quantized, silence_threshold=2):
for start in range(quantized.size):
if abs(quantized[start] - 127) > silence_threshold:
break
for end in range(quantized.size - 1, 1, -1):
if abs(quantized[end] - 127) > silence_threshold:
break
assert abs(quantized[start] - 127) > silence_threshold
assert abs(quantized[end] - 127) > silence_threshold
return start, end
def get_hop_size(hparams):
hop_size = hparams.hop_size
if hop_size is None:
assert hparams.frame_shift_ms is not None
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
return hop_size
def linearspectrogram(wav, hparams):
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
if hparams.signal_normalization:
return _normalize(S, hparams)
return S
def melspectrogram(wav, hparams):
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
if hparams.signal_normalization:
return _normalize(S, hparams)
return S
def inv_linear_spectrogram(linear_spectrogram, hparams):
"""Converts linear spectrogram to waveform using librosa"""
if hparams.signal_normalization:
D = _denormalize(linear_spectrogram, hparams)
else:
D = linear_spectrogram
S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
if hparams.use_lws:
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
def inv_mel_spectrogram(mel_spectrogram, hparams):
"""Converts mel spectrogram to waveform using librosa"""
if hparams.signal_normalization:
D = _denormalize(mel_spectrogram, hparams)
else:
D = mel_spectrogram
S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear
if hparams.use_lws:
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
def _lws_processor(hparams):
import lws
return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
def _griffin_lim(S, hparams):
"""librosa implementation of Griffin-Lim
Based on https://github.com/librosa/librosa/issues/434
"""
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = _istft(S_complex * angles, hparams)
for i in range(hparams.griffin_lim_iters):
angles = np.exp(1j * np.angle(_stft(y, hparams)))
y = _istft(S_complex * angles, hparams)
return y
def _stft(y, hparams):
if hparams.use_lws:
return _lws_processor(hparams).stft(y).T
else:
return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
def _istft(y, hparams):
return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
##########################################################
#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
def num_frames(length, fsize, fshift):
"""Compute number of time frames of spectrogram
"""
pad = (fsize - fshift)
if length % fshift == 0:
M = (length + pad * 2 - fsize) // fshift + 1
else:
M = (length + pad * 2 - fsize) // fshift + 2
return M
def pad_lr(x, fsize, fshift):
"""Compute left and right padding
"""
M = num_frames(len(x), fsize, fshift)
pad = (fsize - fshift)
T = len(x) + 2 * pad
r = (M - 1) * fshift + fsize - T
return pad, pad + r
##########################################################
#Librosa correct padding
def librosa_pad_lr(x, fsize, fshift):
return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
# Conversions
_mel_basis = None
_inv_mel_basis = None
def _linear_to_mel(spectogram, hparams):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis(hparams)
return np.dot(_mel_basis, spectogram)
def _mel_to_linear(mel_spectrogram, hparams):
global _inv_mel_basis
if _inv_mel_basis is None:
_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
def _build_mel_basis(hparams):
assert hparams.fmax <= hparams.sample_rate // 2
return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
fmin=hparams.fmin, fmax=hparams.fmax)
def _amp_to_db(x, hparams):
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
return 20 * np.log10(np.maximum(min_level, x))
def _db_to_amp(x):
return np.power(10.0, (x) * 0.05)
def _normalize(S, hparams):
if hparams.allow_clipping_in_normalization:
if hparams.symmetric_mels:
return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
-hparams.max_abs_value, hparams.max_abs_value)
else:
return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
if hparams.symmetric_mels:
return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
else:
return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
def _denormalize(D, hparams):
if hparams.allow_clipping_in_normalization:
if hparams.symmetric_mels:
return (((np.clip(D, -hparams.max_abs_value,
hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
+ hparams.min_level_db)
else:
return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
if hparams.symmetric_mels:
return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
else:
return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)

View File

@ -0,0 +1,272 @@
from sklearn.model_selection import train_test_split
from synthesizer.utils.text import text_to_sequence
from synthesizer.infolog import log
import tensorflow as tf
import numpy as np
import threading
import time
import os
_batches_per_group = 64
class Feeder:
"""
Feeds batches of data into queue on a background thread.
"""
def __init__(self, coordinator, metadata_filename, hparams):
super(Feeder, self).__init__()
self._coord = coordinator
self._hparams = hparams
self._cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]
self._train_offset = 0
self._test_offset = 0
# Load metadata
self._mel_dir = os.path.join(os.path.dirname(metadata_filename), "mels")
self._embed_dir = os.path.join(os.path.dirname(metadata_filename), "embeds")
with open(metadata_filename, encoding="utf-8") as f:
self._metadata = [line.strip().split("|") for line in f]
frame_shift_ms = hparams.hop_size / hparams.sample_rate
hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
log("Loaded metadata for {} examples ({:.2f} hours)".format(len(self._metadata), hours))
#Train test split
if hparams.tacotron_test_size is None:
assert hparams.tacotron_test_batches is not None
test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None
else hparams.tacotron_test_batches * hparams.tacotron_batch_size)
indices = np.arange(len(self._metadata))
train_indices, test_indices = train_test_split(indices,
test_size=test_size, random_state=hparams.tacotron_data_random_state)
#Make sure test_indices is a multiple of batch_size else round up
len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size)
extra_test = test_indices[len_test_indices:]
test_indices = test_indices[:len_test_indices]
train_indices = np.concatenate([train_indices, extra_test])
self._train_meta = list(np.array(self._metadata)[train_indices])
self._test_meta = list(np.array(self._metadata)[test_indices])
self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size
if hparams.tacotron_test_size is None:
assert hparams.tacotron_test_batches == self.test_steps
#pad input sequences with the <pad_token> 0 ( _ )
self._pad = 0
#explicitely setting the padding to a value that doesn"t originally exist in the spectogram
#to avoid any possible conflicts, without affecting the output range of the model too much
if hparams.symmetric_mels:
self._target_pad = -hparams.max_abs_value
else:
self._target_pad = 0.
#Mark finished sequences with 1s
self._token_pad = 1.
with tf.device("/cpu:0"):
# Create placeholders for inputs and targets. Don"t specify batch size because we want
# to be able to feed different batch sizes at eval time.
self._placeholders = [
tf.placeholder(tf.int32, shape=(None, None), name="inputs"),
tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels),
name="mel_targets"),
tf.placeholder(tf.float32, shape=(None, None), name="token_targets"),
tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None),
name="split_infos"),
# SV2TTS
tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size),
name="speaker_embeddings")
]
# Create queue for buffering data
queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32,
tf.int32, tf.int32, tf.float32], name="input_queue")
self._enqueue_op = queue.enqueue(self._placeholders)
self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \
self.targets_lengths, self.split_infos, self.speaker_embeddings = queue.dequeue()
self.inputs.set_shape(self._placeholders[0].shape)
self.input_lengths.set_shape(self._placeholders[1].shape)
self.mel_targets.set_shape(self._placeholders[2].shape)
self.token_targets.set_shape(self._placeholders[3].shape)
self.targets_lengths.set_shape(self._placeholders[4].shape)
self.split_infos.set_shape(self._placeholders[5].shape)
self.speaker_embeddings.set_shape(self._placeholders[6].shape)
# Create eval queue for buffering eval data
eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,
tf.int32, tf.int32, tf.float32], name="eval_queue")
self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \
self.eval_token_targets, self.eval_targets_lengths, \
self.eval_split_infos, self.eval_speaker_embeddings = eval_queue.dequeue()
self.eval_inputs.set_shape(self._placeholders[0].shape)
self.eval_input_lengths.set_shape(self._placeholders[1].shape)
self.eval_mel_targets.set_shape(self._placeholders[2].shape)
self.eval_token_targets.set_shape(self._placeholders[3].shape)
self.eval_targets_lengths.set_shape(self._placeholders[4].shape)
self.eval_split_infos.set_shape(self._placeholders[5].shape)
self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape)
def start_threads(self, session):
self._session = session
thread = threading.Thread(name="background", target=self._enqueue_next_train_group)
thread.daemon = True #Thread will close when parent quits
thread.start()
thread = threading.Thread(name="background", target=self._enqueue_next_test_group)
thread.daemon = True #Thread will close when parent quits
thread.start()
def _get_test_groups(self):
meta = self._test_meta[self._test_offset]
self._test_offset += 1
text = meta[5]
input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
#Create parallel sequences containing zeros to represent a non finished sequence
token_target = np.asarray([0.] * (len(mel_target) - 1))
embed_target = np.load(os.path.join(self._embed_dir, meta[2]))
return input_data, mel_target, token_target, embed_target, len(mel_target)
def make_test_batches(self):
start = time.time()
# Read a group of examples
n = self._hparams.tacotron_batch_size
r = self._hparams.outputs_per_step
#Test on entire test set
examples = [self._get_test_groups() for i in range(len(self._test_meta))]
# Bucket examples based on similar output sequence length for efficiency
examples.sort(key=lambda x: x[-1])
batches = [examples[i: i+n] for i in range(0, len(examples), n)]
np.random.shuffle(batches)
log("\nGenerated %d test batches of size %d in %.3f sec" % (len(batches), n, time.time() - start))
return batches, r
def _enqueue_next_train_group(self):
while not self._coord.should_stop():
start = time.time()
# Read a group of examples
n = self._hparams.tacotron_batch_size
r = self._hparams.outputs_per_step
examples = [self._get_next_example() for i in range(n * _batches_per_group)]
# Bucket examples based on similar output sequence length for efficiency
examples.sort(key=lambda x: x[-1])
batches = [examples[i: i+n] for i in range(0, len(examples), n)]
np.random.shuffle(batches)
log("\nGenerated {} train batches of size {} in {:.3f} sec".format(len(batches), n, time.time() - start))
for batch in batches:
feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
self._session.run(self._enqueue_op, feed_dict=feed_dict)
def _enqueue_next_test_group(self):
#Create test batches once and evaluate on them for all test steps
test_batches, r = self.make_test_batches()
while not self._coord.should_stop():
for batch in test_batches:
feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
self._session.run(self._eval_enqueue_op, feed_dict=feed_dict)
def _get_next_example(self):
"""Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
"""
if self._train_offset >= len(self._train_meta):
self._train_offset = 0
np.random.shuffle(self._train_meta)
meta = self._train_meta[self._train_offset]
self._train_offset += 1
text = meta[5]
input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
#Create parallel sequences containing zeros to represent a non finished sequence
token_target = np.asarray([0.] * (len(mel_target) - 1))
embed_target = np.load(os.path.join(self._embed_dir, meta[2]))
return input_data, mel_target, token_target, embed_target, len(mel_target)
def _prepare_batch(self, batches, outputs_per_step):
assert 0 == len(batches) % self._hparams.tacotron_num_gpus
size_per_device = int(len(batches) / self._hparams.tacotron_num_gpus)
np.random.shuffle(batches)
inputs = None
mel_targets = None
token_targets = None
targets_lengths = None
split_infos = []
targets_lengths = np.asarray([x[-1] for x in batches], dtype=np.int32) #Used to mask loss
input_lengths = np.asarray([len(x[0]) for x in batches], dtype=np.int32)
for i in range(self._hparams.tacotron_num_gpus):
batch = batches[size_per_device*i:size_per_device*(i+1)]
input_cur_device, input_max_len = self._prepare_inputs([x[0] for x in batch])
inputs = np.concatenate((inputs, input_cur_device), axis=1) if inputs is not None else input_cur_device
mel_target_cur_device, mel_target_max_len = self._prepare_targets([x[1] for x in batch], outputs_per_step)
mel_targets = np.concatenate(( mel_targets, mel_target_cur_device), axis=1) if mel_targets is not None else mel_target_cur_device
#Pad sequences with 1 to infer that the sequence is done
token_target_cur_device, token_target_max_len = self._prepare_token_targets([x[2] for x in batch], outputs_per_step)
token_targets = np.concatenate((token_targets, token_target_cur_device),axis=1) if token_targets is not None else token_target_cur_device
split_infos.append([input_max_len, mel_target_max_len, token_target_max_len])
split_infos = np.asarray(split_infos, dtype=np.int32)
### SV2TTS ###
embed_targets = np.asarray([x[3] for x in batches])
##############
return inputs, input_lengths, mel_targets, token_targets, targets_lengths, \
split_infos, embed_targets
def _prepare_inputs(self, inputs):
max_len = max([len(x) for x in inputs])
return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
def _prepare_targets(self, targets, alignment):
max_len = max([len(t) for t in targets])
data_len = self._round_up(max_len, alignment)
return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
def _prepare_token_targets(self, targets, alignment):
max_len = max([len(t) for t in targets]) + 1
data_len = self._round_up(max_len, alignment)
return np.stack([self._pad_token_target(t, data_len) for t in targets]), data_len
def _pad_input(self, x, length):
return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad)
def _pad_target(self, t, length):
return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad)
def _pad_token_target(self, t, length):
return np.pad(t, (0, length - t.shape[0]), mode="constant", constant_values=self._token_pad)
def _round_up(self, x, multiple):
remainder = x % multiple
return x if remainder == 0 else x + multiple - remainder
def _round_down(self, x, multiple):
remainder = x % multiple
return x if remainder == 0 else x - remainder

View File

@ -0,0 +1,272 @@
import ast
import pprint
from tensorflow.contrib.training import HParams
hparams = HParams(
cleaners="basic_cleaners",
tacotron_gpu_start_idx=0, # idx of the first GPU to be used for Tacotron training.
tacotron_num_gpus=1, # Determines the number of gpus in use for Tacotron training.
split_on_cpu=True,
### Signal Processing (used in both synthesizer and vocoder)
sample_rate = 16000,
n_fft = 800,
num_mels = 80,
hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
fmin = 55,
min_level_db = -100,
ref_level_db = 20,
max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.
preemphasis = 0.97, # Filter coefficient to use if preemphasize is True
preemphasize = True,
frame_shift_ms=None,
normalize_for_wavenet=True,
# whether to rescale to [0, 1] for wavenet. (better audio quality)
clip_for_wavenet=True,
### Tacotron Text-to-Speech (TTS)
tts_embed_dims = 512, # Embedding dimension for the graphemes/phoneme inputs
tts_encoder_dims = 256,
tts_decoder_dims = 128,
tts_postnet_dims = 512,
tts_encoder_K = 5,
tts_lstm_dims = 1024,
tts_postnet_K = 5,
tts_num_highways = 4,
tts_dropout = 0.5,
tts_cleaner_names = ["basic_cleaners"],
tts_stop_threshold = -3.4, # Value below which audio generation ends.
# For example, for a range of [-4, 4], this
# will terminate the sequence at the first
# frame that has all values < -3.4
### Tacotron Training
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
(2, 2e-4, 80_000, 12), #
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
(2, 1e-5, 640_000, 12)], # lr = learning rate
tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed
tts_eval_interval = 500, # Number of steps between model evaluation (sample generation)
# Set to -1 to generate after completing epoch, or 0 to disable
tts_eval_num_samples = 1, # Makes this number of samples
### Data Preprocessing
max_mel_frames = 900,
rescale = True,
rescaling_max = 0.9,
synthesis_batch_size = 16, # For vocoder preprocessing and inference.
### Mel Visualization and Griffin-Lim
signal_normalization = True,
power = 1.5,
griffin_lim_iters = 60,
### Audio processing options
fmax = 7600, # Should not exceed (sample_rate // 2)
allow_clipping_in_normalization = True, # Used when signal_normalization = True
clip_mels_length = True, # If true, discards samples exceeding max_mel_frames
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
# and [0, max_abs_value] if False
trim_silence = True, # Use with sample_rate of 16000 for best results
silence_threshold=2,
trim_fft_size=512,
trim_hop_size=128,
trim_top_db=23,
### SV2TTS
speaker_embedding_size = 256, # Dimension for the speaker embedding
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
# Tacotron
outputs_per_step=2, # Was 1
# number of frames to generate at each decoding step (increase to speed up computation and
# allows for higher batch size, decreases G&L audio quality)
stop_at_any=True,
# Determines whether the decoder should stop when predicting <stop> to any frame or to all of
# them (True works pretty well)
embedding_dim=512, # dimension of embedding space (these are NOT the speaker embeddings)
# Encoder parameters
enc_conv_num_layers=3, # number of encoder convolutional layers
enc_conv_kernel_size=(5,), # size of encoder convolution filters for each layer
enc_conv_channels=512, # number of encoder convolutions filters for each layer
encoder_lstm_units=256, # number of lstm units for each direction (forward and backward)
# Attention mechanism
smoothing=False, # Whether to smooth the attention normalization function
attention_dim=128, # dimension of attention space
attention_filters=32, # number of attention convolution filters
attention_kernel=(31,), # kernel size of attention convolution
cumulative_weights=True,
# Whether to cumulate (sum) all previous attention weights or simply feed previous weights (
# Recommended: True)
# Decoder
prenet_layers=[256, 256], # number of layers and number of units of prenet
decoder_layers=2, # number of decoder lstm layers
decoder_lstm_units=1024, # number of decoder lstm units on each layer
max_iters=2000,
# Max decoder steps during inference (Just for safety from infinite loop cases)
# Residual postnet
postnet_num_layers=5, # number of postnet convolutional layers
postnet_kernel_size=(5,), # size of postnet convolution filters for each layer
postnet_channels=512, # number of postnet convolution filters for each layer
# CBHG mel->linear postnet
cbhg_kernels=8,
# All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act
# as "K-grams"
cbhg_conv_channels=128, # Channels of the convolution bank
cbhg_pool_size=2, # pooling size of the CBHG
cbhg_projection=256,
# projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels)
cbhg_projection_kernel_size=3, # kernel_size of the CBHG projections
cbhg_highwaynet_layers=4, # Number of HighwayNet layers
cbhg_highway_units=128, # Number of units used in HighwayNet fully connected layers
cbhg_rnn_units=128,
# Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in
# shape
# Loss params
mask_encoder=True,
# whether to mask encoder padding while computing attention. Set to True for better prosody
# but slower convergence.
mask_decoder=False,
# Whether to use loss mask for padded sequences (if False, <stop_token> loss function will not
# be weighted, else recommended pos_weight = 20)
cross_entropy_pos_weight=20,
# Use class weights to reduce the stop token classes imbalance (by adding more penalty on
# False Negatives (FN)) (1 = disabled)
predict_linear=False,
# Whether to add a post-processing network to the Tacotron to predict linear spectrograms (
# True mode Not tested!!)
###########################################################################################################################################
# Tacotron Training
# Reproduction seeds
tacotron_random_seed=5339,
# Determines initial graph and operations (i.e: model) random state for reproducibility
tacotron_data_random_state=1234, # random state for train test split repeatability
# performance parameters
tacotron_swap_with_cpu=False,
# Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause
# major slowdowns! Only use when critical!)
# train/test split ratios, mini-batches sizes
tacotron_batch_size=36, # number of training samples on each training steps (was 32)
# Tacotron Batch synthesis supports ~16x the training batch size (no gradients during
# testing).
# Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times
# different from training. We thus recommend masking the encoder.
tacotron_synthesis_batch_size=128,
# DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN"T TRAIN TACOTRON WITH "mask_encoder=True"!!
tacotron_test_size=0.05,
# % of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is
# enough to have a good idea about overfit)
tacotron_test_batches=None, # number of test batches.
# Learning rate schedule
tacotron_decay_learning_rate=True,
# boolean, determines if the learning rate will follow an exponential decay
tacotron_start_decay=50000, # Step at which learning decay starts
tacotron_decay_steps=50000, # Determines the learning rate decay slope (UNDER TEST)
tacotron_decay_rate=0.5, # learning rate decay rate (UNDER TEST)
tacotron_initial_learning_rate=1e-3, # starting learning rate
tacotron_final_learning_rate=1e-5, # minimal learning rate
# Optimization parameters
tacotron_adam_beta1=0.9, # AdamOptimizer beta1 parameter
tacotron_adam_beta2=0.999, # AdamOptimizer beta2 parameter
tacotron_adam_epsilon=1e-6, # AdamOptimizer Epsilon parameter
# Regularization parameters
tacotron_reg_weight=1e-7, # regularization weight (for L2 regularization)
tacotron_scale_regularization=False,
# Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is
# high and biasing the model)
tacotron_zoneout_rate=0.1, # zoneout rate for all LSTM cells in the network
tacotron_dropout_rate=0.5, # dropout rate for all convolutional layers + prenet
tacotron_clip_gradients=True, # whether to clip gradients
# Evaluation parameters
natural_eval=False,
# Whether to use 100% natural eval (to evaluate Curriculum Learning performance) or with same
# teacher-forcing ratio as in training (just for overfit)
# Decoder RNN learning can take be done in one of two ways:
# Teacher Forcing: vanilla teacher forcing (usually with ratio = 1). mode="constant"
# Curriculum Learning Scheme: From Teacher-Forcing to sampling from previous outputs is
# function of global step. (teacher forcing ratio decay) mode="scheduled"
# The second approach is inspired by:
# Bengio et al. 2015: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks.
# Can be found under: https://arxiv.org/pdf/1506.03099.pdf
tacotron_teacher_forcing_mode="constant",
# Can be ("constant" or "scheduled"). "scheduled" mode applies a cosine teacher forcing ratio
# decay. (Preference: scheduled)
tacotron_teacher_forcing_ratio=1.,
# Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder
# inputs, Only relevant if mode="constant"
tacotron_teacher_forcing_init_ratio=1.,
# initial teacher forcing ratio. Relevant if mode="scheduled"
tacotron_teacher_forcing_final_ratio=0.,
# final teacher forcing ratio. Relevant if mode="scheduled"
tacotron_teacher_forcing_start_decay=10000,
# starting point of teacher forcing ratio decay. Relevant if mode="scheduled"
tacotron_teacher_forcing_decay_steps=280000,
# Determines the teacher forcing ratio decay slope. Relevant if mode="scheduled"
tacotron_teacher_forcing_decay_alpha=0.,
# teacher forcing ratio decay rate. Relevant if mode="scheduled"
###########################################################################################################################################
# Tacotron-2 integration parameters
train_with_GTA=False,
# Whether to use GTA mels to train WaveNet instead of ground truth mels.
###########################################################################################################################################
# Eval sentences (if no eval text file was specified during synthesis, these sentences are
# used for eval)
sentences=[
# From July 8, 2017 New York Times:
"Scientists at the CERN laboratory say they have discovered a new particle.",
"There\"s a way to measure the acute emotional intelligence that has never gone out of "
"style.",
"President Trump met with other leaders at the Group of 20 conference.",
"The Senate\"s bill to repeal and replace the Affordable Care Act is now imperiled.",
# From Google"s Tacotron example page:
"Generative adversarial network or variational auto-encoder.",
"Basilar membrane and otolaryngology are not auto-correlations.",
"He has read the whole thing.",
"He reads books.",
"He thought it was time to present the present.",
"Thisss isrealy awhsome.",
"Punctuation sensitivity, is working.",
"Punctuation sensitivity is working.",
"Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
"She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
"Tajima Airport serves Toyooka.",
# From The web (random long utterance)
"Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.\
This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that\
the adopted architecture is able to perform this task with wild success.",
"Thank you so much for your support!",
],
)
def hparams_debug_string():
values = hparams.values()
hp = [" %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"]
return "Hyperparameters:\n" + "\n".join(hp)

View File

@ -0,0 +1,165 @@
from synthesizer.tacotron2 import Tacotron2
import torch
from synthesizer import audio
from synthesizer.hparams import hparams
from synthesizer.models.tacotron import Tacotron
from synthesizer.utils.symbols import symbols
from synthesizer.utils.text import text_to_sequence
from vocoder.display import simple_table
from pathlib import Path
from typing import Union, List
import numpy as np
import librosa
from utils import logmmse
from pypinyin import lazy_pinyin, Style
import os
import tensorflow as tf
class Synthesizer:
sample_rate = hparams.sample_rate
hparams = hparams
def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False):
"""
The model isn't instantiated and loaded in memory until needed or until load() is called.
:param model_fpath: path to the trained model file
:param verbose: if False, prints less information when using the model
"""
self.verbose = verbose
self._low_mem = low_mem
# Prepare the model
self._model = None # type: Tacotron2
checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir)
if checkpoint_state is None:
raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
if verbose:
model_name = checkpoints_dir.parent.name.replace("logs-", "")
step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))
def is_loaded(self):
"""
Whether the model is loaded in memory.
"""
return self._model is not None
def load(self):
"""
Instantiates and loads the model given the weights file that was passed in the constructor.
"""
if self._low_mem:
raise Exception("Cannot load the synthesizer permanently in low mem mode")
tf.reset_default_graph()
self._model = Tacotron2(self.checkpoint_fpath, hparams)
def synthesize_spectrograms(self, texts: List[str],
embeddings: Union[np.ndarray, List[np.ndarray]],
return_alignments=False):
"""
Synthesizes mel spectrograms from texts and speaker embeddings.
:param texts: a list of N text prompts to be synthesized
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
:param return_alignments: if True, a matrix representing the alignments between the
characters
and each decoder output step will be returned for each spectrogram
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
sequence length of spectrogram i, and possibly the alignments.
"""
# Load the model on the first request.
if not self.is_loaded():
self.load()
print("Read " + str(texts))
texts = [" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) for v in texts]
print("Synthesizing " + str(texts))
# Preprocess text inputs
inputs = [text_to_sequence(text, hparams.tts_cleaner_names) for text in texts]
if not isinstance(embeddings, list):
embeddings = [embeddings]
# Batch inputs
batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
for i in range(0, len(inputs), hparams.synthesis_batch_size)]
batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
specs = []
for i, batch in enumerate(batched_inputs, 1):
if self.verbose:
print(f"\n| Generating {i}/{len(batched_inputs)}")
# Pad texts so they are all the same length
text_lens = [len(text) for text in batch]
max_text_len = max(text_lens)
# chars = [pad1d(text, max_text_len) for text in batch]
# chars = np.stack(chars)
#
# # Stack speaker embeddings into 2D array for batch processing
speaker_embeds = np.stack(batched_embeds[i-1])
#
# # Convert to tensor
# chars = torch.tensor(chars).long().to(self.device)
# speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
# Inference
#print(texts)
specs, alignments = self._model.my_synthesize(speaker_embeds, texts) #传入参数是embeddings还是speaker——embeds未确定
if self.verbose:
print("\n\nDone.\n")
return (specs, alignments) if return_alignments else specs
@staticmethod
def load_preprocess_wav(fpath):
"""
Loads and preprocesses an audio file under the same conditions the audio files were used to
train the synthesizer.
"""
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# denoise
if len(wav) > hparams.sample_rate*(0.3+0.1):
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
wav[-int(hparams.sample_rate*0.15):]])
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile)
return wav
@staticmethod
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
"""
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
were fed to the synthesizer when training.
"""
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
else:
wav = fpath_or_wav
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
return mel_spectrogram
@staticmethod
def griffin_lim(mel):
"""
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
with the same parameters present in hparams.py.
"""
return audio.inv_mel_spectrogram(mel, hparams)
def pad1d(x, max_len, pad_value=0):
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)

View File

@ -0,0 +1,50 @@
import atexit
import json
from datetime import datetime
from threading import Thread
from urllib.request import Request, urlopen
_format = "%Y-%m-%d %H:%M:%S.%f"
_file = None
_run_name = None
_slack_url = None
def init(filename, run_name, slack_url=None):
global _file, _run_name, _slack_url
_close_logfile()
_file = open(filename, "a")
_file = open(filename, "a")
_file.write("\n-----------------------------------------------------------------\n")
_file.write("Starting new {} training run\n".format(run_name))
_file.write("-----------------------------------------------------------------\n")
_run_name = run_name
_slack_url = slack_url
def log(msg, end="\n", slack=False):
print(msg, end=end)
if _file is not None:
_file.write("[%s] %s\n" % (datetime.now().strftime(_format)[:-3], msg))
if slack and _slack_url is not None:
Thread(target=_send_slack, args=(msg,)).start()
def _close_logfile():
global _file
if _file is not None:
_file.close()
_file = None
def _send_slack(msg):
req = Request(_slack_url)
req.add_header("Content-Type", "application/json")
urlopen(req, json.dumps({
"username": "tacotron",
"icon_emoji": ":taco:",
"text": "*%s*: %s" % (_run_name, msg)
}).encode())
atexit.register(_close_logfile)

View File

@ -0,0 +1,8 @@
from .tacotron import Tacotron
def create_model(name, hparams):
if name == "Tacotron":
return Tacotron(hparams)
else:
raise Exception("Unknown model: " + name)

View File

@ -0,0 +1,207 @@
"""A set of wrappers useful for tacotron 2 architecture
All notations and variable names were used in concordance with originial tensorflow implementation
"""
import collections
import tensorflow as tf
from synthesizer.models.attention import _compute_attention
from tensorflow.contrib.rnn import RNNCell
from tensorflow.python.framework import ops, tensor_shape
from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops
from tensorflow.python.util import nest
_zero_state_tensors = rnn_cell_impl._zero_state_tensors
class TacotronEncoderCell(RNNCell):
"""Tacotron 2 Encoder Cell
Passes inputs through a stack of convolutional layers then through a bidirectional LSTM
layer to predict the hidden representation vector (or memory)
"""
def __init__(self, convolutional_layers, lstm_layer):
"""Initialize encoder parameters
Args:
convolutional_layers: Encoder convolutional block class
lstm_layer: encoder bidirectional lstm layer class
"""
super(TacotronEncoderCell, self).__init__()
#Initialize encoder layers
self._convolutions = convolutional_layers
self._cell = lstm_layer
def __call__(self, inputs, input_lengths=None):
#Pass input sequence through a stack of convolutional layers
conv_output = self._convolutions(inputs)
#Extract hidden representation from encoder lstm cells
hidden_representation = self._cell(conv_output, input_lengths)
#For shape visualization
self.conv_output_shape = conv_output.shape
return hidden_representation
class TacotronDecoderCellState(
collections.namedtuple("TacotronDecoderCellState",
("cell_state", "attention", "time", "alignments",
"alignment_history"))):
"""`namedtuple` storing the state of a `TacotronDecoderCell`.
Contains:
- `cell_state`: The state of the wrapped `RNNCell` at the previous time
step.
- `attention`: The attention emitted at the previous time step.
- `time`: int32 scalar containing the current time step.
- `alignments`: A single or tuple of `Tensor`(s) containing the alignments
emitted at the previous time step for each attention mechanism.
- `alignment_history`: a single or tuple of `TensorArray`(s)
containing alignment matrices from all time steps for each attention
mechanism. Call `stack()` on each to convert to a `Tensor`.
"""
def replace(self, **kwargs):
"""Clones the current state while overwriting components provided by kwargs.
"""
return super(TacotronDecoderCellState, self)._replace(**kwargs)
class TacotronDecoderCell(RNNCell):
"""Tactron 2 Decoder Cell
Decodes encoder output and previous mel frames into next r frames
Decoder Step i:
1) Prenet to compress last output information
2) Concat compressed inputs with previous context vector (input feeding) *
3) Decoder RNN (actual decoding) to predict current state s_{i} *
4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments *
5) Predict new output y_{i} using s_{i} and c_{i} (concatenated)
6) Predict <stop_token> output ys_{i} using s_{i} and c_{i} (concatenated)
* : This is typically taking a vanilla LSTM, wrapping it using tensorflow"s attention wrapper,
and wrap that with the prenet before doing an input feeding, and with the prediction layer
that uses RNN states to project on output space. Actions marked with (*) can be replaced with
tensorflow"s attention wrapper call if it was using cumulative alignments instead of previous alignments only.
"""
def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection):
"""Initialize decoder parameters
Args:
prenet: A tensorflow fully connected layer acting as the decoder pre-net
attention_mechanism: A _BaseAttentionMechanism instance, usefull to
learn encoder-decoder alignments
rnn_cell: Instance of RNNCell, main body of the decoder
frame_projection: tensorflow fully connected layer with r * num_mels output units
stop_projection: tensorflow fully connected layer, expected to project to a scalar
and through a sigmoid activation
mask_finished: Boolean, Whether to mask decoder frames after the <stop_token>
"""
super(TacotronDecoderCell, self).__init__()
#Initialize decoder layers
self._prenet = prenet
self._attention_mechanism = attention_mechanism
self._cell = rnn_cell
self._frame_projection = frame_projection
self._stop_projection = stop_projection
self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value
def _batch_size_checks(self, batch_size, error_message):
return [check_ops.assert_equal(batch_size,
self._attention_mechanism.batch_size,
message=error_message)]
@property
def output_size(self):
return self._frame_projection.shape
@property
def state_size(self):
"""The `state_size` property of `TacotronDecoderCell`.
Returns:
An `TacotronDecoderCell` tuple containing shapes used by this object.
"""
return TacotronDecoderCellState(
cell_state=self._cell._cell.state_size,
time=tensor_shape.TensorShape([]),
attention=self._attention_layer_size,
alignments=self._attention_mechanism.alignments_size,
alignment_history=())
def zero_state(self, batch_size, dtype):
"""Return an initial (zero) state tuple for this `AttentionWrapper`.
Args:
batch_size: `0D` integer tensor: the batch size.
dtype: The internal state data type.
Returns:
An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
possibly, empty `TensorArray` objects.
Raises:
ValueError: (or, possibly at runtime, InvalidArgument), if
`batch_size` does not match the output size of the encoder passed
to the wrapper object at initialization time.
"""
with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
cell_state = self._cell._cell.zero_state(batch_size, dtype)
error_message = (
"When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
"Non-matching batch sizes between the memory "
"(encoder output) and the requested batch size.")
with ops.control_dependencies(
self._batch_size_checks(batch_size, error_message)):
cell_state = nest.map_structure(
lambda s: array_ops.identity(s, name="checked_cell_state"),
cell_state)
return TacotronDecoderCellState(
cell_state=cell_state,
time=array_ops.zeros([], dtype=tf.int32),
attention=_zero_state_tensors(self._attention_layer_size, batch_size,
dtype),
alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
dynamic_size=True))
def __call__(self, inputs, state):
#Information bottleneck (essential for learning attention)
prenet_output = self._prenet(inputs)
#Concat context vector and prenet output to form LSTM cells input (input feeding)
LSTM_input = tf.concat([prenet_output, state.attention], axis=-1)
#Unidirectional LSTM layers
LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state)
#Compute the attention (context) vector and alignments using
#the new decoder cell hidden state as query vector
#and cumulative alignments to extract location features
#The choice of the new cell hidden state (s_{i}) of the last
#decoder RNN Cell is based on Luong et Al. (2015):
#https://arxiv.org/pdf/1508.04025.pdf
previous_alignments = state.alignments
previous_alignment_history = state.alignment_history
context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism,
LSTM_output,
previous_alignments,
attention_layer=None)
#Concat LSTM outputs and context vector to form projections inputs
projections_input = tf.concat([LSTM_output, context_vector], axis=-1)
#Compute predicted frames and predicted <stop_token>
cell_outputs = self._frame_projection(projections_input)
stop_tokens = self._stop_projection(projections_input)
#Save alignment history
alignment_history = previous_alignment_history.write(state.time, alignments)
#Prepare next decoder state
next_state = TacotronDecoderCellState(
time=state.time + 1,
cell_state=next_cell_state,
attention=context_vector,
alignments=cumulated_alignments,
alignment_history=alignment_history)
return (cell_outputs, stop_tokens), next_state

View File

@ -0,0 +1,207 @@
"""Attention file for location based attention (compatible with tensorflow attention wrapper)"""
import tensorflow as tf
from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention
from tensorflow.python.layers import core as layers_core
from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope
#From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
def _compute_attention(attention_mechanism, cell_output, attention_state,
attention_layer):
"""Computes the attention and alignments for a given attention_mechanism."""
alignments, next_attention_state = attention_mechanism(
cell_output, state=attention_state)
# Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
expanded_alignments = array_ops.expand_dims(alignments, 1)
# Context is the inner product of alignments and values along the
# memory time dimension.
# alignments shape is
# [batch_size, 1, memory_time]
# attention_mechanism.values shape is
# [batch_size, memory_time, memory_size]
# the batched matmul is over memory_time, so the output shape is
# [batch_size, 1, memory_size].
# we then squeeze out the singleton dim.
context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
context = array_ops.squeeze(context, [1])
if attention_layer is not None:
attention = attention_layer(array_ops.concat([cell_output, context], 1))
else:
attention = context
return attention, alignments, next_attention_state
def _location_sensitive_score(W_query, W_fil, W_keys):
"""Impelements Bahdanau-style (cumulative) scoring function.
This attention is described in:
J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
gio, Attention-based models for speech recognition, in Ad-
vances in Neural Information Processing Systems, 2015, pp.
577585.
#############################################################################
hybrid attention (content-based + location-based)
f = F * α_{i-1}
energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
#############################################################################
Args:
W_query: Tensor, shape "[batch_size, 1, attention_dim]" to compare to location features.
W_location: processed previous alignments into location features, shape "[batch_size, max_time, attention_dim]"
W_keys: Tensor, shape "[batch_size, max_time, attention_dim]", typically the encoder outputs.
Returns:
A "[batch_size, max_time]" attention score (energy)
"""
# Get the number of hidden units from the trailing dimension of keys
dtype = W_query.dtype
num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
v_a = tf.get_variable(
"attention_variable_projection", shape=[num_units], dtype=dtype,
initializer=tf.contrib.layers.xavier_initializer())
b_a = tf.get_variable(
"attention_bias", shape=[num_units], dtype=dtype,
initializer=tf.zeros_initializer())
return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
def _smoothing_normalization(e):
"""Applies a smoothing normalization function instead of softmax
Introduced in:
J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
gio, Attention-based models for speech recognition, in Ad-
vances in Neural Information Processing Systems, 2015, pp.
577585.
############################################################################
Smoothing normalization function
a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
############################################################################
Args:
e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
values of an attention mechanism
Returns:
matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
attendance to multiple memory time steps.
"""
return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
class LocationSensitiveAttention(BahdanauAttention):
"""Impelements Bahdanau-style (cumulative) scoring function.
Usually referred to as "hybrid" attention (content-based + location-based)
Extends the additive attention described in:
"D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
tion by jointly learning to align and translate, in Proceedings
of ICLR, 2015."
to use previous alignments as additional location features.
This attention is described in:
J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
gio, Attention-based models for speech recognition, in Ad-
vances in Neural Information Processing Systems, 2015, pp.
577585.
"""
def __init__(self,
num_units,
memory,
hparams,
mask_encoder=True,
memory_sequence_length=None,
smoothing=False,
cumulate_weights=True,
name="LocationSensitiveAttention"):
"""Construct the Attention mechanism.
Args:
num_units: The depth of the query mechanism.
memory: The memory to query; usually the output of an RNN encoder. This
tensor should be shaped `[batch_size, max_time, ...]`.
mask_encoder (optional): Boolean, whether to mask encoder paddings.
memory_sequence_length (optional): Sequence lengths for the batch entries
in memory. If provided, the memory tensor rows are masked with zeros
for values past the respective sequence lengths. Only relevant if mask_encoder = True.
smoothing (optional): Boolean. Determines which normalization function to use.
Default normalization function (probablity_fn) is softmax. If smoothing is
enabled, we replace softmax with:
a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
Introduced in:
J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
gio, Attention-based models for speech recognition, in Ad-
vances in Neural Information Processing Systems, 2015, pp.
577585.
This is mainly used if the model wants to attend to multiple input parts
at the same decoding step. We probably won"t be using it since multiple sound
frames may depend on the same character/phone, probably not the way around.
Note:
We still keep it implemented in case we want to test it. They used it in the
paper in the context of speech recognition, where one phoneme may depend on
multiple subsequent sound frames.
name: Name to use when creating ops.
"""
#Create normalization function
#Setting it to None defaults in using softmax
normalization_function = _smoothing_normalization if (smoothing == True) else None
memory_length = memory_sequence_length if (mask_encoder==True) else None
super(LocationSensitiveAttention, self).__init__(
num_units=num_units,
memory=memory,
memory_sequence_length=memory_length,
probability_fn=normalization_function,
name=name)
self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
kernel_size=hparams.attention_kernel, padding="same", use_bias=True,
bias_initializer=tf.zeros_initializer(), name="location_features_convolution")
self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,
dtype=tf.float32, name="location_features_layer")
self._cumulate = cumulate_weights
def __call__(self, query, state):
"""Score the query based on the keys and values.
Args:
query: Tensor of dtype matching `self.values` and shape
`[batch_size, query_depth]`.
state (previous alignments): Tensor of dtype matching `self.values` and shape
`[batch_size, alignments_size]`
(`alignments_size` is memory"s `max_time`).
Returns:
alignments: Tensor of dtype matching `self.values` and shape
`[batch_size, alignments_size]` (`alignments_size` is memory's
`max_time`).
"""
previous_alignments = state
with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
processed_query = self.query_layer(query) if self.query_layer else query
# -> [batch_size, 1, attention_dim]
processed_query = tf.expand_dims(processed_query, 1)
# processed_location_features shape [batch_size, max_time, attention dimension]
# [batch_size, max_time] -> [batch_size, max_time, 1]
expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
# location features [batch_size, max_time, filters]
f = self.location_convolution(expanded_alignments)
# Projected location features [batch_size, max_time, attention_dim]
processed_location_features = self.location_layer(f)
# energy shape [batch_size, max_time]
energy = _location_sensitive_score(processed_query, processed_location_features, self.keys)
# alignments shape = energy shape = [batch_size, max_time]
alignments = self._probability_fn(energy, previous_alignments)
# Cumulate alignments
if self._cumulate:
next_state = alignments + previous_alignments
else:
next_state = alignments
return alignments, next_state

View File

@ -0,0 +1,132 @@
from __future__ import absolute_import, division, print_function
import collections
import tensorflow as tf
from synthesizer.models.helpers import TacoTestHelper, TacoTrainingHelper
from tensorflow.contrib.seq2seq.python.ops import decoder
from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
from tensorflow.python.framework import ops, tensor_shape
from tensorflow.python.layers import base as layers_base
from tensorflow.python.ops import rnn_cell_impl
from tensorflow.python.util import nest
class CustomDecoderOutput(
collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))):
pass
class CustomDecoder(decoder.Decoder):
"""Custom sampling decoder.
Allows for stop token prediction at inference time
and returns equivalent loss in training time.
Note:
Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers
"""
def __init__(self, cell, helper, initial_state, output_layer=None):
"""Initialize CustomDecoder.
Args:
cell: An `RNNCell` instance.
helper: A `Helper` instance.
initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
The initial state of the RNNCell.
output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
`tf.layers.Dense`. Optional layer to apply to the RNN output prior
to storing the result or sampling.
Raises:
TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
"""
rnn_cell_impl.assert_like_rnncell(type(cell), cell)
if not isinstance(helper, helper_py.Helper):
raise TypeError("helper must be a Helper, received: %s" % type(helper))
if (output_layer is not None
and not isinstance(output_layer, layers_base.Layer)):
raise TypeError(
"output_layer must be a Layer, received: %s" % type(output_layer))
self._cell = cell
self._helper = helper
self._initial_state = initial_state
self._output_layer = output_layer
@property
def batch_size(self):
return self._helper.batch_size
def _rnn_output_size(self):
size = self._cell.output_size
if self._output_layer is None:
return size
else:
# To use layer"s compute_output_shape, we need to convert the
# RNNCell"s output_size entries into shapes with an unknown
# batch size. We then pass this through the layer"s
# compute_output_shape and read off all but the first (batch)
# dimensions to get the output size of the rnn with the layer
# applied to the top.
output_shape_with_unknown_batch = nest.map_structure(
lambda s: tensor_shape.TensorShape([None]).concatenate(s),
size)
layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access
output_shape_with_unknown_batch)
return nest.map_structure(lambda s: s[1:], layer_output_shape)
@property
def output_size(self):
# Return the cell output and the id
return CustomDecoderOutput(
rnn_output=self._rnn_output_size(),
token_output=self._helper.token_output_size,
sample_id=self._helper.sample_ids_shape)
@property
def output_dtype(self):
# Assume the dtype of the cell is the output_size structure
# containing the input_state"s first component's dtype.
# Return that structure and the sample_ids_dtype from the helper.
dtype = nest.flatten(self._initial_state)[0].dtype
return CustomDecoderOutput(
nest.map_structure(lambda _: dtype, self._rnn_output_size()),
tf.float32,
self._helper.sample_ids_dtype)
def initialize(self, name=None):
"""Initialize the decoder.
Args:
name: Name scope for any created operations.
Returns:
`(finished, first_inputs, initial_state)`.
"""
return self._helper.initialize() + (self._initial_state,)
def step(self, time, inputs, state, name=None):
"""Perform a custom decoding step.
Enables for dyanmic <stop_token> prediction
Args:
time: scalar `int32` tensor.
inputs: A (structure of) input tensors.
state: A (structure of) state tensors and TensorArrays.
name: Name scope for any created operations.
Returns:
`(outputs, next_state, next_inputs, finished)`.
"""
with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)):
#Call outputprojection wrapper cell
(cell_outputs, stop_token), cell_state = self._cell(inputs, state)
#apply output_layer (if existant)
if self._output_layer is not None:
cell_outputs = self._output_layer(cell_outputs)
sample_ids = self._helper.sample(
time=time, outputs=cell_outputs, state=cell_state)
(finished, next_inputs, next_state) = self._helper.next_inputs(
time=time,
outputs=cell_outputs,
state=cell_state,
sample_ids=sample_ids,
stop_token_prediction=stop_token)
outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids)
return (outputs, next_state, next_inputs, finished)

View File

@ -0,0 +1,161 @@
import numpy as np
import tensorflow as tf
from tensorflow.contrib.seq2seq import Helper
class TacoTestHelper(Helper):
def __init__(self, batch_size, hparams):
with tf.name_scope("TacoTestHelper"):
self._batch_size = batch_size
self._output_dim = hparams.num_mels
self._reduction_factor = hparams.outputs_per_step
self.stop_at_any = hparams.stop_at_any
@property
def batch_size(self):
return self._batch_size
@property
def token_output_size(self):
return self._reduction_factor
@property
def sample_ids_shape(self):
return tf.TensorShape([])
@property
def sample_ids_dtype(self):
return np.int32
def initialize(self, name=None):
return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
def sample(self, time, outputs, state, name=None):
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them
def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
"""Stop on EOS. Otherwise, pass the last output as the next input and pass through state."""
with tf.name_scope("TacoTestHelper"):
#A sequence is finished when the output probability is > 0.5
finished = tf.cast(tf.round(stop_token_prediction), tf.bool)
#Since we are predicting r frames at each step, two modes are
#then possible:
# Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended)
# Stop when the model outputs a p > 0.5 for all r frames (Safer)
#Note:
# With enough training steps, the model should be able to predict when to stop correctly
# and the use of stop_at_any = True would be recommended. If however the model didn"t
# learn to stop correctly yet, (stops too soon) one could choose to use the safer option
# to get a correct synthesis
if self.stop_at_any:
finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended
else:
finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option
# Feed last output frame as next input. outputs is [N, output_dim * r]
next_inputs = outputs[:, -self._output_dim:]
next_state = state
return (finished, next_inputs, next_state)
class TacoTrainingHelper(Helper):
def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step):
# inputs is [N, T_in], targets is [N, T_out, D]
with tf.name_scope("TacoTrainingHelper"):
self._batch_size = batch_size
self._output_dim = hparams.num_mels
self._reduction_factor = hparams.outputs_per_step
self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio)
self.gta = gta
self.eval = evaluating
self._hparams = hparams
self.global_step = global_step
r = self._reduction_factor
# Feed every r-th target frame as input
self._targets = targets[:, r-1::r, :]
#Maximal sequence length
self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size])
@property
def batch_size(self):
return self._batch_size
@property
def token_output_size(self):
return self._reduction_factor
@property
def sample_ids_shape(self):
return tf.TensorShape([])
@property
def sample_ids_dtype(self):
return np.int32
def initialize(self, name=None):
#Compute teacher forcing ratio for this global step.
#In GTA mode, override teacher forcing scheme to work with full teacher forcing
if self.gta:
self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth
elif self.eval and self._hparams.natural_eval:
self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions
else:
if self._hparams.tacotron_teacher_forcing_mode == "scheduled":
self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio,
self.global_step, self._hparams)
return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
def sample(self, time, outputs, state, name=None):
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them
def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
with tf.name_scope(name or "TacoTrainingHelper"):
#synthesis stop (we let the model see paddings as we mask them when computing loss functions)
finished = (time + 1 >= self._lengths)
#Pick previous outputs randomly with respect to teacher forcing ratio
next_inputs = tf.cond(
tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
lambda: outputs[:,-self._output_dim:])
#Pass on state
next_state = state
return (finished, next_inputs, next_state)
def _go_frames(batch_size, output_dim):
"""Returns all-zero <GO> frames for a given batch size and output dimension"""
return tf.tile([[0.0]], [batch_size, output_dim])
def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams):
#################################################################
# Narrow Cosine Decay:
# Phase 1: tfr = 1
# We only start learning rate decay after 10k steps
# Phase 2: tfr in ]0, 1[
# decay reach minimal value at step ~280k
# Phase 3: tfr = 0
# clip by minimal teacher forcing ratio value (step >~ 280k)
#################################################################
#Compute natural cosine decay
tfr = tf.train.cosine_decay(init_tfr,
global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr = 1 at step 10k
decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr = 0 at step ~280k
alpha=hparams.tacotron_teacher_forcing_decay_alpha, #tfr = 0% of init_tfr as final value
name="tfr_cosine_decay")
#force teacher forcing ratio to take initial value when global step < start decay step.
narrow_tfr = tf.cond(
tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)),
lambda: tf.convert_to_tensor(init_tfr),
lambda: tfr)
return narrow_tfr

View File

@ -0,0 +1,528 @@
import tensorflow as tf
class HighwayNet:
def __init__(self, units, name=None):
self.units = units
self.scope = "HighwayNet" if name is None else name
self.H_layer = tf.layers.Dense(units=self.units, activation=tf.nn.relu, name="H")
self.T_layer = tf.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T",
bias_initializer=tf.constant_initializer(-1.))
def __call__(self, inputs):
with tf.variable_scope(self.scope):
H = self.H_layer(inputs)
T = self.T_layer(inputs)
return H * T + inputs * (1. - T)
class CBHG:
def __init__(self, K, conv_channels, pool_size, projections, projection_kernel_size,
n_highwaynet_layers, highway_units, rnn_units, is_training, name=None):
self.K = K
self.conv_channels = conv_channels
self.pool_size = pool_size
self.projections = projections
self.projection_kernel_size = projection_kernel_size
self.is_training = is_training
self.scope = "CBHG" if name is None else name
self.highway_units = highway_units
self.highwaynet_layers = [
HighwayNet(highway_units, name="{}_highwaynet_{}".format(self.scope, i + 1)) for i in
range(n_highwaynet_layers)]
self._fw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_forward_RNN".format(self.scope))
self._bw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_backward_RNN".format(self.scope))
def __call__(self, inputs, input_lengths):
with tf.variable_scope(self.scope):
with tf.variable_scope("conv_bank"):
# Convolution bank: concatenate on the last axis to stack channels from all
# convolutions
# The convolution bank uses multiple different kernel sizes to have many insights
# of the input sequence
# This makes one of the strengths of the CBHG block on sequences.
conv_outputs = tf.concat(
[conv1d(inputs, k, self.conv_channels, tf.nn.relu, self.is_training, 0.,
"conv1d_{}".format(k)) for k in range(1, self.K + 1)],
axis=-1
)
# Maxpooling (dimension reduction, Using max instead of average helps finding "Edges"
# in mels)
maxpool_output = tf.layers.max_pooling1d(
conv_outputs,
pool_size=self.pool_size,
strides=1,
padding="same")
# Two projection layers
proj1_output = conv1d(maxpool_output, self.projection_kernel_size, self.projections[0],
tf.nn.relu, self.is_training, 0., "proj1")
proj2_output = conv1d(proj1_output, self.projection_kernel_size, self.projections[1],
lambda _: _, self.is_training, 0., "proj2")
# Residual connection
highway_input = proj2_output + inputs
# Additional projection in case of dimension mismatch (for HighwayNet "residual"
# connection)
if highway_input.shape[2] != self.highway_units:
highway_input = tf.layers.dense(highway_input, self.highway_units)
# 4-layer HighwayNet
for highwaynet in self.highwaynet_layers:
highway_input = highwaynet(highway_input)
rnn_input = highway_input
# Bidirectional RNN
outputs, states = tf.nn.bidirectional_dynamic_rnn(
self._fw_cell,
self._bw_cell,
rnn_input,
sequence_length=input_lengths,
dtype=tf.float32)
return tf.concat(outputs, axis=2) # Concat forward and backward outputs
class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
"""Wrapper for tf LSTM to create Zoneout LSTM Cell
inspired by:
https://github.com/teganmaharaj/zoneout/blob/master/zoneout_tensorflow.py
Published by one of "https://arxiv.org/pdf/1606.01305.pdf" paper writers.
Many thanks to @Ondal90 for pointing this out. You sir are a hero!
"""
def __init__(self, num_units, is_training, zoneout_factor_cell=0., zoneout_factor_output=0.,
state_is_tuple=True, name=None):
"""Initializer with possibility to set different zoneout values for cell/hidden states.
"""
zm = min(zoneout_factor_output, zoneout_factor_cell)
zs = max(zoneout_factor_output, zoneout_factor_cell)
if zm < 0. or zs > 1.:
raise ValueError("One/both provided Zoneout factors are not in [0, 1]")
self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name)
self._zoneout_cell = zoneout_factor_cell
self._zoneout_outputs = zoneout_factor_output
self.is_training = is_training
self.state_is_tuple = state_is_tuple
@property
def state_size(self):
return self._cell.state_size
@property
def output_size(self):
return self._cell.output_size
def __call__(self, inputs, state, scope=None):
"""Runs vanilla LSTM Cell and applies zoneout.
"""
# Apply vanilla LSTM
output, new_state = self._cell(inputs, state, scope)
if self.state_is_tuple:
(prev_c, prev_h) = state
(new_c, new_h) = new_state
else:
num_proj = self._cell._num_units if self._cell._num_proj is None else \
self._cell._num_proj
prev_c = tf.slice(state, [0, 0], [-1, self._cell._num_units])
prev_h = tf.slice(state, [0, self._cell._num_units], [-1, num_proj])
new_c = tf.slice(new_state, [0, 0], [-1, self._cell._num_units])
new_h = tf.slice(new_state, [0, self._cell._num_units], [-1, num_proj])
# Apply zoneout
if self.is_training:
# nn.dropout takes keep_prob (probability to keep activations) not drop_prob (
# probability to mask activations)!
c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c,
(1 - self._zoneout_cell)) + prev_c
h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h,
(1 - self._zoneout_outputs)) + prev_h
else:
c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c
h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h
new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c,
h])
return output, new_state
class EncoderConvolutions:
"""Encoder convolutional layers used to find local dependencies in inputs characters.
"""
def __init__(self, is_training, hparams, activation=tf.nn.relu, scope=None):
"""
Args:
is_training: Boolean, determines if the model is training or in inference to control
dropout
kernel_size: tuple or integer, The size of convolution kernels
channels: integer, number of convolutional kernels
activation: callable, postnet activation function for each convolutional layer
scope: Postnet scope.
"""
super(EncoderConvolutions, self).__init__()
self.is_training = is_training
self.kernel_size = hparams.enc_conv_kernel_size
self.channels = hparams.enc_conv_channels
self.activation = activation
self.scope = "enc_conv_layers" if scope is None else scope
self.drop_rate = hparams.tacotron_dropout_rate
self.enc_conv_num_layers = hparams.enc_conv_num_layers
def __call__(self, inputs):
with tf.variable_scope(self.scope):
x = inputs
for i in range(self.enc_conv_num_layers):
x = conv1d(x, self.kernel_size, self.channels, self.activation,
self.is_training, self.drop_rate,
"conv_layer_{}_".format(i + 1) + self.scope)
return x
class EncoderRNN:
"""Encoder bidirectional one layer LSTM
"""
def __init__(self, is_training, size=256, zoneout=0.1, scope=None):
"""
Args:
is_training: Boolean, determines if the model is training or in inference to control
zoneout
size: integer, the number of LSTM units for each direction
zoneout: the zoneout factor
scope: EncoderRNN scope.
"""
super(EncoderRNN, self).__init__()
self.is_training = is_training
self.size = size
self.zoneout = zoneout
self.scope = "encoder_LSTM" if scope is None else scope
# Create forward LSTM Cell
self._fw_cell = ZoneoutLSTMCell(size, is_training,
zoneout_factor_cell=zoneout,
zoneout_factor_output=zoneout,
name="encoder_fw_LSTM")
# Create backward LSTM Cell
self._bw_cell = ZoneoutLSTMCell(size, is_training,
zoneout_factor_cell=zoneout,
zoneout_factor_output=zoneout,
name="encoder_bw_LSTM")
def __call__(self, inputs, input_lengths):
with tf.variable_scope(self.scope):
outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(
self._fw_cell,
self._bw_cell,
inputs,
sequence_length=input_lengths,
dtype=tf.float32,
swap_memory=True)
return tf.concat(outputs, axis=2) # Concat and return forward + backward outputs
class Prenet:
"""Two fully connected layers used as an information bottleneck for the attention.
"""
def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activation=tf.nn.relu,
scope=None):
"""
Args:
layers_sizes: list of integers, the length of the list represents the number of pre-net
layers and the list values represent the layers number of units
activation: callable, activation functions of the prenet layers.
scope: Prenet scope.
"""
super(Prenet, self).__init__()
self.drop_rate = drop_rate
self.layers_sizes = layers_sizes
self.activation = activation
self.is_training = is_training
self.scope = "prenet" if scope is None else scope
def __call__(self, inputs):
x = inputs
with tf.variable_scope(self.scope):
for i, size in enumerate(self.layers_sizes):
dense = tf.layers.dense(x, units=size, activation=self.activation,
name="dense_{}".format(i + 1))
# The paper discussed introducing diversity in generation at inference time
# by using a dropout of 0.5 only in prenet layers (in both training and inference).
x = tf.layers.dropout(dense, rate=self.drop_rate, training=True,
name="dropout_{}".format(i + 1) + self.scope)
return x
class DecoderRNN:
"""Decoder two uni directional LSTM Cells
"""
def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None):
"""
Args:
is_training: Boolean, determines if the model is in training or inference to control
zoneout
layers: integer, the number of LSTM layers in the decoder
size: integer, the number of LSTM units in each layer
zoneout: the zoneout factor
"""
super(DecoderRNN, self).__init__()
self.is_training = is_training
self.layers = layers
self.size = size
self.zoneout = zoneout
self.scope = "decoder_rnn" if scope is None else scope
# Create a set of LSTM layers
self.rnn_layers = [ZoneoutLSTMCell(size, is_training,
zoneout_factor_cell=zoneout,
zoneout_factor_output=zoneout,
name="decoder_LSTM_{}".format(i + 1)) for i in
range(layers)]
self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True)
def __call__(self, inputs, states):
with tf.variable_scope(self.scope):
return self._cell(inputs, states)
class FrameProjection:
"""Projection layer to r * num_mels dimensions or num_mels dimensions
"""
def __init__(self, shape=80, activation=None, scope=None):
"""
Args:
shape: integer, dimensionality of output space (r*n_mels for decoder or n_mels for
postnet)
activation: callable, activation function
scope: FrameProjection scope.
"""
super(FrameProjection, self).__init__()
self.shape = shape
self.activation = activation
self.scope = "Linear_projection" if scope is None else scope
self.dense = tf.layers.Dense(units=shape, activation=activation,
name="projection_{}".format(self.scope))
def __call__(self, inputs):
with tf.variable_scope(self.scope):
# If activation==None, this returns a simple Linear projection
# else the projection will be passed through an activation function
# output = tf.layers.dense(inputs, units=self.shape, activation=self.activation,
# name="projection_{}".format(self.scope))
output = self.dense(inputs)
return output
class StopProjection:
"""Projection to a scalar and through a sigmoid activation
"""
def __init__(self, is_training, shape=1, activation=tf.nn.sigmoid, scope=None):
"""
Args:
is_training: Boolean, to control the use of sigmoid function as it is useless to use it
during training since it is integrate inside the sigmoid_crossentropy loss
shape: integer, dimensionality of output space. Defaults to 1 (scalar)
activation: callable, activation function. only used during inference
scope: StopProjection scope.
"""
super(StopProjection, self).__init__()
self.is_training = is_training
self.shape = shape
self.activation = activation
self.scope = "stop_token_projection" if scope is None else scope
def __call__(self, inputs):
with tf.variable_scope(self.scope):
output = tf.layers.dense(inputs, units=self.shape,
activation=None, name="projection_{}".format(self.scope))
# During training, don"t use activation as it is integrated inside the
# sigmoid_cross_entropy loss function
if self.is_training:
return output
return self.activation(output)
class Postnet:
"""Postnet that takes final decoder output and fine tunes it (using vision on past and future
frames)
"""
def __init__(self, is_training, hparams, activation=tf.nn.tanh, scope=None):
"""
Args:
is_training: Boolean, determines if the model is training or in inference to control
dropout
kernel_size: tuple or integer, The size of convolution kernels
channels: integer, number of convolutional kernels
activation: callable, postnet activation function for each convolutional layer
scope: Postnet scope.
"""
super(Postnet, self).__init__()
self.is_training = is_training
self.kernel_size = hparams.postnet_kernel_size
self.channels = hparams.postnet_channels
self.activation = activation
self.scope = "postnet_convolutions" if scope is None else scope
self.postnet_num_layers = hparams.postnet_num_layers
self.drop_rate = hparams.tacotron_dropout_rate
def __call__(self, inputs):
with tf.variable_scope(self.scope):
x = inputs
for i in range(self.postnet_num_layers - 1):
x = conv1d(x, self.kernel_size, self.channels, self.activation,
self.is_training, self.drop_rate,
"conv_layer_{}_".format(i + 1) + self.scope)
x = conv1d(x, self.kernel_size, self.channels, lambda _: _, self.is_training,
self.drop_rate,
"conv_layer_{}_".format(5) + self.scope)
return x
def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, scope):
with tf.variable_scope(scope):
conv1d_output = tf.layers.conv1d(
inputs,
filters=channels,
kernel_size=kernel_size,
activation=None,
padding="same")
batched = tf.layers.batch_normalization(conv1d_output, training=is_training)
activated = activation(batched)
return tf.layers.dropout(activated, rate=drop_rate, training=is_training,
name="dropout_{}".format(scope))
def _round_up_tf(x, multiple):
# Tf version of remainder = x % multiple
remainder = tf.mod(x, multiple)
# Tf version of return x if remainder == 0 else x + multiple - remainder
x_round = tf.cond(tf.equal(remainder, tf.zeros(tf.shape(remainder), dtype=tf.int32)),
lambda: x,
lambda: x + multiple - remainder)
return x_round
def sequence_mask(lengths, r, expand=True):
"""Returns a 2-D or 3-D tensorflow sequence mask depending on the argument "expand"
"""
max_len = tf.reduce_max(lengths)
max_len = _round_up_tf(max_len, tf.convert_to_tensor(r))
if expand:
return tf.expand_dims(tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32), axis=-1)
return tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32)
def MaskedMSE(targets, outputs, targets_lengths, hparams, mask=None):
"""Computes a masked Mean Squared Error
"""
# [batch_size, time_dimension, 1]
# example:
# sequence_mask([1, 3, 2], 5) = [[[1., 0., 0., 0., 0.]],
# [[1., 1., 1., 0., 0.]],
# [[1., 1., 0., 0., 0.]]]
# Note the maxlen argument that ensures mask shape is compatible with r>1
# This will by default mask the extra paddings caused by r>1
if mask is None:
mask = sequence_mask(targets_lengths, hparams.outputs_per_step, True)
# [batch_size, time_dimension, channel_dimension(mels)]
ones = tf.ones(shape=[tf.shape(mask)[0], tf.shape(mask)[1], tf.shape(targets)[-1]],
dtype=tf.float32)
mask_ = mask * ones
with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask_))]):
return tf.losses.mean_squared_error(labels=targets, predictions=outputs, weights=mask_)
def MaskedSigmoidCrossEntropy(targets, outputs, targets_lengths, hparams, mask=None):
"""Computes a masked SigmoidCrossEntropy with logits
"""
# [batch_size, time_dimension]
# example:
# sequence_mask([1, 3, 2], 5) = [[1., 0., 0., 0., 0.],
# [1., 1., 1., 0., 0.],
# [1., 1., 0., 0., 0.]]
# Note the maxlen argument that ensures mask shape is compatible with r>1
# This will by default mask the extra paddings caused by r>1
if mask is None:
mask = sequence_mask(targets_lengths, hparams.outputs_per_step, False)
with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask))]):
# Use a weighted sigmoid cross entropy to measure the <stop_token> loss. Set
# hparams.cross_entropy_pos_weight to 1
# will have the same effect as vanilla tf.nn.sigmoid_cross_entropy_with_logits.
losses = tf.nn.weighted_cross_entropy_with_logits(targets=targets, logits=outputs,
pos_weight=hparams.cross_entropy_pos_weight)
with tf.control_dependencies([tf.assert_equal(tf.shape(mask), tf.shape(losses))]):
masked_loss = losses * mask
return tf.reduce_sum(masked_loss) / tf.count_nonzero(masked_loss, dtype=tf.float32)
def MaskedLinearLoss(targets, outputs, targets_lengths, hparams, mask=None):
"""Computes a masked MAE loss with priority to low frequencies
"""
# [batch_size, time_dimension, 1]
# example:
# sequence_mask([1, 3, 2], 5) = [[[1., 0., 0., 0., 0.]],
# [[1., 1., 1., 0., 0.]],
# [[1., 1., 0., 0., 0.]]]
# Note the maxlen argument that ensures mask shape is compatible with r>1
# This will by default mask the extra paddings caused by r>1
if mask is None:
mask = sequence_mask(targets_lengths, hparams.outputs_per_step, True)
# [batch_size, time_dimension, channel_dimension(freq)]
ones = tf.ones(shape=[tf.shape(mask)[0], tf.shape(mask)[1], tf.shape(targets)[-1]],
dtype=tf.float32)
mask_ = mask * ones
l1 = tf.abs(targets - outputs)
n_priority_freq = int(2000 / (hparams.sample_rate * 0.5) * hparams.num_freq)
with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask_))]):
masked_l1 = l1 * mask_
masked_l1_low = masked_l1[:, :, 0:n_priority_freq]
mean_l1 = tf.reduce_sum(masked_l1) / tf.reduce_sum(mask_)
mean_l1_low = tf.reduce_sum(masked_l1_low) / tf.reduce_sum(mask_)
return 0.5 * mean_l1 + 0.5 * mean_l1_low

View File

@ -0,0 +1,521 @@
import tensorflow as tf
from synthesizer.utils.symbols import symbols
from synthesizer.infolog import log
from synthesizer.models.helpers import TacoTrainingHelper, TacoTestHelper
from synthesizer.models.modules import *
from tensorflow.contrib.seq2seq import dynamic_decode
from synthesizer.models.architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
from synthesizer.models.custom_decoder import CustomDecoder
from synthesizer.models.attention import LocationSensitiveAttention
import numpy as np
def split_func(x, split_pos):
rst = []
start = 0
# x will be a numpy array with the contents of the placeholder below
for i in range(split_pos.shape[0]):
rst.append(x[:, start:start + split_pos[i]])
start += split_pos[i]
return rst
class Tacotron():
"""Tacotron-2 Feature prediction Model.
"""
def __init__(self, hparams):
self._hparams = hparams
def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None,
stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
global_step=None, is_training=False, is_evaluating=False, split_infos=None):
"""
Initializes the model for inference sets "mel_outputs" and "alignments" fields.
Args:
- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
steps in the input time series, and values are character IDs
- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the
lengths of each sequence in inputs.
- embed_targets: float32 Tensor with shape [N, E] where E is the speaker
embedding size.
- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size,
T_out is number of steps in the output time series, M is num_mels, and values are
entries in the mel spectrogram. Only needed for training.
"""
if mel_targets is None and stop_token_targets is not None:
raise ValueError("no multi targets were provided but token_targets were given")
if mel_targets is not None and stop_token_targets is None and not gta:
raise ValueError("Mel targets are provided without corresponding token_targets")
if not gta and self._hparams.predict_linear == True and linear_targets is None and \
is_training:
raise ValueError(
"Model is set to use post processing to predict linear spectrograms in training "
"but no linear targets given!")
if gta and linear_targets is not None:
raise ValueError("Linear spectrogram prediction is not supported in GTA mode!")
if is_training and self._hparams.mask_decoder and targets_lengths is None:
raise RuntimeError(
"Model set to mask paddings but no targets lengths provided for the mask!")
if is_training and is_evaluating:
raise RuntimeError(
"Model can not be in training and evaluation modes at the same time!")
split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or \
self._hparams.split_on_cpu else "/gpu:{}".format(
self._hparams.tacotron_gpu_start_idx)
with tf.device(split_device):
hp = self._hparams
lout_int = [tf.int32] * hp.tacotron_num_gpus
lout_float = [tf.float32] * hp.tacotron_num_gpus
tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus,
axis=0)
tower_targets_lengths = \
tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if \
targets_lengths is not None else targets_lengths
### SV2TTS ###
tower_embed_targets = tf.split(embed_targets, num_or_size_splits=hp.tacotron_num_gpus,
axis=0)
##############
p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int)
p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]],
lout_float) if mel_targets is not None else mel_targets
p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:, 2]],
lout_float) if stop_token_targets is not None else \
stop_token_targets
tower_inputs = []
tower_mel_targets = []
tower_stop_token_targets = []
batch_size = tf.shape(inputs)[0]
mel_channels = hp.num_mels
for i in range(hp.tacotron_num_gpus):
tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
if p_mel_targets is not None:
tower_mel_targets.append(
tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels]))
if p_stop_token_targets is not None:
tower_stop_token_targets.append(
tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
self.tower_decoder_output = []
self.tower_alignments = []
self.tower_stop_token_prediction = []
self.tower_mel_outputs = []
tower_embedded_inputs = []
tower_enc_conv_output_shape = []
tower_encoder_cond_outputs = []
tower_residual = []
tower_projected_residual = []
# 1. Declare GPU Devices
gpus = ["/gpu:{}".format(i) for i in
range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
for i in range(hp.tacotron_num_gpus):
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
worker_device=gpus[i])):
with tf.variable_scope("inference") as scope:
assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled")
if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training:
assert global_step is not None
# GTA is only used for predicting mels to train Wavenet vocoder, so we ommit
# post processing when doing GTA synthesis
post_condition = hp.predict_linear and not gta
# Embeddings ==> [batch_size, sequence_length, embedding_dim]
self.embedding_table = tf.get_variable(
"inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32)
embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])
# Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
encoder_cell = TacotronEncoderCell(
EncoderConvolutions(is_training, hparams=hp, scope="encoder_convolutions"),
EncoderRNN(is_training, size=hp.encoder_lstm_units,
zoneout=hp.tacotron_zoneout_rate, scope="encoder_LSTM"))
encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i])
# For shape visualization purpose
enc_conv_output_shape = encoder_cell.conv_output_shape
### SV2TT2 ###
# Append the speaker embedding to the encoder output at each timestep
tileable_shape = [-1, 1, self._hparams.speaker_embedding_size]
tileable_embed_targets = tf.reshape(tower_embed_targets[i], tileable_shape)
tiled_embed_targets = tf.tile(tileable_embed_targets,
[1, tf.shape(encoder_outputs)[1], 1])
encoder_cond_outputs = tf.concat((encoder_outputs, tiled_embed_targets), 2)
##############
# Decoder Parts
# Attention Decoder Prenet
prenet = Prenet(is_training, layers_sizes=hp.prenet_layers,
drop_rate=hp.tacotron_dropout_rate, scope="decoder_prenet")
# Attention Mechanism
attention_mechanism = LocationSensitiveAttention(hp.attention_dim,
encoder_cond_outputs,
hparams=hp,
mask_encoder=hp.mask_encoder,
memory_sequence_length=tf.reshape(
tower_input_lengths[i],
[-1]),
smoothing=hp.smoothing,
cumulate_weights=hp.cumulative_weights)
# Decoder LSTM Cells
decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
size=hp.decoder_lstm_units,
zoneout=hp.tacotron_zoneout_rate,
scope="decoder_LSTM")
# Frames Projection layer
frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step,
scope="linear_transform_projection")
# <stop_token> projection layer
stop_projection = StopProjection(is_training or is_evaluating, shape=hp
.outputs_per_step,
scope="stop_token_projection")
# Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
decoder_cell = TacotronDecoderCell(
prenet,
attention_mechanism,
decoder_lstm,
frame_projection,
stop_projection)
# Define the helper for our decoder
if is_training or is_evaluating or gta:
self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta,
is_evaluating, global_step)
else:
self.helper = TacoTestHelper(batch_size, hp)
# initial decoder state
decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
dtype=tf.float32)
# Only use max iterations at synthesis time
max_iters = hp.max_iters if not (is_training or is_evaluating) else None
# Decode
(frames_prediction, stop_token_prediction,
_), final_decoder_state, _ = dynamic_decode(
CustomDecoder(decoder_cell, self.helper, decoder_init_state),
impute_finished=False,
maximum_iterations=max_iters,
swap_memory=hp.tacotron_swap_with_cpu)
# Reshape outputs to be one output per entry
# ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])
# Postnet
postnet = Postnet(is_training, hparams=hp, scope="postnet_convolutions")
# Compute residual using post-net ==> [batch_size, decoder_steps * r,
# postnet_channels]
residual = postnet(decoder_output)
# Project residual to same dimension as mel spectrogram
# ==> [batch_size, decoder_steps * r, num_mels]
residual_projection = FrameProjection(hp.num_mels, scope="postnet_projection")
projected_residual = residual_projection(residual)
# Compute the mel spectrogram
mel_outputs = decoder_output + projected_residual
if post_condition:
# Add post-processing CBHG. This does a great job at extracting features
# from mels before projection to Linear specs.
post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size,
[hp.cbhg_projection, hp.num_mels],
hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers,
hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training,
name="CBHG_postnet")
# [batch_size, decoder_steps(mel_frames), cbhg_channels]
post_outputs = post_cbhg(mel_outputs, None)
# Linear projection of extracted features to make linear spectrogram
linear_specs_projection = FrameProjection(hp.num_freq,
scope="cbhg_linear_specs_projection")
# [batch_size, decoder_steps(linear_frames), num_freq]
linear_outputs = linear_specs_projection(post_outputs)
# Grab alignments from the final decoder state
alignments = tf.transpose(final_decoder_state.alignment_history.stack(),
[1, 2, 0])
self.tower_decoder_output.append(decoder_output)
self.tower_alignments.append(alignments)
self.tower_stop_token_prediction.append(stop_token_prediction)
self.tower_mel_outputs.append(mel_outputs)
tower_embedded_inputs.append(embedded_inputs)
tower_enc_conv_output_shape.append(enc_conv_output_shape)
tower_encoder_cond_outputs.append(encoder_cond_outputs)
tower_residual.append(residual)
tower_projected_residual.append(projected_residual)
if post_condition:
self.tower_linear_outputs.append(linear_outputs)
log("initialisation done {}".format(gpus[i]))
if is_training:
self.ratio = self.helper._ratio
self.tower_inputs = tower_inputs
self.tower_input_lengths = tower_input_lengths
self.tower_mel_targets = tower_mel_targets
# self.tower_linear_targets = tower_linear_targets
self.tower_targets_lengths = tower_targets_lengths
self.tower_stop_token_targets = tower_stop_token_targets
self.all_vars = tf.trainable_variables()
log("Initialized Tacotron model. Dimensions (? = dynamic shape): ")
log(" Train mode: {}".format(is_training))
log(" Eval mode: {}".format(is_evaluating))
log(" GTA mode: {}".format(gta))
log(" Synthesis mode: {}".format(not (is_training or is_evaluating)))
log(" Input: {}".format(inputs.shape))
for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
log(" device: {}".format(i))
log(" embedding: {}".format(tower_embedded_inputs[i].shape))
log(" enc conv out: {}".format(tower_enc_conv_output_shape[i]))
log(" encoder out (cond): {}".format(tower_encoder_cond_outputs[i].shape))
log(" decoder out: {}".format(self.tower_decoder_output[i].shape))
log(" residual out: {}".format(tower_residual[i].shape))
log(" projected residual out: {}".format(tower_projected_residual[i].shape))
log(" mel out: {}".format(self.tower_mel_outputs[i].shape))
if post_condition:
log(" linear out: {}".format(self.tower_linear_outputs[i].shape))
log(" <stop_token> out: {}".format(self.tower_stop_token_prediction[i].shape))
# 1_000_000 is causing syntax problems for some people?! Python please :)
log(" Tacotron Parameters {:.3f} Million.".format(
np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def add_loss(self):
"""Adds loss to the model. Sets "loss" field. initialize must have been called."""
hp = self._hparams
self.tower_before_loss = []
self.tower_after_loss = []
self.tower_stop_token_loss = []
self.tower_regularization_loss = []
self.tower_linear_loss = []
self.tower_loss = []
total_before_loss = 0
total_after_loss = 0
total_stop_token_loss = 0
total_regularization_loss = 0
total_linear_loss = 0
total_loss = 0
gpus = ["/gpu:{}".format(i) for i in
range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
for i in range(hp.tacotron_num_gpus):
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
worker_device=gpus[i])):
with tf.variable_scope("loss") as scope:
if hp.mask_decoder:
# Compute loss of predictions before postnet
before = MaskedMSE(self.tower_mel_targets[i], self.tower_decoder_output[i],
self.tower_targets_lengths[i],
hparams=self._hparams)
# Compute loss after postnet
after = MaskedMSE(self.tower_mel_targets[i], self.tower_mel_outputs[i],
self.tower_targets_lengths[i],
hparams=self._hparams)
# Compute <stop_token> loss (for learning dynamic generation stop)
stop_token_loss = MaskedSigmoidCrossEntropy(
self.tower_stop_token_targets[i],
self.tower_stop_token_prediction[i], self.tower_targets_lengths[i],
hparams=self._hparams)
# SV2TTS extra L1 loss (disabled for now)
# linear_loss = MaskedLinearLoss(self.tower_mel_targets[i],
# self.tower_decoder_output[i],
# self.tower_targets_lengths[i],
# hparams=self._hparams)
linear_loss = 0.
else:
# Compute loss of predictions before postnet
before = tf.losses.mean_squared_error(self.tower_mel_targets[i],
self.tower_decoder_output[i])
# Compute loss after postnet
after = tf.losses.mean_squared_error(self.tower_mel_targets[i],
self.tower_mel_outputs[i])
# Compute <stop_token> loss (for learning dynamic generation stop)
stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
labels=self.tower_stop_token_targets[i],
logits=self.tower_stop_token_prediction[i]))
# SV2TTS extra L1 loss
l1 = tf.abs(self.tower_mel_targets[i] - self.tower_decoder_output[i])
linear_loss = tf.reduce_mean(l1)
# if hp.predict_linear:
# # Compute linear loss
# # From https://github.com/keithito/tacotron/blob/tacotron2-work-in
# # -progress/models/tacotron.py
# # Prioritize loss for frequencies under 2000 Hz.
# l1 = tf.abs(self.tower_linear_targets[i] - self.tower_linear_outputs[i])
# n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_freq)
# linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(
# l1[:, :, 0:n_priority_freq])
# else:
# linear_loss = 0.
# Compute the regularization weight
if hp.tacotron_scale_regularization:
reg_weight_scaler = 1. / (
2 * hp.max_abs_value) if hp.symmetric_mels else 1. / (
hp.max_abs_value)
reg_weight = hp.tacotron_reg_weight * reg_weight_scaler
else:
reg_weight = hp.tacotron_reg_weight
# Regularize variables
# Exclude all types of bias, RNN (Bengio et al. On the difficulty of training recurrent neural networks), embeddings and prediction projection layers.
# Note that we consider attention mechanism v_a weights as a prediction projection layer and we don"t regularize it. (This gave better stability)
regularization = tf.add_n([tf.nn.l2_loss(v) for v in self.all_vars
if not (
"bias" in v.name or "Bias" in v.name or "_projection" in v.name or "inputs_embedding" in v.name
or "RNN" in v.name or "LSTM" in v.name)]) * reg_weight
# Compute final loss term
self.tower_before_loss.append(before)
self.tower_after_loss.append(after)
self.tower_stop_token_loss.append(stop_token_loss)
self.tower_regularization_loss.append(regularization)
self.tower_linear_loss.append(linear_loss)
loss = before + after + stop_token_loss + regularization + linear_loss
self.tower_loss.append(loss)
for i in range(hp.tacotron_num_gpus):
total_before_loss += self.tower_before_loss[i]
total_after_loss += self.tower_after_loss[i]
total_stop_token_loss += self.tower_stop_token_loss[i]
total_regularization_loss += self.tower_regularization_loss[i]
total_linear_loss += self.tower_linear_loss[i]
total_loss += self.tower_loss[i]
self.before_loss = total_before_loss / hp.tacotron_num_gpus
self.after_loss = total_after_loss / hp.tacotron_num_gpus
self.stop_token_loss = total_stop_token_loss / hp.tacotron_num_gpus
self.regularization_loss = total_regularization_loss / hp.tacotron_num_gpus
self.linear_loss = total_linear_loss / hp.tacotron_num_gpus
self.loss = total_loss / hp.tacotron_num_gpus
def add_optimizer(self, global_step):
"""Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
Args:
global_step: int32 scalar Tensor representing current global step in training
"""
hp = self._hparams
tower_gradients = []
# 1. Declare GPU Devices
gpus = ["/gpu:{}".format(i) for i in
range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
grad_device = "/cpu:0" if hp.tacotron_num_gpus > 1 else gpus[0]
with tf.device(grad_device):
with tf.variable_scope("optimizer") as scope:
if hp.tacotron_decay_learning_rate:
self.decay_steps = hp.tacotron_decay_steps
self.decay_rate = hp.tacotron_decay_rate
self.learning_rate = self._learning_rate_decay(
hp.tacotron_initial_learning_rate, global_step)
else:
self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate)
optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
# 2. Compute Gradient
for i in range(hp.tacotron_num_gpus):
# Device placement
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
worker_device=gpus[i])):
# agg_loss += self.tower_loss[i]
with tf.variable_scope("optimizer") as scope:
gradients = optimizer.compute_gradients(self.tower_loss[i])
tower_gradients.append(gradients)
# 3. Average Gradient
with tf.device(grad_device):
avg_grads = []
vars = []
for grad_and_vars in zip(*tower_gradients):
# grads_vars = [(grad1, var), (grad2, var), ...]
grads = []
for g, _ in grad_and_vars:
expanded_g = tf.expand_dims(g, 0)
# Append on a "tower" dimension which we will average over below.
grads.append(expanded_g)
# Average over the "tower" dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
v = grad_and_vars[0][1]
avg_grads.append(grad)
vars.append(v)
self.gradients = avg_grads
# Just for causion
# https://github.com/Rayhane-mamah/Tacotron-2/issues/11
if hp.tacotron_clip_gradients:
clipped_gradients, _ = tf.clip_by_global_norm(avg_grads, 1.) # __mark 0.5 refer
else:
clipped_gradients = avg_grads
# Add dependency on UPDATE_OPS; otherwise batchnorm won"t work correctly. See:
# https://github.com/tensorflow/tensorflow/issues/1122
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
self.optimize = optimizer.apply_gradients(zip(clipped_gradients, vars),
global_step=global_step)
def _learning_rate_decay(self, init_lr, global_step):
#################################################################
# Narrow Exponential Decay:
# Phase 1: lr = 1e-3
# We only start learning rate decay after 50k steps
# Phase 2: lr in ]1e-5, 1e-3[
# decay reach minimal value at step 310k
# Phase 3: lr = 1e-5
# clip by minimal learning rate value (step > 310k)
#################################################################
hp = self._hparams
# Compute natural exponential decay
lr = tf.train.exponential_decay(init_lr,
global_step - hp.tacotron_start_decay,
# lr = 1e-3 at step 50k
self.decay_steps,
self.decay_rate, # lr = 1e-5 around step 310k
name="lr_exponential_decay")
# clip learning rate by max and min values (initial and final values)
return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr)

View File

@ -0,0 +1,120 @@
from multiprocessing.pool import Pool
from functools import partial
from itertools import chain
from pathlib import Path
from tqdm import tqdm
import numpy as np
from encoder import inference as encoder
from synthesizer.preprocess_speaker import preprocess_speaker_general
from synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata
data_info = {
"aidatatang_200zh": {
"subfolders": ["corpus/train"],
"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
"speak_func": preprocess_speaker_general
},
"magicdata": {
"subfolders": ["train"],
"trans_filepath": "train/TRANS.txt",
"speak_func": preprocess_speaker_general,
"transcript_func": preprocess_transcript_magicdata,
},
"aishell3":{
"subfolders": ["train/wav"],
"trans_filepath": "train/content.txt",
"speak_func": preprocess_speaker_general,
"transcript_func": preprocess_transcript_aishell3,
},
"data_aishell":{
"subfolders": ["wav/train"],
"trans_filepath": "transcript/aishell_transcript_v0.8.txt",
"speak_func": preprocess_speaker_general
}
}
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
skip_existing: bool, hparams, no_alignments: bool,
dataset: str):
dataset_info = data_info[dataset]
# Gather the input directories
dataset_root = datasets_root.joinpath(dataset)
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
assert all(input_dir.exists() for input_dir in input_dirs)
# Create the output directories for each output file type
out_dir.joinpath("mels").mkdir(exist_ok=True)
out_dir.joinpath("audio").mkdir(exist_ok=True)
# Create a metadata file
metadata_fpath = out_dir.joinpath("train.txt")
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
# Preprocess the dataset
dict_info = {}
transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
# process with specific function for your dataset
if "transcript_func" in dataset_info:
dataset_info["transcript_func"](dict_info, dict_transcript)
else:
for v in dict_transcript:
if not v:
continue
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
dict_info[v[0]] = " ".join(v[1:])
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)
job = Pool(n_processes).imap(func, speaker_dirs)
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
for metadatum in speaker_metadata:
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
metadata_file.close()
# Verify the contents of the metadata file
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
mel_frames = sum([int(m[4]) for m in metadata])
timesteps = sum([int(m[3]) for m in metadata])
sample_rate = hparams.sample_rate
hours = (timesteps / sample_rate) / 3600
print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
(len(metadata), mel_frames, timesteps, hours))
print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
def embed_utterance(fpaths, encoder_model_fpath):
if not encoder.is_loaded():
encoder.load_model(encoder_model_fpath)
# Compute the speaker embedding of the utterance
wav_fpath, embed_fpath = fpaths
wav = np.load(wav_fpath)
wav = encoder.preprocess_wav(wav)
embed = encoder.embed_utterance(wav)
np.save(embed_fpath, embed, allow_pickle=False)
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
wav_dir = synthesizer_root.joinpath("audio")
metadata_fpath = synthesizer_root.joinpath("train.txt")
assert wav_dir.exists() and metadata_fpath.exists()
embed_dir = synthesizer_root.joinpath("embeds")
embed_dir.mkdir(exist_ok=True)
# Gather the input wave filepath and the target output embed filepath
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
# Embed the utterances in separate threads
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

View File

@ -0,0 +1,99 @@
import librosa
import numpy as np
from encoder import inference as encoder
from utils import logmmse
from synthesizer import audio
from pathlib import Path
from pypinyin import Style
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
pass
pinyin = Pinyin(PinyinConverter()).pinyin
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams):
## FOR REFERENCE:
# For you not to lose your head if you ever wish to change things here or implement your own
# synthesizer.
# - Both the audios and the mel spectrograms are saved as numpy arrays
# - There is no processing done to the audios that will be saved to disk beyond volume
# normalization (in split_on_silences)
# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
# is why we re-apply it on the audio on the side of the vocoder.
# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
# without extra padding. This means that you won't have an exact relation between the length
# of the wav and of the mel spectrogram. See the vocoder data loader.
# Skip existing utterances if needed
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
return None
# Trim silence
if hparams.trim_silence:
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
# Skip utterances that are too short
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
return None
# Compute the mel spectrogram
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
mel_frames = mel_spectrogram.shape[1]
# Skip utterances that are too long
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
return None
# Write the spectrogram, embed and audio to disk
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
np.save(wav_fpath, wav, allow_pickle=False)
# Return a tuple describing this training example
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
def _split_on_silences(wav_fpath, words, hparams):
# Load the audio waveform
wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# denoise, we may not need it here.
if len(wav) > hparams.sample_rate*(0.3+0.1):
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
wav[-int(hparams.sample_rate*0.15):]])
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile, eta=0)
resp = pinyin(words, style=Style.TONE3)
res = [v[0] for v in resp if v[0].strip()]
res = " ".join(res)
return wav, res
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
metadata = []
extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
words = dict_info.get(wav_fpath.name) if not words else words # try with wav
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams)
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))
return [m for m in metadata if m is not None]

View File

@ -0,0 +1,18 @@
def preprocess_transcript_aishell3(dict_info, dict_transcript):
for v in dict_transcript:
if not v:
continue
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
transList = []
for i in range(2, len(v), 2):
transList.append(v[i])
dict_info[v[0]] = " ".join(transList)
def preprocess_transcript_magicdata(dict_info, dict_transcript):
for v in dict_transcript:
if not v:
continue
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
dict_info[v[0]] = " ".join(v[2:])

View File

@ -0,0 +1,92 @@
import torch
from torch.utils.data import DataLoader
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
from synthesizer.models.tacotron import Tacotron
from synthesizer.hparams import hparams_debug_string
from synthesizer.utils.text import text_to_sequence
from synthesizer.utils.symbols import symbols
import numpy as np
from pathlib import Path
from tqdm import tqdm
import sys
from synthesizer.infolog import log
import os
from synthesizer.tacotron2 import Tacotron2
import time
import tensorflow as tf
def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
eval_dir = os.path.join(output_dir, "eval")
log_dir = os.path.join(output_dir, "logs-eval")
# Create output path if it doesn"t exist
os.makedirs(eval_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
os.makedirs(os.path.join(log_dir, "wavs"), exist_ok=True)
os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True)
log(hparams_debug_string())
synth = Tacotron2(checkpoint_path, hparams)
# Set inputs batch wise
sentences = [sentences[i: i + hparams.tacotron_synthesis_batch_size] for i
in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]
log("Starting Synthesis")
with open(os.path.join(eval_dir, "map.txt"), "w") as file:
for i, texts in enumerate(tqdm(sentences)):
start = time.time()
basenames = ["batch_{}_sentence_{}".format(i, j) for j in range(len(texts))]
mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None)
for elems in zip(texts, mel_filenames, speaker_ids):
file.write("|".join([str(x) for x in elems]) + "\n")
log("synthesized mel spectrograms at {}".format(eval_dir))
return eval_dir
def run_synthesis(in_dir, out_dir, model_dir, hparams):
# This generates ground truth-aligned mels for vocoder training
synth_dir = os.path.join(out_dir, "mels_gta")
os.makedirs(synth_dir, exist_ok=True)
metadata_filename = os.path.join(in_dir, "train.txt")
print(hparams_debug_string())
# Load the model in memory
weights_dir = os.path.join(model_dir, "taco_pretrained")
checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path
synth = Tacotron2(checkpoint_fpath, hparams, gta=True)
# Load the metadata
with open(metadata_filename, encoding="utf-8") as f:
metadata = [line.strip().split("|") for line in f]
frame_shift_ms = hparams.hop_size / hparams.sample_rate
hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / 3600
print("Loaded metadata for {} examples ({:.2f} hours)".format(len(metadata), hours))
# Set inputs batch wise
metadata = [metadata[i: i + hparams.tacotron_synthesis_batch_size] for i in
range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]
# TODO: come on big boy, fix this
# Quick and dirty fix to make sure that all batches have the same size
metadata = metadata[:-1]
print("Starting Synthesis")
mel_dir = os.path.join(in_dir, "mels")
embed_dir = os.path.join(in_dir, "embeds")
meta_out_fpath = os.path.join(out_dir, "synthesized.txt")
with open(meta_out_fpath, "w") as file:
for i, meta in enumerate(tqdm(metadata)):
texts = [m[5] for m in meta]
mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
embed_filenames = [os.path.join(embed_dir, m[2]) for m in meta]
basenames = [os.path.basename(m).replace(".npy", "").replace("mel-", "")
for m in mel_filenames]
synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, embed_filenames)
for elems in meta:
file.write("|".join([str(x) for x in elems]) + "\n")
print("Synthesized mel spectrograms at {}".format(synth_dir))
return meta_out_fpath

View File

@ -0,0 +1,92 @@
import torch
from torch.utils.data import Dataset
import numpy as np
from pathlib import Path
from synthesizer.utils.text import text_to_sequence
class SynthesizerDataset(Dataset):
def __init__(self, metadata_fpath: Path, mel_dir: Path, embed_dir: Path, hparams):
print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, embed_dir))
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
mel_fnames = [x[1] for x in metadata if int(x[4])]
mel_fpaths = [mel_dir.joinpath(fname) for fname in mel_fnames]
embed_fnames = [x[2] for x in metadata if int(x[4])]
embed_fpaths = [embed_dir.joinpath(fname) for fname in embed_fnames]
self.samples_fpaths = list(zip(mel_fpaths, embed_fpaths))
self.samples_texts = [x[5].strip() for x in metadata if int(x[4])]
self.metadata = metadata
self.hparams = hparams
print("Found %d samples" % len(self.samples_fpaths))
def __getitem__(self, index):
# Sometimes index may be a list of 2 (not sure why this happens)
# If that is the case, return a single item corresponding to first element in index
if index is list:
index = index[0]
mel_path, embed_path = self.samples_fpaths[index]
mel = np.load(mel_path).T.astype(np.float32)
# Load the embed
embed = np.load(embed_path)
# Get the text and clean it
text = text_to_sequence(self.samples_texts[index], self.hparams.tts_cleaner_names)
# Convert the list returned by text_to_sequence to a numpy array
text = np.asarray(text).astype(np.int32)
return text, mel.astype(np.float32), embed.astype(np.float32), index
def __len__(self):
return len(self.samples_fpaths)
def collate_synthesizer(batch):
# Text
x_lens = [len(x[0]) for x in batch]
max_x_len = max(x_lens)
chars = [pad1d(x[0], max_x_len) for x in batch]
chars = np.stack(chars)
# Mel spectrogram
spec_lens = [x[1].shape[-1] for x in batch]
max_spec_len = max(spec_lens) + 1
if max_spec_len % 2 != 0: # FIXIT: Hardcoded due to incompatibility with Windows (no lambda)
max_spec_len += 2 - max_spec_len % 2
# WaveRNN mel spectrograms are normalized to [0, 1] so zero padding adds silence
# By default, SV2TTS uses symmetric mels, where -1*max_abs_value is silence.
# if hparams.symmetric_mels:
# mel_pad_value = -1 * hparams.max_abs_value
# else:
# mel_pad_value = 0
mel_pad_value = -4 # FIXIT: Hardcoded due to incompatibility with Windows (no lambda)
mel = [pad2d(x[1], max_spec_len, pad_value=mel_pad_value) for x in batch]
mel = np.stack(mel)
# Speaker embedding (SV2TTS)
embeds = [x[2] for x in batch]
# Index (for vocoder preprocessing)
indices = [x[3] for x in batch]
# Convert all to tensor
chars = torch.tensor(chars).long()
mel = torch.tensor(mel)
embeds = torch.tensor(embeds)
return chars, mel, embeds, indices
def pad1d(x, max_len, pad_value=0):
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
def pad2d(x, max_len, pad_value=0):
return np.pad(x, ((0, 0), (0, max_len - x.shape[-1])), mode="constant", constant_values=pad_value)

View File

@ -0,0 +1,238 @@
from synthesizer.utils.text import text_to_sequence
from synthesizer.infolog import log
from synthesizer.models import create_model
from synthesizer.utils import plot
from synthesizer import audio
import tensorflow as tf
import numpy as np
import os
class Tacotron2:
def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
log("Constructing model: %s" % model_name)
#Force the batch size to be known in order to use attention masking in batch synthesis
inputs = tf.placeholder(tf.int32, (None, None), name="inputs")
input_lengths = tf.placeholder(tf.int32, (None,), name="input_lengths")
speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
name="speaker_embeddings")
targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets")
split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos")
with tf.variable_scope("Tacotron_model") as scope:
self.model = create_model(model_name, hparams)
if gta:
self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta,
split_infos=split_infos)
else:
self.model.initialize(inputs, input_lengths, speaker_embeddings,
split_infos=split_infos)
self.mel_outputs = self.model.tower_mel_outputs
self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None
self.alignments = self.model.tower_alignments
self.stop_token_prediction = self.model.tower_stop_token_prediction
self.targets = targets
self.gta = gta
self._hparams = hparams
#pad input sequences with the <pad_token> 0 ( _ )
self._pad = 0
#explicitely setting the padding to a value that doesn"t originally exist in the spectogram
#to avoid any possible conflicts, without affecting the output range of the model too much
if hparams.symmetric_mels:
self._target_pad = -hparams.max_abs_value
else:
self._target_pad = 0.
self.inputs = inputs
self.input_lengths = input_lengths
self.speaker_embeddings = speaker_embeddings
self.targets = targets
self.split_infos = split_infos
log("Loading checkpoint: %s" % checkpoint_path)
#Memory allocation on the GPUs as needed
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
self.session = tf.Session(config=config)
self.session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(self.session, checkpoint_path)
def my_synthesize(self, speaker_embeds, texts):
"""
Lighter synthesis function that directly returns the mel spectrograms.
"""
print(texts)
# Prepare the input
cleaner_names = [x.strip() for x in self._hparams.cleaners.split(",")]
seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
input_lengths = [len(seq) for seq in seqs]
input_seqs, max_seq_len = self._prepare_inputs(seqs)
split_infos = [[max_seq_len, 0, 0, 0]]
feed_dict = {
self.inputs: input_seqs,
self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
self.split_infos: np.asarray(split_infos, dtype=np.int32),
self.speaker_embeddings: speaker_embeds
}
# Forward it
mels, alignments, stop_tokens = self.session.run(
[self.mel_outputs, self.alignments, self.stop_token_prediction],
feed_dict=feed_dict)
mels, alignments, stop_tokens = list(mels[0]), alignments[0], stop_tokens[0]
# Trim the output
for i in range(len(mels)):
try:
target_length = list(np.round(stop_tokens[i])).index(1)
mels[i] = mels[i][:target_length, :]
except ValueError:
# If no token is generated, we simply do not trim the output
continue
return [mel.T for mel in mels], alignments
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames, embed_filenames):
hparams = self._hparams
cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]
assert 0 == len(texts) % self._hparams.tacotron_num_gpus
seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
input_lengths = [len(seq) for seq in seqs]
size_per_device = len(seqs) // self._hparams.tacotron_num_gpus
#Pad inputs according to each GPU max length
input_seqs = None
split_infos = []
for i in range(self._hparams.tacotron_num_gpus):
device_input = seqs[size_per_device*i: size_per_device*(i+1)]
device_input, max_seq_len = self._prepare_inputs(device_input)
input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input
split_infos.append([max_seq_len, 0, 0, 0])
feed_dict = {
self.inputs: input_seqs,
self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
}
if self.gta:
np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
target_lengths = [len(np_target) for np_target in np_targets]
#pad targets according to each GPU max length
target_seqs = None
for i in range(self._hparams.tacotron_num_gpus):
device_target = np_targets[size_per_device*i: size_per_device*(i+1)]
device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step)
target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target
split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe?
feed_dict[self.targets] = target_seqs
assert len(np_targets) == len(texts)
feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
feed_dict[self.speaker_embeddings] = [np.load(f) for f in embed_filenames]
if self.gta or not hparams.predict_linear:
mels, alignments, stop_tokens = self.session.run(
[self.mel_outputs, self.alignments, self.stop_token_prediction],
feed_dict=feed_dict)
#Linearize outputs (1D arrays)
mels = [mel for gpu_mels in mels for mel in gpu_mels]
alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
if not self.gta:
#Natural batch synthesis
#Get Mel lengths for the entire batch from stop_tokens predictions
target_lengths = self._get_output_lengths(stop_tokens)
#Take off the batch wise padding
mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
assert len(mels) == len(texts)
else:
linears, mels, alignments, stop_tokens = self.session.run(
[self.linear_outputs, self.mel_outputs, self.alignments,
self.stop_token_prediction],
feed_dict=feed_dict)
#Linearize outputs (1D arrays)
linears = [linear for gpu_linear in linears for linear in gpu_linear]
mels = [mel for gpu_mels in mels for mel in gpu_mels]
alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
#Natural batch synthesis
#Get Mel/Linear lengths for the entire batch from stop_tokens predictions
# target_lengths = self._get_output_lengths(stop_tokens)
target_lengths = [9999]
#Take off the batch wise padding
mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
assert len(mels) == len(linears) == len(texts)
if basenames is None:
raise NotImplemented()
saved_mels_paths = []
for i, mel in enumerate(mels):
# Write the spectrogram to disk
# Note: outputs mel-spectrogram files and target ones have same names, just different folders
mel_filename = os.path.join(out_dir, "mel-{}.npy".format(basenames[i]))
np.save(mel_filename, mel, allow_pickle=False)
saved_mels_paths.append(mel_filename)
if log_dir is not None:
#save wav (mel -> wav)
wav = audio.inv_mel_spectrogram(mel.T, hparams)
audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-mel.wav".format(basenames[i])), sr=hparams.sample_rate)
#save alignments
plot.plot_alignment(alignments[i], os.path.join(log_dir, "plots/alignment-{}.png".format(basenames[i])),
title="{}".format(texts[i]), split_title=True, max_len=target_lengths[i])
#save mel spectrogram plot
plot.plot_spectrogram(mel, os.path.join(log_dir, "plots/mel-{}.png".format(basenames[i])),
title="{}".format(texts[i]), split_title=True)
if hparams.predict_linear:
#save wav (linear -> wav)
wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-linear.wav".format(basenames[i])), sr=hparams.sample_rate)
#save linear spectrogram plot
plot.plot_spectrogram(linears[i], os.path.join(log_dir, "plots/linear-{}.png".format(basenames[i])),
title="{}".format(texts[i]), split_title=True, auto_aspect=True)
return saved_mels_paths
def _round_up(self, x, multiple):
remainder = x % multiple
return x if remainder == 0 else x + multiple - remainder
def _prepare_inputs(self, inputs):
max_len = max([len(x) for x in inputs])
return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
def _pad_input(self, x, length):
return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad)
def _prepare_targets(self, targets, alignment):
max_len = max([len(t) for t in targets])
data_len = self._round_up(max_len, alignment)
return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
def _pad_target(self, t, length):
return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad)
def _get_output_lengths(self, stop_tokens):
#Determine each mel length by the stop token predictions. (len = first occurence of 1 in stop_tokens row wise)
output_lengths = [row.index(1) for row in np.round(stop_tokens).tolist()]
return output_lengths

View File

@ -0,0 +1,393 @@
from synthesizer.utils.symbols import symbols
from synthesizer.utils.text import sequence_to_text
from synthesizer.hparams import hparams_debug_string
from synthesizer.feeder import Feeder
from synthesizer.models import create_model
from synthesizer.utils import ValueWindow, plot
from synthesizer import infolog, audio
from datetime import datetime
from tqdm import tqdm
import tensorflow as tf
import numpy as np
import traceback
import time
import os
log = infolog.log
def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoint_path):
# Create tensorboard projector
config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
config.model_checkpoint_path = checkpoint_path
for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta):
# Initialize config
embedding = config.embeddings.add()
# Specifiy the embedding variable and the metadata
embedding.tensor_name = embedding_name
embedding.metadata_path = path_to_meta
# Project the embeddings to space dimensions for visualization
tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config)
def add_train_stats(model, hparams):
with tf.variable_scope("stats") as scope:
for i in range(hparams.tacotron_num_gpus):
tf.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i])
tf.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i])
tf.summary.scalar("before_loss", model.before_loss)
tf.summary.scalar("after_loss", model.after_loss)
if hparams.predict_linear:
tf.summary.scalar("linear_loss", model.linear_loss)
for i in range(hparams.tacotron_num_gpus):
tf.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i])
tf.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i])
tf.summary.scalar("regularization_loss", model.regularization_loss)
tf.summary.scalar("stop_token_loss", model.stop_token_loss)
tf.summary.scalar("loss", model.loss)
tf.summary.scalar("learning_rate", model.learning_rate) # Control learning rate decay speed
if hparams.tacotron_teacher_forcing_mode == "scheduled":
tf.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing
# ratio decay when mode = "scheduled"
gradient_norms = [tf.norm(grad) for grad in model.gradients]
tf.summary.histogram("gradient_norm", gradient_norms)
tf.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms)) # visualize
# gradients (in case of explosion)
return tf.summary.merge_all()
def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss,
loss):
values = [
tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss",
simple_value=before_loss),
tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss",
simple_value=after_loss),
tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss",
simple_value=stop_token_loss),
tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss),
]
if linear_loss is not None:
values.append(tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss",
simple_value=linear_loss))
test_summary = tf.Summary(value=values)
summary_writer.add_summary(test_summary, step)
def time_string():
return datetime.now().strftime("%Y-%m-%d %H:%M")
def model_train_mode(args, feeder, hparams, global_step):
with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
model = create_model("Tacotron", hparams)
model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings,
feeder.mel_targets, feeder.token_targets,
targets_lengths=feeder.targets_lengths, global_step=global_step,
is_training=True, split_infos=feeder.split_infos)
model.add_loss()
model.add_optimizer(global_step)
stats = add_train_stats(model, hparams)
return model, stats
def model_test_mode(args, feeder, hparams, global_step):
with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
model = create_model("Tacotron", hparams)
model.initialize(feeder.eval_inputs, feeder.eval_input_lengths,
feeder.eval_speaker_embeddings, feeder.eval_mel_targets,
feeder.eval_token_targets, targets_lengths=feeder.eval_targets_lengths,
global_step=global_step, is_training=False, is_evaluating=True,
split_infos=feeder.eval_split_infos)
model.add_loss()
return model
def train(log_dir, args, hparams):
save_dir = os.path.join(log_dir, "taco_pretrained")
plot_dir = os.path.join(log_dir, "plots")
wav_dir = os.path.join(log_dir, "wavs")
mel_dir = os.path.join(log_dir, "mel-spectrograms")
eval_dir = os.path.join(log_dir, "eval-dir")
eval_plot_dir = os.path.join(eval_dir, "plots")
eval_wav_dir = os.path.join(eval_dir, "wavs")
tensorboard_dir = os.path.join(log_dir, "tacotron_events")
meta_folder = os.path.join(log_dir, "metas")
os.makedirs(save_dir, exist_ok=True)
os.makedirs(plot_dir, exist_ok=True)
os.makedirs(wav_dir, exist_ok=True)
os.makedirs(mel_dir, exist_ok=True)
os.makedirs(eval_dir, exist_ok=True)
os.makedirs(eval_plot_dir, exist_ok=True)
os.makedirs(eval_wav_dir, exist_ok=True)
os.makedirs(tensorboard_dir, exist_ok=True)
os.makedirs(meta_folder, exist_ok=True)
checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt")
metadat_fpath = os.path.join(args.synthesizer_root, "train.txt")
log("Checkpoint path: {}".format(checkpoint_fpath))
log("Loading training data from: {}".format(metadat_fpath))
log("Using model: Tacotron")
log(hparams_debug_string())
# Start by setting a seed for repeatability
tf.set_random_seed(hparams.tacotron_random_seed)
# Set up data feeder
coord = tf.train.Coordinator()
with tf.variable_scope("datafeeder") as scope:
feeder = Feeder(coord, metadat_fpath, hparams)
# Set up model:
global_step = tf.Variable(0, name="global_step", trainable=False)
model, stats = model_train_mode(args, feeder, hparams, global_step)
eval_model = model_test_mode(args, feeder, hparams, global_step)
# Embeddings metadata
char_embedding_meta = os.path.join(meta_folder, "CharacterEmbeddings.tsv")
if not os.path.isfile(char_embedding_meta):
with open(char_embedding_meta, "w", encoding="utf-8") as f:
for symbol in symbols:
if symbol == " ":
symbol = "\\s" # For visual purposes, swap space with \s
f.write("{}\n".format(symbol))
char_embedding_meta = char_embedding_meta.replace(log_dir, "..")
# Book keeping
step = 0
time_window = ValueWindow(100)
loss_window = ValueWindow(100)
saver = tf.train.Saver(max_to_keep=5)
log("Tacotron training set to a maximum of {} steps".format(args.tacotron_train_steps))
# Memory allocation on the GPU as needed
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
# Train
with tf.Session(config=config) as sess:
try:
summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
sess.run(tf.global_variables_initializer())
# saved model restoring
if args.restore:
# Restore saved model if the user requested it, default = True
try:
checkpoint_state = tf.train.get_checkpoint_state(save_dir)
if checkpoint_state and checkpoint_state.model_checkpoint_path:
log("Loading checkpoint {}".format(checkpoint_state.model_checkpoint_path),
slack=True)
saver.restore(sess, checkpoint_state.model_checkpoint_path)
else:
log("No model to load at {}".format(save_dir), slack=True)
saver.save(sess, checkpoint_fpath, global_step=global_step)
except tf.errors.OutOfRangeError as e:
log("Cannot restore checkpoint: {}".format(e), slack=True)
else:
log("Starting new training!", slack=True)
saver.save(sess, checkpoint_fpath, global_step=global_step)
# initializing feeder
feeder.start_threads(sess)
# Training loop
while not coord.should_stop() and step < args.tacotron_train_steps:
start_time = time.time()
step, loss, opt = sess.run([global_step, model.loss, model.optimize])
time_window.append(time.time() - start_time)
loss_window.append(loss)
message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]".format(
step, time_window.average, loss, loss_window.average)
log(message, end="\r", slack=(step % args.checkpoint_interval == 0))
print(message)
if loss > 100 or np.isnan(loss):
log("Loss exploded to {:.5f} at step {}".format(loss, step))
raise Exception("Loss exploded")
if step % args.summary_interval == 0:
log("\nWriting summary at step {}".format(step))
summary_writer.add_summary(sess.run(stats), step)
if step % args.eval_interval == 0:
# Run eval and save eval stats
log("\nRunning evaluation at step {}".format(step))
eval_losses = []
before_losses = []
after_losses = []
stop_token_losses = []
linear_losses = []
linear_loss = None
if hparams.predict_linear:
for i in tqdm(range(feeder.test_steps)):
eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, \
mel_t, t_len, align, lin_p, lin_t = sess.run(
[
eval_model.tower_loss[0], eval_model.tower_before_loss[0],
eval_model.tower_after_loss[0],
eval_model.tower_stop_token_loss[0],
eval_model.tower_linear_loss[0],
eval_model.tower_mel_outputs[0][0],
eval_model.tower_mel_targets[0][0],
eval_model.tower_targets_lengths[0][0],
eval_model.tower_alignments[0][0],
eval_model.tower_linear_outputs[0][0],
eval_model.tower_linear_targets[0][0],
])
eval_losses.append(eloss)
before_losses.append(before_loss)
after_losses.append(after_loss)
stop_token_losses.append(stop_token_loss)
linear_losses.append(linear_loss)
linear_loss = sum(linear_losses) / len(linear_losses)
wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
audio.save_wav(wav, os.path.join(eval_wav_dir,
"step-{}-eval-wave-from-linear.wav".format(
step)), sr=hparams.sample_rate)
else:
for i in tqdm(range(feeder.test_steps)):
eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, \
align = sess.run(
[
eval_model.tower_loss[0], eval_model.tower_before_loss[0],
eval_model.tower_after_loss[0],
eval_model.tower_stop_token_loss[0],
eval_model.tower_mel_outputs[0][0],
eval_model.tower_mel_targets[0][0],
eval_model.tower_targets_lengths[0][0],
eval_model.tower_alignments[0][0]
])
eval_losses.append(eloss)
before_losses.append(before_loss)
after_losses.append(after_loss)
stop_token_losses.append(stop_token_loss)
eval_loss = sum(eval_losses) / len(eval_losses)
before_loss = sum(before_losses) / len(before_losses)
after_loss = sum(after_losses) / len(after_losses)
stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)
log("Saving eval log to {}..".format(eval_dir))
# Save some log to monitor model improvement on same unseen sequence
wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
audio.save_wav(wav, os.path.join(eval_wav_dir,
"step-{}-eval-wave-from-mel.wav".format(step)),
sr=hparams.sample_rate)
plot.plot_alignment(align, os.path.join(eval_plot_dir,
"step-{}-eval-align.png".format(step)),
title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
time_string(),
step,
eval_loss),
max_len=t_len // hparams.outputs_per_step)
plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir,
"step-{"
"}-eval-mel-spectrogram.png".format(
step)),
title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
time_string(),
step,
eval_loss),
target_spectrogram=mel_t,
max_len=t_len)
if hparams.predict_linear:
plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir,
"step-{}-eval-linear-spectrogram.png".format(
step)),
title="{}, {}, step={}, loss={:.5f}".format(
"Tacotron", time_string(), step, eval_loss),
target_spectrogram=lin_t,
max_len=t_len, auto_aspect=True)
log("Eval loss for global step {}: {:.3f}".format(step, eval_loss))
log("Writing eval summary!")
add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss,
stop_token_loss, eval_loss)
if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \
step == 300:
# Save model and current global step
saver.save(sess, checkpoint_fpath, global_step=global_step)
log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..")
input_seq, mel_prediction, alignment, target, target_length = sess.run([
model.tower_inputs[0][0],
model.tower_mel_outputs[0][0],
model.tower_alignments[0][0],
model.tower_mel_targets[0][0],
model.tower_targets_lengths[0][0],
])
# save predicted mel spectrogram to disk (debug)
mel_filename = "mel-prediction-step-{}.npy".format(step)
np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T,
allow_pickle=False)
# save griffin lim inverted wav for debug (mel -> wav)
wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
audio.save_wav(wav,
os.path.join(wav_dir, "step-{}-wave-from-mel.wav".format(step)),
sr=hparams.sample_rate)
# save alignment plot to disk (control purposes)
plot.plot_alignment(alignment,
os.path.join(plot_dir, "step-{}-align.png".format(step)),
title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
time_string(),
step, loss),
max_len=target_length // hparams.outputs_per_step)
# save real and predicted mel-spectrogram plot to disk (control purposes)
plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir,
"step-{}-mel-spectrogram.png".format(
step)),
title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
time_string(),
step, loss),
target_spectrogram=target,
max_len=target_length)
log("Input at step {}: {}".format(step, sequence_to_text(input_seq)))
if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
# Get current checkpoint state
checkpoint_state = tf.train.get_checkpoint_state(save_dir)
# Update Projector
log("\nSaving Model Character Embeddings visualization..")
add_embedding_stats(summary_writer, [model.embedding_table.name],
[char_embedding_meta],
checkpoint_state.model_checkpoint_path)
log("Tacotron Character embeddings have been updated on tensorboard!")
log("Tacotron training complete after {} global steps!".format(
args.tacotron_train_steps), slack=True)
return save_dir
except Exception as e:
log("Exiting due to exception: {}".format(e), slack=True)
traceback.print_exc()
coord.request_stop(e)
def tacotron_train(args, log_dir, hparams):
return train(log_dir, args, hparams)

View File

@ -0,0 +1,45 @@
import torch
_output_ref = None
_replicas_ref = None
def data_parallel_workaround(model, *input):
global _output_ref
global _replicas_ref
device_ids = list(range(torch.cuda.device_count()))
output_device = device_ids[0]
replicas = torch.nn.parallel.replicate(model, device_ids)
# input.shape = (num_args, batch, ...)
inputs = torch.nn.parallel.scatter(input, device_ids)
# inputs.shape = (num_gpus, num_args, batch/num_gpus, ...)
replicas = replicas[:len(inputs)]
outputs = torch.nn.parallel.parallel_apply(replicas, inputs)
y_hat = torch.nn.parallel.gather(outputs, output_device)
_output_ref = outputs
_replicas_ref = replicas
return y_hat
class ValueWindow():
def __init__(self, window_size=100):
self._window_size = window_size
self._values = []
def append(self, x):
self._values = self._values[-(self._window_size - 1):] + [x]
@property
def sum(self):
return sum(self._values)
@property
def count(self):
return len(self._values)
@property
def average(self):
return self.sum / max(1, self.count)
def reset(self):
self._values = []

View File

@ -0,0 +1,62 @@
import re
valid_symbols = [
"AA", "AA0", "AA1", "AA2", "AE", "AE0", "AE1", "AE2", "AH", "AH0", "AH1", "AH2",
"AO", "AO0", "AO1", "AO2", "AW", "AW0", "AW1", "AW2", "AY", "AY0", "AY1", "AY2",
"B", "CH", "D", "DH", "EH", "EH0", "EH1", "EH2", "ER", "ER0", "ER1", "ER2", "EY",
"EY0", "EY1", "EY2", "F", "G", "HH", "IH", "IH0", "IH1", "IH2", "IY", "IY0", "IY1",
"IY2", "JH", "K", "L", "M", "N", "NG", "OW", "OW0", "OW1", "OW2", "OY", "OY0",
"OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH", "UH0", "UH1", "UH2", "UW",
"UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"
]
_valid_symbol_set = set(valid_symbols)
class CMUDict:
"""Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
def __init__(self, file_or_path, keep_ambiguous=True):
if isinstance(file_or_path, str):
with open(file_or_path, encoding="latin-1") as f:
entries = _parse_cmudict(f)
else:
entries = _parse_cmudict(file_or_path)
if not keep_ambiguous:
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
self._entries = entries
def __len__(self):
return len(self._entries)
def lookup(self, word):
"""Returns list of ARPAbet pronunciations of the given word."""
return self._entries.get(word.upper())
_alt_re = re.compile(r"\([0-9]+\)")
def _parse_cmudict(file):
cmudict = {}
for line in file:
if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
parts = line.split(" ")
word = re.sub(_alt_re, "", parts[0])
pronunciation = _get_pronunciation(parts[1])
if pronunciation:
if word in cmudict:
cmudict[word].append(pronunciation)
else:
cmudict[word] = [pronunciation]
return cmudict
def _get_pronunciation(s):
parts = s.strip().split(" ")
for part in parts:
if part not in _valid_symbol_set:
return None
return " ".join(parts)

View File

@ -0,0 +1,88 @@
"""
Cleaners are transformations that run over the input text at both training and eval time.
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You"ll typically want to use:
1. "english_cleaners" for English text
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data).
"""
import re
from unidecode import unidecode
from .numbers import normalize_numbers
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [
("mrs", "misess"),
("mr", "mister"),
("dr", "doctor"),
("st", "saint"),
("co", "company"),
("jr", "junior"),
("maj", "major"),
("gen", "general"),
("drs", "doctors"),
("rev", "reverend"),
("lt", "lieutenant"),
("hon", "honorable"),
("sgt", "sergeant"),
("capt", "captain"),
("esq", "esquire"),
("ltd", "limited"),
("col", "colonel"),
("ft", "fort"),
]]
def expand_abbreviations(text):
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def expand_numbers(text):
return normalize_numbers(text)
def lowercase(text):
"""lowercase input tokens."""
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, " ", text)
def convert_to_ascii(text):
return unidecode(text)
def basic_cleaners(text):
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
text = lowercase(text)
text = collapse_whitespace(text)
return text
def transliteration_cleaners(text):
"""Pipeline for non-English text that transliterates to ASCII."""
text = convert_to_ascii(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text
def english_cleaners(text):
"""Pipeline for English text, including number and abbreviation expansion."""
text = convert_to_ascii(text)
text = lowercase(text)
text = expand_numbers(text)
text = expand_abbreviations(text)
text = collapse_whitespace(text)
return text

View File

@ -0,0 +1,68 @@
import re
import inflect
_inflect = inflect.engine()
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
_number_re = re.compile(r"[0-9]+")
def _remove_commas(m):
return m.group(1).replace(",", "")
def _expand_decimal_point(m):
return m.group(1).replace(".", " point ")
def _expand_dollars(m):
match = m.group(1)
parts = match.split(".")
if len(parts) > 2:
return match + " dollars" # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = "dollar" if dollars == 1 else "dollars"
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = "dollar" if dollars == 1 else "dollars"
return "%s %s" % (dollars, dollar_unit)
elif cents:
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s" % (cents, cent_unit)
else:
return "zero dollars"
def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))
def _expand_number(m):
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return "two thousand"
elif num > 2000 and num < 2010:
return "two thousand " + _inflect.number_to_words(num % 100)
elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + " hundred"
else:
return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
else:
return _inflect.number_to_words(num, andword="")
def normalize_numbers(text):
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_pounds_re, r"\1 pounds", text)
text = re.sub(_dollars_re, _expand_dollars, text)
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text)
return text

View File

@ -0,0 +1,76 @@
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
def split_title_line(title_text, max_words=5):
"""
A function that splits any string based on specific character
(returning it with the string), with maximum number of words on it
"""
seq = title_text.split()
return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
def plot_alignment(alignment, path, title=None, split_title=False, max_len=None):
if max_len is not None:
alignment = alignment[:, :max_len]
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
im = ax.imshow(
alignment,
aspect="auto",
origin="lower",
interpolation="none")
fig.colorbar(im, ax=ax)
xlabel = "Decoder timestep"
if split_title:
title = split_title_line(title)
plt.xlabel(xlabel)
plt.title(title)
plt.ylabel("Encoder timestep")
plt.tight_layout()
plt.savefig(path, format="png")
plt.close()
def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
if max_len is not None:
target_spectrogram = target_spectrogram[:max_len]
pred_spectrogram = pred_spectrogram[:max_len]
if split_title:
title = split_title_line(title)
fig = plt.figure(figsize=(10, 8))
# Set common labels
fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16)
#target spectrogram subplot
if target_spectrogram is not None:
ax1 = fig.add_subplot(311)
ax2 = fig.add_subplot(312)
if auto_aspect:
im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none")
else:
im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none")
ax1.set_title("Target Mel-Spectrogram")
fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1)
ax2.set_title("Predicted Mel-Spectrogram")
else:
ax2 = fig.add_subplot(211)
if auto_aspect:
im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none")
else:
im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none")
fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2)
plt.tight_layout()
plt.savefig(path, format="png")
plt.close()

View File

@ -0,0 +1,18 @@
"""
Defines the set of symbols used in text input to the model.
The default is a set of ASCII characters that works well for English or text that has been run
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
"""
# from . import cmudict
_pad = "_"
_eos = "~"
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? '
#_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? ' # use this old one if you want to train old model
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
#_arpabet = ["@' + s for s in cmudict.valid_symbols]
# Export all symbols:
symbols = [_pad, _eos] + list(_characters) #+ _arpabet

View File

@ -0,0 +1,74 @@
from .symbols import symbols
from . import cleaners
import re
# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
# Regular expression matching text enclosed in curly braces:
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
def text_to_sequence(text, cleaner_names):
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
Args:
text: string to convert to a sequence
cleaner_names: names of the cleaner functions to run the text through
Returns:
List of integers corresponding to the symbols in the text
"""
sequence = []
# Check for curly braces and treat their contents as ARPAbet:
while len(text):
m = _curly_re.match(text)
if not m:
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
break
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
sequence += _arpabet_to_sequence(m.group(2))
text = m.group(3)
# Append EOS token
sequence.append(_symbol_to_id["~"])
return sequence
def sequence_to_text(sequence):
"""Converts a sequence of IDs back to a string"""
result = ""
for symbol_id in sequence:
if symbol_id in _id_to_symbol:
s = _id_to_symbol[symbol_id]
# Enclose ARPAbet back in curly braces:
if len(s) > 1 and s[0] == "@":
s = "{%s}" % s[1:]
result += s
return result.replace("}{", " ")
def _clean_text(text, cleaner_names):
for name in cleaner_names:
cleaner = getattr(cleaners, name)
if not cleaner:
raise Exception("Unknown cleaner: %s" % name)
text = cleaner(text)
return text
def _symbols_to_sequence(symbols):
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
def _arpabet_to_sequence(text):
return _symbols_to_sequence(["@" + s for s in text.split()])
def _should_keep_symbol(s):
return s in _symbol_to_id and s not in ("_", "~")