modified tacotron to tacotron2

2024-03-22 13:11:31 +08:00 · 2021-12-26 12:37:35 +08:00 · 2021-12-26 12:37:35 +08:00 · de4e525a0d
commit de4e525a0d
parent b56ec5ee1b
29 changed files with 4237 additions and 0 deletions
--- a/synthesizer_tacotron2/.DS_Store
+++ b/synthesizer_tacotron2/.DS_Store
--- a/synthesizer_tacotron2/LICENSE.txt
+++ b/synthesizer_tacotron2/LICENSE.txt
@ -0,0 +1,24 @@
+MIT License
+
+Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
+Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
+Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
+Modified work Copyright (c) 2020 blue-fish (https://github.com/blue-fish)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/synthesizer_tacotron2/init.py
+++ b/synthesizer_tacotron2/init.py
@ -0,0 +1 @@
+#
--- a/synthesizer_tacotron2/audio.py
+++ b/synthesizer_tacotron2/audio.py
@ -0,0 +1,206 @@
+import librosa
+import librosa.filters
+import numpy as np
+from scipy import signal
+from scipy.io import wavfile
+import soundfile as sf
+
+
+def load_wav(path, sr):
+    return librosa.core.load(path, sr=sr)[0]
+
+def save_wav(wav, path, sr):
+    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+    #proposed by @dsmiller
+    wavfile.write(path, sr, wav.astype(np.int16))
+
+def save_wavenet_wav(wav, path, sr):
+    sf.write(path, wav.astype(np.float32), sr)
+
+def preemphasis(wav, k, preemphasize=True):
+    if preemphasize:
+        return signal.lfilter([1, -k], [1], wav)
+    return wav
+
+def inv_preemphasis(wav, k, inv_preemphasize=True):
+    if inv_preemphasize:
+        return signal.lfilter([1], [1, -k], wav)
+    return wav
+
+#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
+def start_and_end_indices(quantized, silence_threshold=2):
+    for start in range(quantized.size):
+        if abs(quantized[start] - 127) > silence_threshold:
+            break
+    for end in range(quantized.size - 1, 1, -1):
+        if abs(quantized[end] - 127) > silence_threshold:
+            break
+    
+    assert abs(quantized[start] - 127) > silence_threshold
+    assert abs(quantized[end] - 127) > silence_threshold
+    
+    return start, end
+
+def get_hop_size(hparams):
+    hop_size = hparams.hop_size
+    if hop_size is None:
+        assert hparams.frame_shift_ms is not None
+        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
+    return hop_size
+
+def linearspectrogram(wav, hparams):
+    D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
+    S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
+    
+    if hparams.signal_normalization:
+        return _normalize(S, hparams)
+    return S
+
+def melspectrogram(wav, hparams):
+    D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
+    S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
+    
+    if hparams.signal_normalization:
+        return _normalize(S, hparams)
+    return S
+
+def inv_linear_spectrogram(linear_spectrogram, hparams):
+    """Converts linear spectrogram to waveform using librosa"""
+    if hparams.signal_normalization:
+        D = _denormalize(linear_spectrogram, hparams)
+    else:
+        D = linear_spectrogram
+    
+    S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
+    
+    if hparams.use_lws:
+        processor = _lws_processor(hparams)
+        D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
+        y = processor.istft(D).astype(np.float32)
+        return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
+    else:
+        return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
+
+def inv_mel_spectrogram(mel_spectrogram, hparams):
+    """Converts mel spectrogram to waveform using librosa"""
+    if hparams.signal_normalization:
+        D = _denormalize(mel_spectrogram, hparams)
+    else:
+        D = mel_spectrogram
+    
+    S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams)  # Convert back to linear
+    
+    if hparams.use_lws:
+        processor = _lws_processor(hparams)
+        D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
+        y = processor.istft(D).astype(np.float32)
+        return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
+    else:
+        return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
+
+def _lws_processor(hparams):
+    import lws
+    return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
+
+def _griffin_lim(S, hparams):
+    """librosa implementation of Griffin-Lim
+    Based on https://github.com/librosa/librosa/issues/434
+    """
+    angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
+    S_complex = np.abs(S).astype(np.complex)
+    y = _istft(S_complex * angles, hparams)
+    for i in range(hparams.griffin_lim_iters):
+        angles = np.exp(1j * np.angle(_stft(y, hparams)))
+        y = _istft(S_complex * angles, hparams)
+    return y
+
+def _stft(y, hparams):
+    if hparams.use_lws:
+        return _lws_processor(hparams).stft(y).T
+    else:
+        return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
+
+def _istft(y, hparams):
+    return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
+
+##########################################################
+#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
+def num_frames(length, fsize, fshift):
+    """Compute number of time frames of spectrogram
+    """
+    pad = (fsize - fshift)
+    if length % fshift == 0:
+        M = (length + pad * 2 - fsize) // fshift + 1
+    else:
+        M = (length + pad * 2 - fsize) // fshift + 2
+    return M
+
+
+def pad_lr(x, fsize, fshift):
+    """Compute left and right padding
+    """
+    M = num_frames(len(x), fsize, fshift)
+    pad = (fsize - fshift)
+    T = len(x) + 2 * pad
+    r = (M - 1) * fshift + fsize - T
+    return pad, pad + r
+##########################################################
+#Librosa correct padding
+def librosa_pad_lr(x, fsize, fshift):
+    return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+
+# Conversions
+_mel_basis = None
+_inv_mel_basis = None
+
+def _linear_to_mel(spectogram, hparams):
+    global _mel_basis
+    if _mel_basis is None:
+        _mel_basis = _build_mel_basis(hparams)
+    return np.dot(_mel_basis, spectogram)
+
+def _mel_to_linear(mel_spectrogram, hparams):
+    global _inv_mel_basis
+    if _inv_mel_basis is None:
+        _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
+    return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
+
+def _build_mel_basis(hparams):
+    assert hparams.fmax <= hparams.sample_rate // 2
+    return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
+                               fmin=hparams.fmin, fmax=hparams.fmax)
+
+def _amp_to_db(x, hparams):
+    min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
+    return 20 * np.log10(np.maximum(min_level, x))
+
+def _db_to_amp(x):
+    return np.power(10.0, (x) * 0.05)
+
+def _normalize(S, hparams):
+    if hparams.allow_clipping_in_normalization:
+        if hparams.symmetric_mels:
+            return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
+                           -hparams.max_abs_value, hparams.max_abs_value)
+        else:
+            return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
+    
+    assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
+    if hparams.symmetric_mels:
+        return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
+    else:
+        return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
+
+def _denormalize(D, hparams):
+    if hparams.allow_clipping_in_normalization:
+        if hparams.symmetric_mels:
+            return (((np.clip(D, -hparams.max_abs_value,
+                              hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
+                    + hparams.min_level_db)
+        else:
+            return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
+    
+    if hparams.symmetric_mels:
+        return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
+    else:
+        return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
--- a/synthesizer_tacotron2/feeder.py
+++ b/synthesizer_tacotron2/feeder.py
@ -0,0 +1,272 @@
+from sklearn.model_selection import train_test_split
+from synthesizer.utils.text import text_to_sequence
+from synthesizer.infolog import log
+import tensorflow as tf
+import numpy as np
+import threading
+import time
+import os
+
+_batches_per_group = 64
+
+class Feeder:
+	"""
+		Feeds batches of data into queue on a background thread.
+	"""
+
+	def __init__(self, coordinator, metadata_filename, hparams):
+		super(Feeder, self).__init__()
+		self._coord = coordinator
+		self._hparams = hparams
+		self._cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]
+		self._train_offset = 0
+		self._test_offset = 0
+
+		# Load metadata
+		self._mel_dir = os.path.join(os.path.dirname(metadata_filename), "mels")
+		self._embed_dir = os.path.join(os.path.dirname(metadata_filename), "embeds")
+		with open(metadata_filename, encoding="utf-8") as f:
+			self._metadata = [line.strip().split("|") for line in f]
+			frame_shift_ms = hparams.hop_size / hparams.sample_rate
+			hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
+			log("Loaded metadata for {} examples ({:.2f} hours)".format(len(self._metadata), hours))
+
+		#Train test split
+		if hparams.tacotron_test_size is None:
+			assert hparams.tacotron_test_batches is not None
+
+		test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None
+			else hparams.tacotron_test_batches * hparams.tacotron_batch_size)
+		indices = np.arange(len(self._metadata))
+		train_indices, test_indices = train_test_split(indices,
+			test_size=test_size, random_state=hparams.tacotron_data_random_state)
+
+		#Make sure test_indices is a multiple of batch_size else round up
+		len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size)
+		extra_test = test_indices[len_test_indices:]
+		test_indices = test_indices[:len_test_indices]
+		train_indices = np.concatenate([train_indices, extra_test])
+
+		self._train_meta = list(np.array(self._metadata)[train_indices])
+		self._test_meta = list(np.array(self._metadata)[test_indices])
+
+		self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size
+
+		if hparams.tacotron_test_size is None:
+			assert hparams.tacotron_test_batches == self.test_steps
+
+		#pad input sequences with the <pad_token> 0 ( _ )
+		self._pad = 0
+		#explicitely setting the padding to a value that doesn"t originally exist in the spectogram
+		#to avoid any possible conflicts, without affecting the output range of the model too much
+		if hparams.symmetric_mels:
+			self._target_pad = -hparams.max_abs_value
+		else:
+			self._target_pad = 0.
+		#Mark finished sequences with 1s
+		self._token_pad = 1.
+
+		with tf.device("/cpu:0"):
+			# Create placeholders for inputs and targets. Don"t specify batch size because we want
+			# to be able to feed different batch sizes at eval time.
+			self._placeholders = [
+				tf.placeholder(tf.int32, shape=(None, None), name="inputs"),
+				tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
+				tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), 
+							   name="mel_targets"),
+				tf.placeholder(tf.float32, shape=(None, None), name="token_targets"),
+				tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
+				tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), 
+							   name="split_infos"),
+				
+				# SV2TTS
+				tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), 
+							   name="speaker_embeddings")
+			]
+
+			# Create queue for buffering data
+			queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, 
+									 tf.int32, tf.int32, tf.float32], name="input_queue")
+			self._enqueue_op = queue.enqueue(self._placeholders)
+			self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \
+				self.targets_lengths, self.split_infos, self.speaker_embeddings = queue.dequeue()
+
+			self.inputs.set_shape(self._placeholders[0].shape)
+			self.input_lengths.set_shape(self._placeholders[1].shape)
+			self.mel_targets.set_shape(self._placeholders[2].shape)
+			self.token_targets.set_shape(self._placeholders[3].shape)
+			self.targets_lengths.set_shape(self._placeholders[4].shape)
+			self.split_infos.set_shape(self._placeholders[5].shape)
+			self.speaker_embeddings.set_shape(self._placeholders[6].shape)
+
+			# Create eval queue for buffering eval data
+			eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,  
+										  tf.int32, tf.int32, tf.float32], name="eval_queue")
+			self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
+			self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \
+				self.eval_token_targets, self.eval_targets_lengths, \
+				self.eval_split_infos, self.eval_speaker_embeddings = eval_queue.dequeue()
+
+			self.eval_inputs.set_shape(self._placeholders[0].shape)
+			self.eval_input_lengths.set_shape(self._placeholders[1].shape)
+			self.eval_mel_targets.set_shape(self._placeholders[2].shape)
+			self.eval_token_targets.set_shape(self._placeholders[3].shape)
+			self.eval_targets_lengths.set_shape(self._placeholders[4].shape)
+			self.eval_split_infos.set_shape(self._placeholders[5].shape)
+			self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape)
+
+
+	def start_threads(self, session):
+		self._session = session
+		thread = threading.Thread(name="background", target=self._enqueue_next_train_group)
+		thread.daemon = True #Thread will close when parent quits
+		thread.start()
+
+		thread = threading.Thread(name="background", target=self._enqueue_next_test_group)
+		thread.daemon = True #Thread will close when parent quits
+		thread.start()
+
+	def _get_test_groups(self):
+		meta = self._test_meta[self._test_offset]
+		self._test_offset += 1
+
+		text = meta[5]
+
+		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
+		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
+		#Create parallel sequences containing zeros to represent a non finished sequence
+		token_target = np.asarray([0.] * (len(mel_target) - 1))
+		embed_target = np.load(os.path.join(self._embed_dir, meta[2]))
+		return input_data, mel_target, token_target, embed_target, len(mel_target)
+	
+	def make_test_batches(self):
+		start = time.time()
+
+		# Read a group of examples
+		n = self._hparams.tacotron_batch_size
+		r = self._hparams.outputs_per_step
+
+		#Test on entire test set
+		examples = [self._get_test_groups() for i in range(len(self._test_meta))]
+
+		# Bucket examples based on similar output sequence length for efficiency
+		examples.sort(key=lambda x: x[-1])
+		batches = [examples[i: i+n] for i in range(0, len(examples), n)]
+		np.random.shuffle(batches)
+
+		log("\nGenerated %d test batches of size %d in %.3f sec" % (len(batches), n, time.time() - start))
+		return batches, r
+
+	def _enqueue_next_train_group(self):
+		while not self._coord.should_stop():
+			start = time.time()
+
+			# Read a group of examples
+			n = self._hparams.tacotron_batch_size
+			r = self._hparams.outputs_per_step
+			examples = [self._get_next_example() for i in range(n * _batches_per_group)]
+
+			# Bucket examples based on similar output sequence length for efficiency
+			examples.sort(key=lambda x: x[-1])
+			batches = [examples[i: i+n] for i in range(0, len(examples), n)]
+			np.random.shuffle(batches)
+
+			log("\nGenerated {} train batches of size {} in {:.3f} sec".format(len(batches), n, time.time() - start))
+			for batch in batches:
+				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
+				self._session.run(self._enqueue_op, feed_dict=feed_dict)
+
+	def _enqueue_next_test_group(self):
+		#Create test batches once and evaluate on them for all test steps
+		test_batches, r = self.make_test_batches()
+		while not self._coord.should_stop():
+			for batch in test_batches:
+				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
+				self._session.run(self._eval_enqueue_op, feed_dict=feed_dict)
+
+	def _get_next_example(self):
+		"""Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
+		"""
+		if self._train_offset >= len(self._train_meta):
+			self._train_offset = 0
+			np.random.shuffle(self._train_meta)
+
+		meta = self._train_meta[self._train_offset]
+		self._train_offset += 1
+
+		text = meta[5]
+
+		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
+		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
+		#Create parallel sequences containing zeros to represent a non finished sequence
+		token_target = np.asarray([0.] * (len(mel_target) - 1))
+		embed_target = np.load(os.path.join(self._embed_dir, meta[2]))
+		return input_data, mel_target, token_target, embed_target, len(mel_target)
+
+	def _prepare_batch(self, batches, outputs_per_step):
+		assert 0 == len(batches) % self._hparams.tacotron_num_gpus
+		size_per_device = int(len(batches) / self._hparams.tacotron_num_gpus)
+		np.random.shuffle(batches)
+
+		inputs = None
+		mel_targets = None
+		token_targets = None
+		targets_lengths = None
+		split_infos = []
+
+		targets_lengths = np.asarray([x[-1] for x in batches], dtype=np.int32) #Used to mask loss
+		input_lengths = np.asarray([len(x[0]) for x in batches], dtype=np.int32)
+		
+		for i in range(self._hparams.tacotron_num_gpus):
+			batch = batches[size_per_device*i:size_per_device*(i+1)]
+			input_cur_device, input_max_len = self._prepare_inputs([x[0] for x in batch])
+			inputs = np.concatenate((inputs, input_cur_device), axis=1) if inputs is not None else input_cur_device
+			mel_target_cur_device, mel_target_max_len = self._prepare_targets([x[1] for x in batch], outputs_per_step)
+			mel_targets = np.concatenate(( mel_targets, mel_target_cur_device), axis=1) if mel_targets is not None else mel_target_cur_device
+
+			#Pad sequences with 1 to infer that the sequence is done
+			token_target_cur_device, token_target_max_len = self._prepare_token_targets([x[2] for x in batch], outputs_per_step)
+			token_targets = np.concatenate((token_targets, token_target_cur_device),axis=1) if token_targets is not None else token_target_cur_device
+			split_infos.append([input_max_len, mel_target_max_len, token_target_max_len])
+
+		split_infos = np.asarray(split_infos, dtype=np.int32)
+		
+		### SV2TTS ###
+		
+		embed_targets = np.asarray([x[3] for x in batches])
+		
+		##############
+		
+		return inputs, input_lengths, mel_targets, token_targets, targets_lengths, \
+			   split_infos, embed_targets
+
+	def _prepare_inputs(self, inputs):
+		max_len = max([len(x) for x in inputs])
+		return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
+
+	def _prepare_targets(self, targets, alignment):
+		max_len = max([len(t) for t in targets])
+		data_len = self._round_up(max_len, alignment)
+		return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
+
+	def _prepare_token_targets(self, targets, alignment):
+		max_len = max([len(t) for t in targets]) + 1
+		data_len = self._round_up(max_len, alignment)
+		return np.stack([self._pad_token_target(t, data_len) for t in targets]), data_len
+
+	def _pad_input(self, x, length):
+		return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad)
+
+	def _pad_target(self, t, length):
+		return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad)
+
+	def _pad_token_target(self, t, length):
+		return np.pad(t, (0, length - t.shape[0]), mode="constant", constant_values=self._token_pad)
+
+	def _round_up(self, x, multiple):
+		remainder = x % multiple
+		return x if remainder == 0 else x + multiple - remainder
+
+	def _round_down(self, x, multiple):
+		remainder = x % multiple
+		return x if remainder == 0 else x - remainder
--- a/synthesizer_tacotron2/hparams.py
+++ b/synthesizer_tacotron2/hparams.py
@ -0,0 +1,272 @@
+import ast
+import pprint
+from tensorflow.contrib.training import HParams
+
+
+
+hparams = HParams(
+        cleaners="basic_cleaners",
+        tacotron_gpu_start_idx=0,  # idx of the first GPU to be used for Tacotron training.
+        tacotron_num_gpus=1,  # Determines the number of gpus in use for Tacotron training.
+        split_on_cpu=True,
+
+        ### Signal Processing (used in both synthesizer and vocoder)
+        sample_rate = 16000,
+        n_fft = 800,
+        num_mels = 80,
+        hop_size = 200,                             # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
+        win_size = 800,                             # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
+        fmin = 55,
+        min_level_db = -100,
+        ref_level_db = 20,
+        max_abs_value = 4.,                         # Gradient explodes if too big, premature convergence if too small.
+        preemphasis = 0.97,                         # Filter coefficient to use if preemphasize is True
+        preemphasize = True,
+        frame_shift_ms=None,
+        normalize_for_wavenet=True,
+        # whether to rescale to [0, 1] for wavenet. (better audio quality)
+        clip_for_wavenet=True,
+
+
+
+        ### Tacotron Text-to-Speech (TTS)
+        tts_embed_dims = 512,                       # Embedding dimension for the graphemes/phoneme inputs
+        tts_encoder_dims = 256,
+        tts_decoder_dims = 128,
+        tts_postnet_dims = 512,
+        tts_encoder_K = 5,
+        tts_lstm_dims = 1024,
+        tts_postnet_K = 5,
+        tts_num_highways = 4,
+        tts_dropout = 0.5,
+        tts_cleaner_names = ["basic_cleaners"],
+        tts_stop_threshold = -3.4,                  # Value below which audio generation ends.
+                                                    # For example, for a range of [-4, 4], this
+                                                    # will terminate the sequence at the first
+                                                    # frame that has all values < -3.4
+
+        ### Tacotron Training
+        tts_schedule = [(2,  1e-3,  20_000,  12),   # Progressive training schedule
+                        (2,  5e-4,  40_000,  12),   # (r, lr, step, batch_size)
+                        (2,  2e-4,  80_000,  12),   #
+                        (2,  1e-4, 160_000,  12),   # r = reduction factor (# of mel frames
+                        (2,  3e-5, 320_000,  12),   #     synthesized for each decoder iteration)
+                        (2,  1e-5, 640_000,  12)],  # lr = learning rate
+
+        tts_clip_grad_norm = 1.0,                   # clips the gradient norm to prevent explosion - set to None if not needed
+        tts_eval_interval = 500,                    # Number of steps between model evaluation (sample generation)
+                                                    # Set to -1 to generate after completing epoch, or 0 to disable
+
+        tts_eval_num_samples = 1,                   # Makes this number of samples
+
+        ### Data Preprocessing
+        max_mel_frames = 900,
+        rescale = True,
+        rescaling_max = 0.9,
+        synthesis_batch_size = 16,                  # For vocoder preprocessing and inference.
+
+        ### Mel Visualization and Griffin-Lim
+        signal_normalization = True,
+        power = 1.5,
+        griffin_lim_iters = 60,
+
+        ### Audio processing options
+        fmax = 7600,                                # Should not exceed (sample_rate // 2)
+        allow_clipping_in_normalization = True,     # Used when signal_normalization = True
+        clip_mels_length = True,                    # If true, discards samples exceeding max_mel_frames
+        use_lws = False,                            # "Fast spectrogram phase recovery using local weighted sums"
+        symmetric_mels = True,                      # Sets mel range to [-max_abs_value, max_abs_value] if True,
+                                                    #               and [0, max_abs_value] if False
+        trim_silence = True,                        # Use with sample_rate of 16000 for best results
+        silence_threshold=2,
+        trim_fft_size=512,
+        trim_hop_size=128,
+        trim_top_db=23,
+
+        ### SV2TTS
+        speaker_embedding_size = 256,               # Dimension for the speaker embedding
+        silence_min_duration_split = 0.4,           # Duration in seconds of a silence for an utterance to be split
+        utterance_min_duration = 1.6,               # Duration in seconds below which utterances are discarded
+
+        # Tacotron
+        outputs_per_step=2,  # Was 1
+        # number of frames to generate at each decoding step (increase to speed up computation and
+        # allows for higher batch size, decreases G&L audio quality)
+        stop_at_any=True,
+        # Determines whether the decoder should stop when predicting <stop> to any frame or to all of
+        # them (True works pretty well)
+
+        embedding_dim=512,  # dimension of embedding space (these are NOT the speaker embeddings)
+
+        # Encoder parameters
+        enc_conv_num_layers=3,  # number of encoder convolutional layers
+        enc_conv_kernel_size=(5,),  # size of encoder convolution filters for each layer
+        enc_conv_channels=512,  # number of encoder convolutions filters for each layer
+        encoder_lstm_units=256,  # number of lstm units for each direction (forward and backward)
+
+        # Attention mechanism
+        smoothing=False,  # Whether to smooth the attention normalization function
+        attention_dim=128,  # dimension of attention space
+        attention_filters=32,  # number of attention convolution filters
+        attention_kernel=(31,),  # kernel size of attention convolution
+        cumulative_weights=True,
+        # Whether to cumulate (sum) all previous attention weights or simply feed previous weights (
+        # Recommended: True)
+
+        # Decoder
+        prenet_layers=[256, 256],  # number of layers and number of units of prenet
+        decoder_layers=2,  # number of decoder lstm layers
+        decoder_lstm_units=1024,  # number of decoder lstm units on each layer
+        max_iters=2000,
+        # Max decoder steps during inference (Just for safety from infinite loop cases)
+
+        # Residual postnet
+        postnet_num_layers=5,  # number of postnet convolutional layers
+        postnet_kernel_size=(5,),  # size of postnet convolution filters for each layer
+        postnet_channels=512,  # number of postnet convolution filters for each layer
+
+        # CBHG mel->linear postnet
+        cbhg_kernels=8,
+        # All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act
+        #  as "K-grams"
+        cbhg_conv_channels=128,  # Channels of the convolution bank
+        cbhg_pool_size=2,  # pooling size of the CBHG
+        cbhg_projection=256,
+        # projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels)
+        cbhg_projection_kernel_size=3,  # kernel_size of the CBHG projections
+        cbhg_highwaynet_layers=4,  # Number of HighwayNet layers
+        cbhg_highway_units=128,  # Number of units used in HighwayNet fully connected layers
+        cbhg_rnn_units=128,
+        # Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in
+        # shape
+
+        # Loss params
+        mask_encoder=True,
+        # whether to mask encoder padding while computing attention. Set to True for better prosody
+        # but slower convergence.
+        mask_decoder=False,
+        # Whether to use loss mask for padded sequences (if False, <stop_token> loss function will not
+        #  be weighted, else recommended pos_weight = 20)
+        cross_entropy_pos_weight=20,
+        # Use class weights to reduce the stop token classes imbalance (by adding more penalty on
+        # False Negatives (FN)) (1 = disabled)
+        predict_linear=False,
+        # Whether to add a post-processing network to the Tacotron to predict linear spectrograms (
+        # True mode Not tested!!)
+        ###########################################################################################################################################
+
+        # Tacotron Training
+        # Reproduction seeds
+        tacotron_random_seed=5339,
+        # Determines initial graph and operations (i.e: model) random state for reproducibility
+        tacotron_data_random_state=1234,  # random state for train test split repeatability
+
+        # performance parameters
+        tacotron_swap_with_cpu=False,
+        # Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause
+        # major slowdowns! Only use when critical!)
+
+        # train/test split ratios, mini-batches sizes
+        tacotron_batch_size=36,  # number of training samples on each training steps (was 32)
+        # Tacotron Batch synthesis supports ~16x the training batch size (no gradients during
+        # testing).
+        # Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times
+        #  different from training. We thus recommend masking the encoder.
+        tacotron_synthesis_batch_size=128,
+        # DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN"T TRAIN TACOTRON WITH "mask_encoder=True"!!
+        tacotron_test_size=0.05,
+        # % of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is
+        # enough to have a good idea about overfit)
+        tacotron_test_batches=None,  # number of test batches.
+
+        # Learning rate schedule
+        tacotron_decay_learning_rate=True,
+        # boolean, determines if the learning rate will follow an exponential decay
+        tacotron_start_decay=50000,  # Step at which learning decay starts
+        tacotron_decay_steps=50000,  # Determines the learning rate decay slope (UNDER TEST)
+        tacotron_decay_rate=0.5,  # learning rate decay rate (UNDER TEST)
+        tacotron_initial_learning_rate=1e-3,  # starting learning rate
+        tacotron_final_learning_rate=1e-5,  # minimal learning rate
+
+        # Optimization parameters
+        tacotron_adam_beta1=0.9,  # AdamOptimizer beta1 parameter
+        tacotron_adam_beta2=0.999,  # AdamOptimizer beta2 parameter
+        tacotron_adam_epsilon=1e-6,  # AdamOptimizer Epsilon parameter
+
+        # Regularization parameters
+        tacotron_reg_weight=1e-7,  # regularization weight (for L2 regularization)
+        tacotron_scale_regularization=False,
+        # Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is
+        #  high and biasing the model)
+        tacotron_zoneout_rate=0.1,  # zoneout rate for all LSTM cells in the network
+        tacotron_dropout_rate=0.5,  # dropout rate for all convolutional layers + prenet
+        tacotron_clip_gradients=True,  # whether to clip gradients
+
+        # Evaluation parameters
+        natural_eval=False,
+        # Whether to use 100% natural eval (to evaluate Curriculum Learning performance) or with same
+        #  teacher-forcing ratio as in training (just for overfit)
+
+        # Decoder RNN learning can take be done in one of two ways:
+        #	Teacher Forcing: vanilla teacher forcing (usually with ratio = 1). mode="constant"
+        #	Curriculum Learning Scheme: From Teacher-Forcing to sampling from previous outputs is
+        # function of global step. (teacher forcing ratio decay) mode="scheduled"
+        # The second approach is inspired by:
+        # Bengio et al. 2015: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks.
+        # Can be found under: https://arxiv.org/pdf/1506.03099.pdf
+        tacotron_teacher_forcing_mode="constant",
+        # Can be ("constant" or "scheduled"). "scheduled" mode applies a cosine teacher forcing ratio
+        # decay. (Preference: scheduled)
+        tacotron_teacher_forcing_ratio=1.,
+        # Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder
+        # inputs, Only relevant if mode="constant"
+        tacotron_teacher_forcing_init_ratio=1.,
+        # initial teacher forcing ratio. Relevant if mode="scheduled"
+        tacotron_teacher_forcing_final_ratio=0.,
+        # final teacher forcing ratio. Relevant if mode="scheduled"
+        tacotron_teacher_forcing_start_decay=10000,
+        # starting point of teacher forcing ratio decay. Relevant if mode="scheduled"
+        tacotron_teacher_forcing_decay_steps=280000,
+        # Determines the teacher forcing ratio decay slope. Relevant if mode="scheduled"
+        tacotron_teacher_forcing_decay_alpha=0.,
+        # teacher forcing ratio decay rate. Relevant if mode="scheduled"
+        ###########################################################################################################################################
+
+        # Tacotron-2 integration parameters
+        train_with_GTA=False,
+        # Whether to use GTA mels to train WaveNet instead of ground truth mels.
+        ###########################################################################################################################################
+
+        # Eval sentences (if no eval text file was specified during synthesis, these sentences are
+        # used for eval)
+        sentences=[
+            # From July 8, 2017 New York Times:
+            "Scientists at the CERN laboratory say they have discovered a new particle.",
+            "There\"s a way to measure the acute emotional intelligence that has never gone out of "
+            "style.",
+            "President Trump met with other leaders at the Group of 20 conference.",
+            "The Senate\"s bill to repeal and replace the Affordable Care Act is now imperiled.",
+            # From Google"s Tacotron example page:
+            "Generative adversarial network or variational auto-encoder.",
+            "Basilar membrane and otolaryngology are not auto-correlations.",
+            "He has read the whole thing.",
+            "He reads books.",
+            "He thought it was time to present the present.",
+            "Thisss isrealy awhsome.",
+            "Punctuation sensitivity, is working.",
+            "Punctuation sensitivity is working.",
+            "Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
+            "She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
+            "Tajima Airport serves Toyooka.",
+            # From The web (random long utterance)
+            "Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.\
+            This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that\
+            the adopted architecture is able to perform this task with wild success.",
+            "Thank you so much for your support!",
+        ],
+        )
+
+def hparams_debug_string():
+    values = hparams.values()
+    hp = ["  %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"]
+    return "Hyperparameters:\n" + "\n".join(hp)
--- a/synthesizer_tacotron2/inference.py
+++ b/synthesizer_tacotron2/inference.py
@ -0,0 +1,165 @@
+from synthesizer.tacotron2 import Tacotron2
+import torch
+from synthesizer import audio
+from synthesizer.hparams import hparams
+from synthesizer.models.tacotron import Tacotron
+from synthesizer.utils.symbols import symbols
+from synthesizer.utils.text import text_to_sequence
+from vocoder.display import simple_table
+from pathlib import Path
+from typing import Union, List
+import numpy as np
+import librosa
+from utils import logmmse
+from pypinyin import lazy_pinyin, Style
+import os
+import tensorflow as tf
+
+class Synthesizer:
+    sample_rate = hparams.sample_rate
+    hparams = hparams
+    
+    def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False):
+        """
+        The model isn't instantiated and loaded in memory until needed or until load() is called.
+        
+        :param model_fpath: path to the trained model file
+        :param verbose: if False, prints less information when using the model
+        """
+        self.verbose = verbose
+        self._low_mem = low_mem
+
+
+
+
+        # Prepare the model
+        self._model = None  # type: Tacotron2
+        checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir)
+        if checkpoint_state is None:
+            raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
+        self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
+        if verbose:
+            model_name = checkpoints_dir.parent.name.replace("logs-", "")
+            step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
+            print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))
+
+
+    def is_loaded(self):
+        """
+        Whether the model is loaded in memory.
+        """
+        return self._model is not None
+    
+    def load(self):
+        """
+        Instantiates and loads the model given the weights file that was passed in the constructor.
+        """
+        if self._low_mem:
+            raise Exception("Cannot load the synthesizer permanently in low mem mode")
+        tf.reset_default_graph()
+        self._model = Tacotron2(self.checkpoint_fpath, hparams)
+
+    def synthesize_spectrograms(self, texts: List[str],
+                                embeddings: Union[np.ndarray, List[np.ndarray]],
+                                return_alignments=False):
+        """
+        Synthesizes mel spectrograms from texts and speaker embeddings.
+
+        :param texts: a list of N text prompts to be synthesized
+        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 
+        :param return_alignments: if True, a matrix representing the alignments between the 
+        characters
+        and each decoder output step will be returned for each spectrogram
+        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 
+        sequence length of spectrogram i, and possibly the alignments.
+        """
+        # Load the model on the first request.
+        if not self.is_loaded():
+            self.load()
+
+
+        
+        print("Read " + str(texts))
+        texts = [" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) for v in texts]
+        print("Synthesizing " + str(texts))
+        # Preprocess text inputs
+        inputs = [text_to_sequence(text, hparams.tts_cleaner_names) for text in texts]
+        if not isinstance(embeddings, list):
+            embeddings = [embeddings]
+
+        # Batch inputs
+        batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
+                             for i in range(0, len(inputs), hparams.synthesis_batch_size)]
+        batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
+                             for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
+
+        specs = []
+        for i, batch in enumerate(batched_inputs, 1):
+            if self.verbose:
+                print(f"\n| Generating {i}/{len(batched_inputs)}")
+
+            # Pad texts so they are all the same length
+            text_lens = [len(text) for text in batch]
+            max_text_len = max(text_lens)
+            # chars = [pad1d(text, max_text_len) for text in batch]
+            # chars = np.stack(chars)
+            #
+            # # Stack speaker embeddings into 2D array for batch processing
+            speaker_embeds = np.stack(batched_embeds[i-1])
+            #
+            # # Convert to tensor
+            # chars = torch.tensor(chars).long().to(self.device)
+            # speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
+
+            # Inference
+            #print(texts)
+            specs, alignments = self._model.my_synthesize(speaker_embeds, texts) #传入参数是embeddings还是speaker——embeds未确定
+
+
+
+        if self.verbose:
+            print("\n\nDone.\n")
+        return (specs, alignments) if return_alignments else specs
+
+    @staticmethod
+    def load_preprocess_wav(fpath):
+        """
+        Loads and preprocesses an audio file under the same conditions the audio files were used to
+        train the synthesizer. 
+        """
+        wav = librosa.load(str(fpath), hparams.sample_rate)[0]
+        if hparams.rescale:
+            wav = wav / np.abs(wav).max() * hparams.rescaling_max
+        # denoise
+        if len(wav) > hparams.sample_rate*(0.3+0.1):
+            noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
+                                        wav[-int(hparams.sample_rate*0.15):]])
+            profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
+            wav = logmmse.denoise(wav, profile)
+        return wav
+
+    @staticmethod
+    def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
+        """
+        Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that 
+        were fed to the synthesizer when training.
+        """
+        if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+            wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
+        else:
+            wav = fpath_or_wav
+        
+        mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
+        return mel_spectrogram
+    
+    @staticmethod
+    def griffin_lim(mel):
+        """
+        Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
+        with the same parameters present in hparams.py.
+        """
+        return audio.inv_mel_spectrogram(mel, hparams)
+
+
+def pad1d(x, max_len, pad_value=0):
+    return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
--- a/synthesizer_tacotron2/infolog.py
+++ b/synthesizer_tacotron2/infolog.py
@ -0,0 +1,50 @@
+import atexit
+import json
+from datetime import datetime
+from threading import Thread
+from urllib.request import Request, urlopen
+
+_format = "%Y-%m-%d %H:%M:%S.%f"
+_file = None
+_run_name = None
+_slack_url = None
+
+
+def init(filename, run_name, slack_url=None):
+	global _file, _run_name, _slack_url
+	_close_logfile()
+	_file = open(filename, "a")
+	_file = open(filename, "a")
+	_file.write("\n-----------------------------------------------------------------\n")
+	_file.write("Starting new {} training run\n".format(run_name))
+	_file.write("-----------------------------------------------------------------\n")
+	_run_name = run_name
+	_slack_url = slack_url
+
+
+def log(msg, end="\n", slack=False):
+	print(msg, end=end)
+	if _file is not None:
+		_file.write("[%s]  %s\n" % (datetime.now().strftime(_format)[:-3], msg))
+	if slack and _slack_url is not None:
+		Thread(target=_send_slack, args=(msg,)).start()
+
+
+def _close_logfile():
+	global _file
+	if _file is not None:
+		_file.close()
+		_file = None
+
+
+def _send_slack(msg):
+	req = Request(_slack_url)
+	req.add_header("Content-Type", "application/json")
+	urlopen(req, json.dumps({
+		"username": "tacotron",
+		"icon_emoji": ":taco:",
+		"text": "*%s*: %s" % (_run_name, msg)
+	}).encode())
+
+
+atexit.register(_close_logfile)
--- a/synthesizer_tacotron2/models/init.py
+++ b/synthesizer_tacotron2/models/init.py
@ -0,0 +1,8 @@
+from .tacotron import Tacotron
+
+
+def create_model(name, hparams):
+  if name == "Tacotron":
+    return Tacotron(hparams)
+  else:
+    raise Exception("Unknown model: " + name)
--- a/synthesizer_tacotron2/models/architecture_wrappers.py
+++ b/synthesizer_tacotron2/models/architecture_wrappers.py
@ -0,0 +1,207 @@
+"""A set of wrappers useful for tacotron 2 architecture
+All notations and variable names were used in concordance with originial tensorflow implementation
+"""
+import collections
+import tensorflow as tf
+from synthesizer.models.attention import _compute_attention
+from tensorflow.contrib.rnn import RNNCell
+from tensorflow.python.framework import ops, tensor_shape
+from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops
+from tensorflow.python.util import nest
+
+_zero_state_tensors = rnn_cell_impl._zero_state_tensors
+
+
+
+class TacotronEncoderCell(RNNCell):
+	"""Tacotron 2 Encoder Cell
+	Passes inputs through a stack of convolutional layers then through a bidirectional LSTM
+	layer to predict the hidden representation vector (or memory)
+	"""
+
+	def __init__(self, convolutional_layers, lstm_layer):
+		"""Initialize encoder parameters
+
+		Args:
+			convolutional_layers: Encoder convolutional block class
+			lstm_layer: encoder bidirectional lstm layer class
+		"""
+		super(TacotronEncoderCell, self).__init__()
+		#Initialize encoder layers
+		self._convolutions = convolutional_layers
+		self._cell = lstm_layer
+
+	def __call__(self, inputs, input_lengths=None):
+		#Pass input sequence through a stack of convolutional layers
+		conv_output = self._convolutions(inputs)
+
+		#Extract hidden representation from encoder lstm cells
+		hidden_representation = self._cell(conv_output, input_lengths)
+
+		#For shape visualization
+		self.conv_output_shape = conv_output.shape
+		return hidden_representation
+
+
+class TacotronDecoderCellState(
+	collections.namedtuple("TacotronDecoderCellState",
+	 ("cell_state", "attention", "time", "alignments",
+	  "alignment_history"))):
+	"""`namedtuple` storing the state of a `TacotronDecoderCell`.
+	Contains:
+	  - `cell_state`: The state of the wrapped `RNNCell` at the previous time
+		step.
+	  - `attention`: The attention emitted at the previous time step.
+	  - `time`: int32 scalar containing the current time step.
+	  - `alignments`: A single or tuple of `Tensor`(s) containing the alignments
+		 emitted at the previous time step for each attention mechanism.
+	  - `alignment_history`: a single or tuple of `TensorArray`(s)
+		 containing alignment matrices from all time steps for each attention
+		 mechanism. Call `stack()` on each to convert to a `Tensor`.
+	"""
+	def replace(self, **kwargs):
+		"""Clones the current state while overwriting components provided by kwargs.
+		"""
+		return super(TacotronDecoderCellState, self)._replace(**kwargs)
+
+class TacotronDecoderCell(RNNCell):
+	"""Tactron 2 Decoder Cell
+	Decodes encoder output and previous mel frames into next r frames
+
+	Decoder Step i:
+		1) Prenet to compress last output information
+		2) Concat compressed inputs with previous context vector (input feeding) *
+		3) Decoder RNN (actual decoding) to predict current state s_{i} *
+		4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments *
+		5) Predict new output y_{i} using s_{i} and c_{i} (concatenated)
+		6) Predict <stop_token> output ys_{i} using s_{i} and c_{i} (concatenated)
+
+	* : This is typically taking a vanilla LSTM, wrapping it using tensorflow"s attention wrapper,
+	and wrap that with the prenet before doing an input feeding, and with the prediction layer
+	that uses RNN states to project on output space. Actions marked with (*) can be replaced with
+	tensorflow"s attention wrapper call if it was using cumulative alignments instead of previous alignments only.
+	"""
+
+	def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection):
+		"""Initialize decoder parameters
+
+		Args:
+		    prenet: A tensorflow fully connected layer acting as the decoder pre-net
+		    attention_mechanism: A _BaseAttentionMechanism instance, usefull to
+			    learn encoder-decoder alignments
+		    rnn_cell: Instance of RNNCell, main body of the decoder
+		    frame_projection: tensorflow fully connected layer with r * num_mels output units
+		    stop_projection: tensorflow fully connected layer, expected to project to a scalar
+			    and through a sigmoid activation
+			mask_finished: Boolean, Whether to mask decoder frames after the <stop_token>
+		"""
+		super(TacotronDecoderCell, self).__init__()
+		#Initialize decoder layers
+		self._prenet = prenet
+		self._attention_mechanism = attention_mechanism
+		self._cell = rnn_cell
+		self._frame_projection = frame_projection
+		self._stop_projection = stop_projection
+
+		self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value
+
+	def _batch_size_checks(self, batch_size, error_message):
+		return [check_ops.assert_equal(batch_size,
+		  self._attention_mechanism.batch_size,
+		  message=error_message)]
+
+	@property
+	def output_size(self):
+		return self._frame_projection.shape
+
+	@property
+	def state_size(self):
+		"""The `state_size` property of `TacotronDecoderCell`.
+
+		Returns:
+		  An `TacotronDecoderCell` tuple containing shapes used by this object.
+		"""
+		return TacotronDecoderCellState(
+			cell_state=self._cell._cell.state_size,
+			time=tensor_shape.TensorShape([]),
+			attention=self._attention_layer_size,
+			alignments=self._attention_mechanism.alignments_size,
+			alignment_history=())
+
+	def zero_state(self, batch_size, dtype):
+		"""Return an initial (zero) state tuple for this `AttentionWrapper`.
+
+		Args:
+		  batch_size: `0D` integer tensor: the batch size.
+		  dtype: The internal state data type.
+		Returns:
+		  An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
+		  possibly, empty `TensorArray` objects.
+		Raises:
+		  ValueError: (or, possibly at runtime, InvalidArgument), if
+			`batch_size` does not match the output size of the encoder passed
+			to the wrapper object at initialization time.
+		"""
+		with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+			cell_state = self._cell._cell.zero_state(batch_size, dtype)
+			error_message = (
+				"When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
+				"Non-matching batch sizes between the memory "
+				"(encoder output) and the requested batch size.")
+			with ops.control_dependencies(
+				self._batch_size_checks(batch_size, error_message)):
+				cell_state = nest.map_structure(
+					lambda s: array_ops.identity(s, name="checked_cell_state"),
+					cell_state)
+			return TacotronDecoderCellState(
+				cell_state=cell_state,
+				time=array_ops.zeros([], dtype=tf.int32),
+				attention=_zero_state_tensors(self._attention_layer_size, batch_size,
+				  dtype),
+				alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
+				alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
+				dynamic_size=True))
+
+	def __call__(self, inputs, state):
+		#Information bottleneck (essential for learning attention)
+		prenet_output = self._prenet(inputs)
+
+		#Concat context vector and prenet output to form LSTM cells input (input feeding)
+		LSTM_input = tf.concat([prenet_output, state.attention], axis=-1)
+
+		#Unidirectional LSTM layers
+		LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state)
+
+
+		#Compute the attention (context) vector and alignments using
+		#the new decoder cell hidden state as query vector
+		#and cumulative alignments to extract location features
+		#The choice of the new cell hidden state (s_{i}) of the last
+		#decoder RNN Cell is based on Luong et Al. (2015):
+		#https://arxiv.org/pdf/1508.04025.pdf
+		previous_alignments = state.alignments
+		previous_alignment_history = state.alignment_history
+		context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism,
+			LSTM_output,
+			previous_alignments,
+			attention_layer=None)
+
+		#Concat LSTM outputs and context vector to form projections inputs
+		projections_input = tf.concat([LSTM_output, context_vector], axis=-1)
+
+		#Compute predicted frames and predicted <stop_token>
+		cell_outputs = self._frame_projection(projections_input)
+		stop_tokens = self._stop_projection(projections_input)
+
+		#Save alignment history
+		alignment_history = previous_alignment_history.write(state.time, alignments)
+
+		#Prepare next decoder state
+		next_state = TacotronDecoderCellState(
+			time=state.time + 1,
+			cell_state=next_cell_state,
+			attention=context_vector,
+			alignments=cumulated_alignments,
+			alignment_history=alignment_history)
+
+		return (cell_outputs, stop_tokens), next_state
--- a/synthesizer_tacotron2/models/attention.py
+++ b/synthesizer_tacotron2/models/attention.py
@ -0,0 +1,207 @@
+"""Attention file for location based attention (compatible with tensorflow attention wrapper)"""
+
+import tensorflow as tf
+from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope
+
+
+#From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+def _compute_attention(attention_mechanism, cell_output, attention_state,
+					   attention_layer):
+	"""Computes the attention and alignments for a given attention_mechanism."""
+	alignments, next_attention_state = attention_mechanism(
+		cell_output, state=attention_state)
+
+	# Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
+	expanded_alignments = array_ops.expand_dims(alignments, 1)
+	# Context is the inner product of alignments and values along the
+	# memory time dimension.
+	# alignments shape is
+	#   [batch_size, 1, memory_time]
+	# attention_mechanism.values shape is
+	#   [batch_size, memory_time, memory_size]
+	# the batched matmul is over memory_time, so the output shape is
+	#   [batch_size, 1, memory_size].
+	# we then squeeze out the singleton dim.
+	context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
+	context = array_ops.squeeze(context, [1])
+
+	if attention_layer is not None:
+		attention = attention_layer(array_ops.concat([cell_output, context], 1))
+	else:
+		attention = context
+
+	return attention, alignments, next_attention_state
+
+
+def _location_sensitive_score(W_query, W_fil, W_keys):
+	"""Impelements Bahdanau-style (cumulative) scoring function.
+	This attention is described in:
+		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+	  gio, “Attention-based models for speech recognition,” in Ad-
+	  vances in Neural Information Processing Systems, 2015, pp.
+	  577–585.
+
+	#############################################################################
+			  hybrid attention (content-based + location-based)
+							   f = F * α_{i-1}
+	   energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
+	#############################################################################
+
+	Args:
+		W_query: Tensor, shape "[batch_size, 1, attention_dim]" to compare to location features.
+		W_location: processed previous alignments into location features, shape "[batch_size, max_time, attention_dim]"
+		W_keys: Tensor, shape "[batch_size, max_time, attention_dim]", typically the encoder outputs.
+	Returns:
+		A "[batch_size, max_time]" attention score (energy)
+	"""
+	# Get the number of hidden units from the trailing dimension of keys
+	dtype = W_query.dtype
+	num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
+
+	v_a = tf.get_variable(
+		"attention_variable_projection", shape=[num_units], dtype=dtype,
+		initializer=tf.contrib.layers.xavier_initializer())
+	b_a = tf.get_variable(
+		"attention_bias", shape=[num_units], dtype=dtype,
+		initializer=tf.zeros_initializer())
+
+	return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
+
+def _smoothing_normalization(e):
+	"""Applies a smoothing normalization function instead of softmax
+	Introduced in:
+		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+	  gio, “Attention-based models for speech recognition,” in Ad-
+	  vances in Neural Information Processing Systems, 2015, pp.
+	  577–585.
+
+	############################################################################
+						Smoothing normalization function
+				a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
+	############################################################################
+
+	Args:
+		e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
+			values of an attention mechanism
+	Returns:
+		matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
+			attendance to multiple memory time steps.
+	"""
+	return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
+
+
+class LocationSensitiveAttention(BahdanauAttention):
+	"""Impelements Bahdanau-style (cumulative) scoring function.
+	Usually referred to as "hybrid" attention (content-based + location-based)
+	Extends the additive attention described in:
+	"D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
+  tion by jointly learning to align and translate,” in Proceedings
+  of ICLR, 2015."
+	to use previous alignments as additional location features.
+
+	This attention is described in:
+	J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+  gio, “Attention-based models for speech recognition,” in Ad-
+  vances in Neural Information Processing Systems, 2015, pp.
+  577–585.
+	"""
+
+	def __init__(self,
+				 num_units,
+				 memory,
+				 hparams,
+				 mask_encoder=True,
+				 memory_sequence_length=None,
+				 smoothing=False,
+				 cumulate_weights=True,
+				 name="LocationSensitiveAttention"):
+		"""Construct the Attention mechanism.
+		Args:
+			num_units: The depth of the query mechanism.
+			memory: The memory to query; usually the output of an RNN encoder.  This
+				tensor should be shaped `[batch_size, max_time, ...]`.
+			mask_encoder (optional): Boolean, whether to mask encoder paddings.
+			memory_sequence_length (optional): Sequence lengths for the batch entries
+				in memory.  If provided, the memory tensor rows are masked with zeros
+				for values past the respective sequence lengths. Only relevant if mask_encoder = True.
+			smoothing (optional): Boolean. Determines which normalization function to use.
+				Default normalization function (probablity_fn) is softmax. If smoothing is
+				enabled, we replace softmax with:
+						a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
+				Introduced in:
+					J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
+				  gio, “Attention-based models for speech recognition,” in Ad-
+				  vances in Neural Information Processing Systems, 2015, pp.
+				  577–585.
+				This is mainly used if the model wants to attend to multiple input parts
+				at the same decoding step. We probably won"t be using it since multiple sound
+				frames may depend on the same character/phone, probably not the way around.
+				Note:
+					We still keep it implemented in case we want to test it. They used it in the
+					paper in the context of speech recognition, where one phoneme may depend on
+					multiple subsequent sound frames.
+			name: Name to use when creating ops.
+		"""
+		#Create normalization function
+		#Setting it to None defaults in using softmax
+		normalization_function = _smoothing_normalization if (smoothing == True) else None
+		memory_length = memory_sequence_length if (mask_encoder==True) else None
+		super(LocationSensitiveAttention, self).__init__(
+				num_units=num_units,
+				memory=memory,
+				memory_sequence_length=memory_length,
+				probability_fn=normalization_function,
+				name=name)
+
+		self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
+			kernel_size=hparams.attention_kernel, padding="same", use_bias=True,
+			bias_initializer=tf.zeros_initializer(), name="location_features_convolution")
+		self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,
+			dtype=tf.float32, name="location_features_layer")
+		self._cumulate = cumulate_weights
+
+	def __call__(self, query, state):
+		"""Score the query based on the keys and values.
+		Args:
+			query: Tensor of dtype matching `self.values` and shape
+				`[batch_size, query_depth]`.
+			state (previous alignments): Tensor of dtype matching `self.values` and shape
+				`[batch_size, alignments_size]`
+				(`alignments_size` is memory"s `max_time`).
+		Returns:
+			alignments: Tensor of dtype matching `self.values` and shape
+				`[batch_size, alignments_size]` (`alignments_size` is memory's
+				`max_time`).
+		"""
+		previous_alignments = state
+		with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
+
+			# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
+			processed_query = self.query_layer(query) if self.query_layer else query
+			# -> [batch_size, 1, attention_dim]
+			processed_query = tf.expand_dims(processed_query, 1)
+
+			# processed_location_features shape [batch_size, max_time, attention dimension]
+			# [batch_size, max_time] -> [batch_size, max_time, 1]
+			expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
+			# location features [batch_size, max_time, filters]
+			f = self.location_convolution(expanded_alignments)
+			# Projected location features [batch_size, max_time, attention_dim]
+			processed_location_features = self.location_layer(f)
+
+			# energy shape [batch_size, max_time]
+			energy = _location_sensitive_score(processed_query, processed_location_features, self.keys)
+
+
+		# alignments shape = energy shape = [batch_size, max_time]
+		alignments = self._probability_fn(energy, previous_alignments)
+
+		# Cumulate alignments
+		if self._cumulate:
+			next_state = alignments + previous_alignments
+		else:
+			next_state = alignments
+
+		return alignments, next_state
--- a/synthesizer_tacotron2/models/custom_decoder.py
+++ b/synthesizer_tacotron2/models/custom_decoder.py
@ -0,0 +1,132 @@
+from __future__ import absolute_import, division, print_function
+import collections
+import tensorflow as tf
+from synthesizer.models.helpers import TacoTestHelper, TacoTrainingHelper
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
+from tensorflow.python.framework import ops, tensor_shape
+from tensorflow.python.layers import base as layers_base
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.util import nest
+
+
+class CustomDecoderOutput(
+		collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))):
+	pass
+
+
+class CustomDecoder(decoder.Decoder):
+	"""Custom sampling decoder.
+
+	Allows for stop token prediction at inference time
+	and returns equivalent loss in training time.
+
+	Note:
+	Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers
+	"""
+
+	def __init__(self, cell, helper, initial_state, output_layer=None):
+		"""Initialize CustomDecoder.
+		Args:
+			cell: An `RNNCell` instance.
+			helper: A `Helper` instance.
+			initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+				The initial state of the RNNCell.
+			output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
+				`tf.layers.Dense`. Optional layer to apply to the RNN output prior
+				to storing the result or sampling.
+		Raises:
+			TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
+		"""
+		rnn_cell_impl.assert_like_rnncell(type(cell), cell)
+		if not isinstance(helper, helper_py.Helper):
+			raise TypeError("helper must be a Helper, received: %s" % type(helper))
+		if (output_layer is not None
+				and not isinstance(output_layer, layers_base.Layer)):
+			raise TypeError(
+					"output_layer must be a Layer, received: %s" % type(output_layer))
+		self._cell = cell
+		self._helper = helper
+		self._initial_state = initial_state
+		self._output_layer = output_layer
+
+	@property
+	def batch_size(self):
+		return self._helper.batch_size
+
+	def _rnn_output_size(self):
+		size = self._cell.output_size
+		if self._output_layer is None:
+			return size
+		else:
+			# To use layer"s compute_output_shape, we need to convert the
+			# RNNCell"s output_size entries into shapes with an unknown
+			# batch size.  We then pass this through the layer"s
+			# compute_output_shape and read off all but the first (batch)
+			# dimensions to get the output size of the rnn with the layer
+			# applied to the top.
+			output_shape_with_unknown_batch = nest.map_structure(
+					lambda s: tensor_shape.TensorShape([None]).concatenate(s),
+					size)
+			layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
+					output_shape_with_unknown_batch)
+			return nest.map_structure(lambda s: s[1:], layer_output_shape)
+
+	@property
+	def output_size(self):
+		# Return the cell output and the id
+		return CustomDecoderOutput(
+				rnn_output=self._rnn_output_size(),
+				token_output=self._helper.token_output_size,
+				sample_id=self._helper.sample_ids_shape)
+
+	@property
+	def output_dtype(self):
+		# Assume the dtype of the cell is the output_size structure
+		# containing the input_state"s first component's dtype.
+		# Return that structure and the sample_ids_dtype from the helper.
+		dtype = nest.flatten(self._initial_state)[0].dtype
+		return CustomDecoderOutput(
+				nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+				tf.float32,
+				self._helper.sample_ids_dtype)
+
+	def initialize(self, name=None):
+		"""Initialize the decoder.
+		Args:
+			name: Name scope for any created operations.
+		Returns:
+			`(finished, first_inputs, initial_state)`.
+		"""
+		return self._helper.initialize() + (self._initial_state,)
+
+	def step(self, time, inputs, state, name=None):
+		"""Perform a custom decoding step.
+		Enables for dyanmic <stop_token> prediction
+		Args:
+			time: scalar `int32` tensor.
+			inputs: A (structure of) input tensors.
+			state: A (structure of) state tensors and TensorArrays.
+			name: Name scope for any created operations.
+		Returns:
+			`(outputs, next_state, next_inputs, finished)`.
+		"""
+		with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)):
+			#Call outputprojection wrapper cell
+			(cell_outputs, stop_token), cell_state = self._cell(inputs, state)
+
+			#apply output_layer (if existant)
+			if self._output_layer is not None:
+				cell_outputs = self._output_layer(cell_outputs)
+			sample_ids = self._helper.sample(
+					time=time, outputs=cell_outputs, state=cell_state)
+
+			(finished, next_inputs, next_state) = self._helper.next_inputs(
+					time=time,
+					outputs=cell_outputs,
+					state=cell_state,
+					sample_ids=sample_ids,
+					stop_token_prediction=stop_token)
+
+		outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids)
+		return (outputs, next_state, next_inputs, finished)
--- a/synthesizer_tacotron2/models/helpers.py
+++ b/synthesizer_tacotron2/models/helpers.py
@ -0,0 +1,161 @@
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib.seq2seq import Helper
+
+
+class TacoTestHelper(Helper):
+	def __init__(self, batch_size, hparams):
+		with tf.name_scope("TacoTestHelper"):
+			self._batch_size = batch_size
+			self._output_dim = hparams.num_mels
+			self._reduction_factor = hparams.outputs_per_step
+			self.stop_at_any = hparams.stop_at_any
+
+	@property
+	def batch_size(self):
+		return self._batch_size
+
+	@property
+	def token_output_size(self):
+		return self._reduction_factor
+
+	@property
+	def sample_ids_shape(self):
+		return tf.TensorShape([])
+
+	@property
+	def sample_ids_dtype(self):
+		return np.int32
+
+	def initialize(self, name=None):
+		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
+
+	def sample(self, time, outputs, state, name=None):
+		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
+
+	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
+		"""Stop on EOS. Otherwise, pass the last output as the next input and pass through state."""
+		with tf.name_scope("TacoTestHelper"):
+			#A sequence is finished when the output probability is > 0.5
+			finished = tf.cast(tf.round(stop_token_prediction), tf.bool)
+
+			#Since we are predicting r frames at each step, two modes are
+			#then possible:
+			#	Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended)
+			#	Stop when the model outputs a p > 0.5 for all r frames (Safer)
+			#Note:
+			#	With enough training steps, the model should be able to predict when to stop correctly
+			#	and the use of stop_at_any = True would be recommended. If however the model didn"t
+			#	learn to stop correctly yet, (stops too soon) one could choose to use the safer option
+			#	to get a correct synthesis
+			if self.stop_at_any:
+				finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended
+			else:
+				finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option
+
+			# Feed last output frame as next input. outputs is [N, output_dim * r]
+			next_inputs = outputs[:, -self._output_dim:]
+			next_state = state
+			return (finished, next_inputs, next_state)
+
+
+class TacoTrainingHelper(Helper):
+	def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step):
+		# inputs is [N, T_in], targets is [N, T_out, D]
+		with tf.name_scope("TacoTrainingHelper"):
+			self._batch_size = batch_size
+			self._output_dim = hparams.num_mels
+			self._reduction_factor = hparams.outputs_per_step
+			self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio)
+			self.gta = gta
+			self.eval = evaluating
+			self._hparams = hparams
+			self.global_step = global_step
+
+			r = self._reduction_factor
+			# Feed every r-th target frame as input
+			self._targets = targets[:, r-1::r, :]
+
+			#Maximal sequence length
+			self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size])
+
+	@property
+	def batch_size(self):
+		return self._batch_size
+
+	@property
+	def token_output_size(self):
+		return self._reduction_factor
+
+	@property
+	def sample_ids_shape(self):
+		return tf.TensorShape([])
+
+	@property
+	def sample_ids_dtype(self):
+		return np.int32
+
+	def initialize(self, name=None):
+		#Compute teacher forcing ratio for this global step.
+		#In GTA mode, override teacher forcing scheme to work with full teacher forcing
+		if self.gta:
+			self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth
+		elif self.eval and self._hparams.natural_eval:
+			self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions
+		else:
+			if self._hparams.tacotron_teacher_forcing_mode == "scheduled":
+				self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio,
+					self.global_step, self._hparams)
+
+		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
+
+	def sample(self, time, outputs, state, name=None):
+		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
+
+	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
+		with tf.name_scope(name or "TacoTrainingHelper"):
+			#synthesis stop (we let the model see paddings as we mask them when computing loss functions)
+			finished = (time + 1 >= self._lengths)
+
+			#Pick previous outputs randomly with respect to teacher forcing ratio
+			next_inputs = tf.cond(
+				tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
+				lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
+				lambda: outputs[:,-self._output_dim:])
+
+			#Pass on state
+			next_state = state
+			return (finished, next_inputs, next_state)
+
+
+def _go_frames(batch_size, output_dim):
+	"""Returns all-zero <GO> frames for a given batch size and output dimension"""
+	return tf.tile([[0.0]], [batch_size, output_dim])
+
+def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams):
+		#################################################################
+		# Narrow Cosine Decay:
+
+		# Phase 1: tfr = 1
+		# We only start learning rate decay after 10k steps
+
+		# Phase 2: tfr in ]0, 1[
+		# decay reach minimal value at step ~280k
+
+		# Phase 3: tfr = 0
+		# clip by minimal teacher forcing ratio value (step >~ 280k)
+		#################################################################
+		#Compute natural cosine decay
+		tfr = tf.train.cosine_decay(init_tfr,
+			global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr = 1 at step 10k
+			decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr = 0 at step ~280k
+			alpha=hparams.tacotron_teacher_forcing_decay_alpha, #tfr = 0% of init_tfr as final value
+			name="tfr_cosine_decay")
+
+		#force teacher forcing ratio to take initial value when global step < start decay step.
+		narrow_tfr = tf.cond(
+			tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)),
+			lambda: tf.convert_to_tensor(init_tfr),
+			lambda: tfr)
+
+		return narrow_tfr
--- a/synthesizer_tacotron2/models/modules.py
+++ b/synthesizer_tacotron2/models/modules.py
@ -0,0 +1,528 @@
+import tensorflow as tf
+
+
+class HighwayNet:
+    def __init__(self, units, name=None):
+        self.units = units
+        self.scope = "HighwayNet" if name is None else name
+        
+        self.H_layer = tf.layers.Dense(units=self.units, activation=tf.nn.relu, name="H")
+        self.T_layer = tf.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T",
+                                       bias_initializer=tf.constant_initializer(-1.))
+    
+    def __call__(self, inputs):
+        with tf.variable_scope(self.scope):
+            H = self.H_layer(inputs)
+            T = self.T_layer(inputs)
+            return H * T + inputs * (1. - T)
+
+
+class CBHG:
+    def __init__(self, K, conv_channels, pool_size, projections, projection_kernel_size,
+                 n_highwaynet_layers, highway_units, rnn_units, is_training, name=None):
+        self.K = K
+        self.conv_channels = conv_channels
+        self.pool_size = pool_size
+        
+        self.projections = projections
+        self.projection_kernel_size = projection_kernel_size
+        
+        self.is_training = is_training
+        self.scope = "CBHG" if name is None else name
+        
+        self.highway_units = highway_units
+        self.highwaynet_layers = [
+            HighwayNet(highway_units, name="{}_highwaynet_{}".format(self.scope, i + 1)) for i in
+            range(n_highwaynet_layers)]
+        self._fw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_forward_RNN".format(self.scope))
+        self._bw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_backward_RNN".format(self.scope))
+    
+    def __call__(self, inputs, input_lengths):
+        with tf.variable_scope(self.scope):
+            with tf.variable_scope("conv_bank"):
+                # Convolution bank: concatenate on the last axis to stack channels from all 
+                # convolutions
+                # The convolution bank uses multiple different kernel sizes to have many insights 
+                # of the input sequence
+                # This makes one of the strengths of the CBHG block on sequences.
+                conv_outputs = tf.concat(
+                    [conv1d(inputs, k, self.conv_channels, tf.nn.relu, self.is_training, 0.,
+                            "conv1d_{}".format(k)) for k in range(1, self.K + 1)],
+                    axis=-1
+                )
+            
+            # Maxpooling (dimension reduction, Using max instead of average helps finding "Edges" 
+			# in mels)
+            maxpool_output = tf.layers.max_pooling1d(
+                conv_outputs,
+                pool_size=self.pool_size,
+                strides=1,
+                padding="same")
+            
+            # Two projection layers
+            proj1_output = conv1d(maxpool_output, self.projection_kernel_size, self.projections[0],
+                                  tf.nn.relu, self.is_training, 0., "proj1")
+            proj2_output = conv1d(proj1_output, self.projection_kernel_size, self.projections[1],
+                                  lambda _: _, self.is_training, 0., "proj2")
+            
+            # Residual connection
+            highway_input = proj2_output + inputs
+            
+            # Additional projection in case of dimension mismatch (for HighwayNet "residual" 
+			# connection)
+            if highway_input.shape[2] != self.highway_units:
+                highway_input = tf.layers.dense(highway_input, self.highway_units)
+            
+            # 4-layer HighwayNet
+            for highwaynet in self.highwaynet_layers:
+                highway_input = highwaynet(highway_input)
+            rnn_input = highway_input
+            
+            # Bidirectional RNN
+            outputs, states = tf.nn.bidirectional_dynamic_rnn(
+                self._fw_cell,
+                self._bw_cell,
+                rnn_input,
+                sequence_length=input_lengths,
+                dtype=tf.float32)
+            return tf.concat(outputs, axis=2)  # Concat forward and backward outputs
+
+
+class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
+    """Wrapper for tf LSTM to create Zoneout LSTM Cell
+
+    inspired by:
+    https://github.com/teganmaharaj/zoneout/blob/master/zoneout_tensorflow.py
+
+    Published by one of "https://arxiv.org/pdf/1606.01305.pdf" paper writers.
+
+    Many thanks to @Ondal90 for pointing this out. You sir are a hero!
+    """
+    
+    def __init__(self, num_units, is_training, zoneout_factor_cell=0., zoneout_factor_output=0.,
+                 state_is_tuple=True, name=None):
+        """Initializer with possibility to set different zoneout values for cell/hidden states.
+        """
+        zm = min(zoneout_factor_output, zoneout_factor_cell)
+        zs = max(zoneout_factor_output, zoneout_factor_cell)
+        
+        if zm < 0. or zs > 1.:
+            raise ValueError("One/both provided Zoneout factors are not in [0, 1]")
+        
+        self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name)
+        self._zoneout_cell = zoneout_factor_cell
+        self._zoneout_outputs = zoneout_factor_output
+        self.is_training = is_training
+        self.state_is_tuple = state_is_tuple
+    
+    @property
+    def state_size(self):
+        return self._cell.state_size
+    
+    @property
+    def output_size(self):
+        return self._cell.output_size
+    
+    def __call__(self, inputs, state, scope=None):
+        """Runs vanilla LSTM Cell and applies zoneout.
+        """
+        # Apply vanilla LSTM
+        output, new_state = self._cell(inputs, state, scope)
+        
+        if self.state_is_tuple:
+            (prev_c, prev_h) = state
+            (new_c, new_h) = new_state
+        else:
+            num_proj = self._cell._num_units if self._cell._num_proj is None else \
+				self._cell._num_proj
+            prev_c = tf.slice(state, [0, 0], [-1, self._cell._num_units])
+            prev_h = tf.slice(state, [0, self._cell._num_units], [-1, num_proj])
+            new_c = tf.slice(new_state, [0, 0], [-1, self._cell._num_units])
+            new_h = tf.slice(new_state, [0, self._cell._num_units], [-1, num_proj])
+        
+        # Apply zoneout
+        if self.is_training:
+            # nn.dropout takes keep_prob (probability to keep activations) not drop_prob (
+			# probability to mask activations)!
+            c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c,
+                                                         (1 - self._zoneout_cell)) + prev_c
+            h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h,
+                                                            (1 - self._zoneout_outputs)) + prev_h
+        
+        else:
+            c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c
+            h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h
+        
+        new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c,
+                                                                                                  h])
+        
+        return output, new_state
+
+
+class EncoderConvolutions:
+    """Encoder convolutional layers used to find local dependencies in inputs characters.
+    """
+    
+    def __init__(self, is_training, hparams, activation=tf.nn.relu, scope=None):
+        """
+        Args:
+            is_training: Boolean, determines if the model is training or in inference to control 
+            dropout
+            kernel_size: tuple or integer, The size of convolution kernels
+            channels: integer, number of convolutional kernels
+            activation: callable, postnet activation function for each convolutional layer
+            scope: Postnet scope.
+        """
+        super(EncoderConvolutions, self).__init__()
+        self.is_training = is_training
+        
+        self.kernel_size = hparams.enc_conv_kernel_size
+        self.channels = hparams.enc_conv_channels
+        self.activation = activation
+        self.scope = "enc_conv_layers" if scope is None else scope
+        self.drop_rate = hparams.tacotron_dropout_rate
+        self.enc_conv_num_layers = hparams.enc_conv_num_layers
+    
+    def __call__(self, inputs):
+        with tf.variable_scope(self.scope):
+            x = inputs
+            for i in range(self.enc_conv_num_layers):
+                x = conv1d(x, self.kernel_size, self.channels, self.activation,
+                           self.is_training, self.drop_rate,
+                           "conv_layer_{}_".format(i + 1) + self.scope)
+        return x
+
+
+class EncoderRNN:
+    """Encoder bidirectional one layer LSTM
+    """
+    
+    def __init__(self, is_training, size=256, zoneout=0.1, scope=None):
+        """
+        Args:
+            is_training: Boolean, determines if the model is training or in inference to control 
+            zoneout
+            size: integer, the number of LSTM units for each direction
+            zoneout: the zoneout factor
+            scope: EncoderRNN scope.
+        """
+        super(EncoderRNN, self).__init__()
+        self.is_training = is_training
+        
+        self.size = size
+        self.zoneout = zoneout
+        self.scope = "encoder_LSTM" if scope is None else scope
+        
+        # Create forward LSTM Cell
+        self._fw_cell = ZoneoutLSTMCell(size, is_training,
+                                        zoneout_factor_cell=zoneout,
+                                        zoneout_factor_output=zoneout,
+                                        name="encoder_fw_LSTM")
+        
+        # Create backward LSTM Cell
+        self._bw_cell = ZoneoutLSTMCell(size, is_training,
+                                        zoneout_factor_cell=zoneout,
+                                        zoneout_factor_output=zoneout,
+                                        name="encoder_bw_LSTM")
+    
+    def __call__(self, inputs, input_lengths):
+        with tf.variable_scope(self.scope):
+            outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(
+                self._fw_cell,
+                self._bw_cell,
+                inputs,
+                sequence_length=input_lengths,
+                dtype=tf.float32,
+                swap_memory=True)
+            
+            return tf.concat(outputs, axis=2)  # Concat and return forward + backward outputs
+
+
+class Prenet:
+    """Two fully connected layers used as an information bottleneck for the attention.
+    """
+    
+    def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activation=tf.nn.relu,
+                 scope=None):
+        """
+        Args:
+            layers_sizes: list of integers, the length of the list represents the number of pre-net
+                layers and the list values represent the layers number of units
+            activation: callable, activation functions of the prenet layers.
+            scope: Prenet scope.
+        """
+        super(Prenet, self).__init__()
+        self.drop_rate = drop_rate
+        
+        self.layers_sizes = layers_sizes
+        self.activation = activation
+        self.is_training = is_training
+        
+        self.scope = "prenet" if scope is None else scope
+    
+    def __call__(self, inputs):
+        x = inputs
+        
+        with tf.variable_scope(self.scope):
+            for i, size in enumerate(self.layers_sizes):
+                dense = tf.layers.dense(x, units=size, activation=self.activation,
+                                        name="dense_{}".format(i + 1))
+                # The paper discussed introducing diversity in generation at inference time
+                # by using a dropout of 0.5 only in prenet layers (in both training and inference).
+                x = tf.layers.dropout(dense, rate=self.drop_rate, training=True,
+                                      name="dropout_{}".format(i + 1) + self.scope)
+        return x
+
+
+class DecoderRNN:
+    """Decoder two uni directional LSTM Cells
+    """
+    
+    def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None):
+        """
+        Args:
+            is_training: Boolean, determines if the model is in training or inference to control 
+            zoneout
+            layers: integer, the number of LSTM layers in the decoder
+            size: integer, the number of LSTM units in each layer
+            zoneout: the zoneout factor
+        """
+        super(DecoderRNN, self).__init__()
+        self.is_training = is_training
+        
+        self.layers = layers
+        self.size = size
+        self.zoneout = zoneout
+        self.scope = "decoder_rnn" if scope is None else scope
+        
+        # Create a set of LSTM layers
+        self.rnn_layers = [ZoneoutLSTMCell(size, is_training,
+                                           zoneout_factor_cell=zoneout,
+                                           zoneout_factor_output=zoneout,
+                                           name="decoder_LSTM_{}".format(i + 1)) for i in
+                           range(layers)]
+        
+        self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True)
+    
+    def __call__(self, inputs, states):
+        with tf.variable_scope(self.scope):
+            return self._cell(inputs, states)
+
+
+class FrameProjection:
+    """Projection layer to r * num_mels dimensions or num_mels dimensions
+    """
+    
+    def __init__(self, shape=80, activation=None, scope=None):
+        """
+        Args:
+            shape: integer, dimensionality of output space (r*n_mels for decoder or n_mels for 
+            postnet)
+            activation: callable, activation function
+            scope: FrameProjection scope.
+        """
+        super(FrameProjection, self).__init__()
+        
+        self.shape = shape
+        self.activation = activation
+        
+        self.scope = "Linear_projection" if scope is None else scope
+        self.dense = tf.layers.Dense(units=shape, activation=activation,
+                                     name="projection_{}".format(self.scope))
+    
+    def __call__(self, inputs):
+        with tf.variable_scope(self.scope):
+            # If activation==None, this returns a simple Linear projection
+            # else the projection will be passed through an activation function
+            # output = tf.layers.dense(inputs, units=self.shape, activation=self.activation,
+            # 	name="projection_{}".format(self.scope))
+            output = self.dense(inputs)
+            
+            return output
+
+
+class StopProjection:
+    """Projection to a scalar and through a sigmoid activation
+    """
+    
+    def __init__(self, is_training, shape=1, activation=tf.nn.sigmoid, scope=None):
+        """
+        Args:
+            is_training: Boolean, to control the use of sigmoid function as it is useless to use it
+                during training since it is integrate inside the sigmoid_crossentropy loss
+            shape: integer, dimensionality of output space. Defaults to 1 (scalar)
+            activation: callable, activation function. only used during inference
+            scope: StopProjection scope.
+        """
+        super(StopProjection, self).__init__()
+        self.is_training = is_training
+        
+        self.shape = shape
+        self.activation = activation
+        self.scope = "stop_token_projection" if scope is None else scope
+    
+    def __call__(self, inputs):
+        with tf.variable_scope(self.scope):
+            output = tf.layers.dense(inputs, units=self.shape,
+                                     activation=None, name="projection_{}".format(self.scope))
+            
+            # During training, don"t use activation as it is integrated inside the 
+			# sigmoid_cross_entropy loss function
+            if self.is_training:
+                return output
+            return self.activation(output)
+
+
+class Postnet:
+    """Postnet that takes final decoder output and fine tunes it (using vision on past and future 
+    frames)
+    """
+    
+    def __init__(self, is_training, hparams, activation=tf.nn.tanh, scope=None):
+        """
+        Args:
+            is_training: Boolean, determines if the model is training or in inference to control 
+            dropout
+            kernel_size: tuple or integer, The size of convolution kernels
+            channels: integer, number of convolutional kernels
+            activation: callable, postnet activation function for each convolutional layer
+            scope: Postnet scope.
+        """
+        super(Postnet, self).__init__()
+        self.is_training = is_training
+        
+        self.kernel_size = hparams.postnet_kernel_size
+        self.channels = hparams.postnet_channels
+        self.activation = activation
+        self.scope = "postnet_convolutions" if scope is None else scope
+        self.postnet_num_layers = hparams.postnet_num_layers
+        self.drop_rate = hparams.tacotron_dropout_rate
+    
+    def __call__(self, inputs):
+        with tf.variable_scope(self.scope):
+            x = inputs
+            for i in range(self.postnet_num_layers - 1):
+                x = conv1d(x, self.kernel_size, self.channels, self.activation,
+                           self.is_training, self.drop_rate,
+                           "conv_layer_{}_".format(i + 1) + self.scope)
+            x = conv1d(x, self.kernel_size, self.channels, lambda _: _, self.is_training,
+                       self.drop_rate,
+                       "conv_layer_{}_".format(5) + self.scope)
+        return x
+
+
+def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, scope):
+    with tf.variable_scope(scope):
+        conv1d_output = tf.layers.conv1d(
+            inputs,
+            filters=channels,
+            kernel_size=kernel_size,
+            activation=None,
+            padding="same")
+        batched = tf.layers.batch_normalization(conv1d_output, training=is_training)
+        activated = activation(batched)
+        return tf.layers.dropout(activated, rate=drop_rate, training=is_training,
+                                 name="dropout_{}".format(scope))
+
+
+def _round_up_tf(x, multiple):
+    # Tf version of remainder = x % multiple
+    remainder = tf.mod(x, multiple)
+    # Tf version of return x if remainder == 0 else x + multiple - remainder
+    x_round = tf.cond(tf.equal(remainder, tf.zeros(tf.shape(remainder), dtype=tf.int32)),
+                      lambda: x,
+                      lambda: x + multiple - remainder)
+    
+    return x_round
+
+
+def sequence_mask(lengths, r, expand=True):
+    """Returns a 2-D or 3-D tensorflow sequence mask depending on the argument "expand"
+    """
+    max_len = tf.reduce_max(lengths)
+    max_len = _round_up_tf(max_len, tf.convert_to_tensor(r))
+    if expand:
+        return tf.expand_dims(tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32), axis=-1)
+    return tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32)
+
+
+def MaskedMSE(targets, outputs, targets_lengths, hparams, mask=None):
+    """Computes a masked Mean Squared Error
+    """
+    
+    # [batch_size, time_dimension, 1]
+    # example:
+    # sequence_mask([1, 3, 2], 5) = [[[1., 0., 0., 0., 0.]],
+    #							    [[1., 1., 1., 0., 0.]],
+    #							    [[1., 1., 0., 0., 0.]]]
+    # Note the maxlen argument that ensures mask shape is compatible with r>1
+    # This will by default mask the extra paddings caused by r>1
+    if mask is None:
+        mask = sequence_mask(targets_lengths, hparams.outputs_per_step, True)
+    
+    # [batch_size, time_dimension, channel_dimension(mels)]
+    ones = tf.ones(shape=[tf.shape(mask)[0], tf.shape(mask)[1], tf.shape(targets)[-1]],
+                   dtype=tf.float32)
+    mask_ = mask * ones
+    
+    with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask_))]):
+        return tf.losses.mean_squared_error(labels=targets, predictions=outputs, weights=mask_)
+
+
+def MaskedSigmoidCrossEntropy(targets, outputs, targets_lengths, hparams, mask=None):
+    """Computes a masked SigmoidCrossEntropy with logits
+    """
+    
+    # [batch_size, time_dimension]
+    # example:
+    # sequence_mask([1, 3, 2], 5) = [[1., 0., 0., 0., 0.],
+    #							    [1., 1., 1., 0., 0.],
+    #							    [1., 1., 0., 0., 0.]]
+    # Note the maxlen argument that ensures mask shape is compatible with r>1
+    # This will by default mask the extra paddings caused by r>1
+    if mask is None:
+        mask = sequence_mask(targets_lengths, hparams.outputs_per_step, False)
+    
+    with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask))]):
+        # Use a weighted sigmoid cross entropy to measure the <stop_token> loss. Set 
+        # hparams.cross_entropy_pos_weight to 1
+        # will have the same effect as  vanilla tf.nn.sigmoid_cross_entropy_with_logits.
+        losses = tf.nn.weighted_cross_entropy_with_logits(targets=targets, logits=outputs,
+                                                          pos_weight=hparams.cross_entropy_pos_weight)
+    
+    with tf.control_dependencies([tf.assert_equal(tf.shape(mask), tf.shape(losses))]):
+        masked_loss = losses * mask
+    
+    return tf.reduce_sum(masked_loss) / tf.count_nonzero(masked_loss, dtype=tf.float32)
+
+
+def MaskedLinearLoss(targets, outputs, targets_lengths, hparams, mask=None):
+    """Computes a masked MAE loss with priority to low frequencies
+    """
+    
+    # [batch_size, time_dimension, 1]
+    # example:
+    # sequence_mask([1, 3, 2], 5) = [[[1., 0., 0., 0., 0.]],
+    #							    [[1., 1., 1., 0., 0.]],
+    #							    [[1., 1., 0., 0., 0.]]]
+    # Note the maxlen argument that ensures mask shape is compatible with r>1
+    # This will by default mask the extra paddings caused by r>1
+    if mask is None:
+        mask = sequence_mask(targets_lengths, hparams.outputs_per_step, True)
+    
+    # [batch_size, time_dimension, channel_dimension(freq)]
+    ones = tf.ones(shape=[tf.shape(mask)[0], tf.shape(mask)[1], tf.shape(targets)[-1]],
+                   dtype=tf.float32)
+    mask_ = mask * ones
+    
+    l1 = tf.abs(targets - outputs)
+    n_priority_freq = int(2000 / (hparams.sample_rate * 0.5) * hparams.num_freq)
+    
+    with tf.control_dependencies([tf.assert_equal(tf.shape(targets), tf.shape(mask_))]):
+        masked_l1 = l1 * mask_
+        masked_l1_low = masked_l1[:, :, 0:n_priority_freq]
+    
+    mean_l1 = tf.reduce_sum(masked_l1) / tf.reduce_sum(mask_)
+    mean_l1_low = tf.reduce_sum(masked_l1_low) / tf.reduce_sum(mask_)
+    
+    return 0.5 * mean_l1 + 0.5 * mean_l1_low
--- a/synthesizer_tacotron2/models/tacotron.py
+++ b/synthesizer_tacotron2/models/tacotron.py
@ -0,0 +1,521 @@
+import tensorflow as tf
+from synthesizer.utils.symbols import symbols
+from synthesizer.infolog import log
+from synthesizer.models.helpers import TacoTrainingHelper, TacoTestHelper
+from synthesizer.models.modules import *
+from tensorflow.contrib.seq2seq import dynamic_decode
+from synthesizer.models.architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
+from synthesizer.models.custom_decoder import CustomDecoder
+from synthesizer.models.attention import LocationSensitiveAttention
+
+import numpy as np
+
+
+def split_func(x, split_pos):
+    rst = []
+    start = 0
+    # x will be a numpy array with the contents of the placeholder below
+    for i in range(split_pos.shape[0]):
+        rst.append(x[:, start:start + split_pos[i]])
+        start += split_pos[i]
+    return rst
+
+
+class Tacotron():
+    """Tacotron-2 Feature prediction Model.
+    """
+    
+    def __init__(self, hparams):
+        self._hparams = hparams
+    
+    def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, 
+                   stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
+                   global_step=None, is_training=False, is_evaluating=False, split_infos=None):
+        """
+        Initializes the model for inference sets "mel_outputs" and "alignments" fields.
+        Args:
+            - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
+              steps in the input time series, and values are character IDs
+            - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the 
+            lengths of each sequence in inputs.
+            - embed_targets: float32 Tensor with shape [N, E] where E is the speaker 
+            embedding size.
+            - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, 
+            T_out is number of steps in the output time series, M is num_mels, and values are 
+            entries in the mel spectrogram. Only needed for training.
+        """
+        if mel_targets is None and stop_token_targets is not None:
+            raise ValueError("no multi targets were provided but token_targets were given")
+        if mel_targets is not None and stop_token_targets is None and not gta:
+            raise ValueError("Mel targets are provided without corresponding token_targets")
+        if not gta and self._hparams.predict_linear == True and linear_targets is None and \
+				is_training:
+            raise ValueError(
+                "Model is set to use post processing to predict linear spectrograms in training "
+				"but no linear targets given!")
+        if gta and linear_targets is not None:
+            raise ValueError("Linear spectrogram prediction is not supported in GTA mode!")
+        if is_training and self._hparams.mask_decoder and targets_lengths is None:
+            raise RuntimeError(
+                "Model set to mask paddings but no targets lengths provided for the mask!")
+        if is_training and is_evaluating:
+            raise RuntimeError(
+                "Model can not be in training and evaluation modes at the same time!")
+        
+        split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or \
+								   self._hparams.split_on_cpu else "/gpu:{}".format(
+            self._hparams.tacotron_gpu_start_idx)
+        with tf.device(split_device):
+            hp = self._hparams
+            lout_int = [tf.int32] * hp.tacotron_num_gpus
+            lout_float = [tf.float32] * hp.tacotron_num_gpus
+            
+            tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus,
+                                           axis=0)
+            tower_targets_lengths = \
+                tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if \
+                    targets_lengths is not None else targets_lengths
+            
+            ### SV2TTS ###
+            
+            tower_embed_targets = tf.split(embed_targets, num_or_size_splits=hp.tacotron_num_gpus,
+                                           axis=0)
+            
+            ##############
+            
+            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int)
+            p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]],
+                                       lout_float) if mel_targets is not None else mel_targets
+            p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:, 2]],
+                                              lout_float) if stop_token_targets is not None else \
+				stop_token_targets
+            
+            tower_inputs = []
+            tower_mel_targets = []
+            tower_stop_token_targets = []
+            
+            batch_size = tf.shape(inputs)[0]
+            mel_channels = hp.num_mels
+            for i in range(hp.tacotron_num_gpus):
+                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
+                if p_mel_targets is not None:
+                    tower_mel_targets.append(
+                        tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels]))
+                if p_stop_token_targets is not None:
+                    tower_stop_token_targets.append(
+                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
+        
+        self.tower_decoder_output = []
+        self.tower_alignments = []
+        self.tower_stop_token_prediction = []
+        self.tower_mel_outputs = []
+        
+        tower_embedded_inputs = []
+        tower_enc_conv_output_shape = []
+        tower_encoder_cond_outputs = []
+        tower_residual = []
+        tower_projected_residual = []
+        
+        # 1. Declare GPU Devices
+        gpus = ["/gpu:{}".format(i) for i in
+                range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
+        for i in range(hp.tacotron_num_gpus):
+            with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
+                                                          worker_device=gpus[i])):
+                with tf.variable_scope("inference") as scope:
+                    assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled")
+                    if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training:
+                        assert global_step is not None
+                    
+                    # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit 
+                    # post processing when doing GTA synthesis
+                    post_condition = hp.predict_linear and not gta
+                    
+                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
+                    self.embedding_table = tf.get_variable(
+                        "inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32)
+                    embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])
+                    
+                    # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
+                    encoder_cell = TacotronEncoderCell(
+                        EncoderConvolutions(is_training, hparams=hp, scope="encoder_convolutions"),
+                        EncoderRNN(is_training, size=hp.encoder_lstm_units,
+                                   zoneout=hp.tacotron_zoneout_rate, scope="encoder_LSTM"))
+                    
+                    encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i])
+                    
+                    # For shape visualization purpose
+                    enc_conv_output_shape = encoder_cell.conv_output_shape
+                    
+                    
+                    ### SV2TT2 ###
+                    
+                    # Append the speaker embedding to the encoder output at each timestep
+                    tileable_shape = [-1, 1, self._hparams.speaker_embedding_size]
+                    tileable_embed_targets = tf.reshape(tower_embed_targets[i], tileable_shape)
+                    tiled_embed_targets = tf.tile(tileable_embed_targets, 
+                                                       [1, tf.shape(encoder_outputs)[1], 1])
+                    encoder_cond_outputs = tf.concat((encoder_outputs, tiled_embed_targets), 2)
+                    
+                    ##############
+                    
+                    
+                    # Decoder Parts
+                    # Attention Decoder Prenet
+                    prenet = Prenet(is_training, layers_sizes=hp.prenet_layers,
+                                    drop_rate=hp.tacotron_dropout_rate, scope="decoder_prenet")
+                    # Attention Mechanism
+                    attention_mechanism = LocationSensitiveAttention(hp.attention_dim,
+                                                                     encoder_cond_outputs, 
+                                                                     hparams=hp,
+                                                                     mask_encoder=hp.mask_encoder,
+                                                                     memory_sequence_length=tf.reshape(
+                                                                         tower_input_lengths[i],
+                                                                         [-1]),
+                                                                     smoothing=hp.smoothing,
+                                                                     cumulate_weights=hp.cumulative_weights)
+                    # Decoder LSTM Cells
+                    decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
+                                              size=hp.decoder_lstm_units,
+                                              zoneout=hp.tacotron_zoneout_rate,
+                                              scope="decoder_LSTM")
+                    # Frames Projection layer
+                    frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step,
+                                                       scope="linear_transform_projection")
+                    # <stop_token> projection layer
+                    stop_projection = StopProjection(is_training or is_evaluating, shape=hp
+                                                     .outputs_per_step,
+                                                     scope="stop_token_projection")
+                    
+                    # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
+                    decoder_cell = TacotronDecoderCell(
+                        prenet,
+                        attention_mechanism,
+                        decoder_lstm,
+                        frame_projection,
+                        stop_projection)
+                    
+                    # Define the helper for our decoder
+                    if is_training or is_evaluating or gta:
+                        self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta,
+                                                         is_evaluating, global_step)
+                    else:
+                        self.helper = TacoTestHelper(batch_size, hp)
+                    
+                    # initial decoder state
+                    decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
+                                                                 dtype=tf.float32)
+                    
+                    # Only use max iterations at synthesis time
+                    max_iters = hp.max_iters if not (is_training or is_evaluating) else None
+                    
+                    # Decode
+                    (frames_prediction, stop_token_prediction,
+                     _), final_decoder_state, _ = dynamic_decode(
+                        CustomDecoder(decoder_cell, self.helper, decoder_init_state),
+                        impute_finished=False,
+                        maximum_iterations=max_iters,
+                        swap_memory=hp.tacotron_swap_with_cpu)
+                    
+                    # Reshape outputs to be one output per entry 
+                    # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
+                    decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
+                    stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])
+                    
+                    # Postnet
+                    postnet = Postnet(is_training, hparams=hp, scope="postnet_convolutions")
+                    
+                    # Compute residual using post-net ==> [batch_size, decoder_steps * r, 
+                    # postnet_channels]
+                    residual = postnet(decoder_output)
+                    
+                    # Project residual to same dimension as mel spectrogram 
+                    # ==> [batch_size, decoder_steps * r, num_mels]
+                    residual_projection = FrameProjection(hp.num_mels, scope="postnet_projection")
+                    projected_residual = residual_projection(residual)
+                    
+                    # Compute the mel spectrogram
+                    mel_outputs = decoder_output + projected_residual
+                    
+                    if post_condition:
+                        # Add post-processing CBHG. This does a great job at extracting features 
+						# from mels before projection to Linear specs.
+                        post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size,
+                                         [hp.cbhg_projection, hp.num_mels],
+                                         hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers,
+                                         hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training,
+                                         name="CBHG_postnet")
+                        
+                        # [batch_size, decoder_steps(mel_frames), cbhg_channels]
+                        post_outputs = post_cbhg(mel_outputs, None)
+                        
+                        # Linear projection of extracted features to make linear spectrogram
+                        linear_specs_projection = FrameProjection(hp.num_freq,
+                                                                  scope="cbhg_linear_specs_projection")
+                        
+                        # [batch_size, decoder_steps(linear_frames), num_freq]
+                        linear_outputs = linear_specs_projection(post_outputs)
+                    
+                    # Grab alignments from the final decoder state
+                    alignments = tf.transpose(final_decoder_state.alignment_history.stack(),
+                                              [1, 2, 0])
+                    
+                    self.tower_decoder_output.append(decoder_output)
+                    self.tower_alignments.append(alignments)
+                    self.tower_stop_token_prediction.append(stop_token_prediction)
+                    self.tower_mel_outputs.append(mel_outputs)
+                    tower_embedded_inputs.append(embedded_inputs)
+                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
+                    tower_encoder_cond_outputs.append(encoder_cond_outputs)
+                    tower_residual.append(residual)
+                    tower_projected_residual.append(projected_residual)
+                    
+                    if post_condition:
+                        self.tower_linear_outputs.append(linear_outputs)
+            log("initialisation done {}".format(gpus[i]))
+        
+        if is_training:
+            self.ratio = self.helper._ratio
+        self.tower_inputs = tower_inputs
+        self.tower_input_lengths = tower_input_lengths
+        self.tower_mel_targets = tower_mel_targets
+        # self.tower_linear_targets = tower_linear_targets
+        self.tower_targets_lengths = tower_targets_lengths
+        self.tower_stop_token_targets = tower_stop_token_targets
+        
+        self.all_vars = tf.trainable_variables()
+        
+        log("Initialized Tacotron model. Dimensions (? = dynamic shape): ")
+        log("  Train mode:               {}".format(is_training))
+        log("  Eval mode:                {}".format(is_evaluating))
+        log("  GTA mode:                 {}".format(gta))
+        log("  Synthesis mode:           {}".format(not (is_training or is_evaluating)))
+        log("  Input:                    {}".format(inputs.shape))
+        for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
+            log("  device:                   {}".format(i))
+            log("  embedding:                {}".format(tower_embedded_inputs[i].shape))
+            log("  enc conv out:             {}".format(tower_enc_conv_output_shape[i]))
+            log("  encoder out (cond):       {}".format(tower_encoder_cond_outputs[i].shape))
+            log("  decoder out:              {}".format(self.tower_decoder_output[i].shape))
+            log("  residual out:             {}".format(tower_residual[i].shape))
+            log("  projected residual out:   {}".format(tower_projected_residual[i].shape))
+            log("  mel out:                  {}".format(self.tower_mel_outputs[i].shape))
+            if post_condition:
+                log("  linear out:               {}".format(self.tower_linear_outputs[i].shape))
+            log("  <stop_token> out:         {}".format(self.tower_stop_token_prediction[i].shape))
+            
+            # 1_000_000 is causing syntax problems for some people?! Python please :)
+            log("  Tacotron Parameters       {:.3f} Million.".format(
+                np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
+    
+    
+    def add_loss(self):
+        """Adds loss to the model. Sets "loss" field. initialize must have been called."""
+        hp = self._hparams
+        
+        self.tower_before_loss = []
+        self.tower_after_loss = []
+        self.tower_stop_token_loss = []
+        self.tower_regularization_loss = []
+        self.tower_linear_loss = []
+        self.tower_loss = []
+        
+        total_before_loss = 0
+        total_after_loss = 0
+        total_stop_token_loss = 0
+        total_regularization_loss = 0
+        total_linear_loss = 0
+        total_loss = 0
+
+        gpus = ["/gpu:{}".format(i) for i in
+                range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
+        
+        for i in range(hp.tacotron_num_gpus):
+            with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
+                                                          worker_device=gpus[i])):
+                with tf.variable_scope("loss") as scope:
+                    if hp.mask_decoder:
+                        # Compute loss of predictions before postnet
+                        before = MaskedMSE(self.tower_mel_targets[i], self.tower_decoder_output[i],
+                                           self.tower_targets_lengths[i],
+                                           hparams=self._hparams)
+                        # Compute loss after postnet
+                        after = MaskedMSE(self.tower_mel_targets[i], self.tower_mel_outputs[i],
+                                          self.tower_targets_lengths[i],
+                                          hparams=self._hparams)
+                        # Compute <stop_token> loss (for learning dynamic generation stop)
+                        stop_token_loss = MaskedSigmoidCrossEntropy(
+                            self.tower_stop_token_targets[i],
+                            self.tower_stop_token_prediction[i], self.tower_targets_lengths[i],
+                            hparams=self._hparams)
+                        # SV2TTS extra L1 loss (disabled for now)
+                        # linear_loss = MaskedLinearLoss(self.tower_mel_targets[i],
+                        #                                self.tower_decoder_output[i],
+                        #                                self.tower_targets_lengths[i],
+                        #                                hparams=self._hparams)
+                        linear_loss = 0.
+                    else:
+                        # Compute loss of predictions before postnet
+                        before = tf.losses.mean_squared_error(self.tower_mel_targets[i],
+                                                              self.tower_decoder_output[i])
+                        # Compute loss after postnet
+                        after = tf.losses.mean_squared_error(self.tower_mel_targets[i],
+                                                             self.tower_mel_outputs[i])
+                        # Compute <stop_token> loss (for learning dynamic generation stop)
+                        stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
+                            labels=self.tower_stop_token_targets[i],
+                            logits=self.tower_stop_token_prediction[i]))
+                        
+                        # SV2TTS extra L1 loss
+                        l1 = tf.abs(self.tower_mel_targets[i] - self.tower_decoder_output[i])
+                        linear_loss = tf.reduce_mean(l1)
+
+                        # if hp.predict_linear:
+                        #     # Compute linear loss
+                        #     # From https://github.com/keithito/tacotron/blob/tacotron2-work-in
+						# 	# -progress/models/tacotron.py
+                        #     # Prioritize loss for frequencies under 2000 Hz.
+                        #     l1 = tf.abs(self.tower_linear_targets[i] - self.tower_linear_outputs[i])
+                        #     n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_freq)
+                        #     linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(
+                        #         l1[:, :, 0:n_priority_freq])
+                        # else:
+                        #     linear_loss = 0.
+                    
+                    # Compute the regularization weight
+                    if hp.tacotron_scale_regularization:
+                        reg_weight_scaler = 1. / (
+                                    2 * hp.max_abs_value) if hp.symmetric_mels else 1. / (
+                            hp.max_abs_value)
+                        reg_weight = hp.tacotron_reg_weight * reg_weight_scaler
+                    else:
+                        reg_weight = hp.tacotron_reg_weight
+                    
+                    # Regularize variables
+                    # Exclude all types of bias, RNN (Bengio et al. On the difficulty of training recurrent neural networks), embeddings and prediction projection layers.
+                    # Note that we consider attention mechanism v_a weights as a prediction projection layer and we don"t regularize it. (This gave better stability)
+                    regularization = tf.add_n([tf.nn.l2_loss(v) for v in self.all_vars
+                                               if not (
+                                    "bias" in v.name or "Bias" in v.name or "_projection" in v.name or "inputs_embedding" in v.name
+                                    or "RNN" in v.name or "LSTM" in v.name)]) * reg_weight
+                    
+                    # Compute final loss term
+                    self.tower_before_loss.append(before)
+                    self.tower_after_loss.append(after)
+                    self.tower_stop_token_loss.append(stop_token_loss)
+                    self.tower_regularization_loss.append(regularization)
+                    self.tower_linear_loss.append(linear_loss)
+                    
+                    loss = before + after + stop_token_loss + regularization + linear_loss
+                    self.tower_loss.append(loss)
+        
+        for i in range(hp.tacotron_num_gpus):
+            total_before_loss += self.tower_before_loss[i]
+            total_after_loss += self.tower_after_loss[i]
+            total_stop_token_loss += self.tower_stop_token_loss[i]
+            total_regularization_loss += self.tower_regularization_loss[i]
+            total_linear_loss += self.tower_linear_loss[i]
+            total_loss += self.tower_loss[i]
+        
+        self.before_loss = total_before_loss / hp.tacotron_num_gpus
+        self.after_loss = total_after_loss / hp.tacotron_num_gpus
+        self.stop_token_loss = total_stop_token_loss / hp.tacotron_num_gpus
+        self.regularization_loss = total_regularization_loss / hp.tacotron_num_gpus
+        self.linear_loss = total_linear_loss / hp.tacotron_num_gpus
+        self.loss = total_loss / hp.tacotron_num_gpus
+    
+    def add_optimizer(self, global_step):
+        """Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
+        Args:
+            global_step: int32 scalar Tensor representing current global step in training
+        """
+        hp = self._hparams
+        tower_gradients = []
+        
+        # 1. Declare GPU Devices
+        gpus = ["/gpu:{}".format(i) for i in
+                range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
+        
+        grad_device = "/cpu:0" if hp.tacotron_num_gpus > 1 else gpus[0]
+        
+        with tf.device(grad_device):
+            with tf.variable_scope("optimizer") as scope:
+                if hp.tacotron_decay_learning_rate:
+                    self.decay_steps = hp.tacotron_decay_steps
+                    self.decay_rate = hp.tacotron_decay_rate
+                    self.learning_rate = self._learning_rate_decay(
+                        hp.tacotron_initial_learning_rate, global_step)
+                else:
+                    self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate)
+                
+                optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
+                                                   hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
+        
+        # 2. Compute Gradient
+        for i in range(hp.tacotron_num_gpus):
+            #  Device placement
+            with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
+                                                          worker_device=gpus[i])):
+                # agg_loss += self.tower_loss[i]
+                with tf.variable_scope("optimizer") as scope:
+                    gradients = optimizer.compute_gradients(self.tower_loss[i])
+                    tower_gradients.append(gradients)
+        
+        # 3. Average Gradient
+        with tf.device(grad_device):
+            avg_grads = []
+            vars = []
+            for grad_and_vars in zip(*tower_gradients):
+                # grads_vars = [(grad1, var), (grad2, var), ...]
+                grads = []
+                for g, _ in grad_and_vars:
+                    expanded_g = tf.expand_dims(g, 0)
+                    # Append on a "tower" dimension which we will average over below.
+                    grads.append(expanded_g)
+                # Average over the "tower" dimension.
+                grad = tf.concat(axis=0, values=grads)
+                grad = tf.reduce_mean(grad, 0)
+                
+                v = grad_and_vars[0][1]
+                avg_grads.append(grad)
+                vars.append(v)
+            
+            self.gradients = avg_grads
+            # Just for causion
+            # https://github.com/Rayhane-mamah/Tacotron-2/issues/11
+            if hp.tacotron_clip_gradients:
+                clipped_gradients, _ = tf.clip_by_global_norm(avg_grads, 1.)  # __mark 0.5 refer
+            else:
+                clipped_gradients = avg_grads
+            
+            # Add dependency on UPDATE_OPS; otherwise batchnorm won"t work correctly. See:
+            # https://github.com/tensorflow/tensorflow/issues/1122
+            with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
+                self.optimize = optimizer.apply_gradients(zip(clipped_gradients, vars),
+                                                          global_step=global_step)
+    
+    def _learning_rate_decay(self, init_lr, global_step):
+        #################################################################
+        # Narrow Exponential Decay:
+        
+        # Phase 1: lr = 1e-3
+        # We only start learning rate decay after 50k steps
+        
+        # Phase 2: lr in ]1e-5, 1e-3[
+        # decay reach minimal value at step 310k
+        
+        # Phase 3: lr = 1e-5
+        # clip by minimal learning rate value (step > 310k)
+        #################################################################
+        hp = self._hparams
+        
+        # Compute natural exponential decay
+        lr = tf.train.exponential_decay(init_lr,
+                                        global_step - hp.tacotron_start_decay,
+                                        # lr = 1e-3 at step 50k
+                                        self.decay_steps,
+                                        self.decay_rate,  # lr = 1e-5 around step 310k
+                                        name="lr_exponential_decay")
+        
+        # clip learning rate by max and min values (initial and final values)
+        return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr)
--- a/synthesizer_tacotron2/preprocess.py
+++ b/synthesizer_tacotron2/preprocess.py
@ -0,0 +1,120 @@
+from multiprocessing.pool import Pool 
+
+from functools import partial
+from itertools import chain
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+from encoder import inference as encoder
+from synthesizer.preprocess_speaker import preprocess_speaker_general
+from synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata
+
+data_info = {
+    "aidatatang_200zh": {
+        "subfolders": ["corpus/train"],
+        "trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
+        "speak_func": preprocess_speaker_general
+    },
+    "magicdata": {
+        "subfolders": ["train"],
+        "trans_filepath": "train/TRANS.txt",
+        "speak_func": preprocess_speaker_general,
+        "transcript_func": preprocess_transcript_magicdata,
+    },
+    "aishell3":{
+        "subfolders": ["train/wav"],
+        "trans_filepath": "train/content.txt",
+        "speak_func": preprocess_speaker_general,
+        "transcript_func": preprocess_transcript_aishell3,
+    },
+    "data_aishell":{
+        "subfolders": ["wav/train"],
+        "trans_filepath": "transcript/aishell_transcript_v0.8.txt",
+        "speak_func": preprocess_speaker_general
+    }
+}
+
+def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
+                           skip_existing: bool, hparams, no_alignments: bool,
+                           dataset: str):
+    dataset_info = data_info[dataset]
+    # Gather the input directories
+    dataset_root = datasets_root.joinpath(dataset)
+    input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]
+    print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
+    assert all(input_dir.exists() for input_dir in input_dirs)
+    
+    # Create the output directories for each output file type
+    out_dir.joinpath("mels").mkdir(exist_ok=True)
+    out_dir.joinpath("audio").mkdir(exist_ok=True)
+    
+    # Create a metadata file
+    metadata_fpath = out_dir.joinpath("train.txt")
+    metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
+
+    # Preprocess the dataset
+    dict_info = {}
+    transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
+    assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
+    with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
+        # process with specific function for your dataset 
+        if "transcript_func" in dataset_info:
+            dataset_info["transcript_func"](dict_info, dict_transcript)
+        else:
+            for v in dict_transcript:
+                if not v:
+                    continue
+                v = v.strip().replace("\n","").replace("\t"," ").split(" ")
+                dict_info[v[0]] = " ".join(v[1:])
+
+    speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
+    func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, 
+                   hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)
+    job = Pool(n_processes).imap(func, speaker_dirs)
+    for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
+        for metadatum in speaker_metadata:
+            metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
+    metadata_file.close()
+
+    # Verify the contents of the metadata file
+    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
+        metadata = [line.split("|") for line in metadata_file]
+    mel_frames = sum([int(m[4]) for m in metadata])
+    timesteps = sum([int(m[3]) for m in metadata])
+    sample_rate = hparams.sample_rate
+    hours = (timesteps / sample_rate) / 3600
+    print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
+          (len(metadata), mel_frames, timesteps, hours))
+    print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
+    print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
+    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
+
+def embed_utterance(fpaths, encoder_model_fpath):
+    if not encoder.is_loaded():
+        encoder.load_model(encoder_model_fpath)
+
+    # Compute the speaker embedding of the utterance
+    wav_fpath, embed_fpath = fpaths
+    wav = np.load(wav_fpath)
+    wav = encoder.preprocess_wav(wav)
+    embed = encoder.embed_utterance(wav)
+    np.save(embed_fpath, embed, allow_pickle=False)
+    
+ 
+def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
+    wav_dir = synthesizer_root.joinpath("audio")
+    metadata_fpath = synthesizer_root.joinpath("train.txt")
+    assert wav_dir.exists() and metadata_fpath.exists()
+    embed_dir = synthesizer_root.joinpath("embeds")
+    embed_dir.mkdir(exist_ok=True)
+    
+    # Gather the input wave filepath and the target output embed filepath
+    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
+        metadata = [line.split("|") for line in metadata_file]
+        fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
+        
+    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
+    # Embed the utterances in separate threads
+    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
+    job = Pool(n_processes).imap(func, fpaths)
+    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
--- a/synthesizer_tacotron2/preprocess_speaker.py
+++ b/synthesizer_tacotron2/preprocess_speaker.py
@ -0,0 +1,99 @@
+import librosa
+import numpy as np
+
+from encoder import inference as encoder
+from utils import logmmse
+from synthesizer import audio
+from pathlib import Path
+from pypinyin import Style
+from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
+from pypinyin.converter import DefaultConverter
+from pypinyin.core import Pinyin
+
+class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
+    pass
+
+pinyin = Pinyin(PinyinConverter()).pinyin
+
+
+def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, 
+                      skip_existing: bool, hparams):
+    ## FOR REFERENCE:
+    # For you not to lose your head if you ever wish to change things here or implement your own
+    # synthesizer.
+    # - Both the audios and the mel spectrograms are saved as numpy arrays
+    # - There is no processing done to the audios that will be saved to disk beyond volume  
+    #   normalization (in split_on_silences)
+    # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
+    #   is why we re-apply it on the audio on the side of the vocoder.
+    # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
+    #   without extra padding. This means that you won't have an exact relation between the length
+    #   of the wav and of the mel spectrogram. See the vocoder data loader.
+    
+    
+    # Skip existing utterances if needed
+    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
+    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
+    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
+        return None
+
+    # Trim silence
+    if hparams.trim_silence:
+        wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
+    
+    # Skip utterances that are too short
+    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
+        return None
+    
+    # Compute the mel spectrogram
+    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
+    mel_frames = mel_spectrogram.shape[1]
+    
+    # Skip utterances that are too long
+    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
+        return None
+    
+    # Write the spectrogram, embed and audio to disk
+    np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
+    np.save(wav_fpath, wav, allow_pickle=False)
+    
+    # Return a tuple describing this training example
+    return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
+ 
+
+def _split_on_silences(wav_fpath, words, hparams):
+    # Load the audio waveform
+    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
+    wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
+    if hparams.rescale:
+        wav = wav / np.abs(wav).max() * hparams.rescaling_max
+    # denoise, we may not need it here.
+    if len(wav) > hparams.sample_rate*(0.3+0.1):
+        noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
+                                    wav[-int(hparams.sample_rate*0.15):]])
+        profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
+        wav = logmmse.denoise(wav, profile, eta=0)
+
+    resp = pinyin(words, style=Style.TONE3)
+    res = [v[0] for v in resp if v[0].strip()]
+    res = " ".join(res)
+
+    return wav, res
+
+def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
+    metadata = []
+    extensions = ["*.wav", "*.flac", "*.mp3"]
+    for extension in extensions:
+        wav_fpath_list = speaker_dir.glob(extension)
+        # Iterate over each wav
+        for wav_fpath in wav_fpath_list:
+            words = dict_info.get(wav_fpath.name.split(".")[0])
+            words = dict_info.get(wav_fpath.name) if not words else words # try with wav 
+            if not words:
+                print("no wordS")
+                continue
+            sub_basename = "%s_%02d" % (wav_fpath.name, 0)
+            wav, text = _split_on_silences(wav_fpath, words, hparams)
+            metadata.append(_process_utterance(wav, text, out_dir, sub_basename, 
+                                                skip_existing, hparams))
+    return [m for m in metadata if m is not None]
--- a/synthesizer_tacotron2/preprocess_transcript.py
+++ b/synthesizer_tacotron2/preprocess_transcript.py
@ -0,0 +1,18 @@
+def preprocess_transcript_aishell3(dict_info, dict_transcript):
+    for v in dict_transcript:
+        if not v:
+            continue
+        v = v.strip().replace("\n","").replace("\t"," ").split(" ")
+        transList = []
+        for i in range(2, len(v), 2):
+            transList.append(v[i])
+        dict_info[v[0]] = " ".join(transList)
+
+
+def preprocess_transcript_magicdata(dict_info, dict_transcript):
+    for v in dict_transcript:
+        if not v:
+            continue
+        v = v.strip().replace("\n","").replace("\t"," ").split(" ")
+        dict_info[v[0]] = " ".join(v[2:])
+       
--- a/synthesizer_tacotron2/synthesize.py
+++ b/synthesizer_tacotron2/synthesize.py
@ -0,0 +1,92 @@
+import torch
+from torch.utils.data import DataLoader
+from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
+from synthesizer.models.tacotron import Tacotron
+from synthesizer.hparams import hparams_debug_string
+from synthesizer.utils.text import text_to_sequence
+from synthesizer.utils.symbols import symbols
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+import sys
+from synthesizer.infolog import log
+import os
+from synthesizer.tacotron2 import Tacotron2
+import time
+import tensorflow as tf
+
+
+def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
+    eval_dir = os.path.join(output_dir, "eval")
+    log_dir = os.path.join(output_dir, "logs-eval")
+
+    # Create output path if it doesn"t exist
+    os.makedirs(eval_dir, exist_ok=True)
+    os.makedirs(log_dir, exist_ok=True)
+    os.makedirs(os.path.join(log_dir, "wavs"), exist_ok=True)
+    os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True)
+
+    log(hparams_debug_string())
+    synth = Tacotron2(checkpoint_path, hparams)
+
+    # Set inputs batch wise
+    sentences = [sentences[i: i + hparams.tacotron_synthesis_batch_size] for i
+                 in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]
+
+    log("Starting Synthesis")
+    with open(os.path.join(eval_dir, "map.txt"), "w") as file:
+        for i, texts in enumerate(tqdm(sentences)):
+            start = time.time()
+            basenames = ["batch_{}_sentence_{}".format(i, j) for j in range(len(texts))]
+            mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None)
+
+            for elems in zip(texts, mel_filenames, speaker_ids):
+                file.write("|".join([str(x) for x in elems]) + "\n")
+    log("synthesized mel spectrograms at {}".format(eval_dir))
+    return eval_dir
+
+
+def run_synthesis(in_dir, out_dir, model_dir, hparams):
+    # This generates ground truth-aligned mels for vocoder training
+    synth_dir = os.path.join(out_dir, "mels_gta")
+    os.makedirs(synth_dir, exist_ok=True)
+    metadata_filename = os.path.join(in_dir, "train.txt")
+    print(hparams_debug_string())
+
+    # Load the model in memory
+    weights_dir = os.path.join(model_dir, "taco_pretrained")
+    checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path
+    synth = Tacotron2(checkpoint_fpath, hparams, gta=True)
+
+    # Load the metadata
+    with open(metadata_filename, encoding="utf-8") as f:
+        metadata = [line.strip().split("|") for line in f]
+        frame_shift_ms = hparams.hop_size / hparams.sample_rate
+        hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / 3600
+        print("Loaded metadata for {} examples ({:.2f} hours)".format(len(metadata), hours))
+
+    # Set inputs batch wise
+    metadata = [metadata[i: i + hparams.tacotron_synthesis_batch_size] for i in
+                range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]
+    # TODO: come on big boy, fix this
+    # Quick and dirty fix to make sure that all batches have the same size
+    metadata = metadata[:-1]
+
+    print("Starting Synthesis")
+    mel_dir = os.path.join(in_dir, "mels")
+    embed_dir = os.path.join(in_dir, "embeds")
+    meta_out_fpath = os.path.join(out_dir, "synthesized.txt")
+    with open(meta_out_fpath, "w") as file:
+        for i, meta in enumerate(tqdm(metadata)):
+            texts = [m[5] for m in meta]
+            mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
+            embed_filenames = [os.path.join(embed_dir, m[2]) for m in meta]
+            basenames = [os.path.basename(m).replace(".npy", "").replace("mel-", "")
+                         for m in mel_filenames]
+            synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, embed_filenames)
+
+            for elems in meta:
+                file.write("|".join([str(x) for x in elems]) + "\n")
+
+    print("Synthesized mel spectrograms at {}".format(synth_dir))
+    return meta_out_fpath
--- a/synthesizer_tacotron2/synthesizer_dataset.py
+++ b/synthesizer_tacotron2/synthesizer_dataset.py
@ -0,0 +1,92 @@
+import torch
+from torch.utils.data import Dataset
+import numpy as np
+from pathlib import Path
+from synthesizer.utils.text import text_to_sequence
+
+
+class SynthesizerDataset(Dataset):
+    def __init__(self, metadata_fpath: Path, mel_dir: Path, embed_dir: Path, hparams):
+        print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, embed_dir))
+        
+        with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
+            metadata = [line.split("|") for line in metadata_file]
+        
+        mel_fnames = [x[1] for x in metadata if int(x[4])]
+        mel_fpaths = [mel_dir.joinpath(fname) for fname in mel_fnames]
+        embed_fnames = [x[2] for x in metadata if int(x[4])]
+        embed_fpaths = [embed_dir.joinpath(fname) for fname in embed_fnames]
+        self.samples_fpaths = list(zip(mel_fpaths, embed_fpaths))
+        self.samples_texts = [x[5].strip() for x in metadata if int(x[4])]
+        self.metadata = metadata
+        self.hparams = hparams
+        
+        print("Found %d samples" % len(self.samples_fpaths))
+    
+    def __getitem__(self, index):  
+        # Sometimes index may be a list of 2 (not sure why this happens)
+        # If that is the case, return a single item corresponding to first element in index
+        if index is list:
+            index = index[0]
+
+        mel_path, embed_path = self.samples_fpaths[index]
+        mel = np.load(mel_path).T.astype(np.float32)
+        
+        # Load the embed
+        embed = np.load(embed_path)
+
+        # Get the text and clean it
+        text = text_to_sequence(self.samples_texts[index], self.hparams.tts_cleaner_names)
+        
+        # Convert the list returned by text_to_sequence to a numpy array
+        text = np.asarray(text).astype(np.int32)
+
+        return text, mel.astype(np.float32), embed.astype(np.float32), index
+
+    def __len__(self):
+        return len(self.samples_fpaths)
+
+
+def collate_synthesizer(batch):
+    # Text
+    x_lens = [len(x[0]) for x in batch]
+    max_x_len = max(x_lens)
+
+    chars = [pad1d(x[0], max_x_len) for x in batch]
+    chars = np.stack(chars)
+
+    # Mel spectrogram
+    spec_lens = [x[1].shape[-1] for x in batch]
+    max_spec_len = max(spec_lens) + 1 
+    if max_spec_len % 2 != 0:  # FIXIT: Hardcoded due to incompatibility with Windows (no lambda)
+        max_spec_len += 2 - max_spec_len % 2
+
+    # WaveRNN mel spectrograms are normalized to [0, 1] so zero padding adds silence
+    # By default, SV2TTS uses symmetric mels, where -1*max_abs_value is silence.
+    # if hparams.symmetric_mels:
+    #     mel_pad_value = -1 * hparams.max_abs_value
+    # else:
+    #     mel_pad_value = 0
+    mel_pad_value = -4 # FIXIT: Hardcoded due to incompatibility with Windows (no lambda)
+    mel = [pad2d(x[1], max_spec_len, pad_value=mel_pad_value) for x in batch]
+    mel = np.stack(mel)
+
+    # Speaker embedding (SV2TTS)
+    embeds = [x[2] for x in batch]
+
+    # Index (for vocoder preprocessing)
+    indices = [x[3] for x in batch]
+
+
+    # Convert all to tensor
+    chars = torch.tensor(chars).long()
+    mel = torch.tensor(mel)
+    embeds = torch.tensor(embeds)
+
+    return chars, mel, embeds, indices
+
+def pad1d(x, max_len, pad_value=0):
+    return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
+
+def pad2d(x, max_len, pad_value=0):
+    return np.pad(x, ((0, 0), (0, max_len - x.shape[-1])), mode="constant", constant_values=pad_value)
--- a/synthesizer_tacotron2/tacotron2.py
+++ b/synthesizer_tacotron2/tacotron2.py
@ -0,0 +1,238 @@
+from synthesizer.utils.text import text_to_sequence
+from synthesizer.infolog import log
+from synthesizer.models import create_model
+from synthesizer.utils import plot
+from synthesizer import audio
+import tensorflow as tf
+import numpy as np
+import os
+
+
+class Tacotron2:
+    def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
+        log("Constructing model: %s" % model_name)
+        #Force the batch size to be known in order to use attention masking in batch synthesis
+        inputs = tf.placeholder(tf.int32, (None, None), name="inputs")
+        input_lengths = tf.placeholder(tf.int32, (None,), name="input_lengths")
+        speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
+                                            name="speaker_embeddings")
+        targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets")
+        split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos")
+        with tf.variable_scope("Tacotron_model") as scope:
+            self.model = create_model(model_name, hparams)
+            if gta:
+                self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta,
+                                      split_infos=split_infos)
+            else:
+                self.model.initialize(inputs, input_lengths, speaker_embeddings,
+                                      split_infos=split_infos)
+            
+            self.mel_outputs = self.model.tower_mel_outputs
+            self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None
+            self.alignments = self.model.tower_alignments
+            self.stop_token_prediction = self.model.tower_stop_token_prediction
+            self.targets = targets
+        
+        self.gta = gta
+        self._hparams = hparams
+        #pad input sequences with the <pad_token> 0 ( _ )
+        self._pad = 0
+        #explicitely setting the padding to a value that doesn"t originally exist in the spectogram
+        #to avoid any possible conflicts, without affecting the output range of the model too much
+        if hparams.symmetric_mels:
+            self._target_pad = -hparams.max_abs_value
+        else:
+            self._target_pad = 0.
+        
+        self.inputs = inputs
+        self.input_lengths = input_lengths
+        self.speaker_embeddings = speaker_embeddings
+        self.targets = targets
+        self.split_infos = split_infos
+        
+        log("Loading checkpoint: %s" % checkpoint_path)
+        #Memory allocation on the GPUs as needed
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        config.allow_soft_placement = True
+        
+        self.session = tf.Session(config=config)
+        self.session.run(tf.global_variables_initializer())
+        
+        saver = tf.train.Saver()
+        saver.restore(self.session, checkpoint_path)
+    
+    def my_synthesize(self, speaker_embeds, texts):
+        """
+        Lighter synthesis function that directly returns the mel spectrograms.
+        """
+        print(texts)
+        # Prepare the input
+        cleaner_names = [x.strip() for x in self._hparams.cleaners.split(",")]
+        seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
+        input_lengths = [len(seq) for seq in seqs]
+        input_seqs, max_seq_len = self._prepare_inputs(seqs)
+        split_infos = [[max_seq_len, 0, 0, 0]]
+        feed_dict = {
+            self.inputs: input_seqs,
+            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
+            self.split_infos: np.asarray(split_infos, dtype=np.int32),
+            self.speaker_embeddings: speaker_embeds
+        }
+        
+        # Forward it
+        mels, alignments, stop_tokens = self.session.run(
+            [self.mel_outputs, self.alignments, self.stop_token_prediction],
+            feed_dict=feed_dict)
+        mels, alignments, stop_tokens = list(mels[0]), alignments[0], stop_tokens[0]
+        
+        # Trim the output
+        for i in range(len(mels)):
+            try:
+                target_length = list(np.round(stop_tokens[i])).index(1)
+                mels[i] = mels[i][:target_length, :]
+            except ValueError:
+                # If no token is generated, we simply do not trim the output
+                continue
+        
+        return [mel.T for mel in mels], alignments
+    
+    def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames, embed_filenames):
+        hparams = self._hparams
+        cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]
+              
+        assert 0 == len(texts) % self._hparams.tacotron_num_gpus
+        seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
+        input_lengths = [len(seq) for seq in seqs]
+        
+        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus
+        
+        #Pad inputs according to each GPU max length
+        input_seqs = None
+        split_infos = []
+        for i in range(self._hparams.tacotron_num_gpus):
+            device_input = seqs[size_per_device*i: size_per_device*(i+1)]
+            device_input, max_seq_len = self._prepare_inputs(device_input)
+            input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input
+            split_infos.append([max_seq_len, 0, 0, 0])
+        
+        feed_dict = {
+            self.inputs: input_seqs,
+            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
+        }
+        
+        if self.gta:
+            np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
+            target_lengths = [len(np_target) for np_target in np_targets]
+            
+            #pad targets according to each GPU max length
+            target_seqs = None
+            for i in range(self._hparams.tacotron_num_gpus):
+                device_target = np_targets[size_per_device*i: size_per_device*(i+1)]
+                device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step)
+                target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target
+                split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe?
+            
+            feed_dict[self.targets] = target_seqs
+            assert len(np_targets) == len(texts)
+        
+        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
+        feed_dict[self.speaker_embeddings] = [np.load(f) for f in embed_filenames]
+        
+        if self.gta or not hparams.predict_linear:
+            mels, alignments, stop_tokens = self.session.run(
+                [self.mel_outputs, self.alignments, self.stop_token_prediction],
+                feed_dict=feed_dict)
+            #Linearize outputs (1D arrays)
+            mels = [mel for gpu_mels in mels for mel in gpu_mels]
+            alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
+            stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
+            
+            if not self.gta:
+                #Natural batch synthesis
+                #Get Mel lengths for the entire batch from stop_tokens predictions
+                target_lengths = self._get_output_lengths(stop_tokens)
+            
+            #Take off the batch wise padding
+            mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
+            assert len(mels) == len(texts)
+        
+        else:
+            linears, mels, alignments, stop_tokens = self.session.run(
+                [self.linear_outputs, self.mel_outputs, self.alignments,
+                 self.stop_token_prediction],
+                feed_dict=feed_dict)
+            #Linearize outputs (1D arrays)
+            linears = [linear for gpu_linear in linears for linear in gpu_linear]
+            mels = [mel for gpu_mels in mels for mel in gpu_mels]
+            alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
+            stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
+            
+            #Natural batch synthesis
+            #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
+            # target_lengths = self._get_output_lengths(stop_tokens)
+            target_lengths = [9999]
+            
+            #Take off the batch wise padding
+            mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
+            linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
+            assert len(mels) == len(linears) == len(texts)
+        
+        if basenames is None:
+            raise NotImplemented()
+        
+        saved_mels_paths = []
+        for i, mel in enumerate(mels):
+            # Write the spectrogram to disk
+            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
+            mel_filename = os.path.join(out_dir, "mel-{}.npy".format(basenames[i]))
+            np.save(mel_filename, mel, allow_pickle=False)
+            saved_mels_paths.append(mel_filename)
+            
+            if log_dir is not None:
+                #save wav (mel -> wav)
+                wav = audio.inv_mel_spectrogram(mel.T, hparams)
+                audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-mel.wav".format(basenames[i])), sr=hparams.sample_rate)
+                
+                #save alignments
+                plot.plot_alignment(alignments[i], os.path.join(log_dir, "plots/alignment-{}.png".format(basenames[i])),
+                                    title="{}".format(texts[i]), split_title=True, max_len=target_lengths[i])
+                
+                #save mel spectrogram plot
+                plot.plot_spectrogram(mel, os.path.join(log_dir, "plots/mel-{}.png".format(basenames[i])),
+                                      title="{}".format(texts[i]), split_title=True)
+                
+                if hparams.predict_linear:
+                    #save wav (linear -> wav)
+                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
+                    audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-linear.wav".format(basenames[i])), sr=hparams.sample_rate)
+                    
+                    #save linear spectrogram plot
+                    plot.plot_spectrogram(linears[i], os.path.join(log_dir, "plots/linear-{}.png".format(basenames[i])),
+                                          title="{}".format(texts[i]), split_title=True, auto_aspect=True)
+        
+        return saved_mels_paths
+    
+    def _round_up(self, x, multiple):
+        remainder = x % multiple
+        return x if remainder == 0 else x + multiple - remainder
+    
+    def _prepare_inputs(self, inputs):
+        max_len = max([len(x) for x in inputs])
+        return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
+    
+    def _pad_input(self, x, length):
+        return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad)
+    
+    def _prepare_targets(self, targets, alignment):
+        max_len = max([len(t) for t in targets])
+        data_len = self._round_up(max_len, alignment)
+        return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
+    
+    def _pad_target(self, t, length):
+        return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad)
+    
+    def _get_output_lengths(self, stop_tokens):
+        #Determine each mel length by the stop token predictions. (len = first occurence of 1 in stop_tokens row wise)
+        output_lengths = [row.index(1) for row in np.round(stop_tokens).tolist()]
+        return output_lengths
--- a/synthesizer_tacotron2/train.py
+++ b/synthesizer_tacotron2/train.py
@ -0,0 +1,393 @@
+from synthesizer.utils.symbols import symbols
+from synthesizer.utils.text import sequence_to_text
+from synthesizer.hparams import hparams_debug_string
+from synthesizer.feeder import Feeder
+from synthesizer.models import create_model
+from synthesizer.utils import ValueWindow, plot
+from synthesizer import infolog, audio
+from datetime import datetime
+from tqdm import tqdm
+import tensorflow as tf
+import numpy as np
+import traceback
+import time
+import os
+
+log = infolog.log
+
+
+def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoint_path):
+    # Create tensorboard projector
+    config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
+    config.model_checkpoint_path = checkpoint_path
+
+    for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta):
+        # Initialize config
+        embedding = config.embeddings.add()
+        # Specifiy the embedding variable and the metadata
+        embedding.tensor_name = embedding_name
+        embedding.metadata_path = path_to_meta
+
+    # Project the embeddings to space dimensions for visualization
+    tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config)
+
+
+def add_train_stats(model, hparams):
+    with tf.variable_scope("stats") as scope:
+        for i in range(hparams.tacotron_num_gpus):
+            tf.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i])
+            tf.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i])
+        tf.summary.scalar("before_loss", model.before_loss)
+        tf.summary.scalar("after_loss", model.after_loss)
+
+        if hparams.predict_linear:
+            tf.summary.scalar("linear_loss", model.linear_loss)
+            for i in range(hparams.tacotron_num_gpus):
+                tf.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i])
+                tf.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i])
+
+        tf.summary.scalar("regularization_loss", model.regularization_loss)
+        tf.summary.scalar("stop_token_loss", model.stop_token_loss)
+        tf.summary.scalar("loss", model.loss)
+        tf.summary.scalar("learning_rate", model.learning_rate)  # Control learning rate decay speed
+        if hparams.tacotron_teacher_forcing_mode == "scheduled":
+            tf.summary.scalar("teacher_forcing_ratio", model.ratio)  # Control teacher forcing
+        # ratio decay when mode = "scheduled"
+        gradient_norms = [tf.norm(grad) for grad in model.gradients]
+        tf.summary.histogram("gradient_norm", gradient_norms)
+        tf.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms))  # visualize
+        # gradients (in case of explosion)
+        return tf.summary.merge_all()
+
+
+def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss,
+                   loss):
+    values = [
+        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss",
+                         simple_value=before_loss),
+        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss",
+                         simple_value=after_loss),
+        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss",
+                         simple_value=stop_token_loss),
+        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss),
+    ]
+    if linear_loss is not None:
+        values.append(tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss",
+                                       simple_value=linear_loss))
+    test_summary = tf.Summary(value=values)
+    summary_writer.add_summary(test_summary, step)
+
+
+def time_string():
+    return datetime.now().strftime("%Y-%m-%d %H:%M")
+
+
+def model_train_mode(args, feeder, hparams, global_step):
+    with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
+        model = create_model("Tacotron", hparams)
+        model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings,
+                         feeder.mel_targets, feeder.token_targets,
+                         targets_lengths=feeder.targets_lengths, global_step=global_step,
+                         is_training=True, split_infos=feeder.split_infos)
+        model.add_loss()
+        model.add_optimizer(global_step)
+        stats = add_train_stats(model, hparams)
+        return model, stats
+
+
+def model_test_mode(args, feeder, hparams, global_step):
+    with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
+        model = create_model("Tacotron", hparams)
+        model.initialize(feeder.eval_inputs, feeder.eval_input_lengths,
+                         feeder.eval_speaker_embeddings, feeder.eval_mel_targets,
+                         feeder.eval_token_targets, targets_lengths=feeder.eval_targets_lengths,
+                         global_step=global_step, is_training=False, is_evaluating=True,
+                         split_infos=feeder.eval_split_infos)
+        model.add_loss()
+        return model
+
+
+def train(log_dir, args, hparams):
+    save_dir = os.path.join(log_dir, "taco_pretrained")
+    plot_dir = os.path.join(log_dir, "plots")
+    wav_dir = os.path.join(log_dir, "wavs")
+    mel_dir = os.path.join(log_dir, "mel-spectrograms")
+    eval_dir = os.path.join(log_dir, "eval-dir")
+    eval_plot_dir = os.path.join(eval_dir, "plots")
+    eval_wav_dir = os.path.join(eval_dir, "wavs")
+    tensorboard_dir = os.path.join(log_dir, "tacotron_events")
+    meta_folder = os.path.join(log_dir, "metas")
+    os.makedirs(save_dir, exist_ok=True)
+    os.makedirs(plot_dir, exist_ok=True)
+    os.makedirs(wav_dir, exist_ok=True)
+    os.makedirs(mel_dir, exist_ok=True)
+    os.makedirs(eval_dir, exist_ok=True)
+    os.makedirs(eval_plot_dir, exist_ok=True)
+    os.makedirs(eval_wav_dir, exist_ok=True)
+    os.makedirs(tensorboard_dir, exist_ok=True)
+    os.makedirs(meta_folder, exist_ok=True)
+
+
+    checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt")
+    metadat_fpath = os.path.join(args.synthesizer_root, "train.txt")
+
+    log("Checkpoint path: {}".format(checkpoint_fpath))
+    log("Loading training data from: {}".format(metadat_fpath))
+    log("Using model: Tacotron")
+    log(hparams_debug_string())
+
+    # Start by setting a seed for repeatability
+    tf.set_random_seed(hparams.tacotron_random_seed)
+
+    # Set up data feeder
+    coord = tf.train.Coordinator()
+    with tf.variable_scope("datafeeder") as scope:
+        feeder = Feeder(coord, metadat_fpath, hparams)
+
+    # Set up model:
+    global_step = tf.Variable(0, name="global_step", trainable=False)
+    model, stats = model_train_mode(args, feeder, hparams, global_step)
+    eval_model = model_test_mode(args, feeder, hparams, global_step)
+
+    # Embeddings metadata
+    char_embedding_meta = os.path.join(meta_folder, "CharacterEmbeddings.tsv")
+    if not os.path.isfile(char_embedding_meta):
+        with open(char_embedding_meta, "w", encoding="utf-8") as f:
+            for symbol in symbols:
+                if symbol == " ":
+                    symbol = "\\s"  # For visual purposes, swap space with \s
+
+                f.write("{}\n".format(symbol))
+
+    char_embedding_meta = char_embedding_meta.replace(log_dir, "..")
+
+    # Book keeping
+    step = 0
+    time_window = ValueWindow(100)
+    loss_window = ValueWindow(100)
+    saver = tf.train.Saver(max_to_keep=5)
+
+    log("Tacotron training set to a maximum of {} steps".format(args.tacotron_train_steps))
+
+    # Memory allocation on the GPU as needed
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    config.allow_soft_placement = True
+
+    # Train
+    with tf.Session(config=config) as sess:
+        try:
+            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
+
+            sess.run(tf.global_variables_initializer())
+
+            # saved model restoring
+            if args.restore:
+                # Restore saved model if the user requested it, default = True
+                try:
+                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)
+
+                    if checkpoint_state and checkpoint_state.model_checkpoint_path:
+                        log("Loading checkpoint {}".format(checkpoint_state.model_checkpoint_path),
+                            slack=True)
+                        saver.restore(sess, checkpoint_state.model_checkpoint_path)
+
+                    else:
+                        log("No model to load at {}".format(save_dir), slack=True)
+                        saver.save(sess, checkpoint_fpath, global_step=global_step)
+
+                except tf.errors.OutOfRangeError as e:
+                    log("Cannot restore checkpoint: {}".format(e), slack=True)
+            else:
+                log("Starting new training!", slack=True)
+                saver.save(sess, checkpoint_fpath, global_step=global_step)
+
+            # initializing feeder
+            feeder.start_threads(sess)
+
+            # Training loop
+            while not coord.should_stop() and step < args.tacotron_train_steps:
+                start_time = time.time()
+                step, loss, opt = sess.run([global_step, model.loss, model.optimize])
+                time_window.append(time.time() - start_time)
+                loss_window.append(loss)
+                message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]".format(
+                    step, time_window.average, loss, loss_window.average)
+                log(message, end="\r", slack=(step % args.checkpoint_interval == 0))
+                print(message)
+
+                if loss > 100 or np.isnan(loss):
+                    log("Loss exploded to {:.5f} at step {}".format(loss, step))
+                    raise Exception("Loss exploded")
+
+                if step % args.summary_interval == 0:
+                    log("\nWriting summary at step {}".format(step))
+                    summary_writer.add_summary(sess.run(stats), step)
+
+                if step % args.eval_interval == 0:
+                    # Run eval and save eval stats
+                    log("\nRunning evaluation at step {}".format(step))
+
+                    eval_losses = []
+                    before_losses = []
+                    after_losses = []
+                    stop_token_losses = []
+                    linear_losses = []
+                    linear_loss = None
+
+                    if hparams.predict_linear:
+                        for i in tqdm(range(feeder.test_steps)):
+                            eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, \
+                            mel_t, t_len, align, lin_p, lin_t = sess.run(
+                                [
+                                    eval_model.tower_loss[0], eval_model.tower_before_loss[0],
+                                    eval_model.tower_after_loss[0],
+                                    eval_model.tower_stop_token_loss[0],
+                                    eval_model.tower_linear_loss[0],
+                                    eval_model.tower_mel_outputs[0][0],
+                                    eval_model.tower_mel_targets[0][0],
+                                    eval_model.tower_targets_lengths[0][0],
+                                    eval_model.tower_alignments[0][0],
+                                    eval_model.tower_linear_outputs[0][0],
+                                    eval_model.tower_linear_targets[0][0],
+                                ])
+                            eval_losses.append(eloss)
+                            before_losses.append(before_loss)
+                            after_losses.append(after_loss)
+                            stop_token_losses.append(stop_token_loss)
+                            linear_losses.append(linear_loss)
+                        linear_loss = sum(linear_losses) / len(linear_losses)
+
+                        wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
+                        audio.save_wav(wav, os.path.join(eval_wav_dir,
+                                                         "step-{}-eval-wave-from-linear.wav".format(
+                                                             step)), sr=hparams.sample_rate)
+
+                    else:
+                        for i in tqdm(range(feeder.test_steps)):
+                            eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, \
+                            align = sess.run(
+                                [
+                                    eval_model.tower_loss[0], eval_model.tower_before_loss[0],
+                                    eval_model.tower_after_loss[0],
+                                    eval_model.tower_stop_token_loss[0],
+                                    eval_model.tower_mel_outputs[0][0],
+                                    eval_model.tower_mel_targets[0][0],
+                                    eval_model.tower_targets_lengths[0][0],
+                                    eval_model.tower_alignments[0][0]
+                                ])
+                            eval_losses.append(eloss)
+                            before_losses.append(before_loss)
+                            after_losses.append(after_loss)
+                            stop_token_losses.append(stop_token_loss)
+
+                    eval_loss = sum(eval_losses) / len(eval_losses)
+                    before_loss = sum(before_losses) / len(before_losses)
+                    after_loss = sum(after_losses) / len(after_losses)
+                    stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)
+
+                    log("Saving eval log to {}..".format(eval_dir))
+                    # Save some log to monitor model improvement on same unseen sequence
+                    wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
+                    audio.save_wav(wav, os.path.join(eval_wav_dir,
+                                                     "step-{}-eval-wave-from-mel.wav".format(step)),
+                                   sr=hparams.sample_rate)
+
+                    plot.plot_alignment(align, os.path.join(eval_plot_dir,
+                                                            "step-{}-eval-align.png".format(step)),
+                                        title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
+                                                                                    time_string(),
+                                                                                    step,
+                                                                                    eval_loss),
+                                        max_len=t_len // hparams.outputs_per_step)
+                    plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir,
+                                                              "step-{"
+                                                              "}-eval-mel-spectrogram.png".format(
+                                                                  step)),
+                                          title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
+                                                                                      time_string(),
+                                                                                      step,
+                                                                                      eval_loss),
+                                          target_spectrogram=mel_t,
+                                          max_len=t_len)
+
+                    if hparams.predict_linear:
+                        plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir,
+                                                                  "step-{}-eval-linear-spectrogram.png".format(
+                                                                      step)),
+                                              title="{}, {}, step={}, loss={:.5f}".format(
+                                                  "Tacotron", time_string(), step, eval_loss),
+                                              target_spectrogram=lin_t,
+                                              max_len=t_len, auto_aspect=True)
+
+                    log("Eval loss for global step {}: {:.3f}".format(step, eval_loss))
+                    log("Writing eval summary!")
+                    add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss,
+                                   stop_token_loss, eval_loss)
+
+                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \
+                        step == 300:
+                    # Save model and current global step
+                    saver.save(sess, checkpoint_fpath, global_step=global_step)
+
+                    log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..")
+                    input_seq, mel_prediction, alignment, target, target_length = sess.run([
+                        model.tower_inputs[0][0],
+                        model.tower_mel_outputs[0][0],
+                        model.tower_alignments[0][0],
+                        model.tower_mel_targets[0][0],
+                        model.tower_targets_lengths[0][0],
+                    ])
+
+                    # save predicted mel spectrogram to disk (debug)
+                    mel_filename = "mel-prediction-step-{}.npy".format(step)
+                    np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T,
+                            allow_pickle=False)
+
+                    # save griffin lim inverted wav for debug (mel -> wav)
+                    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
+                    audio.save_wav(wav,
+                                   os.path.join(wav_dir, "step-{}-wave-from-mel.wav".format(step)),
+                                   sr=hparams.sample_rate)
+
+                    # save alignment plot to disk (control purposes)
+                    plot.plot_alignment(alignment,
+                                        os.path.join(plot_dir, "step-{}-align.png".format(step)),
+                                        title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
+                                                                                    time_string(),
+                                                                                    step, loss),
+                                        max_len=target_length // hparams.outputs_per_step)
+                    # save real and predicted mel-spectrogram plot to disk (control purposes)
+                    plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir,
+                                                                       "step-{}-mel-spectrogram.png".format(
+                                                                           step)),
+                                          title="{}, {}, step={}, loss={:.5f}".format("Tacotron",
+                                                                                      time_string(),
+                                                                                      step, loss),
+                                          target_spectrogram=target,
+                                          max_len=target_length)
+                    log("Input at step {}: {}".format(step, sequence_to_text(input_seq)))
+
+                if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
+                    # Get current checkpoint state
+                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)
+
+                    # Update Projector
+                    log("\nSaving Model Character Embeddings visualization..")
+                    add_embedding_stats(summary_writer, [model.embedding_table.name],
+                                        [char_embedding_meta],
+                                        checkpoint_state.model_checkpoint_path)
+                    log("Tacotron Character embeddings have been updated on tensorboard!")
+
+            log("Tacotron training complete after {} global steps!".format(
+                args.tacotron_train_steps), slack=True)
+            return save_dir
+
+        except Exception as e:
+            log("Exiting due to exception: {}".format(e), slack=True)
+            traceback.print_exc()
+            coord.request_stop(e)
+
+
+def tacotron_train(args, log_dir, hparams):
+    return train(log_dir, args, hparams)
--- a/synthesizer_tacotron2/utils/init.py
+++ b/synthesizer_tacotron2/utils/init.py
@ -0,0 +1,45 @@
+import torch
+
+
+_output_ref = None
+_replicas_ref = None
+
+def data_parallel_workaround(model, *input):
+    global _output_ref
+    global _replicas_ref
+    device_ids = list(range(torch.cuda.device_count()))
+    output_device = device_ids[0]
+    replicas = torch.nn.parallel.replicate(model, device_ids)
+    # input.shape = (num_args, batch, ...)
+    inputs = torch.nn.parallel.scatter(input, device_ids)
+    # inputs.shape = (num_gpus, num_args, batch/num_gpus, ...)
+    replicas = replicas[:len(inputs)]
+    outputs = torch.nn.parallel.parallel_apply(replicas, inputs)
+    y_hat = torch.nn.parallel.gather(outputs, output_device)
+    _output_ref = outputs
+    _replicas_ref = replicas
+    return y_hat
+
+
+class ValueWindow():
+  def __init__(self, window_size=100):
+    self._window_size = window_size
+    self._values = []
+
+  def append(self, x):
+    self._values = self._values[-(self._window_size - 1):] + [x]
+
+  @property
+  def sum(self):
+    return sum(self._values)
+
+  @property
+  def count(self):
+    return len(self._values)
+
+  @property
+  def average(self):
+    return self.sum / max(1, self.count)
+
+  def reset(self):
+    self._values = []
--- a/synthesizer_tacotron2/utils/_cmudict.py
+++ b/synthesizer_tacotron2/utils/_cmudict.py
@ -0,0 +1,62 @@
+import re
+
+valid_symbols = [
+  "AA", "AA0", "AA1", "AA2", "AE", "AE0", "AE1", "AE2", "AH", "AH0", "AH1", "AH2",
+  "AO", "AO0", "AO1", "AO2", "AW", "AW0", "AW1", "AW2", "AY", "AY0", "AY1", "AY2",
+  "B", "CH", "D", "DH", "EH", "EH0", "EH1", "EH2", "ER", "ER0", "ER1", "ER2", "EY",
+  "EY0", "EY1", "EY2", "F", "G", "HH", "IH", "IH0", "IH1", "IH2", "IY", "IY0", "IY1",
+  "IY2", "JH", "K", "L", "M", "N", "NG", "OW", "OW0", "OW1", "OW2", "OY", "OY0",
+  "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH", "UH0", "UH1", "UH2", "UW",
+  "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"
+]
+
+_valid_symbol_set = set(valid_symbols)
+
+
+class CMUDict:
+  """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
+  def __init__(self, file_or_path, keep_ambiguous=True):
+    if isinstance(file_or_path, str):
+      with open(file_or_path, encoding="latin-1") as f:
+        entries = _parse_cmudict(f)
+    else:
+      entries = _parse_cmudict(file_or_path)
+    if not keep_ambiguous:
+      entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+    self._entries = entries
+
+
+  def __len__(self):
+    return len(self._entries)
+
+
+  def lookup(self, word):
+    """Returns list of ARPAbet pronunciations of the given word."""
+    return self._entries.get(word.upper())
+
+
+
+_alt_re = re.compile(r"\([0-9]+\)")
+
+
+def _parse_cmudict(file):
+  cmudict = {}
+  for line in file:
+    if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
+      parts = line.split("  ")
+      word = re.sub(_alt_re, "", parts[0])
+      pronunciation = _get_pronunciation(parts[1])
+      if pronunciation:
+        if word in cmudict:
+          cmudict[word].append(pronunciation)
+        else:
+          cmudict[word] = [pronunciation]
+  return cmudict
+
+
+def _get_pronunciation(s):
+  parts = s.strip().split(" ")
+  for part in parts:
+    if part not in _valid_symbol_set:
+      return None
+  return " ".join(parts)
--- a/synthesizer_tacotron2/utils/cleaners.py
+++ b/synthesizer_tacotron2/utils/cleaners.py
@ -0,0 +1,88 @@
+"""
+Cleaners are transformations that run over the input text at both training and eval time.
+
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You"ll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+"""
+
+import re
+from unidecode import unidecode
+from .numbers import normalize_numbers
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r"\s+")
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [
+  ("mrs", "misess"),
+  ("mr", "mister"),
+  ("dr", "doctor"),
+  ("st", "saint"),
+  ("co", "company"),
+  ("jr", "junior"),
+  ("maj", "major"),
+  ("gen", "general"),
+  ("drs", "doctors"),
+  ("rev", "reverend"),
+  ("lt", "lieutenant"),
+  ("hon", "honorable"),
+  ("sgt", "sergeant"),
+  ("capt", "captain"),
+  ("esq", "esquire"),
+  ("ltd", "limited"),
+  ("col", "colonel"),
+  ("ft", "fort"),
+]]
+
+
+def expand_abbreviations(text):
+  for regex, replacement in _abbreviations:
+    text = re.sub(regex, replacement, text)
+  return text
+
+
+def expand_numbers(text):
+  return normalize_numbers(text)
+
+
+def lowercase(text):
+  """lowercase input tokens."""
+  return text.lower()
+
+
+def collapse_whitespace(text):
+  return re.sub(_whitespace_re, " ", text)
+
+
+def convert_to_ascii(text):
+  return unidecode(text)
+
+
+def basic_cleaners(text):
+  """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+  text = lowercase(text)
+  text = collapse_whitespace(text)
+  return text
+
+
+def transliteration_cleaners(text):
+  """Pipeline for non-English text that transliterates to ASCII."""
+  text = convert_to_ascii(text)
+  text = lowercase(text)
+  text = collapse_whitespace(text)
+  return text
+
+
+def english_cleaners(text):
+  """Pipeline for English text, including number and abbreviation expansion."""
+  text = convert_to_ascii(text)
+  text = lowercase(text)
+  text = expand_numbers(text)
+  text = expand_abbreviations(text)
+  text = collapse_whitespace(text)
+  return text
--- a/synthesizer_tacotron2/utils/numbers.py
+++ b/synthesizer_tacotron2/utils/numbers.py
@ -0,0 +1,68 @@
+import re
+import inflect
+
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+
+
+def _remove_commas(m):
+  return m.group(1).replace(",", "")
+
+
+def _expand_decimal_point(m):
+  return m.group(1).replace(".", " point ")
+
+
+def _expand_dollars(m):
+  match = m.group(1)
+  parts = match.split(".")
+  if len(parts) > 2:
+    return match + " dollars"  # Unexpected format
+  dollars = int(parts[0]) if parts[0] else 0
+  cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+  if dollars and cents:
+    dollar_unit = "dollar" if dollars == 1 else "dollars"
+    cent_unit = "cent" if cents == 1 else "cents"
+    return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
+  elif dollars:
+    dollar_unit = "dollar" if dollars == 1 else "dollars"
+    return "%s %s" % (dollars, dollar_unit)
+  elif cents:
+    cent_unit = "cent" if cents == 1 else "cents"
+    return "%s %s" % (cents, cent_unit)
+  else:
+    return "zero dollars"
+
+
+def _expand_ordinal(m):
+  return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+  num = int(m.group(0))
+  if num > 1000 and num < 3000:
+    if num == 2000:
+      return "two thousand"
+    elif num > 2000 and num < 2010:
+      return "two thousand " + _inflect.number_to_words(num % 100)
+    elif num % 100 == 0:
+      return _inflect.number_to_words(num // 100) + " hundred"
+    else:
+      return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
+  else:
+    return _inflect.number_to_words(num, andword="")
+
+
+def normalize_numbers(text):
+  text = re.sub(_comma_number_re, _remove_commas, text)
+  text = re.sub(_pounds_re, r"\1 pounds", text)
+  text = re.sub(_dollars_re, _expand_dollars, text)
+  text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+  text = re.sub(_ordinal_re, _expand_ordinal, text)
+  text = re.sub(_number_re, _expand_number, text)
+  return text
--- a/synthesizer_tacotron2/utils/plot.py
+++ b/synthesizer_tacotron2/utils/plot.py
@ -0,0 +1,76 @@
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def split_title_line(title_text, max_words=5):
+	"""
+	A function that splits any string based on specific character
+	(returning it with the string), with maximum number of words on it
+	"""
+	seq = title_text.split()
+	return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
+
+def plot_alignment(alignment, path, title=None, split_title=False, max_len=None):
+	if max_len is not None:
+		alignment = alignment[:, :max_len]
+
+	fig = plt.figure(figsize=(8, 6))
+	ax = fig.add_subplot(111)
+
+	im = ax.imshow(
+		alignment,
+		aspect="auto",
+		origin="lower",
+		interpolation="none")
+	fig.colorbar(im, ax=ax)
+	xlabel = "Decoder timestep"
+
+	if split_title:
+		title = split_title_line(title)
+
+	plt.xlabel(xlabel)
+	plt.title(title)
+	plt.ylabel("Encoder timestep")
+	plt.tight_layout()
+	plt.savefig(path, format="png")
+	plt.close()
+
+
+def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
+	if max_len is not None:
+		target_spectrogram = target_spectrogram[:max_len]
+		pred_spectrogram = pred_spectrogram[:max_len]
+
+	if split_title:
+		title = split_title_line(title)
+
+	fig = plt.figure(figsize=(10, 8))
+	# Set common labels
+	fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16)
+
+	#target spectrogram subplot
+	if target_spectrogram is not None:
+		ax1 = fig.add_subplot(311)
+		ax2 = fig.add_subplot(312)
+
+		if auto_aspect:
+			im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none")
+		else:
+			im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none")
+		ax1.set_title("Target Mel-Spectrogram")
+		fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1)
+		ax2.set_title("Predicted Mel-Spectrogram")
+	else:
+		ax2 = fig.add_subplot(211)
+
+	if auto_aspect:
+		im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none")
+	else:
+		im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none")
+	fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2)
+
+	plt.tight_layout()
+	plt.savefig(path, format="png")
+	plt.close()
--- a/synthesizer_tacotron2/utils/symbols.py
+++ b/synthesizer_tacotron2/utils/symbols.py
@ -0,0 +1,18 @@
+"""
+Defines the set of symbols used in text input to the model.
+
+The default is a set of ASCII characters that works well for English or text that has been run
+through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
+"""
+# from . import cmudict
+
+_pad        = "_"
+_eos        = "~"
+_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? '
+
+#_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? ' # use this old one if you want to train old model 
+# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+#_arpabet = ["@' + s for s in cmudict.valid_symbols]
+
+# Export all symbols:
+symbols = [_pad, _eos] + list(_characters) #+ _arpabet
--- a/synthesizer_tacotron2/utils/text.py
+++ b/synthesizer_tacotron2/utils/text.py
@ -0,0 +1,74 @@
+from .symbols import symbols
+from . import cleaners
+import re
+
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+
+# Regular expression matching text enclosed in curly braces:
+_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
+
+
+def text_to_sequence(text, cleaner_names):
+  """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+
+    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
+    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+
+    Returns:
+      List of integers corresponding to the symbols in the text
+  """
+  sequence = []
+
+  # Check for curly braces and treat their contents as ARPAbet:
+  while len(text):
+    m = _curly_re.match(text)
+    if not m:
+      sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
+      break
+    sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+    sequence += _arpabet_to_sequence(m.group(2))
+    text = m.group(3)
+
+  # Append EOS token
+  sequence.append(_symbol_to_id["~"])
+  return sequence
+
+
+def sequence_to_text(sequence):
+  """Converts a sequence of IDs back to a string"""
+  result = ""
+  for symbol_id in sequence:
+    if symbol_id in _id_to_symbol:
+      s = _id_to_symbol[symbol_id]
+      # Enclose ARPAbet back in curly braces:
+      if len(s) > 1 and s[0] == "@":
+        s = "{%s}" % s[1:]
+      result += s
+  return result.replace("}{", " ")
+
+
+def _clean_text(text, cleaner_names):
+  for name in cleaner_names:
+    cleaner = getattr(cleaners, name)
+    if not cleaner:
+      raise Exception("Unknown cleaner: %s" % name)
+    text = cleaner(text)
+  return text
+
+
+def _symbols_to_sequence(symbols):
+  return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
+
+
+def _arpabet_to_sequence(text):
+  return _symbols_to_sequence(["@" + s for s in text.split()])
+
+
+def _should_keep_symbol(s):
+  return s in _symbol_to_id and s not in ("_", "~")