MockingBird/synthesizer_tacotron2/hparams.py

import ast
import pprint
from tensorflow.contrib.training import HParams


hparams = HParams(
        cleaners="basic_cleaners",
        tacotron_gpu_start_idx=0,  # idx of the first GPU to be used for Tacotron training.
        tacotron_num_gpus=1,  # Determines the number of gpus in use for Tacotron training.
        split_on_cpu=True,

        ### Signal Processing (used in both synthesizer and vocoder)
        sample_rate = 16000,
        n_fft = 800,
        num_mels = 80,
        hop_size = 200,                             # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
        win_size = 800,                             # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
        fmin = 55,
        min_level_db = -100,
        ref_level_db = 20,
        max_abs_value = 4.,                         # Gradient explodes if too big, premature convergence if too small.
        preemphasis = 0.97,                         # Filter coefficient to use if preemphasize is True
        preemphasize = True,
        frame_shift_ms=None,
        normalize_for_wavenet=True,
        # whether to rescale to [0, 1] for wavenet. (better audio quality)
        clip_for_wavenet=True,


        ### Tacotron Text-to-Speech (TTS)
        tts_embed_dims = 512,                       # Embedding dimension for the graphemes/phoneme inputs
        tts_encoder_dims = 256,
        tts_decoder_dims = 128,
        tts_postnet_dims = 512,
        tts_encoder_K = 5,
        tts_lstm_dims = 1024,
        tts_postnet_K = 5,
        tts_num_highways = 4,
        tts_dropout = 0.5,
        tts_cleaner_names = ["basic_cleaners"],
        tts_stop_threshold = -3.4,                  # Value below which audio generation ends.
                                                    # For example, for a range of [-4, 4], this
                                                    # will terminate the sequence at the first
                                                    # frame that has all values < -3.4

        ### Tacotron Training
        tts_schedule = [(2,  1e-3,  20_000,  12),   # Progressive training schedule
                        (2,  5e-4,  40_000,  12),   # (r, lr, step, batch_size)
                        (2,  2e-4,  80_000,  12),   #
                        (2,  1e-4, 160_000,  12),   # r = reduction factor (# of mel frames
                        (2,  3e-5, 320_000,  12),   #     synthesized for each decoder iteration)
                        (2,  1e-5, 640_000,  12)],  # lr = learning rate

        tts_clip_grad_norm = 1.0,                   # clips the gradient norm to prevent explosion - set to None if not needed
        tts_eval_interval = 500,                    # Number of steps between model evaluation (sample generation)
                                                    # Set to -1 to generate after completing epoch, or 0 to disable

        tts_eval_num_samples = 1,                   # Makes this number of samples

        ### Data Preprocessing
        max_mel_frames = 900,
        rescale = True,
        rescaling_max = 0.9,
        synthesis_batch_size = 16,                  # For vocoder preprocessing and inference.

        ### Mel Visualization and Griffin-Lim
        signal_normalization = True,
        power = 1.5,
        griffin_lim_iters = 60,

        ### Audio processing options
        fmax = 7600,                                # Should not exceed (sample_rate // 2)
        allow_clipping_in_normalization = True,     # Used when signal_normalization = True
        clip_mels_length = True,                    # If true, discards samples exceeding max_mel_frames
        use_lws = False,                            # "Fast spectrogram phase recovery using local weighted sums"
        symmetric_mels = True,                      # Sets mel range to [-max_abs_value, max_abs_value] if True,
                                                    #               and [0, max_abs_value] if False
        trim_silence = True,                        # Use with sample_rate of 16000 for best results
        silence_threshold=2,
        trim_fft_size=512,
        trim_hop_size=128,
        trim_top_db=23,

        ### SV2TTS
        speaker_embedding_size = 256,               # Dimension for the speaker embedding
        silence_min_duration_split = 0.4,           # Duration in seconds of a silence for an utterance to be split
        utterance_min_duration = 1.6,               # Duration in seconds below which utterances are discarded

        # Tacotron
        outputs_per_step=2,  # Was 1
        # number of frames to generate at each decoding step (increase to speed up computation and
        # allows for higher batch size, decreases G&L audio quality)
        stop_at_any=True,
        # Determines whether the decoder should stop when predicting <stop> to any frame or to all of
        # them (True works pretty well)

        embedding_dim=512,  # dimension of embedding space (these are NOT the speaker embeddings)

        # Encoder parameters
        enc_conv_num_layers=3,  # number of encoder convolutional layers
        enc_conv_kernel_size=(5,),  # size of encoder convolution filters for each layer
        enc_conv_channels=512,  # number of encoder convolutions filters for each layer
        encoder_lstm_units=256,  # number of lstm units for each direction (forward and backward)

        # Attention mechanism
        smoothing=False,  # Whether to smooth the attention normalization function
        attention_dim=128,  # dimension of attention space
        attention_filters=32,  # number of attention convolution filters
        attention_kernel=(31,),  # kernel size of attention convolution
        cumulative_weights=True,
        # Whether to cumulate (sum) all previous attention weights or simply feed previous weights (
        # Recommended: True)

        # Decoder
        prenet_layers=[256, 256],  # number of layers and number of units of prenet
        decoder_layers=2,  # number of decoder lstm layers
        decoder_lstm_units=1024,  # number of decoder lstm units on each layer
        max_iters=2000,
        # Max decoder steps during inference (Just for safety from infinite loop cases)

        # Residual postnet
        postnet_num_layers=5,  # number of postnet convolutional layers
        postnet_kernel_size=(5,),  # size of postnet convolution filters for each layer
        postnet_channels=512,  # number of postnet convolution filters for each layer

        # CBHG mel->linear postnet
        cbhg_kernels=8,
        # All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act
        #  as "K-grams"
        cbhg_conv_channels=128,  # Channels of the convolution bank
        cbhg_pool_size=2,  # pooling size of the CBHG
        cbhg_projection=256,
        # projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels)
        cbhg_projection_kernel_size=3,  # kernel_size of the CBHG projections
        cbhg_highwaynet_layers=4,  # Number of HighwayNet layers
        cbhg_highway_units=128,  # Number of units used in HighwayNet fully connected layers
        cbhg_rnn_units=128,
        # Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in
        # shape

        # Loss params
        mask_encoder=True,
        # whether to mask encoder padding while computing attention. Set to True for better prosody
        # but slower convergence.
        mask_decoder=False,
        # Whether to use loss mask for padded sequences (if False, <stop_token> loss function will not
        #  be weighted, else recommended pos_weight = 20)
        cross_entropy_pos_weight=20,
        # Use class weights to reduce the stop token classes imbalance (by adding more penalty on
        # False Negatives (FN)) (1 = disabled)
        predict_linear=False,
        # Whether to add a post-processing network to the Tacotron to predict linear spectrograms (
        # True mode Not tested!!)
        ###########################################################################################################################################

        # Tacotron Training
        # Reproduction seeds
        tacotron_random_seed=5339,
        # Determines initial graph and operations (i.e: model) random state for reproducibility
        tacotron_data_random_state=1234,  # random state for train test split repeatability

        # performance parameters
        tacotron_swap_with_cpu=False,
        # Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause
        # major slowdowns! Only use when critical!)

        # train/test split ratios, mini-batches sizes
        tacotron_batch_size=36,  # number of training samples on each training steps (was 32)
        # Tacotron Batch synthesis supports ~16x the training batch size (no gradients during
        # testing).
        # Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times
        #  different from training. We thus recommend masking the encoder.
        tacotron_synthesis_batch_size=128,
        # DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN"T TRAIN TACOTRON WITH "mask_encoder=True"!!
        tacotron_test_size=0.05,
        # % of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is
        # enough to have a good idea about overfit)
        tacotron_test_batches=None,  # number of test batches.

        # Learning rate schedule
        tacotron_decay_learning_rate=True,
        # boolean, determines if the learning rate will follow an exponential decay
        tacotron_start_decay=50000,  # Step at which learning decay starts
        tacotron_decay_steps=50000,  # Determines the learning rate decay slope (UNDER TEST)
        tacotron_decay_rate=0.5,  # learning rate decay rate (UNDER TEST)
        tacotron_initial_learning_rate=1e-3,  # starting learning rate
        tacotron_final_learning_rate=1e-5,  # minimal learning rate

        # Optimization parameters
        tacotron_adam_beta1=0.9,  # AdamOptimizer beta1 parameter
        tacotron_adam_beta2=0.999,  # AdamOptimizer beta2 parameter
        tacotron_adam_epsilon=1e-6,  # AdamOptimizer Epsilon parameter

        # Regularization parameters
        tacotron_reg_weight=1e-7,  # regularization weight (for L2 regularization)
        tacotron_scale_regularization=False,
        # Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is
        #  high and biasing the model)
        tacotron_zoneout_rate=0.1,  # zoneout rate for all LSTM cells in the network
        tacotron_dropout_rate=0.5,  # dropout rate for all convolutional layers + prenet
        tacotron_clip_gradients=True,  # whether to clip gradients

        # Evaluation parameters
        natural_eval=False,
        # Whether to use 100% natural eval (to evaluate Curriculum Learning performance) or with same
        #  teacher-forcing ratio as in training (just for overfit)

        # Decoder RNN learning can take be done in one of two ways:
        #	Teacher Forcing: vanilla teacher forcing (usually with ratio = 1). mode="constant"
        #	Curriculum Learning Scheme: From Teacher-Forcing to sampling from previous outputs is
        # function of global step. (teacher forcing ratio decay) mode="scheduled"
        # The second approach is inspired by:
        # Bengio et al. 2015: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks.
        # Can be found under: https://arxiv.org/pdf/1506.03099.pdf
        tacotron_teacher_forcing_mode="constant",
        # Can be ("constant" or "scheduled"). "scheduled" mode applies a cosine teacher forcing ratio
        # decay. (Preference: scheduled)
        tacotron_teacher_forcing_ratio=1.,
        # Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder
        # inputs, Only relevant if mode="constant"
        tacotron_teacher_forcing_init_ratio=1.,
        # initial teacher forcing ratio. Relevant if mode="scheduled"
        tacotron_teacher_forcing_final_ratio=0.,
        # final teacher forcing ratio. Relevant if mode="scheduled"
        tacotron_teacher_forcing_start_decay=10000,
        # starting point of teacher forcing ratio decay. Relevant if mode="scheduled"
        tacotron_teacher_forcing_decay_steps=280000,
        # Determines the teacher forcing ratio decay slope. Relevant if mode="scheduled"
        tacotron_teacher_forcing_decay_alpha=0.,
        # teacher forcing ratio decay rate. Relevant if mode="scheduled"
        ###########################################################################################################################################

        # Tacotron-2 integration parameters
        train_with_GTA=False,
        # Whether to use GTA mels to train WaveNet instead of ground truth mels.
        ###########################################################################################################################################

        # Eval sentences (if no eval text file was specified during synthesis, these sentences are
        # used for eval)
        sentences=[
            # From July 8, 2017 New York Times:
            "Scientists at the CERN laboratory say they have discovered a new particle.",
            "There\"s a way to measure the acute emotional intelligence that has never gone out of "
            "style.",
            "President Trump met with other leaders at the Group of 20 conference.",
            "The Senate\"s bill to repeal and replace the Affordable Care Act is now imperiled.",
            # From Google"s Tacotron example page:
            "Generative adversarial network or variational auto-encoder.",
            "Basilar membrane and otolaryngology are not auto-correlations.",
            "He has read the whole thing.",
            "He reads books.",
            "He thought it was time to present the present.",
            "Thisss isrealy awhsome.",
            "Punctuation sensitivity, is working.",
            "Punctuation sensitivity is working.",
            "Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
            "She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
            "Tajima Airport serves Toyooka.",
            # From The web (random long utterance)
            "Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.\
            This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that\
            the adopted architecture is able to perform this task with wild success.",
            "Thank you so much for your support!",
        ],
        )

def hparams_debug_string():
    values = hparams.values()
    hp = ["  %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"]
    return "Hyperparameters:\n" + "\n".join(hp)