mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
79 lines
4.5 KiB
Python
79 lines
4.5 KiB
Python
from utils.hparams import HParams
|
|
|
|
hparams = HParams(
|
|
### Signal Processing (used in both synthesizer and vocoder)
|
|
sample_rate = 16000,
|
|
n_fft = 1024, # filter_length
|
|
num_mels = 80,
|
|
hop_size = 256, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
|
|
win_size = 1024, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
|
|
fmin = 55,
|
|
min_level_db = -100,
|
|
ref_level_db = 20,
|
|
max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.
|
|
preemphasis = 0.97, # Filter coefficient to use if preemphasize is True
|
|
preemphasize = True,
|
|
|
|
### Tacotron Text-to-Speech (TTS)
|
|
tts_embed_dims = 512, # Embedding dimension for the graphemes/phoneme inputs
|
|
tts_encoder_dims = 256,
|
|
tts_decoder_dims = 128,
|
|
tts_postnet_dims = 512,
|
|
tts_encoder_K = 5,
|
|
tts_lstm_dims = 1024,
|
|
tts_postnet_K = 5,
|
|
tts_num_highways = 4,
|
|
tts_dropout = 0.5,
|
|
tts_cleaner_names = ["basic_cleaners"],
|
|
tts_stop_threshold = -3.4, # Value below which audio generation ends.
|
|
# For example, for a range of [-4, 4], this
|
|
# will terminate the sequence at the first
|
|
# frame that has all values < -3.4
|
|
|
|
### Tacotron Training
|
|
tts_schedule = [(2, 1e-3, 10_000, 12), # Progressive training schedule
|
|
(2, 5e-4, 15_000, 12), # (r, lr, step, batch_size)
|
|
(2, 2e-4, 20_000, 12), # (r, lr, step, batch_size)
|
|
(2, 1e-4, 30_000, 12), #
|
|
(2, 5e-5, 40_000, 12), #
|
|
(2, 1e-5, 60_000, 12), #
|
|
(2, 5e-6, 160_000, 12), # r = reduction factor (# of mel frames
|
|
(2, 3e-6, 320_000, 12), # synthesized for each decoder iteration)
|
|
(2, 1e-6, 640_000, 12)], # lr = learning rate
|
|
|
|
tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed
|
|
tts_eval_interval = 500, # Number of steps between model evaluation (sample generation)
|
|
# Set to -1 to generate after completing epoch, or 0 to disable
|
|
tts_eval_num_samples = 1, # Makes this number of samples
|
|
|
|
## For finetune usage, if set, only selected layers will be trained, available: encoder,encoder_proj,gst,decoder,postnet,post_proj
|
|
tts_finetune_layers = [],
|
|
|
|
### Data Preprocessing
|
|
max_mel_frames = 900,
|
|
rescale = True,
|
|
rescaling_max = 0.9,
|
|
synthesis_batch_size = 16, # For vocoder preprocessing and inference.
|
|
|
|
### Mel Visualization and Griffin-Lim
|
|
signal_normalization = True,
|
|
power = 1.5,
|
|
griffin_lim_iters = 60,
|
|
|
|
### Audio processing options
|
|
fmax = 7600, # Should not exceed (sample_rate // 2)
|
|
allow_clipping_in_normalization = True, # Used when signal_normalization = True
|
|
clip_mels_length = True, # If true, discards samples exceeding max_mel_frames
|
|
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
|
|
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
|
|
# and [0, max_abs_value] if False
|
|
trim_silence = False, # Use with sample_rate of 16000 for best results
|
|
|
|
### SV2TTS
|
|
speaker_embedding_size = 256, # Dimension for the speaker embedding
|
|
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
|
|
utterance_min_duration = 0.5, # Duration in seconds below which utterances are discarded
|
|
use_gst = True, # Whether to use global style token
|
|
use_ser_for_gst = True, # Whether to use speaker embedding referenced for global style token
|
|
)
|