MockingBird/synthesizer/hparams.py

import ast
import pprint

class HParams(object):
    def __init__(self, **kwargs): self.__dict__.update(kwargs)
    def __setitem__(self, key, value): setattr(self, key, value)
    def __getitem__(self, key): return getattr(self, key)
    def __repr__(self): return pprint.pformat(self.__dict__)

    def parse(self, string):
        # Overrides hparams from a comma-separated string of name=value pairs
        if len(string) > 0:
            overrides = [s.split("=") for s in string.split(",")]
            keys, values = zip(*overrides)
            keys = list(map(str.strip, keys))
            values = list(map(str.strip, values))
            for k in keys:
                self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
        return self

hparams = HParams(
        ### Signal Processing (used in both synthesizer and vocoder)
        sample_rate = 16000,
        n_fft = 800,
        num_mels = 80,
        hop_size = 200,                             # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
        win_size = 800,                             # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
        fmin = 55,
        min_level_db = -100,
        ref_level_db = 20,
        max_abs_value = 4.,                         # Gradient explodes if too big, premature convergence if too small.
        preemphasis = 0.97,                         # Filter coefficient to use if preemphasize is True
        preemphasize = True,

        ### Tacotron Text-to-Speech (TTS)
        tts_embed_dims = 512,                       # Embedding dimension for the graphemes/phoneme inputs
        tts_encoder_dims = 256,
        tts_decoder_dims = 128,
        tts_postnet_dims = 512,
        tts_encoder_K = 5,
        tts_lstm_dims = 1024,
        tts_postnet_K = 5,
        tts_num_highways = 4,
        tts_dropout = 0.5,
        tts_cleaner_names = ["basic_cleaners"],
        tts_stop_threshold = -3.4,                  # Value below which audio generation ends.
                                                    # For example, for a range of [-4, 4], this
                                                    # will terminate the sequence at the first
                                                    # frame that has all values < -3.4

        ### Tacotron Training
        tts_schedule = [(2,  1e-3,  10_000,  12),   # Progressive training schedule
                        (2,  5e-4,  15_000,  12),   # (r, lr, step, batch_size)
                        (2,  2e-4,  20_000,  12),   # (r, lr, step, batch_size)
                        (2,  1e-4,  30_000,  12),   #
                        (2,  5e-5,  40_000,  12),   #
                        (2,  1e-5,  60_000,  12),   #
                        (2,  5e-6, 160_000,  12),   # r = reduction factor (# of mel frames
                        (2,  3e-6, 320_000,  12),   #     synthesized for each decoder iteration)
                        (2,  1e-6, 640_000,  12)],  # lr = learning rate

        tts_clip_grad_norm = 1.0,                   # clips the gradient norm to prevent explosion - set to None if not needed
        tts_eval_interval = 500,                    # Number of steps between model evaluation (sample generation)
                                                    # Set to -1 to generate after completing epoch, or 0 to disable

        tts_eval_num_samples = 1,                   # Makes this number of samples

        ### Data Preprocessing
        max_mel_frames = 900,
        rescale = True,
        rescaling_max = 0.9,
        synthesis_batch_size = 16,                  # For vocoder preprocessing and inference.

        ### Mel Visualization and Griffin-Lim
        signal_normalization = True,
        power = 1.5,
        griffin_lim_iters = 60,

        ### Audio processing options
        fmax = 7600,                                # Should not exceed (sample_rate // 2)
        allow_clipping_in_normalization = True,     # Used when signal_normalization = True
        clip_mels_length = True,                    # If true, discards samples exceeding max_mel_frames
        use_lws = False,                            # "Fast spectrogram phase recovery using local weighted sums"
        symmetric_mels = True,                      # Sets mel range to [-max_abs_value, max_abs_value] if True,
                                                    #               and [0, max_abs_value] if False
        trim_silence = True,                        # Use with sample_rate of 16000 for best results

        ### SV2TTS
        speaker_embedding_size = 256,               # Dimension for the speaker embedding
        silence_min_duration_split = 0.4,           # Duration in seconds of a silence for an utterance to be split
        utterance_min_duration = 1.6,               # Duration in seconds below which utterances are discarded
        )
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`import ast`
			`import pprint`

			`class HParams(object):`
			`def __init__(self, **kwargs): self.__dict__.update(kwargs)`
			`def __setitem__(self, key, value): setattr(self, key, value)`
			`def __getitem__(self, key): return getattr(self, key)`
			`def __repr__(self): return pprint.pformat(self.__dict__)`

			`def parse(self, string):`
			`# Overrides hparams from a comma-separated string of name=value pairs`
			`if len(string) > 0:`
			`overrides = [s.split("=") for s in string.split(",")]`
			`keys, values = zip(*overrides)`
			`keys = list(map(str.strip, keys))`
			`values = list(map(str.strip, values))`
			`for k in keys:`
			`self.__dict__[k] = ast.literal_eval(values[keys.index(k)])`
			`return self`

			`hparams = HParams(`
			`### Signal Processing (used in both synthesizer and vocoder)`
			`sample_rate = 16000,`
			`n_fft = 800,`
			`num_mels = 80,`
			`hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)`
			`win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)`
			`fmin = 55,`
			`min_level_db = -100,`
			`ref_level_db = 20,`
			`max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.`
			`preemphasis = 0.97, # Filter coefficient to use if preemphasize is True`
			`preemphasize = True,`

			`### Tacotron Text-to-Speech (TTS)`
			`tts_embed_dims = 512, # Embedding dimension for the graphemes/phoneme inputs`
			`tts_encoder_dims = 256,`
			`tts_decoder_dims = 128,`
			`tts_postnet_dims = 512,`
			`tts_encoder_K = 5,`
			`tts_lstm_dims = 1024,`
			`tts_postnet_K = 5,`
			`tts_num_highways = 4,`
Web server release v2 (#99) * Init App * init server.py (#93) * init server.py * Update requirements.txt Add requirement Co-authored-by: auau <auau@test.com> Co-authored-by: babysor00 <babysor00@gmail.com> * Run web.py！ Run web.py！ * Restruct readme and add instruction to use web server * fix training preprocess of vocoder * Init App * init server.py (#93) * init server.py * Update requirements.txt Add requirement Co-authored-by: auau <auau@test.com> Co-authored-by: babysor00 <babysor00@gmail.com> * Run web.py！ Run web.py！ * fix training preprocess of vocoder * Refactor to restful style Co-authored-by: balala <Ozgay@users.noreply.github.com> Co-authored-by: auau <auau@test.com> 2021-09-25 17:07:46 +08:00			`tts_dropout = 0.5,`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`tts_cleaner_names = ["basic_cleaners"],`
			`tts_stop_threshold = -3.4, # Value below which audio generation ends.`
			`# For example, for a range of [-4, 4], this`
			`# will terminate the sequence at the first`
			`# frame that has all values < -3.4`

			`### Tacotron Training`
Update demo and training param A 2021-10-05 10:48:54 +08:00			`tts_schedule = [(2, 1e-3, 10_000, 12), # Progressive training schedule`
			`(2, 5e-4, 15_000, 12), # (r, lr, step, batch_size)`
			`(2, 2e-4, 20_000, 12), # (r, lr, step, batch_size)`
			`(2, 1e-4, 30_000, 12), #`
			`(2, 5e-5, 40_000, 12), #`
			`(2, 1e-5, 60_000, 12), #`
			`(2, 5e-6, 160_000, 12), # r = reduction factor (# of mel frames`
			`(2, 3e-6, 320_000, 12), # synthesized for each decoder iteration)`
			`(2, 1e-6, 640_000, 12)], # lr = learning rate`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00
			`tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed`
			`tts_eval_interval = 500, # Number of steps between model evaluation (sample generation)`
			`# Set to -1 to generate after completing epoch, or 0 to disable`

			`tts_eval_num_samples = 1, # Makes this number of samples`

			`### Data Preprocessing`
			`max_mel_frames = 900,`
			`rescale = True,`
			`rescaling_max = 0.9,`
			`synthesis_batch_size = 16, # For vocoder preprocessing and inference.`

			`### Mel Visualization and Griffin-Lim`
			`signal_normalization = True,`
			`power = 1.5,`
			`griffin_lim_iters = 60,`

			`### Audio processing options`
			`fmax = 7600, # Should not exceed (sample_rate // 2)`
			`allow_clipping_in_normalization = True, # Used when signal_normalization = True`
			`clip_mels_length = True, # If true, discards samples exceeding max_mel_frames`
			`use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"`
			`symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,`
			`# and [0, max_abs_value] if False`
			`trim_silence = True, # Use with sample_rate of 16000 for best results`

			`### SV2TTS`
			`speaker_embedding_size = 256, # Dimension for the speaker embedding`
			`silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split`
			`utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded`
			`)`