Solved the problem that the existing model could not be loaded when training the GAN model (#549)

* The new vocoder Fre-GAN is now supported * Improved some fregan details * Fixed the problem that the existing model could not be loaded to continue training when training GAN * Updated reference papers
2024-03-22 13:11:31 +08:00 · 2022-05-13 13:41:03 +08:00 · 2022-05-13 13:41:03 +08:00 · 350b190662
commit 350b190662
parent 0caed984e3
6 changed files with 10 additions and 8 deletions
--- a/README-CN.md
+++ b/README-CN.md
@ -141,6 +141,7 @@
 | --- | ----------- | ----- | --------------------- |
 | [1803.09017](https://arxiv.org/abs/1803.09017) | GlobalStyleToken (synthesizer)| Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis | 本代码库 |
 | [2010.05646](https://arxiv.org/abs/2010.05646) | HiFi-GAN (vocoder)| Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis | 本代码库 |
+| [2106.02297](https://arxiv.org/abs/2106.02297) | Fre-GAN (vocoder)| Fre-GAN: Adversarial Frequency-consistent Audio Synthesis | 本代码库 |
 |[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | SV2TTS | Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis | 本代码库 |
 |[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
 |[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
--- a/README.md
+++ b/README.md
@ -97,6 +97,7 @@ You can then try the toolbox:
 | --- | ----------- | ----- | --------------------- |
 | [1803.09017](https://arxiv.org/abs/1803.09017) | GlobalStyleToken (synthesizer)| Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis | This repo |
 | [2010.05646](https://arxiv.org/abs/2010.05646) | HiFi-GAN (vocoder)| Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis | This repo |
+| [2106.02297](https://arxiv.org/abs/2106.02297) | Fre-GAN (vocoder)| Fre-GAN: Adversarial Frequency-consistent Audio Synthesis | This repo |
 |[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
 |[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
 |[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
--- a/vocoder/fregan/train.py
+++ b/vocoder/fregan/train.py
@ -51,8 +51,8 @@ def train(rank, a, h):
        print("checkpoints directory : ", a.checkpoint_path)

    if os.path.isdir(a.checkpoint_path):
-        cp_g = scan_checkpoint(a.checkpoint_path, 'g_')
-        cp_do = scan_checkpoint(a.checkpoint_path, 'do_')
+        cp_g = scan_checkpoint(a.checkpoint_path, 'g_fregan_')
+        cp_do = scan_checkpoint(a.checkpoint_path, 'do_fregan_')

    steps = 0
    if cp_g is None or cp_do is None:
--- a/vocoder/fregan/utils.py
+++ b/vocoder/fregan/utils.py
@ -58,7 +58,7 @@ def save_checkpoint(filepath, obj):


 def scan_checkpoint(cp_dir, prefix):
-    pattern = os.path.join(cp_dir, prefix + '????????')
+    pattern = os.path.join(cp_dir, prefix + '????????.pt')
    cp_list = glob.glob(pattern)
    if len(cp_list) == 0:
        return None
--- a/vocoder/hifigan/train.py
+++ b/vocoder/hifigan/train.py
@ -51,8 +51,8 @@ def train(rank, a, h):
        print("checkpoints directory : ", a.checkpoint_path)

    if os.path.isdir(a.checkpoint_path):
-        cp_g = scan_checkpoint(a.checkpoint_path, 'g_')
-        cp_do = scan_checkpoint(a.checkpoint_path, 'do_')
+        cp_g = scan_checkpoint(a.checkpoint_path, 'g_hifigan_')
+        cp_do = scan_checkpoint(a.checkpoint_path, 'do_hifigan_')

    steps = 0
    if cp_g is None or cp_do is None:
@ -181,10 +181,10 @@ def train(rank, a, h):

                # checkpointing
                if steps % a.checkpoint_interval == 0 and steps != 0:
-                    checkpoint_path = "{}/g_{:08d}.pt".format(a.checkpoint_path, steps)
+                    checkpoint_path = "{}/g_hifigan_{:08d}.pt".format(a.checkpoint_path, steps)
                    save_checkpoint(checkpoint_path,
                                    {'generator': (generator.module if h.num_gpus > 1 else generator).state_dict()})
-                    checkpoint_path = "{}/do_{:08d}.pt".format(a.checkpoint_path, steps)
+                    checkpoint_path = "{}/do_hifigan_{:08d}.pt".format(a.checkpoint_path, steps)
                    save_checkpoint(checkpoint_path,
                                    {'mpd': (mpd.module if h.num_gpus > 1 else mpd).state_dict(),
                                     'msd': (msd.module if h.num_gpus > 1 else msd).state_dict(),
--- a/vocoder/hifigan/utils.py
+++ b/vocoder/hifigan/utils.py
@ -50,7 +50,7 @@ def save_checkpoint(filepath, obj):


 def scan_checkpoint(cp_dir, prefix):
-    pattern = os.path.join(cp_dir, prefix + 'hifigan.pt')
+    pattern = os.path.join(cp_dir, prefix + '????????.pt')
    cp_list = glob.glob(pattern)
    if len(cp_list) == 0:
        return None