Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68

2024-03-22 13:11:31 +08:00 · 2021-08-13 00:41:58 +08:00 · 2021-08-13 00:41:58 +08:00 · 4d6da5d49b
commit 4d6da5d49b
parent 752f5dbefb
4 changed files with 48 additions and 38 deletions
--- a/README-CN.md
+++ b/README-CN.md
@ -7,7 +7,7 @@
 ### [English](README.md)  | 中文
 ## 特性
-🌍 **中文** 支持普通话并使用数据集进行测试：adatatang_200zh
+🌍 **中文** 支持普通话并使用多种中文数据集进行测试：adatatang_200zh, SLR68
 🤩 **PyTorch** 适用于 pytorch，已在 1.9.0 版本（最新于 2021 年 8 月）中测试，GPU Tesla T4 和 GTX 2060
@ -33,6 +33,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
 * 下载 adatatang_200zh 数据集并解压：确保您可以访问 *train* 文件夹中的所有 .wav
 * 使用音频和梅尔频谱图进行预处理：
 `python synthesizer_preprocess_audio.py <datasets_root>`
 可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, SLR68
 * 预处理嵌入：
 `python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
@ -48,8 +49,8 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
 然后您可以尝试使用工具箱：
 `python demo_toolbox.py -d <datasets_root>`
-＃＃ TODO
+## TODO
- 添加演示视频
+- [X]  添加演示视频
- 添加对更多数据集的支持
+- [X] 添加对更多数据集的支持
- 上传预训练模型
+- [ ] 上传预训练模型
- 🙏 欢迎补充
+- [ ] 🙏 欢迎补充
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@
 > English | [中文](README-CN.md) 
 ## Features
-🌍 **Chinese** supported mandarin and tested with dataset: aidatatang_200zh
+🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, SLR68
 🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060
@ -35,7 +35,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
 * Download aidatatang_200zh dataset and unzip: make sure you can access all .wav in *train* folder
 * Preprocess with the audios and the mel spectrograms:
 `python synthesizer_preprocess_audio.py <datasets_root>`
-
+Allow parameter `--dataset {dataset}` to support adatatang_200zh, SLR68
 * Preprocess the embeddings:
 `python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
@ -56,6 +56,6 @@ or
 ## TODO
 - [x] Add demo video
- [ ] Add support for more dataset
+- [X] Add support for more dataset
 - [ ] Upload pretrained model
 - 🙏 Welcome to add more
--- a/synthesizer/preprocess.py
+++ b/synthesizer/preprocess.py
@ -6,22 +6,28 @@ from pathlib import Path
 from tqdm import tqdm
 import numpy as np
 from encoder import inference as encoder
-from synthesizer.preprocess_speaker import preprocess_speaker_aidatatang_200zh
+from synthesizer.preprocess_speaker import preprocess_speaker_general
 data_info = {
    "aidatatang_200zh": {
        "subfolders": ["corpus/train"],
-        "speak_func": preprocess_speaker_aidatatang_200zh
+        "trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
-    }
+        "speak_func": preprocess_speaker_general
-    # TODO add more
+    },
    "SLR68": {
        "subfolders": ["train"],
        "trans_filepath": "train/TRANS.txt",
        "speak_func": preprocess_speaker_general
    },
 }
 def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
                           skip_existing: bool, hparams, no_alignments: bool,
                           dataset: str):
    dataset_info = data_info[dataset]
    # Gather the input directories
    dataset_root = datasets_root.joinpath(dataset)
-    input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in data_info[dataset]["subfolders"]]
+    input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]
    print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
    assert all(input_dir.exists() for input_dir in input_dirs)
@ -34,9 +40,19 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
    metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
    # Preprocess the dataset
    dict_info = {}
    transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
    assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
    with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
        for v in dict_transcript:
            if not v:
                continue
            v = v.strip().replace("\n","").replace("\t"," ").split(" ")
            dict_info[v[0]] = " ".join(v[1:])
    speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
-    func = partial(data_info[dataset]["speak_func"], out_dir=out_dir, skip_existing=skip_existing, 
+    func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, 
-                   hparams=hparams, directory=dataset_root, no_alignments=no_alignments)
+                   hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)
    job = Pool(n_processes).imap(func, speaker_dirs)
    for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
        for metadatum in speaker_metadata:
--- a/synthesizer/preprocess_speaker.py
+++ b/synthesizer/preprocess_speaker.py
@ -1,4 +1,3 @@
 import platform
 import librosa
 import numpy as np
@ -68,36 +67,30 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
    wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
    # denoise, we may not need it here.
    if len(wav) > hparams.sample_rate*(0.3+0.1):
        noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
                                    wav[-int(hparams.sample_rate*0.15):]])
        profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
        wav = logmmse.denoise(wav, profile, eta=0)
    resp = pinyin(words, style=Style.TONE3)
    res = [v[0] for v in resp if v[0].strip()]
    res = " ".join(res)
    return wav, res
-
+def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
 def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, directory, no_alignments: bool):
    dict_info = {}
    transcript_dirs = directory.joinpath("transcript/aidatatang_200_zh_transcript.txt")
    with open(transcript_dirs,"rb") as fp:
        dict_transcript = [v.decode() for v in fp]
    for v in dict_transcript:
        if not v:
            continue
        v = v.strip().replace("\n","").split(" ")
        dict_info[v[0]] = " ".join(v[1:])
    metadata = []
-    if platform.system() == "Windows":
+    wav_fpath_list = speaker_dir.glob("*.wav")
-        split = "\\"
+    # Iterate over each wav
-    else:
+    for wav_fpath in wav_fpath_list:
-        split = "/" 
+        words = dict_info.get(wav_fpath.name.split(".")[0])
-    for wav_fpath in speaker_dir.glob("*.wav"):
+        words = dict_info.get(wav_fpath.name) if not words else words # try with wav 
        name = str(wav_fpath).split(split)[-1]
        key = name.split(".")[0]
        words = dict_info.get(key)
        if not words:
            print("no wordS")
            continue
-        sub_basename = "%s_%02d" % (name, 0)
+        sub_basename = "%s_%02d" % (wav_fpath.name, 0)
        wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
        metadata.append(_process_utterance(wav, text, out_dir, sub_basename, 
                                              skip_existing, hparams))