Support new dataset "biaobei" BZNSYP High quality single speaker for Chinese

2024-03-22 13:11:31 +08:00 · 2021-09-01 23:44:40 +08:00 · 2021-09-01 23:44:40 +08:00 · bd0e47e76b
commit bd0e47e76b
parent 024d88ae96
6 changed files with 44 additions and 143 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -4,6 +4,16 @@
    // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
+        {
+            "name": "Python: Syn Preprocess",
+            "type": "python",
+            "request": "launch",
+            "program": "pre.py",
+            "console": "integratedTerminal",
+            "args": [
+                "D:\\ttsdata\\BZNSYP", "-d", "BZNSYP"
+            ],
+        },
        {
            "name": "Python: Vocoder Preprocess",
            "type": "python",
--- a/README-CN.md
+++ b/README-CN.md
@ -33,7 +33,8 @@
 * 下载 数据集并解压：确保您可以访问 *train* 文件夹中的所有音频文件（如.wav）
 * 进行音频和梅尔频谱图预处理：
 `python pre.py <datasets_root>`
-可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3
+
+可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3, BZNSYP
 > 假如你下载的 `aidatatang_200zh`文件放在D盘，`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`

 >假如發生 `頁面文件太小，無法完成操作`，請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030)，將虛擬內存更改為100G(102400)，例如:档案放置D槽就更改D槽的虚拟内存
--- a/README.md
+++ b/README.md
@ -33,7 +33,8 @@
 * Download aidatatang_200zh or other dataset and unzip: make sure you can access all .wav in *train* folder
 * Preprocess with the audios and the mel spectrograms:
 `python pre.py <datasets_root>`
-Allow parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3
+
+Allowing parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3, BZNSYP

 >If it happens `the page file is too small to complete the operation`, please refer to this [video](https://www.youtube.com/watch?v=Oh6dga-Oy10&ab_channel=CodeProf) and change the virtual memory to 100G (102400), for example : When the file is placed in the D disk, the virtual memory of the D disk is changed.

--- a/pre.py
+++ b/pre.py
@ -12,7 +12,8 @@ import argparse
 recognized_datasets = [
    "aidatatang_200zh",
    "magicdata",
-    "aishell3"
+    "aishell3",
+    "BZNSYP"
 ]

 if __name__ == "__main__":
@ -40,8 +41,8 @@ if __name__ == "__main__":
    parser.add_argument("--no_alignments", action="store_true", help=\
        "Use this option when dataset does not include alignments\
        (these are used to split long audio files into sub-utterances.)")
-    parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\
-        "Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3.")
+    parser.add_argument("-d","--dataset", type=str, default="aidatatang_200zh", help=\
+        "Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3, BZNSYP.")
    parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\
        "Path your trained encoder model.")
    args = parser.parse_args()
--- a/synthesizer/preprocess.py
+++ b/synthesizer/preprocess.py
@ -6,7 +6,8 @@ from pathlib import Path
 from tqdm import tqdm
 import numpy as np
 from encoder import inference as encoder
-from synthesizer.preprocess_speaker import preprocess_speaker_general
+from synthesizer.preprocess_speaker import preprocess_speaker_general, preprocess_speaker_bznsyp
+from synthesizer.preprocess_transcript import preprocess_transcript_bznsyp

 data_info = {
    "aidatatang_200zh": {
@ -24,6 +25,12 @@ data_info = {
        "trans_filepath": "train/content.txt",
        "speak_func": preprocess_speaker_general
    },
+    "BZNSYP":{
+        "subfolders": ["Wave"],
+        "trans_filepath": "ProsodyLabeling/000001-010000.txt",
+        "speak_func": preprocess_speaker_bznsyp,
+        "transcript_func": preprocess_transcript_bznsyp,
+    },
 }

 def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
@ -49,6 +56,10 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
    transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
    assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
    with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
+        # process with specific function for your dataset 
+        if "transcript_func" in dataset_info:
+            dataset_info["transcript_func"](dict_info, dict_transcript)
+        else:
            for v in dict_transcript:
                if not v:
                    continue
--- a/synthesizer/preprocess_speaker.py
+++ b/synthesizer/preprocess_speaker.py
@ -81,9 +81,16 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
    return wav, res

 def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
-    metadata = []
    wav_fpath_list = speaker_dir.glob("*.wav")
+    return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments)
+
+def preprocess_speaker_bznsyp(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
+    wav_fpath_list = [speaker_dir]
+    return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments)
+
+def preprocess_speaker_internal(wav_fpath_list, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
    # Iterate over each wav
+    metadata = []
    for wav_fpath in wav_fpath_list:
        words = dict_info.get(wav_fpath.name.split(".")[0])
        words = dict_info.get(wav_fpath.name) if not words else words # try with wav 
@ -95,133 +102,3 @@ def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool,
        metadata.append(_process_utterance(wav, text, out_dir, sub_basename, 
                                              skip_existing, hparams))
    return [m for m in metadata if m is not None]
-
-def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
-    metadata = []
-    for book_dir in speaker_dir.glob("*"):
-        if no_alignments:
-            # Gather the utterance audios and texts
-            # LibriTTS uses .wav but we will include extensions for compatibility with other datasets
-            extensions = ["*.wav", "*.flac", "*.mp3"]
-            for extension in extensions:
-                wav_fpaths = book_dir.glob(extension)
-
-                for wav_fpath in wav_fpaths:
-                    # Load the audio waveform
-                    wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
-                    if hparams.rescale:
-                        wav = wav / np.abs(wav).max() * hparams.rescaling_max
-
-                    # Get the corresponding text
-                    # Check for .txt (for compatibility with other datasets)
-                    text_fpath = wav_fpath.with_suffix(".txt")
-                    if not text_fpath.exists():
-                        # Check for .normalized.txt (LibriTTS)
-                        text_fpath = wav_fpath.with_suffix(".normalized.txt")
-                        assert text_fpath.exists()
-                    with text_fpath.open("r") as text_file:
-                        text = "".join([line for line in text_file])
-                        text = text.replace("\"", "")
-                        text = text.strip()
-
-                    # Process the utterance
-                    metadata.append(_process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
-                                                      skip_existing, hparams))
-        else:
-            # Process alignment file (LibriSpeech support)
-            # Gather the utterance audios and texts
-            try:
-                alignments_fpath = next(book_dir.glob("*.alignment.txt"))
-                with alignments_fpath.open("r") as alignments_file:
-                    alignments = [line.rstrip().split(" ") for line in alignments_file]
-            except StopIteration:
-                # A few alignment files will be missing
-                continue
-
-            # Iterate over each entry in the alignments file
-            for wav_fname, words, end_times in alignments:
-                wav_fpath = book_dir.joinpath(wav_fname + ".flac")
-                assert wav_fpath.exists()
-                words = words.replace("\"", "").split(",")
-                end_times = list(map(float, end_times.replace("\"", "").split(",")))
-
-                # Process each sub-utterance
-                wavs, texts = _split_on_silences(wav_fpath, words, end_times, hparams)
-                for i, (wav, text) in enumerate(zip(wavs, texts)):
-                    sub_basename = "%s_%02d" % (wav_fname, i)
-                    metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
-                                                      skip_existing, hparams))
-
-    return [m for m in metadata if m is not None]
-
-# TODO: use original split func
-def _split_on_silences(wav_fpath, words, end_times, hparams):
-    # Load the audio waveform
-    wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
-    if hparams.rescale:
-        wav = wav / np.abs(wav).max() * hparams.rescaling_max
-    
-    words = np.array(words)
-    start_times = np.array([0.0] + end_times[:-1])
-    end_times = np.array(end_times)
-    assert len(words) == len(end_times) == len(start_times)
-    assert words[0] == "" and words[-1] == ""
-    
-    # Find pauses that are too long
-    mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
-    mask[0] = mask[-1] = True
-    breaks = np.where(mask)[0]
-
-    # Profile the noise from the silences and perform noise reduction on the waveform
-    silence_times = [[start_times[i], end_times[i]] for i in breaks]
-    silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
-    noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
-    if len(noisy_wav) > hparams.sample_rate * 0.02:
-        profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
-        wav = logmmse.denoise(wav, profile, eta=0)
-    
-    # Re-attach segments that are too short
-    segments = list(zip(breaks[:-1], breaks[1:]))
-    segment_durations = [start_times[end] - end_times[start] for start, end in segments]
-    i = 0
-    while i < len(segments) and len(segments) > 1:
-        if segment_durations[i] < hparams.utterance_min_duration:
-            # See if the segment can be re-attached with the right or the left segment
-            left_duration = float("inf") if i == 0 else segment_durations[i - 1]
-            right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
-            joined_duration = segment_durations[i] + min(left_duration, right_duration)
-
-            # Do not re-attach if it causes the joined utterance to be too long
-            if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
-                i += 1
-                continue
-
-            # Re-attach the segment with the neighbour of shortest duration
-            j = i - 1 if left_duration <= right_duration else i
-            segments[j] = (segments[j][0], segments[j + 1][1])
-            segment_durations[j] = joined_duration
-            del segments[j + 1], segment_durations[j + 1]
-        else:
-            i += 1
-    
-    # Split the utterance
-    segment_times = [[end_times[start], start_times[end]] for start, end in segments]
-    segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
-    wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
-    texts = [" ".join(words[start + 1:end]).replace("  ", " ") for start, end in segments]
-    
-    # # DEBUG: play the audio segments (run with -n=1)
-    # import sounddevice as sd
-    # if len(wavs) > 1:
-    #     print("This sentence was split in %d segments:" % len(wavs))
-    # else:
-    #     print("There are no silences long enough for this sentence to be split:")
-    # for wav, text in zip(wavs, texts):
-    #     # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
-    #     # when playing them. You shouldn't need to do that in your parsers.
-    #     wav = np.concatenate((wav, [0] * 16000))
-    #     print("\t%s" % text)
-    #     sd.play(wav, 16000, blocking=True)
-    # print("")
-    
-    return wavs, texts