From bd0e47e76b4ea92a639dab0410fb0415b26b8550 Mon Sep 17 00:00:00 2001 From: babysor00 Date: Wed, 1 Sep 2021 23:44:40 +0800 Subject: [PATCH] Support new dataset "biaobei" BZNSYP High quality single speaker for Chinese --- .vscode/launch.json | 10 +++ README-CN.md | 3 +- README.md | 3 +- pre.py | 7 +- synthesizer/preprocess.py | 23 +++-- synthesizer/preprocess_speaker.py | 141 ++---------------------------- 6 files changed, 44 insertions(+), 143 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index f821057..914eaa8 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -4,6 +4,16 @@ // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ + { + "name": "Python: Syn Preprocess", + "type": "python", + "request": "launch", + "program": "pre.py", + "console": "integratedTerminal", + "args": [ + "D:\\ttsdata\\BZNSYP", "-d", "BZNSYP" + ], + }, { "name": "Python: Vocoder Preprocess", "type": "python", diff --git a/README-CN.md b/README-CN.md index 0921819..043567e 100644 --- a/README-CN.md +++ b/README-CN.md @@ -33,7 +33,8 @@ * 下载 数据集并解压:确保您可以访问 *train* 文件夹中的所有音频文件(如.wav) * 进行音频和梅尔频谱图预处理: `python pre.py ` -可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3 + +可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3, BZNSYP > 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\` >假如發生 `頁面文件太小,無法完成操作`,請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030),將虛擬內存更改為100G(102400),例如:档案放置D槽就更改D槽的虚拟内存 diff --git a/README.md b/README.md index bcc0104..a155bb5 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,8 @@ * Download aidatatang_200zh or other dataset and unzip: make sure you can access all .wav in *train* folder * Preprocess with the audios and the mel spectrograms: `python pre.py ` -Allow parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3 + +Allowing parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3, BZNSYP >If it happens `the page file is too small to complete the operation`, please refer to this [video](https://www.youtube.com/watch?v=Oh6dga-Oy10&ab_channel=CodeProf) and change the virtual memory to 100G (102400), for example : When the file is placed in the D disk, the virtual memory of the D disk is changed. diff --git a/pre.py b/pre.py index 26350a9..b54d587 100644 --- a/pre.py +++ b/pre.py @@ -12,7 +12,8 @@ import argparse recognized_datasets = [ "aidatatang_200zh", "magicdata", - "aishell3" + "aishell3", + "BZNSYP" ] if __name__ == "__main__": @@ -40,8 +41,8 @@ if __name__ == "__main__": parser.add_argument("--no_alignments", action="store_true", help=\ "Use this option when dataset does not include alignments\ (these are used to split long audio files into sub-utterances.)") - parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\ - "Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3.") + parser.add_argument("-d","--dataset", type=str, default="aidatatang_200zh", help=\ + "Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3, BZNSYP.") parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\ "Path your trained encoder model.") args = parser.parse_args() diff --git a/synthesizer/preprocess.py b/synthesizer/preprocess.py index 344a74d..b460568 100644 --- a/synthesizer/preprocess.py +++ b/synthesizer/preprocess.py @@ -6,7 +6,8 @@ from pathlib import Path from tqdm import tqdm import numpy as np from encoder import inference as encoder -from synthesizer.preprocess_speaker import preprocess_speaker_general +from synthesizer.preprocess_speaker import preprocess_speaker_general, preprocess_speaker_bznsyp +from synthesizer.preprocess_transcript import preprocess_transcript_bznsyp data_info = { "aidatatang_200zh": { @@ -24,6 +25,12 @@ data_info = { "trans_filepath": "train/content.txt", "speak_func": preprocess_speaker_general }, + "BZNSYP":{ + "subfolders": ["Wave"], + "trans_filepath": "ProsodyLabeling/000001-010000.txt", + "speak_func": preprocess_speaker_bznsyp, + "transcript_func": preprocess_transcript_bznsyp, + }, } def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, @@ -49,11 +56,15 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"]) assert transcript_dirs.exists(), str(transcript_dirs)+" not exist." with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript: - for v in dict_transcript: - if not v: - continue - v = v.strip().replace("\n","").replace("\t"," ").split(" ") - dict_info[v[0]] = " ".join(v[1:]) + # process with specific function for your dataset + if "transcript_func" in dataset_info: + dataset_info["transcript_func"](dict_info, dict_transcript) + else: + for v in dict_transcript: + if not v: + continue + v = v.strip().replace("\n","").replace("\t"," ").split(" ") + dict_info[v[0]] = " ".join(v[1:]) speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, diff --git a/synthesizer/preprocess_speaker.py b/synthesizer/preprocess_speaker.py index ed566ad..a7728af 100644 --- a/synthesizer/preprocess_speaker.py +++ b/synthesizer/preprocess_speaker.py @@ -81,9 +81,16 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams): return wav, res def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool): - metadata = [] wav_fpath_list = speaker_dir.glob("*.wav") + return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments) + +def preprocess_speaker_bznsyp(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool): + wav_fpath_list = [speaker_dir] + return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments) + +def preprocess_speaker_internal(wav_fpath_list, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool): # Iterate over each wav + metadata = [] for wav_fpath in wav_fpath_list: words = dict_info.get(wav_fpath.name.split(".")[0]) words = dict_info.get(wav_fpath.name) if not words else words # try with wav @@ -94,134 +101,4 @@ def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams) metadata.append(_process_utterance(wav, text, out_dir, sub_basename, skip_existing, hparams)) - return [m for m in metadata if m is not None] - -def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool): - metadata = [] - for book_dir in speaker_dir.glob("*"): - if no_alignments: - # Gather the utterance audios and texts - # LibriTTS uses .wav but we will include extensions for compatibility with other datasets - extensions = ["*.wav", "*.flac", "*.mp3"] - for extension in extensions: - wav_fpaths = book_dir.glob(extension) - - for wav_fpath in wav_fpaths: - # Load the audio waveform - wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) - if hparams.rescale: - wav = wav / np.abs(wav).max() * hparams.rescaling_max - - # Get the corresponding text - # Check for .txt (for compatibility with other datasets) - text_fpath = wav_fpath.with_suffix(".txt") - if not text_fpath.exists(): - # Check for .normalized.txt (LibriTTS) - text_fpath = wav_fpath.with_suffix(".normalized.txt") - assert text_fpath.exists() - with text_fpath.open("r") as text_file: - text = "".join([line for line in text_file]) - text = text.replace("\"", "") - text = text.strip() - - # Process the utterance - metadata.append(_process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name), - skip_existing, hparams)) - else: - # Process alignment file (LibriSpeech support) - # Gather the utterance audios and texts - try: - alignments_fpath = next(book_dir.glob("*.alignment.txt")) - with alignments_fpath.open("r") as alignments_file: - alignments = [line.rstrip().split(" ") for line in alignments_file] - except StopIteration: - # A few alignment files will be missing - continue - - # Iterate over each entry in the alignments file - for wav_fname, words, end_times in alignments: - wav_fpath = book_dir.joinpath(wav_fname + ".flac") - assert wav_fpath.exists() - words = words.replace("\"", "").split(",") - end_times = list(map(float, end_times.replace("\"", "").split(","))) - - # Process each sub-utterance - wavs, texts = _split_on_silences(wav_fpath, words, end_times, hparams) - for i, (wav, text) in enumerate(zip(wavs, texts)): - sub_basename = "%s_%02d" % (wav_fname, i) - metadata.append(_process_utterance(wav, text, out_dir, sub_basename, - skip_existing, hparams)) - - return [m for m in metadata if m is not None] - -# TODO: use original split func -def _split_on_silences(wav_fpath, words, end_times, hparams): - # Load the audio waveform - wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) - if hparams.rescale: - wav = wav / np.abs(wav).max() * hparams.rescaling_max - - words = np.array(words) - start_times = np.array([0.0] + end_times[:-1]) - end_times = np.array(end_times) - assert len(words) == len(end_times) == len(start_times) - assert words[0] == "" and words[-1] == "" - - # Find pauses that are too long - mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split) - mask[0] = mask[-1] = True - breaks = np.where(mask)[0] - - # Profile the noise from the silences and perform noise reduction on the waveform - silence_times = [[start_times[i], end_times[i]] for i in breaks] - silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int) - noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times]) - if len(noisy_wav) > hparams.sample_rate * 0.02: - profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate) - wav = logmmse.denoise(wav, profile, eta=0) - - # Re-attach segments that are too short - segments = list(zip(breaks[:-1], breaks[1:])) - segment_durations = [start_times[end] - end_times[start] for start, end in segments] - i = 0 - while i < len(segments) and len(segments) > 1: - if segment_durations[i] < hparams.utterance_min_duration: - # See if the segment can be re-attached with the right or the left segment - left_duration = float("inf") if i == 0 else segment_durations[i - 1] - right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1] - joined_duration = segment_durations[i] + min(left_duration, right_duration) - - # Do not re-attach if it causes the joined utterance to be too long - if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate: - i += 1 - continue - - # Re-attach the segment with the neighbour of shortest duration - j = i - 1 if left_duration <= right_duration else i - segments[j] = (segments[j][0], segments[j + 1][1]) - segment_durations[j] = joined_duration - del segments[j + 1], segment_durations[j + 1] - else: - i += 1 - - # Split the utterance - segment_times = [[end_times[start], start_times[end]] for start, end in segments] - segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int) - wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times] - texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments] - - # # DEBUG: play the audio segments (run with -n=1) - # import sounddevice as sd - # if len(wavs) > 1: - # print("This sentence was split in %d segments:" % len(wavs)) - # else: - # print("There are no silences long enough for this sentence to be split:") - # for wav, text in zip(wavs, texts): - # # Pad the waveform with 1 second of silence because sounddevice tends to cut them early - # # when playing them. You shouldn't need to do that in your parsers. - # wav = np.concatenate((wav, [0] * 16000)) - # print("\t%s" % text) - # sd.play(wav, 16000, blocking=True) - # print("") - - return wavs, texts \ No newline at end of file + return [m for m in metadata if m is not None] \ No newline at end of file