From 4d6da5d49bc6aab8f09c21131ed821886cc46cc5 Mon Sep 17 00:00:00 2001 From: babysor00 Date: Fri, 13 Aug 2021 00:41:58 +0800 Subject: [PATCH] Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 --- README-CN.md | 13 ++++++----- README.md | 6 ++--- synthesizer/preprocess.py | 30 +++++++++++++++++++------ synthesizer/preprocess_speaker.py | 37 +++++++++++++------------------ 4 files changed, 48 insertions(+), 38 deletions(-) diff --git a/README-CN.md b/README-CN.md index 2afabb3..20f841e 100644 --- a/README-CN.md +++ b/README-CN.md @@ -7,7 +7,7 @@ ### [English](README.md) | 中文 ## 特性 -🌍 **中文** 支持普通话并使用数据集进行测试:adatatang_200zh +🌍 **中文** 支持普通话并使用多种中文数据集进行测试:adatatang_200zh, SLR68 🤩 **PyTorch** 适用于 pytorch,已在 1.9.0 版本(最新于 2021 年 8 月)中测试,GPU Tesla T4 和 GTX 2060 @@ -33,6 +33,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models * 下载 adatatang_200zh 数据集并解压:确保您可以访问 *train* 文件夹中的所有 .wav * 使用音频和梅尔频谱图进行预处理: `python synthesizer_preprocess_audio.py ` +可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, SLR68 * 预处理嵌入: `python synthesizer_preprocess_embeds.py /SV2TTS/synthesizer` @@ -48,8 +49,8 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models 然后您可以尝试使用工具箱: `python demo_toolbox.py -d ` -## TODO -- 添加演示视频 -- 添加对更多数据集的支持 -- 上传预训练模型 -- 🙏 欢迎补充 \ No newline at end of file +## TODO +- [X] 添加演示视频 +- [X] 添加对更多数据集的支持 +- [ ] 上传预训练模型 +- [ ] 🙏 欢迎补充 \ No newline at end of file diff --git a/README.md b/README.md index 2a74b90..170b4eb 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ > English | [中文](README-CN.md) ## Features -🌍 **Chinese** supported mandarin and tested with dataset: aidatatang_200zh +🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, SLR68 🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060 @@ -35,7 +35,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models * Download aidatatang_200zh dataset and unzip: make sure you can access all .wav in *train* folder * Preprocess with the audios and the mel spectrograms: `python synthesizer_preprocess_audio.py ` - +Allow parameter `--dataset {dataset}` to support adatatang_200zh, SLR68 * Preprocess the embeddings: `python synthesizer_preprocess_embeds.py /SV2TTS/synthesizer` @@ -56,6 +56,6 @@ or ## TODO - [x] Add demo video -- [ ] Add support for more dataset +- [X] Add support for more dataset - [ ] Upload pretrained model - 🙏 Welcome to add more diff --git a/synthesizer/preprocess.py b/synthesizer/preprocess.py index b40ef7f..17a1123 100644 --- a/synthesizer/preprocess.py +++ b/synthesizer/preprocess.py @@ -6,22 +6,28 @@ from pathlib import Path from tqdm import tqdm import numpy as np from encoder import inference as encoder -from synthesizer.preprocess_speaker import preprocess_speaker_aidatatang_200zh +from synthesizer.preprocess_speaker import preprocess_speaker_general data_info = { "aidatatang_200zh": { "subfolders": ["corpus/train"], - "speak_func": preprocess_speaker_aidatatang_200zh - } - # TODO add more + "trans_filepath": "transcript/aidatatang_200_zh_transcript.txt", + "speak_func": preprocess_speaker_general + }, + "SLR68": { + "subfolders": ["train"], + "trans_filepath": "train/TRANS.txt", + "speak_func": preprocess_speaker_general + }, } def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams, no_alignments: bool, dataset: str): + dataset_info = data_info[dataset] # Gather the input directories dataset_root = datasets_root.joinpath(dataset) - input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in data_info[dataset]["subfolders"]] + input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]] print("\n ".join(map(str, ["Using data from:"] + input_dirs))) assert all(input_dir.exists() for input_dir in input_dirs) @@ -34,9 +40,19 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8") # Preprocess the dataset + dict_info = {} + transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"]) + assert transcript_dirs.exists(), str(transcript_dirs)+" not exist." + with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript: + for v in dict_transcript: + if not v: + continue + v = v.strip().replace("\n","").replace("\t"," ").split(" ") + dict_info[v[0]] = " ".join(v[1:]) + speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) - func = partial(data_info[dataset]["speak_func"], out_dir=out_dir, skip_existing=skip_existing, - hparams=hparams, directory=dataset_root, no_alignments=no_alignments) + func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, + hparams=hparams, dict_info=dict_info, no_alignments=no_alignments) job = Pool(n_processes).imap(func, speaker_dirs) for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"): for metadatum in speaker_metadata: diff --git a/synthesizer/preprocess_speaker.py b/synthesizer/preprocess_speaker.py index 291b554..ed566ad 100644 --- a/synthesizer/preprocess_speaker.py +++ b/synthesizer/preprocess_speaker.py @@ -1,4 +1,3 @@ -import platform import librosa import numpy as np @@ -68,36 +67,30 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams): wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0] if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max + # denoise, we may not need it here. + if len(wav) > hparams.sample_rate*(0.3+0.1): + noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)], + wav[-int(hparams.sample_rate*0.15):]]) + profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) + wav = logmmse.denoise(wav, profile, eta=0) + resp = pinyin(words, style=Style.TONE3) res = [v[0] for v in resp if v[0].strip()] res = " ".join(res) return wav, res - -def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, directory, no_alignments: bool): - dict_info = {} - transcript_dirs = directory.joinpath("transcript/aidatatang_200_zh_transcript.txt") - with open(transcript_dirs,"rb") as fp: - dict_transcript = [v.decode() for v in fp] - for v in dict_transcript: - if not v: - continue - v = v.strip().replace("\n","").split(" ") - dict_info[v[0]] = " ".join(v[1:]) - +def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool): metadata = [] - if platform.system() == "Windows": - split = "\\" - else: - split = "/" - for wav_fpath in speaker_dir.glob("*.wav"): - name = str(wav_fpath).split(split)[-1] - key = name.split(".")[0] - words = dict_info.get(key) + wav_fpath_list = speaker_dir.glob("*.wav") + # Iterate over each wav + for wav_fpath in wav_fpath_list: + words = dict_info.get(wav_fpath.name.split(".")[0]) + words = dict_info.get(wav_fpath.name) if not words else words # try with wav if not words: + print("no wordS") continue - sub_basename = "%s_%02d" % (name, 0) + sub_basename = "%s_%02d" % (wav_fpath.name, 0) wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams) metadata.append(_process_utterance(wav, text, out_dir, sub_basename, skip_existing, hparams))