Merge pull request #7 from babysor/newdataset

Supporting new dataset SLR68 ! try python synthesizer_preprocess_audi…
This commit is contained in:
Nemo 2021-08-13 14:06:33 +08:00 committed by GitHub
commit 96e9d74966
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 48 additions and 38 deletions

View File

@ -7,7 +7,7 @@
### [English](README.md) | 中文 ### [English](README.md) | 中文
## 特性 ## 特性
🌍 **中文** 支持普通话并使用数据集进行测试adatatang_200zh 🌍 **中文** 支持普通话并使用多种中文数据集进行测试adatatang_200zh, SLR68
🤩 **PyTorch** 适用于 pytorch已在 1.9.0 版本(最新于 2021 年 8 月中测试GPU Tesla T4 和 GTX 2060 🤩 **PyTorch** 适用于 pytorch已在 1.9.0 版本(最新于 2021 年 8 月中测试GPU Tesla T4 和 GTX 2060
@ -33,6 +33,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
* 下载 adatatang_200zh 数据集并解压:确保您可以访问 *train* 文件夹中的所有 .wav * 下载 adatatang_200zh 数据集并解压:确保您可以访问 *train* 文件夹中的所有 .wav
* 使用音频和梅尔频谱图进行预处理: * 使用音频和梅尔频谱图进行预处理:
`python synthesizer_preprocess_audio.py <datasets_root>` `python synthesizer_preprocess_audio.py <datasets_root>`
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, SLR68
* 预处理嵌入: * 预处理嵌入:
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer` `python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
@ -48,8 +49,8 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
然后您可以尝试使用工具箱: 然后您可以尝试使用工具箱:
`python demo_toolbox.py -d <datasets_root>` `python demo_toolbox.py -d <datasets_root>`
TODO ## TODO
- 添加演示视频 - [X] 添加演示视频
- 添加对更多数据集的支持 - [X] 添加对更多数据集的支持
- 上传预训练模型 - [ ] 上传预训练模型
- 🙏 欢迎补充 - [ ] 🙏 欢迎补充

View File

@ -6,7 +6,7 @@
> English | [中文](README-CN.md) > English | [中文](README-CN.md)
## Features ## Features
🌍 **Chinese** supported mandarin and tested with dataset: aidatatang_200zh 🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, SLR68
🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060 🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060
@ -35,7 +35,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
* Download aidatatang_200zh dataset and unzip: make sure you can access all .wav in *train* folder * Download aidatatang_200zh dataset and unzip: make sure you can access all .wav in *train* folder
* Preprocess with the audios and the mel spectrograms: * Preprocess with the audios and the mel spectrograms:
`python synthesizer_preprocess_audio.py <datasets_root>` `python synthesizer_preprocess_audio.py <datasets_root>`
Allow parameter `--dataset {dataset}` to support adatatang_200zh, SLR68
* Preprocess the embeddings: * Preprocess the embeddings:
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer` `python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
@ -56,6 +56,6 @@ or
## TODO ## TODO
- [x] Add demo video - [x] Add demo video
- [ ] Add support for more dataset - [X] Add support for more dataset
- [ ] Upload pretrained model - [ ] Upload pretrained model
- 🙏 Welcome to add more - 🙏 Welcome to add more

View File

@ -6,22 +6,28 @@ from pathlib import Path
from tqdm import tqdm from tqdm import tqdm
import numpy as np import numpy as np
from encoder import inference as encoder from encoder import inference as encoder
from synthesizer.preprocess_speaker import preprocess_speaker_aidatatang_200zh from synthesizer.preprocess_speaker import preprocess_speaker_general
data_info = { data_info = {
"aidatatang_200zh": { "aidatatang_200zh": {
"subfolders": ["corpus/train"], "subfolders": ["corpus/train"],
"speak_func": preprocess_speaker_aidatatang_200zh "trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
} "speak_func": preprocess_speaker_general
# TODO add more },
"SLR68": {
"subfolders": ["train"],
"trans_filepath": "train/TRANS.txt",
"speak_func": preprocess_speaker_general
},
} }
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
skip_existing: bool, hparams, no_alignments: bool, skip_existing: bool, hparams, no_alignments: bool,
dataset: str): dataset: str):
dataset_info = data_info[dataset]
# Gather the input directories # Gather the input directories
dataset_root = datasets_root.joinpath(dataset) dataset_root = datasets_root.joinpath(dataset)
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in data_info[dataset]["subfolders"]] input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]
print("\n ".join(map(str, ["Using data from:"] + input_dirs))) print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
assert all(input_dir.exists() for input_dir in input_dirs) assert all(input_dir.exists() for input_dir in input_dirs)
@ -34,9 +40,19 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8") metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
# Preprocess the dataset # Preprocess the dataset
dict_info = {}
transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
for v in dict_transcript:
if not v:
continue
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
dict_info[v[0]] = " ".join(v[1:])
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
func = partial(data_info[dataset]["speak_func"], out_dir=out_dir, skip_existing=skip_existing, func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams, directory=dataset_root, no_alignments=no_alignments) hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)
job = Pool(n_processes).imap(func, speaker_dirs) job = Pool(n_processes).imap(func, speaker_dirs)
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"): for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
for metadatum in speaker_metadata: for metadatum in speaker_metadata:

View File

@ -1,4 +1,3 @@
import platform
import librosa import librosa
import numpy as np import numpy as np
@ -68,36 +67,30 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0] wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
if hparams.rescale: if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max wav = wav / np.abs(wav).max() * hparams.rescaling_max
# denoise, we may not need it here.
if len(wav) > hparams.sample_rate*(0.3+0.1):
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
wav[-int(hparams.sample_rate*0.15):]])
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile, eta=0)
resp = pinyin(words, style=Style.TONE3) resp = pinyin(words, style=Style.TONE3)
res = [v[0] for v in resp if v[0].strip()] res = [v[0] for v in resp if v[0].strip()]
res = " ".join(res) res = " ".join(res)
return wav, res return wav, res
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, directory, no_alignments: bool):
dict_info = {}
transcript_dirs = directory.joinpath("transcript/aidatatang_200_zh_transcript.txt")
with open(transcript_dirs,"rb") as fp:
dict_transcript = [v.decode() for v in fp]
for v in dict_transcript:
if not v:
continue
v = v.strip().replace("\n","").split(" ")
dict_info[v[0]] = " ".join(v[1:])
metadata = [] metadata = []
if platform.system() == "Windows": wav_fpath_list = speaker_dir.glob("*.wav")
split = "\\" # Iterate over each wav
else: for wav_fpath in wav_fpath_list:
split = "/" words = dict_info.get(wav_fpath.name.split(".")[0])
for wav_fpath in speaker_dir.glob("*.wav"): words = dict_info.get(wav_fpath.name) if not words else words # try with wav
name = str(wav_fpath).split(split)[-1]
key = name.split(".")[0]
words = dict_info.get(key)
if not words: if not words:
print("no wordS")
continue continue
sub_basename = "%s_%02d" % (name, 0) sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams) wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
metadata.append(_process_utterance(wav, text, out_dir, sub_basename, metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams)) skip_existing, hparams))