Merge pull request #7 from babysor/newdataset

Supporting new dataset SLR68 ! try python synthesizer_preprocess_audi…
This commit is contained in:
Nemo 2021-08-13 14:06:33 +08:00 committed by GitHub
commit 96e9d74966
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 48 additions and 38 deletions

View File

@ -7,7 +7,7 @@
### [English](README.md) | 中文
## 特性
🌍 **中文** 支持普通话并使用数据集进行测试adatatang_200zh
🌍 **中文** 支持普通话并使用多种中文数据集进行测试adatatang_200zh, SLR68
🤩 **PyTorch** 适用于 pytorch已在 1.9.0 版本(最新于 2021 年 8 月中测试GPU Tesla T4 和 GTX 2060
@ -33,6 +33,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
* 下载 adatatang_200zh 数据集并解压:确保您可以访问 *train* 文件夹中的所有 .wav
* 使用音频和梅尔频谱图进行预处理:
`python synthesizer_preprocess_audio.py <datasets_root>`
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, SLR68
* 预处理嵌入:
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
@ -48,8 +49,8 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
然后您可以尝试使用工具箱:
`python demo_toolbox.py -d <datasets_root>`
TODO
- 添加演示视频
- 添加对更多数据集的支持
- 上传预训练模型
- 🙏 欢迎补充
## TODO
- [X] 添加演示视频
- [X] 添加对更多数据集的支持
- [ ] 上传预训练模型
- [ ] 🙏 欢迎补充

View File

@ -6,7 +6,7 @@
> English | [中文](README-CN.md)
## Features
🌍 **Chinese** supported mandarin and tested with dataset: aidatatang_200zh
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, SLR68
🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060
@ -35,7 +35,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
* Download aidatatang_200zh dataset and unzip: make sure you can access all .wav in *train* folder
* Preprocess with the audios and the mel spectrograms:
`python synthesizer_preprocess_audio.py <datasets_root>`
Allow parameter `--dataset {dataset}` to support adatatang_200zh, SLR68
* Preprocess the embeddings:
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
@ -56,6 +56,6 @@ or
## TODO
- [x] Add demo video
- [ ] Add support for more dataset
- [X] Add support for more dataset
- [ ] Upload pretrained model
- 🙏 Welcome to add more

View File

@ -6,22 +6,28 @@ from pathlib import Path
from tqdm import tqdm
import numpy as np
from encoder import inference as encoder
from synthesizer.preprocess_speaker import preprocess_speaker_aidatatang_200zh
from synthesizer.preprocess_speaker import preprocess_speaker_general
data_info = {
"aidatatang_200zh": {
"subfolders": ["corpus/train"],
"speak_func": preprocess_speaker_aidatatang_200zh
}
# TODO add more
"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
"speak_func": preprocess_speaker_general
},
"SLR68": {
"subfolders": ["train"],
"trans_filepath": "train/TRANS.txt",
"speak_func": preprocess_speaker_general
},
}
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
skip_existing: bool, hparams, no_alignments: bool,
dataset: str):
dataset_info = data_info[dataset]
# Gather the input directories
dataset_root = datasets_root.joinpath(dataset)
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in data_info[dataset]["subfolders"]]
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
assert all(input_dir.exists() for input_dir in input_dirs)
@ -34,9 +40,19 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
# Preprocess the dataset
dict_info = {}
transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
for v in dict_transcript:
if not v:
continue
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
dict_info[v[0]] = " ".join(v[1:])
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
func = partial(data_info[dataset]["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams, directory=dataset_root, no_alignments=no_alignments)
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)
job = Pool(n_processes).imap(func, speaker_dirs)
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
for metadatum in speaker_metadata:

View File

@ -1,4 +1,3 @@
import platform
import librosa
import numpy as np
@ -68,36 +67,30 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# denoise, we may not need it here.
if len(wav) > hparams.sample_rate*(0.3+0.1):
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
wav[-int(hparams.sample_rate*0.15):]])
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile, eta=0)
resp = pinyin(words, style=Style.TONE3)
res = [v[0] for v in resp if v[0].strip()]
res = " ".join(res)
return wav, res
def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, directory, no_alignments: bool):
dict_info = {}
transcript_dirs = directory.joinpath("transcript/aidatatang_200_zh_transcript.txt")
with open(transcript_dirs,"rb") as fp:
dict_transcript = [v.decode() for v in fp]
for v in dict_transcript:
if not v:
continue
v = v.strip().replace("\n","").split(" ")
dict_info[v[0]] = " ".join(v[1:])
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
metadata = []
if platform.system() == "Windows":
split = "\\"
else:
split = "/"
for wav_fpath in speaker_dir.glob("*.wav"):
name = str(wav_fpath).split(split)[-1]
key = name.split(".")[0]
words = dict_info.get(key)
wav_fpath_list = speaker_dir.glob("*.wav")
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
words = dict_info.get(wav_fpath.name) if not words else words # try with wav
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (name, 0)
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))