mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
Merge pull request #7 from babysor/newdataset
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audi…
This commit is contained in:
commit
96e9d74966
13
README-CN.md
13
README-CN.md
|
@ -7,7 +7,7 @@
|
|||
### [English](README.md) | 中文
|
||||
|
||||
## 特性
|
||||
🌍 **中文** 支持普通话并使用数据集进行测试:adatatang_200zh
|
||||
🌍 **中文** 支持普通话并使用多种中文数据集进行测试:adatatang_200zh, SLR68
|
||||
|
||||
🤩 **PyTorch** 适用于 pytorch,已在 1.9.0 版本(最新于 2021 年 8 月)中测试,GPU Tesla T4 和 GTX 2060
|
||||
|
||||
|
@ -33,6 +33,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
|
|||
* 下载 adatatang_200zh 数据集并解压:确保您可以访问 *train* 文件夹中的所有 .wav
|
||||
* 使用音频和梅尔频谱图进行预处理:
|
||||
`python synthesizer_preprocess_audio.py <datasets_root>`
|
||||
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, SLR68
|
||||
|
||||
* 预处理嵌入:
|
||||
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
|
||||
|
@ -48,8 +49,8 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
|
|||
然后您可以尝试使用工具箱:
|
||||
`python demo_toolbox.py -d <datasets_root>`
|
||||
|
||||
## TODO
|
||||
- 添加演示视频
|
||||
- 添加对更多数据集的支持
|
||||
- 上传预训练模型
|
||||
- 🙏 欢迎补充
|
||||
## TODO
|
||||
- [X] 添加演示视频
|
||||
- [X] 添加对更多数据集的支持
|
||||
- [ ] 上传预训练模型
|
||||
- [ ] 🙏 欢迎补充
|
|
@ -6,7 +6,7 @@
|
|||
> English | [中文](README-CN.md)
|
||||
|
||||
## Features
|
||||
🌍 **Chinese** supported mandarin and tested with dataset: aidatatang_200zh
|
||||
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, SLR68
|
||||
|
||||
🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060
|
||||
|
||||
|
@ -35,7 +35,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
|
|||
* Download aidatatang_200zh dataset and unzip: make sure you can access all .wav in *train* folder
|
||||
* Preprocess with the audios and the mel spectrograms:
|
||||
`python synthesizer_preprocess_audio.py <datasets_root>`
|
||||
|
||||
Allow parameter `--dataset {dataset}` to support adatatang_200zh, SLR68
|
||||
* Preprocess the embeddings:
|
||||
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
|
||||
|
||||
|
@ -56,6 +56,6 @@ or
|
|||
|
||||
## TODO
|
||||
- [x] Add demo video
|
||||
- [ ] Add support for more dataset
|
||||
- [X] Add support for more dataset
|
||||
- [ ] Upload pretrained model
|
||||
- 🙏 Welcome to add more
|
||||
|
|
|
@ -6,22 +6,28 @@ from pathlib import Path
|
|||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
from encoder import inference as encoder
|
||||
from synthesizer.preprocess_speaker import preprocess_speaker_aidatatang_200zh
|
||||
from synthesizer.preprocess_speaker import preprocess_speaker_general
|
||||
|
||||
data_info = {
|
||||
"aidatatang_200zh": {
|
||||
"subfolders": ["corpus/train"],
|
||||
"speak_func": preprocess_speaker_aidatatang_200zh
|
||||
}
|
||||
# TODO add more
|
||||
"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
|
||||
"speak_func": preprocess_speaker_general
|
||||
},
|
||||
"SLR68": {
|
||||
"subfolders": ["train"],
|
||||
"trans_filepath": "train/TRANS.txt",
|
||||
"speak_func": preprocess_speaker_general
|
||||
},
|
||||
}
|
||||
|
||||
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||
skip_existing: bool, hparams, no_alignments: bool,
|
||||
dataset: str):
|
||||
dataset_info = data_info[dataset]
|
||||
# Gather the input directories
|
||||
dataset_root = datasets_root.joinpath(dataset)
|
||||
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in data_info[dataset]["subfolders"]]
|
||||
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]
|
||||
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
||||
assert all(input_dir.exists() for input_dir in input_dirs)
|
||||
|
||||
|
@ -34,9 +40,19 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
|||
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
|
||||
|
||||
# Preprocess the dataset
|
||||
dict_info = {}
|
||||
transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
|
||||
assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
|
||||
with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
|
||||
for v in dict_transcript:
|
||||
if not v:
|
||||
continue
|
||||
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
|
||||
dict_info[v[0]] = " ".join(v[1:])
|
||||
|
||||
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
||||
func = partial(data_info[dataset]["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||
hparams=hparams, directory=dataset_root, no_alignments=no_alignments)
|
||||
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)
|
||||
job = Pool(n_processes).imap(func, speaker_dirs)
|
||||
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
||||
for metadatum in speaker_metadata:
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import platform
|
||||
import librosa
|
||||
import numpy as np
|
||||
|
||||
|
@ -68,36 +67,30 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
|
|||
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
|
||||
if hparams.rescale:
|
||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||
# denoise, we may not need it here.
|
||||
if len(wav) > hparams.sample_rate*(0.3+0.1):
|
||||
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
|
||||
wav[-int(hparams.sample_rate*0.15):]])
|
||||
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
|
||||
wav = logmmse.denoise(wav, profile, eta=0)
|
||||
|
||||
resp = pinyin(words, style=Style.TONE3)
|
||||
res = [v[0] for v in resp if v[0].strip()]
|
||||
res = " ".join(res)
|
||||
|
||||
return wav, res
|
||||
|
||||
|
||||
def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, directory, no_alignments: bool):
|
||||
dict_info = {}
|
||||
transcript_dirs = directory.joinpath("transcript/aidatatang_200_zh_transcript.txt")
|
||||
with open(transcript_dirs,"rb") as fp:
|
||||
dict_transcript = [v.decode() for v in fp]
|
||||
for v in dict_transcript:
|
||||
if not v:
|
||||
continue
|
||||
v = v.strip().replace("\n","").split(" ")
|
||||
dict_info[v[0]] = " ".join(v[1:])
|
||||
|
||||
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||
metadata = []
|
||||
if platform.system() == "Windows":
|
||||
split = "\\"
|
||||
else:
|
||||
split = "/"
|
||||
for wav_fpath in speaker_dir.glob("*.wav"):
|
||||
name = str(wav_fpath).split(split)[-1]
|
||||
key = name.split(".")[0]
|
||||
words = dict_info.get(key)
|
||||
wav_fpath_list = speaker_dir.glob("*.wav")
|
||||
# Iterate over each wav
|
||||
for wav_fpath in wav_fpath_list:
|
||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||
words = dict_info.get(wav_fpath.name) if not words else words # try with wav
|
||||
if not words:
|
||||
print("no wordS")
|
||||
continue
|
||||
sub_basename = "%s_%02d" % (name, 0)
|
||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||
wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
|
||||
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
||||
skip_existing, hparams))
|
||||
|
|
Loading…
Reference in New Issue
Block a user