mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
Merge pull request #7 from babysor/newdataset
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audi…
This commit is contained in:
commit
96e9d74966
13
README-CN.md
13
README-CN.md
|
@ -7,7 +7,7 @@
|
||||||
### [English](README.md) | 中文
|
### [English](README.md) | 中文
|
||||||
|
|
||||||
## 特性
|
## 特性
|
||||||
🌍 **中文** 支持普通话并使用数据集进行测试:adatatang_200zh
|
🌍 **中文** 支持普通话并使用多种中文数据集进行测试:adatatang_200zh, SLR68
|
||||||
|
|
||||||
🤩 **PyTorch** 适用于 pytorch,已在 1.9.0 版本(最新于 2021 年 8 月)中测试,GPU Tesla T4 和 GTX 2060
|
🤩 **PyTorch** 适用于 pytorch,已在 1.9.0 版本(最新于 2021 年 8 月)中测试,GPU Tesla T4 和 GTX 2060
|
||||||
|
|
||||||
|
@ -33,6 +33,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
|
||||||
* 下载 adatatang_200zh 数据集并解压:确保您可以访问 *train* 文件夹中的所有 .wav
|
* 下载 adatatang_200zh 数据集并解压:确保您可以访问 *train* 文件夹中的所有 .wav
|
||||||
* 使用音频和梅尔频谱图进行预处理:
|
* 使用音频和梅尔频谱图进行预处理:
|
||||||
`python synthesizer_preprocess_audio.py <datasets_root>`
|
`python synthesizer_preprocess_audio.py <datasets_root>`
|
||||||
|
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, SLR68
|
||||||
|
|
||||||
* 预处理嵌入:
|
* 预处理嵌入:
|
||||||
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
|
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
|
||||||
|
@ -48,8 +49,8 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
|
||||||
然后您可以尝试使用工具箱:
|
然后您可以尝试使用工具箱:
|
||||||
`python demo_toolbox.py -d <datasets_root>`
|
`python demo_toolbox.py -d <datasets_root>`
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
- 添加演示视频
|
- [X] 添加演示视频
|
||||||
- 添加对更多数据集的支持
|
- [X] 添加对更多数据集的支持
|
||||||
- 上传预训练模型
|
- [ ] 上传预训练模型
|
||||||
- 🙏 欢迎补充
|
- [ ] 🙏 欢迎补充
|
|
@ -6,7 +6,7 @@
|
||||||
> English | [中文](README-CN.md)
|
> English | [中文](README-CN.md)
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
🌍 **Chinese** supported mandarin and tested with dataset: aidatatang_200zh
|
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, SLR68
|
||||||
|
|
||||||
🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060
|
🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
|
||||||
* Download aidatatang_200zh dataset and unzip: make sure you can access all .wav in *train* folder
|
* Download aidatatang_200zh dataset and unzip: make sure you can access all .wav in *train* folder
|
||||||
* Preprocess with the audios and the mel spectrograms:
|
* Preprocess with the audios and the mel spectrograms:
|
||||||
`python synthesizer_preprocess_audio.py <datasets_root>`
|
`python synthesizer_preprocess_audio.py <datasets_root>`
|
||||||
|
Allow parameter `--dataset {dataset}` to support adatatang_200zh, SLR68
|
||||||
* Preprocess the embeddings:
|
* Preprocess the embeddings:
|
||||||
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
|
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
|
||||||
|
|
||||||
|
@ -56,6 +56,6 @@ or
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
- [x] Add demo video
|
- [x] Add demo video
|
||||||
- [ ] Add support for more dataset
|
- [X] Add support for more dataset
|
||||||
- [ ] Upload pretrained model
|
- [ ] Upload pretrained model
|
||||||
- 🙏 Welcome to add more
|
- 🙏 Welcome to add more
|
||||||
|
|
|
@ -6,22 +6,28 @@ from pathlib import Path
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from encoder import inference as encoder
|
from encoder import inference as encoder
|
||||||
from synthesizer.preprocess_speaker import preprocess_speaker_aidatatang_200zh
|
from synthesizer.preprocess_speaker import preprocess_speaker_general
|
||||||
|
|
||||||
data_info = {
|
data_info = {
|
||||||
"aidatatang_200zh": {
|
"aidatatang_200zh": {
|
||||||
"subfolders": ["corpus/train"],
|
"subfolders": ["corpus/train"],
|
||||||
"speak_func": preprocess_speaker_aidatatang_200zh
|
"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
|
||||||
}
|
"speak_func": preprocess_speaker_general
|
||||||
# TODO add more
|
},
|
||||||
|
"SLR68": {
|
||||||
|
"subfolders": ["train"],
|
||||||
|
"trans_filepath": "train/TRANS.txt",
|
||||||
|
"speak_func": preprocess_speaker_general
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
skip_existing: bool, hparams, no_alignments: bool,
|
skip_existing: bool, hparams, no_alignments: bool,
|
||||||
dataset: str):
|
dataset: str):
|
||||||
|
dataset_info = data_info[dataset]
|
||||||
# Gather the input directories
|
# Gather the input directories
|
||||||
dataset_root = datasets_root.joinpath(dataset)
|
dataset_root = datasets_root.joinpath(dataset)
|
||||||
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in data_info[dataset]["subfolders"]]
|
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]
|
||||||
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
||||||
assert all(input_dir.exists() for input_dir in input_dirs)
|
assert all(input_dir.exists() for input_dir in input_dirs)
|
||||||
|
|
||||||
|
@ -34,9 +40,19 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
|
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
|
||||||
|
|
||||||
# Preprocess the dataset
|
# Preprocess the dataset
|
||||||
|
dict_info = {}
|
||||||
|
transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
|
||||||
|
assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
|
||||||
|
with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
|
||||||
|
for v in dict_transcript:
|
||||||
|
if not v:
|
||||||
|
continue
|
||||||
|
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
|
||||||
|
dict_info[v[0]] = " ".join(v[1:])
|
||||||
|
|
||||||
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
||||||
func = partial(data_info[dataset]["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||||
hparams=hparams, directory=dataset_root, no_alignments=no_alignments)
|
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)
|
||||||
job = Pool(n_processes).imap(func, speaker_dirs)
|
job = Pool(n_processes).imap(func, speaker_dirs)
|
||||||
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
||||||
for metadatum in speaker_metadata:
|
for metadatum in speaker_metadata:
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import platform
|
|
||||||
import librosa
|
import librosa
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
@ -68,36 +67,30 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
|
||||||
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
|
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
|
||||||
if hparams.rescale:
|
if hparams.rescale:
|
||||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||||
|
# denoise, we may not need it here.
|
||||||
|
if len(wav) > hparams.sample_rate*(0.3+0.1):
|
||||||
|
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
|
||||||
|
wav[-int(hparams.sample_rate*0.15):]])
|
||||||
|
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
|
||||||
|
wav = logmmse.denoise(wav, profile, eta=0)
|
||||||
|
|
||||||
resp = pinyin(words, style=Style.TONE3)
|
resp = pinyin(words, style=Style.TONE3)
|
||||||
res = [v[0] for v in resp if v[0].strip()]
|
res = [v[0] for v in resp if v[0].strip()]
|
||||||
res = " ".join(res)
|
res = " ".join(res)
|
||||||
|
|
||||||
return wav, res
|
return wav, res
|
||||||
|
|
||||||
|
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||||
def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, directory, no_alignments: bool):
|
|
||||||
dict_info = {}
|
|
||||||
transcript_dirs = directory.joinpath("transcript/aidatatang_200_zh_transcript.txt")
|
|
||||||
with open(transcript_dirs,"rb") as fp:
|
|
||||||
dict_transcript = [v.decode() for v in fp]
|
|
||||||
for v in dict_transcript:
|
|
||||||
if not v:
|
|
||||||
continue
|
|
||||||
v = v.strip().replace("\n","").split(" ")
|
|
||||||
dict_info[v[0]] = " ".join(v[1:])
|
|
||||||
|
|
||||||
metadata = []
|
metadata = []
|
||||||
if platform.system() == "Windows":
|
wav_fpath_list = speaker_dir.glob("*.wav")
|
||||||
split = "\\"
|
# Iterate over each wav
|
||||||
else:
|
for wav_fpath in wav_fpath_list:
|
||||||
split = "/"
|
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||||
for wav_fpath in speaker_dir.glob("*.wav"):
|
words = dict_info.get(wav_fpath.name) if not words else words # try with wav
|
||||||
name = str(wav_fpath).split(split)[-1]
|
|
||||||
key = name.split(".")[0]
|
|
||||||
words = dict_info.get(key)
|
|
||||||
if not words:
|
if not words:
|
||||||
|
print("no wordS")
|
||||||
continue
|
continue
|
||||||
sub_basename = "%s_%02d" % (name, 0)
|
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||||
wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
|
wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
|
||||||
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
||||||
skip_existing, hparams))
|
skip_existing, hparams))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user