mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
Support new dataset "biaobei" BZNSYP High quality single speaker for Chinese
This commit is contained in:
parent
024d88ae96
commit
bd0e47e76b
10
.vscode/launch.json
vendored
10
.vscode/launch.json
vendored
@ -4,6 +4,16 @@
|
|||||||
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
"version": "0.2.0",
|
"version": "0.2.0",
|
||||||
"configurations": [
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Python: Syn Preprocess",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "pre.py",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"args": [
|
||||||
|
"D:\\ttsdata\\BZNSYP", "-d", "BZNSYP"
|
||||||
|
],
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Python: Vocoder Preprocess",
|
"name": "Python: Vocoder Preprocess",
|
||||||
"type": "python",
|
"type": "python",
|
||||||
|
@ -33,7 +33,8 @@
|
|||||||
* 下载 数据集并解压:确保您可以访问 *train* 文件夹中的所有音频文件(如.wav)
|
* 下载 数据集并解压:确保您可以访问 *train* 文件夹中的所有音频文件(如.wav)
|
||||||
* 进行音频和梅尔频谱图预处理:
|
* 进行音频和梅尔频谱图预处理:
|
||||||
`python pre.py <datasets_root>`
|
`python pre.py <datasets_root>`
|
||||||
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3
|
|
||||||
|
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3, BZNSYP
|
||||||
> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
|
> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
|
||||||
|
|
||||||
>假如發生 `頁面文件太小,無法完成操作`,請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030),將虛擬內存更改為100G(102400),例如:档案放置D槽就更改D槽的虚拟内存
|
>假如發生 `頁面文件太小,無法完成操作`,請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030),將虛擬內存更改為100G(102400),例如:档案放置D槽就更改D槽的虚拟内存
|
||||||
|
@ -33,7 +33,8 @@
|
|||||||
* Download aidatatang_200zh or other dataset and unzip: make sure you can access all .wav in *train* folder
|
* Download aidatatang_200zh or other dataset and unzip: make sure you can access all .wav in *train* folder
|
||||||
* Preprocess with the audios and the mel spectrograms:
|
* Preprocess with the audios and the mel spectrograms:
|
||||||
`python pre.py <datasets_root>`
|
`python pre.py <datasets_root>`
|
||||||
Allow parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3
|
|
||||||
|
Allowing parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3, BZNSYP
|
||||||
|
|
||||||
>If it happens `the page file is too small to complete the operation`, please refer to this [video](https://www.youtube.com/watch?v=Oh6dga-Oy10&ab_channel=CodeProf) and change the virtual memory to 100G (102400), for example : When the file is placed in the D disk, the virtual memory of the D disk is changed.
|
>If it happens `the page file is too small to complete the operation`, please refer to this [video](https://www.youtube.com/watch?v=Oh6dga-Oy10&ab_channel=CodeProf) and change the virtual memory to 100G (102400), for example : When the file is placed in the D disk, the virtual memory of the D disk is changed.
|
||||||
|
|
||||||
|
7
pre.py
7
pre.py
@ -12,7 +12,8 @@ import argparse
|
|||||||
recognized_datasets = [
|
recognized_datasets = [
|
||||||
"aidatatang_200zh",
|
"aidatatang_200zh",
|
||||||
"magicdata",
|
"magicdata",
|
||||||
"aishell3"
|
"aishell3",
|
||||||
|
"BZNSYP"
|
||||||
]
|
]
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -40,8 +41,8 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--no_alignments", action="store_true", help=\
|
parser.add_argument("--no_alignments", action="store_true", help=\
|
||||||
"Use this option when dataset does not include alignments\
|
"Use this option when dataset does not include alignments\
|
||||||
(these are used to split long audio files into sub-utterances.)")
|
(these are used to split long audio files into sub-utterances.)")
|
||||||
parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\
|
parser.add_argument("-d","--dataset", type=str, default="aidatatang_200zh", help=\
|
||||||
"Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3.")
|
"Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3, BZNSYP.")
|
||||||
parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\
|
parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\
|
||||||
"Path your trained encoder model.")
|
"Path your trained encoder model.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
@ -6,7 +6,8 @@ from pathlib import Path
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from encoder import inference as encoder
|
from encoder import inference as encoder
|
||||||
from synthesizer.preprocess_speaker import preprocess_speaker_general
|
from synthesizer.preprocess_speaker import preprocess_speaker_general, preprocess_speaker_bznsyp
|
||||||
|
from synthesizer.preprocess_transcript import preprocess_transcript_bznsyp
|
||||||
|
|
||||||
data_info = {
|
data_info = {
|
||||||
"aidatatang_200zh": {
|
"aidatatang_200zh": {
|
||||||
@ -24,6 +25,12 @@ data_info = {
|
|||||||
"trans_filepath": "train/content.txt",
|
"trans_filepath": "train/content.txt",
|
||||||
"speak_func": preprocess_speaker_general
|
"speak_func": preprocess_speaker_general
|
||||||
},
|
},
|
||||||
|
"BZNSYP":{
|
||||||
|
"subfolders": ["Wave"],
|
||||||
|
"trans_filepath": "ProsodyLabeling/000001-010000.txt",
|
||||||
|
"speak_func": preprocess_speaker_bznsyp,
|
||||||
|
"transcript_func": preprocess_transcript_bznsyp,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
@ -49,11 +56,15 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
|||||||
transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
|
transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
|
||||||
assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
|
assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
|
||||||
with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
|
with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
|
||||||
for v in dict_transcript:
|
# process with specific function for your dataset
|
||||||
if not v:
|
if "transcript_func" in dataset_info:
|
||||||
continue
|
dataset_info["transcript_func"](dict_info, dict_transcript)
|
||||||
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
|
else:
|
||||||
dict_info[v[0]] = " ".join(v[1:])
|
for v in dict_transcript:
|
||||||
|
if not v:
|
||||||
|
continue
|
||||||
|
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
|
||||||
|
dict_info[v[0]] = " ".join(v[1:])
|
||||||
|
|
||||||
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
||||||
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||||
|
@ -81,9 +81,16 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
|
|||||||
return wav, res
|
return wav, res
|
||||||
|
|
||||||
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||||
metadata = []
|
|
||||||
wav_fpath_list = speaker_dir.glob("*.wav")
|
wav_fpath_list = speaker_dir.glob("*.wav")
|
||||||
|
return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments)
|
||||||
|
|
||||||
|
def preprocess_speaker_bznsyp(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||||
|
wav_fpath_list = [speaker_dir]
|
||||||
|
return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments)
|
||||||
|
|
||||||
|
def preprocess_speaker_internal(wav_fpath_list, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||||
# Iterate over each wav
|
# Iterate over each wav
|
||||||
|
metadata = []
|
||||||
for wav_fpath in wav_fpath_list:
|
for wav_fpath in wav_fpath_list:
|
||||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||||
words = dict_info.get(wav_fpath.name) if not words else words # try with wav
|
words = dict_info.get(wav_fpath.name) if not words else words # try with wav
|
||||||
@ -95,133 +102,3 @@ def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool,
|
|||||||
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
||||||
skip_existing, hparams))
|
skip_existing, hparams))
|
||||||
return [m for m in metadata if m is not None]
|
return [m for m in metadata if m is not None]
|
||||||
|
|
||||||
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
|
|
||||||
metadata = []
|
|
||||||
for book_dir in speaker_dir.glob("*"):
|
|
||||||
if no_alignments:
|
|
||||||
# Gather the utterance audios and texts
|
|
||||||
# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
|
|
||||||
extensions = ["*.wav", "*.flac", "*.mp3"]
|
|
||||||
for extension in extensions:
|
|
||||||
wav_fpaths = book_dir.glob(extension)
|
|
||||||
|
|
||||||
for wav_fpath in wav_fpaths:
|
|
||||||
# Load the audio waveform
|
|
||||||
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
|
||||||
if hparams.rescale:
|
|
||||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
|
||||||
|
|
||||||
# Get the corresponding text
|
|
||||||
# Check for .txt (for compatibility with other datasets)
|
|
||||||
text_fpath = wav_fpath.with_suffix(".txt")
|
|
||||||
if not text_fpath.exists():
|
|
||||||
# Check for .normalized.txt (LibriTTS)
|
|
||||||
text_fpath = wav_fpath.with_suffix(".normalized.txt")
|
|
||||||
assert text_fpath.exists()
|
|
||||||
with text_fpath.open("r") as text_file:
|
|
||||||
text = "".join([line for line in text_file])
|
|
||||||
text = text.replace("\"", "")
|
|
||||||
text = text.strip()
|
|
||||||
|
|
||||||
# Process the utterance
|
|
||||||
metadata.append(_process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
|
|
||||||
skip_existing, hparams))
|
|
||||||
else:
|
|
||||||
# Process alignment file (LibriSpeech support)
|
|
||||||
# Gather the utterance audios and texts
|
|
||||||
try:
|
|
||||||
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
|
|
||||||
with alignments_fpath.open("r") as alignments_file:
|
|
||||||
alignments = [line.rstrip().split(" ") for line in alignments_file]
|
|
||||||
except StopIteration:
|
|
||||||
# A few alignment files will be missing
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Iterate over each entry in the alignments file
|
|
||||||
for wav_fname, words, end_times in alignments:
|
|
||||||
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
|
|
||||||
assert wav_fpath.exists()
|
|
||||||
words = words.replace("\"", "").split(",")
|
|
||||||
end_times = list(map(float, end_times.replace("\"", "").split(",")))
|
|
||||||
|
|
||||||
# Process each sub-utterance
|
|
||||||
wavs, texts = _split_on_silences(wav_fpath, words, end_times, hparams)
|
|
||||||
for i, (wav, text) in enumerate(zip(wavs, texts)):
|
|
||||||
sub_basename = "%s_%02d" % (wav_fname, i)
|
|
||||||
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
|
||||||
skip_existing, hparams))
|
|
||||||
|
|
||||||
return [m for m in metadata if m is not None]
|
|
||||||
|
|
||||||
# TODO: use original split func
|
|
||||||
def _split_on_silences(wav_fpath, words, end_times, hparams):
|
|
||||||
# Load the audio waveform
|
|
||||||
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
|
||||||
if hparams.rescale:
|
|
||||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
|
||||||
|
|
||||||
words = np.array(words)
|
|
||||||
start_times = np.array([0.0] + end_times[:-1])
|
|
||||||
end_times = np.array(end_times)
|
|
||||||
assert len(words) == len(end_times) == len(start_times)
|
|
||||||
assert words[0] == "" and words[-1] == ""
|
|
||||||
|
|
||||||
# Find pauses that are too long
|
|
||||||
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
|
|
||||||
mask[0] = mask[-1] = True
|
|
||||||
breaks = np.where(mask)[0]
|
|
||||||
|
|
||||||
# Profile the noise from the silences and perform noise reduction on the waveform
|
|
||||||
silence_times = [[start_times[i], end_times[i]] for i in breaks]
|
|
||||||
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
|
|
||||||
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
|
|
||||||
if len(noisy_wav) > hparams.sample_rate * 0.02:
|
|
||||||
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
|
|
||||||
wav = logmmse.denoise(wav, profile, eta=0)
|
|
||||||
|
|
||||||
# Re-attach segments that are too short
|
|
||||||
segments = list(zip(breaks[:-1], breaks[1:]))
|
|
||||||
segment_durations = [start_times[end] - end_times[start] for start, end in segments]
|
|
||||||
i = 0
|
|
||||||
while i < len(segments) and len(segments) > 1:
|
|
||||||
if segment_durations[i] < hparams.utterance_min_duration:
|
|
||||||
# See if the segment can be re-attached with the right or the left segment
|
|
||||||
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
|
|
||||||
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
|
|
||||||
joined_duration = segment_durations[i] + min(left_duration, right_duration)
|
|
||||||
|
|
||||||
# Do not re-attach if it causes the joined utterance to be too long
|
|
||||||
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Re-attach the segment with the neighbour of shortest duration
|
|
||||||
j = i - 1 if left_duration <= right_duration else i
|
|
||||||
segments[j] = (segments[j][0], segments[j + 1][1])
|
|
||||||
segment_durations[j] = joined_duration
|
|
||||||
del segments[j + 1], segment_durations[j + 1]
|
|
||||||
else:
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
# Split the utterance
|
|
||||||
segment_times = [[end_times[start], start_times[end]] for start, end in segments]
|
|
||||||
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
|
|
||||||
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
|
|
||||||
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
|
|
||||||
|
|
||||||
# # DEBUG: play the audio segments (run with -n=1)
|
|
||||||
# import sounddevice as sd
|
|
||||||
# if len(wavs) > 1:
|
|
||||||
# print("This sentence was split in %d segments:" % len(wavs))
|
|
||||||
# else:
|
|
||||||
# print("There are no silences long enough for this sentence to be split:")
|
|
||||||
# for wav, text in zip(wavs, texts):
|
|
||||||
# # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
|
|
||||||
# # when playing them. You shouldn't need to do that in your parsers.
|
|
||||||
# wav = np.concatenate((wav, [0] * 16000))
|
|
||||||
# print("\t%s" % text)
|
|
||||||
# sd.play(wav, 16000, blocking=True)
|
|
||||||
# print("")
|
|
||||||
|
|
||||||
return wavs, texts
|
|
Loading…
x
Reference in New Issue
Block a user