[FIX] Fix preprocessing bug for aishell3

This commit is contained in:
babysor00 2021-09-19 00:09:16 +08:00
parent 3fbe03f2ff
commit 4178416385
4 changed files with 42 additions and 153 deletions

10
pre.py
View File

@ -28,8 +28,7 @@ if __name__ == "__main__":
"Path to the output directory that will contain the mel spectrograms, the audios and the " "Path to the output directory that will contain the mel spectrograms, the audios and the "
"embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/") "embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
parser.add_argument("-n", "--n_processes", type=int, default=1, help=\ parser.add_argument("-n", "--n_processes", type=int, default=1, help=\
"Number of processes in parallel.An encoder is created for each, so you may need to lower " "Number of processes in parallel.")
"this value on GPUs with low memory. Set it to 1 if CUDA is unhappy")
parser.add_argument("-s", "--skip_existing", action="store_true", help=\ parser.add_argument("-s", "--skip_existing", action="store_true", help=\
"Whether to overwrite existing files with the same name. Useful if the preprocessing was " "Whether to overwrite existing files with the same name. Useful if the preprocessing was "
"interrupted. ") "interrupted. ")
@ -40,10 +39,13 @@ if __name__ == "__main__":
parser.add_argument("--no_alignments", action="store_true", help=\ parser.add_argument("--no_alignments", action="store_true", help=\
"Use this option when dataset does not include alignments\ "Use this option when dataset does not include alignments\
(these are used to split long audio files into sub-utterances.)") (these are used to split long audio files into sub-utterances.)")
parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\ parser.add_argument("-d", "--dataset", type=str, default="aidatatang_200zh", help=\
"Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3.") "Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3.")
parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\ parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\
"Path your trained encoder model.") "Path your trained encoder model.")
parser.add_argument("-ne", "--n_processes_embed", type=int, default=1, help=\
"Number of processes in parallel.An encoder is created for each, so you may need to lower "
"this value on GPUs with low memory. Set it to 1 if CUDA is unhappy")
args = parser.parse_args() args = parser.parse_args()
# Process the arguments # Process the arguments
@ -69,4 +71,4 @@ if __name__ == "__main__":
preprocess_dataset(**vars(args)) preprocess_dataset(**vars(args))
create_embeddings(synthesizer_root=args.out_dir, n_processes=args.n_processes, encoder_model_fpath=encoder_model_fpath) create_embeddings(synthesizer_root=args.out_dir, n_processes=args.n_processes_embed, encoder_model_fpath=encoder_model_fpath)

View File

@ -7,6 +7,7 @@ from tqdm import tqdm
import numpy as np import numpy as np
from encoder import inference as encoder from encoder import inference as encoder
from synthesizer.preprocess_speaker import preprocess_speaker_general from synthesizer.preprocess_speaker import preprocess_speaker_general
from synthesizer.preprocess_transcript import preprocess_transcript_aishell3
data_info = { data_info = {
"aidatatang_200zh": { "aidatatang_200zh": {
@ -22,8 +23,9 @@ data_info = {
"aishell3":{ "aishell3":{
"subfolders": ["train/wav"], "subfolders": ["train/wav"],
"trans_filepath": "train/content.txt", "trans_filepath": "train/content.txt",
"speak_func": preprocess_speaker_general "speak_func": preprocess_speaker_general,
}, "transcript_func": preprocess_transcript_aishell3,
}
} }
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
@ -49,11 +51,15 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"]) transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
assert transcript_dirs.exists(), str(transcript_dirs)+" not exist." assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript: with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
for v in dict_transcript: # process with specific function for your dataset
if not v: if "transcript_func" in dataset_info:
continue dataset_info["transcript_func"](dict_info, dict_transcript)
v = v.strip().replace("\n","").replace("\t"," ").split(" ") else:
dict_info[v[0]] = " ".join(v[1:]) for v in dict_transcript:
if not v:
continue
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
dict_info[v[0]] = " ".join(v[1:])
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,

View File

@ -61,7 +61,7 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams): def _split_on_silences(wav_fpath, words, hparams):
# Load the audio waveform # Load the audio waveform
wav, _ = librosa.load(wav_fpath, hparams.sample_rate) wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0] wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
@ -82,146 +82,18 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool): def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
metadata = [] metadata = []
wav_fpath_list = speaker_dir.glob("*.wav") extensions = ["*.wav", "*.flac", "*.mp3"]
# Iterate over each wav for extension in extensions:
for wav_fpath in wav_fpath_list: wav_fpath_list = speaker_dir.glob(extension)
words = dict_info.get(wav_fpath.name.split(".")[0]) # Iterate over each wav
words = dict_info.get(wav_fpath.name) if not words else words # try with wav for wav_fpath in wav_fpath_list:
if not words: words = dict_info.get(wav_fpath.name.split(".")[0])
print("no wordS") words = dict_info.get(wav_fpath.name) if not words else words # try with wav
continue if not words:
sub_basename = "%s_%02d" % (wav_fpath.name, 0) print("no wordS")
wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))
return [m for m in metadata if m is not None]
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
metadata = []
for book_dir in speaker_dir.glob("*"):
if no_alignments:
# Gather the utterance audios and texts
# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions:
wav_fpaths = book_dir.glob(extension)
for wav_fpath in wav_fpaths:
# Load the audio waveform
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Get the corresponding text
# Check for .txt (for compatibility with other datasets)
text_fpath = wav_fpath.with_suffix(".txt")
if not text_fpath.exists():
# Check for .normalized.txt (LibriTTS)
text_fpath = wav_fpath.with_suffix(".normalized.txt")
assert text_fpath.exists()
with text_fpath.open("r") as text_file:
text = "".join([line for line in text_file])
text = text.replace("\"", "")
text = text.strip()
# Process the utterance
metadata.append(_process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
skip_existing, hparams))
else:
# Process alignment file (LibriSpeech support)
# Gather the utterance audios and texts
try:
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
with alignments_fpath.open("r") as alignments_file:
alignments = [line.rstrip().split(" ") for line in alignments_file]
except StopIteration:
# A few alignment files will be missing
continue continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
# Iterate over each entry in the alignments file wav, text = _split_on_silences(wav_fpath, words, hparams)
for wav_fname, words, end_times in alignments: metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
wav_fpath = book_dir.joinpath(wav_fname + ".flac") skip_existing, hparams))
assert wav_fpath.exists()
words = words.replace("\"", "").split(",")
end_times = list(map(float, end_times.replace("\"", "").split(",")))
# Process each sub-utterance
wavs, texts = _split_on_silences(wav_fpath, words, end_times, hparams)
for i, (wav, text) in enumerate(zip(wavs, texts)):
sub_basename = "%s_%02d" % (wav_fname, i)
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))
return [m for m in metadata if m is not None] return [m for m in metadata if m is not None]
# TODO: use original split func
def _split_on_silences(wav_fpath, words, end_times, hparams):
# Load the audio waveform
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
words = np.array(words)
start_times = np.array([0.0] + end_times[:-1])
end_times = np.array(end_times)
assert len(words) == len(end_times) == len(start_times)
assert words[0] == "" and words[-1] == ""
# Find pauses that are too long
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
mask[0] = mask[-1] = True
breaks = np.where(mask)[0]
# Profile the noise from the silences and perform noise reduction on the waveform
silence_times = [[start_times[i], end_times[i]] for i in breaks]
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
if len(noisy_wav) > hparams.sample_rate * 0.02:
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile, eta=0)
# Re-attach segments that are too short
segments = list(zip(breaks[:-1], breaks[1:]))
segment_durations = [start_times[end] - end_times[start] for start, end in segments]
i = 0
while i < len(segments) and len(segments) > 1:
if segment_durations[i] < hparams.utterance_min_duration:
# See if the segment can be re-attached with the right or the left segment
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
joined_duration = segment_durations[i] + min(left_duration, right_duration)
# Do not re-attach if it causes the joined utterance to be too long
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
i += 1
continue
# Re-attach the segment with the neighbour of shortest duration
j = i - 1 if left_duration <= right_duration else i
segments[j] = (segments[j][0], segments[j + 1][1])
segment_durations[j] = joined_duration
del segments[j + 1], segment_durations[j + 1]
else:
i += 1
# Split the utterance
segment_times = [[end_times[start], start_times[end]] for start, end in segments]
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
# # DEBUG: play the audio segments (run with -n=1)
# import sounddevice as sd
# if len(wavs) > 1:
# print("This sentence was split in %d segments:" % len(wavs))
# else:
# print("There are no silences long enough for this sentence to be split:")
# for wav, text in zip(wavs, texts):
# # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
# # when playing them. You shouldn't need to do that in your parsers.
# wav = np.concatenate((wav, [0] * 16000))
# print("\t%s" % text)
# sd.play(wav, 16000, blocking=True)
# print("")
return wavs, texts

View File

@ -0,0 +1,9 @@
def preprocess_transcript_aishell3(dict_info, dict_transcript):
for v in dict_transcript:
if not v:
continue
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
transList = []
for i in range(2, len(v), 2):
transList.append(v[i])
dict_info[v[0]] = " ".join(transList)