mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
Refactor preprocessor of synthesizer to prepare to supprot more datasets
This commit is contained in:
parent
c21d2c11dd
commit
856793c9bd
|
@ -1,31 +1,27 @@
|
||||||
from multiprocessing.pool import Pool
|
from multiprocessing.pool import Pool
|
||||||
from synthesizer import audio
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from encoder import inference as encoder
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from utils import logmmse
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import librosa
|
from encoder import inference as encoder
|
||||||
import platform
|
from synthesizer.preprocess_speaker import preprocess_speaker_aidatatang_200zh
|
||||||
from pypinyin import Style
|
|
||||||
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
|
|
||||||
from pypinyin.converter import DefaultConverter
|
|
||||||
from pypinyin.core import Pinyin
|
|
||||||
|
|
||||||
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
|
|
||||||
pass
|
|
||||||
|
|
||||||
pinyin = Pinyin(PinyinConverter()).pinyin
|
|
||||||
|
|
||||||
|
data_info = {
|
||||||
|
"aidatatang_200zh": {
|
||||||
|
"subfolders": ["corpus/train"],
|
||||||
|
"speak_func": preprocess_speaker_aidatatang_200zh
|
||||||
|
}
|
||||||
|
# TODO add more
|
||||||
|
}
|
||||||
|
|
||||||
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
skip_existing: bool, hparams, no_alignments: bool,
|
skip_existing: bool, hparams, no_alignments: bool,
|
||||||
datasets_name: str, subfolders: str):
|
dataset: str):
|
||||||
# Gather the input directories
|
# Gather the input directories
|
||||||
dataset_root = datasets_root.joinpath(datasets_name)
|
dataset_root = datasets_root.joinpath(dataset)
|
||||||
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
|
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in data_info[dataset]["subfolders"]]
|
||||||
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
||||||
assert all(input_dir.exists() for input_dir in input_dirs)
|
assert all(input_dir.exists() for input_dir in input_dirs)
|
||||||
|
|
||||||
|
@ -39,10 +35,10 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
|
|
||||||
# Preprocess the dataset
|
# Preprocess the dataset
|
||||||
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
||||||
func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
|
func = partial(data_info[dataset]["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||||
hparams=hparams, no_alignments=no_alignments)
|
hparams=hparams, directory=dataset_root, no_alignments=no_alignments)
|
||||||
job = Pool(n_processes).imap(func, speaker_dirs)
|
job = Pool(n_processes).imap(func, speaker_dirs)
|
||||||
for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
|
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
||||||
for metadatum in speaker_metadata:
|
for metadatum in speaker_metadata:
|
||||||
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
||||||
metadata_file.close()
|
metadata_file.close()
|
||||||
|
@ -60,183 +56,6 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
||||||
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
||||||
|
|
||||||
|
|
||||||
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
|
|
||||||
metadata = []
|
|
||||||
for book_dir in speaker_dir.glob("*"):
|
|
||||||
if no_alignments:
|
|
||||||
# Gather the utterance audios and texts
|
|
||||||
# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
|
|
||||||
extensions = ["*.wav", "*.flac", "*.mp3"]
|
|
||||||
for extension in extensions:
|
|
||||||
wav_fpaths = book_dir.glob(extension)
|
|
||||||
|
|
||||||
for wav_fpath in wav_fpaths:
|
|
||||||
# Load the audio waveform
|
|
||||||
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
|
||||||
if hparams.rescale:
|
|
||||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
|
||||||
|
|
||||||
# Get the corresponding text
|
|
||||||
# Check for .txt (for compatibility with other datasets)
|
|
||||||
text_fpath = wav_fpath.with_suffix(".txt")
|
|
||||||
if not text_fpath.exists():
|
|
||||||
# Check for .normalized.txt (LibriTTS)
|
|
||||||
text_fpath = wav_fpath.with_suffix(".normalized.txt")
|
|
||||||
assert text_fpath.exists()
|
|
||||||
with text_fpath.open("r") as text_file:
|
|
||||||
text = "".join([line for line in text_file])
|
|
||||||
text = text.replace("\"", "")
|
|
||||||
text = text.strip()
|
|
||||||
|
|
||||||
# Process the utterance
|
|
||||||
metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
|
|
||||||
skip_existing, hparams))
|
|
||||||
else:
|
|
||||||
# Process alignment file (LibriSpeech support)
|
|
||||||
# Gather the utterance audios and texts
|
|
||||||
try:
|
|
||||||
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
|
|
||||||
with alignments_fpath.open("r") as alignments_file:
|
|
||||||
alignments = [line.rstrip().split(" ") for line in alignments_file]
|
|
||||||
except StopIteration:
|
|
||||||
# A few alignment files will be missing
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Iterate over each entry in the alignments file
|
|
||||||
for wav_fname, words, end_times in alignments:
|
|
||||||
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
|
|
||||||
assert wav_fpath.exists()
|
|
||||||
words = words.replace("\"", "").split(",")
|
|
||||||
end_times = list(map(float, end_times.replace("\"", "").split(",")))
|
|
||||||
|
|
||||||
# Process each sub-utterance
|
|
||||||
wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
|
|
||||||
for i, (wav, text) in enumerate(zip(wavs, texts)):
|
|
||||||
sub_basename = "%s_%02d" % (wav_fname, i)
|
|
||||||
metadata.append(process_utterance(wav, text, out_dir, sub_basename,
|
|
||||||
skip_existing, hparams))
|
|
||||||
|
|
||||||
return [m for m in metadata if m is not None]
|
|
||||||
|
|
||||||
|
|
||||||
def split_on_silences(wav_fpath, words, end_times, hparams):
|
|
||||||
# Load the audio waveform
|
|
||||||
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
|
||||||
if hparams.rescale:
|
|
||||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
|
||||||
|
|
||||||
words = np.array(words)
|
|
||||||
start_times = np.array([0.0] + end_times[:-1])
|
|
||||||
end_times = np.array(end_times)
|
|
||||||
assert len(words) == len(end_times) == len(start_times)
|
|
||||||
assert words[0] == "" and words[-1] == ""
|
|
||||||
|
|
||||||
# Find pauses that are too long
|
|
||||||
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
|
|
||||||
mask[0] = mask[-1] = True
|
|
||||||
breaks = np.where(mask)[0]
|
|
||||||
|
|
||||||
# Profile the noise from the silences and perform noise reduction on the waveform
|
|
||||||
silence_times = [[start_times[i], end_times[i]] for i in breaks]
|
|
||||||
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
|
|
||||||
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
|
|
||||||
if len(noisy_wav) > hparams.sample_rate * 0.02:
|
|
||||||
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
|
|
||||||
wav = logmmse.denoise(wav, profile, eta=0)
|
|
||||||
|
|
||||||
# Re-attach segments that are too short
|
|
||||||
segments = list(zip(breaks[:-1], breaks[1:]))
|
|
||||||
segment_durations = [start_times[end] - end_times[start] for start, end in segments]
|
|
||||||
i = 0
|
|
||||||
while i < len(segments) and len(segments) > 1:
|
|
||||||
if segment_durations[i] < hparams.utterance_min_duration:
|
|
||||||
# See if the segment can be re-attached with the right or the left segment
|
|
||||||
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
|
|
||||||
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
|
|
||||||
joined_duration = segment_durations[i] + min(left_duration, right_duration)
|
|
||||||
|
|
||||||
# Do not re-attach if it causes the joined utterance to be too long
|
|
||||||
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Re-attach the segment with the neighbour of shortest duration
|
|
||||||
j = i - 1 if left_duration <= right_duration else i
|
|
||||||
segments[j] = (segments[j][0], segments[j + 1][1])
|
|
||||||
segment_durations[j] = joined_duration
|
|
||||||
del segments[j + 1], segment_durations[j + 1]
|
|
||||||
else:
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
# Split the utterance
|
|
||||||
segment_times = [[end_times[start], start_times[end]] for start, end in segments]
|
|
||||||
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
|
|
||||||
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
|
|
||||||
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
|
|
||||||
|
|
||||||
# # DEBUG: play the audio segments (run with -n=1)
|
|
||||||
# import sounddevice as sd
|
|
||||||
# if len(wavs) > 1:
|
|
||||||
# print("This sentence was split in %d segments:" % len(wavs))
|
|
||||||
# else:
|
|
||||||
# print("There are no silences long enough for this sentence to be split:")
|
|
||||||
# for wav, text in zip(wavs, texts):
|
|
||||||
# # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
|
|
||||||
# # when playing them. You shouldn't need to do that in your parsers.
|
|
||||||
# wav = np.concatenate((wav, [0] * 16000))
|
|
||||||
# print("\t%s" % text)
|
|
||||||
# sd.play(wav, 16000, blocking=True)
|
|
||||||
# print("")
|
|
||||||
|
|
||||||
return wavs, texts
|
|
||||||
|
|
||||||
|
|
||||||
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
|
||||||
skip_existing: bool, hparams):
|
|
||||||
## FOR REFERENCE:
|
|
||||||
# For you not to lose your head if you ever wish to change things here or implement your own
|
|
||||||
# synthesizer.
|
|
||||||
# - Both the audios and the mel spectrograms are saved as numpy arrays
|
|
||||||
# - There is no processing done to the audios that will be saved to disk beyond volume
|
|
||||||
# normalization (in split_on_silences)
|
|
||||||
# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
|
|
||||||
# is why we re-apply it on the audio on the side of the vocoder.
|
|
||||||
# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
|
|
||||||
# without extra padding. This means that you won't have an exact relation between the length
|
|
||||||
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
|
||||||
|
|
||||||
|
|
||||||
# Skip existing utterances if needed
|
|
||||||
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
|
|
||||||
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
|
|
||||||
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Trim silence
|
|
||||||
if hparams.trim_silence:
|
|
||||||
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
|
||||||
|
|
||||||
# Skip utterances that are too short
|
|
||||||
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Compute the mel spectrogram
|
|
||||||
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
|
||||||
mel_frames = mel_spectrogram.shape[1]
|
|
||||||
|
|
||||||
# Skip utterances that are too long
|
|
||||||
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Write the spectrogram, embed and audio to disk
|
|
||||||
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
|
|
||||||
np.save(wav_fpath, wav, allow_pickle=False)
|
|
||||||
|
|
||||||
# Return a tuple describing this training example
|
|
||||||
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
|
|
||||||
|
|
||||||
|
|
||||||
def embed_utterance(fpaths, encoder_model_fpath):
|
def embed_utterance(fpaths, encoder_model_fpath):
|
||||||
if not encoder.is_loaded():
|
if not encoder.is_loaded():
|
||||||
encoder.load_model(encoder_model_fpath)
|
encoder.load_model(encoder_model_fpath)
|
||||||
|
@ -266,93 +85,3 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
|
||||||
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
||||||
job = Pool(n_processes).imap(func, fpaths)
|
job = Pool(n_processes).imap(func, fpaths)
|
||||||
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
||||||
|
|
||||||
# aidatatang_200zh
|
|
||||||
def preprocess_aidatatang_200zh(datasets_root: Path, out_dir: Path, n_processes: int,
|
|
||||||
skip_existing: bool, hparams, no_alignments: bool, datasets_name=None, subfolders=None):
|
|
||||||
# Gather the input directories
|
|
||||||
dataset_root = datasets_root.joinpath("aidatatang_200zh")
|
|
||||||
|
|
||||||
dict_info = {}
|
|
||||||
transcript_dirs = dataset_root.joinpath("transcript/aidatatang_200_zh_transcript.txt")
|
|
||||||
with open(transcript_dirs,"rb") as fp:
|
|
||||||
dict_transcript = [v.decode() for v in fp]
|
|
||||||
|
|
||||||
for v in dict_transcript:
|
|
||||||
if not v:
|
|
||||||
continue
|
|
||||||
v = v.strip().replace("\n","").split(" ")
|
|
||||||
dict_info[v[0]] = " ".join(v[1:])
|
|
||||||
|
|
||||||
input_dirs = [dataset_root.joinpath("corpus/train")]
|
|
||||||
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
|
||||||
assert all(input_dir.exists() for input_dir in input_dirs)
|
|
||||||
|
|
||||||
# Create the output directories for each output file type
|
|
||||||
out_dir.joinpath("mels").mkdir(exist_ok=True)
|
|
||||||
out_dir.joinpath("audio").mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
# Create a metadata file
|
|
||||||
metadata_fpath = out_dir.joinpath("train.txt")
|
|
||||||
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
|
|
||||||
|
|
||||||
# Preprocess the dataset
|
|
||||||
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
|
||||||
func = partial(preprocess_speaker_aidatatang_200zh, out_dir=out_dir, skip_existing=skip_existing,
|
|
||||||
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)
|
|
||||||
job = Pool(n_processes).imap(func, speaker_dirs)
|
|
||||||
for speaker_metadata in tqdm(job, "aidatatang_200zh", len(speaker_dirs), unit="speakers"):
|
|
||||||
for metadatum in speaker_metadata:
|
|
||||||
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
|
||||||
metadata_file.close()
|
|
||||||
|
|
||||||
# Verify the contents of the metadata file
|
|
||||||
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
|
||||||
metadata = [line.split("|") for line in metadata_file]
|
|
||||||
mel_frames = sum([int(m[4]) for m in metadata])
|
|
||||||
timesteps = sum([int(m[3]) for m in metadata])
|
|
||||||
sample_rate = hparams.sample_rate
|
|
||||||
hours = (timesteps / sample_rate) / 3600
|
|
||||||
print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
|
|
||||||
(len(metadata), mel_frames, timesteps, hours))
|
|
||||||
print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
|
|
||||||
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
|
||||||
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
|
||||||
|
|
||||||
def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
|
||||||
metadata = []
|
|
||||||
if platform.system() == "Windows":
|
|
||||||
split = "\\"
|
|
||||||
else:
|
|
||||||
split = "/"
|
|
||||||
# for book_dir in speaker_dir.glob("*"):
|
|
||||||
# Gather the utterance audios and texts
|
|
||||||
|
|
||||||
for wav_fpath in speaker_dir.glob("*.wav"):
|
|
||||||
# D:\dataset\data_aishell\wav\train\S0002\BAC009S0002W0122.wav
|
|
||||||
|
|
||||||
# Process each sub-utterance
|
|
||||||
|
|
||||||
name = str(wav_fpath).split(split)[-1]
|
|
||||||
key = name.split(".")[0]
|
|
||||||
words = dict_info.get(key)
|
|
||||||
if not words:
|
|
||||||
continue
|
|
||||||
sub_basename = "%s_%02d" % (name, 0)
|
|
||||||
wav, text = split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
|
|
||||||
metadata.append(process_utterance(wav, text, out_dir, sub_basename,
|
|
||||||
skip_existing, hparams))
|
|
||||||
|
|
||||||
return [m for m in metadata if m is not None]
|
|
||||||
|
|
||||||
def split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
|
|
||||||
# Load the audio waveform
|
|
||||||
wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
|
|
||||||
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
|
|
||||||
if hparams.rescale:
|
|
||||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
|
||||||
resp = pinyin(words, style=Style.TONE3)
|
|
||||||
res = [v[0] for v in resp if v[0].strip()]
|
|
||||||
res = " ".join(res)
|
|
||||||
|
|
||||||
return wav, res
|
|
234
synthesizer/preprocess_speaker.py
Normal file
234
synthesizer/preprocess_speaker.py
Normal file
|
@ -0,0 +1,234 @@
|
||||||
|
import platform
|
||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from encoder import inference as encoder
|
||||||
|
from utils import logmmse
|
||||||
|
from synthesizer import audio
|
||||||
|
from pathlib import Path
|
||||||
|
from pypinyin import Style
|
||||||
|
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
|
||||||
|
from pypinyin.converter import DefaultConverter
|
||||||
|
from pypinyin.core import Pinyin
|
||||||
|
|
||||||
|
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
|
||||||
|
pass
|
||||||
|
|
||||||
|
pinyin = Pinyin(PinyinConverter()).pinyin
|
||||||
|
|
||||||
|
|
||||||
|
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||||
|
skip_existing: bool, hparams):
|
||||||
|
## FOR REFERENCE:
|
||||||
|
# For you not to lose your head if you ever wish to change things here or implement your own
|
||||||
|
# synthesizer.
|
||||||
|
# - Both the audios and the mel spectrograms are saved as numpy arrays
|
||||||
|
# - There is no processing done to the audios that will be saved to disk beyond volume
|
||||||
|
# normalization (in split_on_silences)
|
||||||
|
# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
|
||||||
|
# is why we re-apply it on the audio on the side of the vocoder.
|
||||||
|
# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
|
||||||
|
# without extra padding. This means that you won't have an exact relation between the length
|
||||||
|
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
||||||
|
|
||||||
|
|
||||||
|
# Skip existing utterances if needed
|
||||||
|
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
|
||||||
|
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
|
||||||
|
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Trim silence
|
||||||
|
if hparams.trim_silence:
|
||||||
|
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
||||||
|
|
||||||
|
# Skip utterances that are too short
|
||||||
|
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Compute the mel spectrogram
|
||||||
|
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
||||||
|
mel_frames = mel_spectrogram.shape[1]
|
||||||
|
|
||||||
|
# Skip utterances that are too long
|
||||||
|
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Write the spectrogram, embed and audio to disk
|
||||||
|
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
|
||||||
|
np.save(wav_fpath, wav, allow_pickle=False)
|
||||||
|
|
||||||
|
# Return a tuple describing this training example
|
||||||
|
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
|
||||||
|
|
||||||
|
|
||||||
|
def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
|
||||||
|
# Load the audio waveform
|
||||||
|
wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
|
||||||
|
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
|
||||||
|
if hparams.rescale:
|
||||||
|
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||||
|
resp = pinyin(words, style=Style.TONE3)
|
||||||
|
res = [v[0] for v in resp if v[0].strip()]
|
||||||
|
res = " ".join(res)
|
||||||
|
|
||||||
|
return wav, res
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, directory, no_alignments: bool):
|
||||||
|
dict_info = {}
|
||||||
|
transcript_dirs = directory.joinpath("transcript/aidatatang_200_zh_transcript.txt")
|
||||||
|
with open(transcript_dirs,"rb") as fp:
|
||||||
|
dict_transcript = [v.decode() for v in fp]
|
||||||
|
for v in dict_transcript:
|
||||||
|
if not v:
|
||||||
|
continue
|
||||||
|
v = v.strip().replace("\n","").split(" ")
|
||||||
|
dict_info[v[0]] = " ".join(v[1:])
|
||||||
|
|
||||||
|
metadata = []
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
split = "\\"
|
||||||
|
else:
|
||||||
|
split = "/"
|
||||||
|
for wav_fpath in speaker_dir.glob("*.wav"):
|
||||||
|
name = str(wav_fpath).split(split)[-1]
|
||||||
|
key = name.split(".")[0]
|
||||||
|
words = dict_info.get(key)
|
||||||
|
if not words:
|
||||||
|
continue
|
||||||
|
sub_basename = "%s_%02d" % (name, 0)
|
||||||
|
wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
|
||||||
|
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
||||||
|
skip_existing, hparams))
|
||||||
|
return [m for m in metadata if m is not None]
|
||||||
|
|
||||||
|
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
|
||||||
|
metadata = []
|
||||||
|
for book_dir in speaker_dir.glob("*"):
|
||||||
|
if no_alignments:
|
||||||
|
# Gather the utterance audios and texts
|
||||||
|
# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
|
||||||
|
extensions = ["*.wav", "*.flac", "*.mp3"]
|
||||||
|
for extension in extensions:
|
||||||
|
wav_fpaths = book_dir.glob(extension)
|
||||||
|
|
||||||
|
for wav_fpath in wav_fpaths:
|
||||||
|
# Load the audio waveform
|
||||||
|
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
||||||
|
if hparams.rescale:
|
||||||
|
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||||
|
|
||||||
|
# Get the corresponding text
|
||||||
|
# Check for .txt (for compatibility with other datasets)
|
||||||
|
text_fpath = wav_fpath.with_suffix(".txt")
|
||||||
|
if not text_fpath.exists():
|
||||||
|
# Check for .normalized.txt (LibriTTS)
|
||||||
|
text_fpath = wav_fpath.with_suffix(".normalized.txt")
|
||||||
|
assert text_fpath.exists()
|
||||||
|
with text_fpath.open("r") as text_file:
|
||||||
|
text = "".join([line for line in text_file])
|
||||||
|
text = text.replace("\"", "")
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
# Process the utterance
|
||||||
|
metadata.append(_process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
|
||||||
|
skip_existing, hparams))
|
||||||
|
else:
|
||||||
|
# Process alignment file (LibriSpeech support)
|
||||||
|
# Gather the utterance audios and texts
|
||||||
|
try:
|
||||||
|
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
|
||||||
|
with alignments_fpath.open("r") as alignments_file:
|
||||||
|
alignments = [line.rstrip().split(" ") for line in alignments_file]
|
||||||
|
except StopIteration:
|
||||||
|
# A few alignment files will be missing
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Iterate over each entry in the alignments file
|
||||||
|
for wav_fname, words, end_times in alignments:
|
||||||
|
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
|
||||||
|
assert wav_fpath.exists()
|
||||||
|
words = words.replace("\"", "").split(",")
|
||||||
|
end_times = list(map(float, end_times.replace("\"", "").split(",")))
|
||||||
|
|
||||||
|
# Process each sub-utterance
|
||||||
|
wavs, texts = _split_on_silences(wav_fpath, words, end_times, hparams)
|
||||||
|
for i, (wav, text) in enumerate(zip(wavs, texts)):
|
||||||
|
sub_basename = "%s_%02d" % (wav_fname, i)
|
||||||
|
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
||||||
|
skip_existing, hparams))
|
||||||
|
|
||||||
|
return [m for m in metadata if m is not None]
|
||||||
|
|
||||||
|
# TODO: use original split func
|
||||||
|
def _split_on_silences(wav_fpath, words, end_times, hparams):
|
||||||
|
# Load the audio waveform
|
||||||
|
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
||||||
|
if hparams.rescale:
|
||||||
|
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||||
|
|
||||||
|
words = np.array(words)
|
||||||
|
start_times = np.array([0.0] + end_times[:-1])
|
||||||
|
end_times = np.array(end_times)
|
||||||
|
assert len(words) == len(end_times) == len(start_times)
|
||||||
|
assert words[0] == "" and words[-1] == ""
|
||||||
|
|
||||||
|
# Find pauses that are too long
|
||||||
|
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
|
||||||
|
mask[0] = mask[-1] = True
|
||||||
|
breaks = np.where(mask)[0]
|
||||||
|
|
||||||
|
# Profile the noise from the silences and perform noise reduction on the waveform
|
||||||
|
silence_times = [[start_times[i], end_times[i]] for i in breaks]
|
||||||
|
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
|
||||||
|
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
|
||||||
|
if len(noisy_wav) > hparams.sample_rate * 0.02:
|
||||||
|
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
|
||||||
|
wav = logmmse.denoise(wav, profile, eta=0)
|
||||||
|
|
||||||
|
# Re-attach segments that are too short
|
||||||
|
segments = list(zip(breaks[:-1], breaks[1:]))
|
||||||
|
segment_durations = [start_times[end] - end_times[start] for start, end in segments]
|
||||||
|
i = 0
|
||||||
|
while i < len(segments) and len(segments) > 1:
|
||||||
|
if segment_durations[i] < hparams.utterance_min_duration:
|
||||||
|
# See if the segment can be re-attached with the right or the left segment
|
||||||
|
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
|
||||||
|
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
|
||||||
|
joined_duration = segment_durations[i] + min(left_duration, right_duration)
|
||||||
|
|
||||||
|
# Do not re-attach if it causes the joined utterance to be too long
|
||||||
|
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Re-attach the segment with the neighbour of shortest duration
|
||||||
|
j = i - 1 if left_duration <= right_duration else i
|
||||||
|
segments[j] = (segments[j][0], segments[j + 1][1])
|
||||||
|
segment_durations[j] = joined_duration
|
||||||
|
del segments[j + 1], segment_durations[j + 1]
|
||||||
|
else:
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Split the utterance
|
||||||
|
segment_times = [[end_times[start], start_times[end]] for start, end in segments]
|
||||||
|
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
|
||||||
|
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
|
||||||
|
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
|
||||||
|
|
||||||
|
# # DEBUG: play the audio segments (run with -n=1)
|
||||||
|
# import sounddevice as sd
|
||||||
|
# if len(wavs) > 1:
|
||||||
|
# print("This sentence was split in %d segments:" % len(wavs))
|
||||||
|
# else:
|
||||||
|
# print("There are no silences long enough for this sentence to be split:")
|
||||||
|
# for wav, text in zip(wavs, texts):
|
||||||
|
# # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
|
||||||
|
# # when playing them. You shouldn't need to do that in your parsers.
|
||||||
|
# wav = np.concatenate((wav, [0] * 16000))
|
||||||
|
# print("\t%s" % text)
|
||||||
|
# sd.play(wav, 16000, blocking=True)
|
||||||
|
# print("")
|
||||||
|
|
||||||
|
return wavs, texts
|
|
@ -1,10 +1,15 @@
|
||||||
from synthesizer.preprocess import preprocess_aidatatang_200zh
|
from synthesizer.preprocess import preprocess_dataset
|
||||||
from synthesizer.hparams import hparams
|
from synthesizer.hparams import hparams
|
||||||
from utils.argutils import print_args
|
from utils.argutils import print_args
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
recognized_datasets = [
|
||||||
|
"aidatatang_200zh",
|
||||||
|
"SLR68",
|
||||||
|
]
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
|
description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
|
||||||
|
@ -29,16 +34,14 @@ if __name__ == "__main__":
|
||||||
parser.add_argument("--no_alignments", action="store_true", help=\
|
parser.add_argument("--no_alignments", action="store_true", help=\
|
||||||
"Use this option when dataset does not include alignments\
|
"Use this option when dataset does not include alignments\
|
||||||
(these are used to split long audio files into sub-utterances.)")
|
(these are used to split long audio files into sub-utterances.)")
|
||||||
parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\
|
parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\
|
||||||
"Name of the dataset directory to process.")
|
"Name of the dataset to process.")
|
||||||
parser.add_argument("--subfolders", type=str, default="train-clean-100, train-clean-360", help=\
|
|
||||||
"Comma-separated list of subfolders to process inside your dataset directory")
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Process the arguments
|
# Process the arguments
|
||||||
if not hasattr(args, "out_dir"):
|
if not hasattr(args, "out_dir"):
|
||||||
args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
|
args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
|
||||||
|
assert args.dataset in recognized_datasets, 'not surpport such dataset'
|
||||||
# Create directories
|
# Create directories
|
||||||
assert args.datasets_root.exists()
|
assert args.datasets_root.exists()
|
||||||
args.out_dir.mkdir(exist_ok=True, parents=True)
|
args.out_dir.mkdir(exist_ok=True, parents=True)
|
||||||
|
@ -56,5 +59,5 @@ if __name__ == "__main__":
|
||||||
# Preprocess the dataset
|
# Preprocess the dataset
|
||||||
print_args(args, parser)
|
print_args(args, parser)
|
||||||
args.hparams = hparams.parse(args.hparams)
|
args.hparams = hparams.parse(args.hparams)
|
||||||
# preprocess_dataset(**vars(args))
|
|
||||||
preprocess_aidatatang_200zh(**vars(args))
|
preprocess_dataset(**vars(args))
|
Loading…
Reference in New Issue
Block a user