2022-02-09 00:44:43 +08:00
|
|
|
|
|
|
|
import os
|
|
|
|
import torch
|
|
|
|
import numpy as np
|
|
|
|
from tqdm import tqdm
|
|
|
|
from pathlib import Path
|
2022-02-20 11:56:58 +08:00
|
|
|
import soundfile
|
|
|
|
import resampy
|
2022-02-09 00:44:43 +08:00
|
|
|
|
|
|
|
from ppg_extractor import load_model
|
|
|
|
import encoder.inference as Encoder
|
|
|
|
from encoder.audio import preprocess_wav
|
2022-02-13 11:28:41 +08:00
|
|
|
from encoder import audio
|
2022-02-09 00:44:43 +08:00
|
|
|
from utils.f0_utils import compute_f0
|
|
|
|
|
|
|
|
from torch.multiprocessing import Pool, cpu_count
|
|
|
|
from functools import partial
|
|
|
|
|
|
|
|
SAMPLE_RATE=16000
|
|
|
|
|
|
|
|
def _compute_bnf(
|
|
|
|
wav: any,
|
|
|
|
output_fpath: str,
|
|
|
|
device: torch.device,
|
|
|
|
ppg_model_local: any,
|
|
|
|
):
|
|
|
|
"""
|
|
|
|
Compute CTC-Attention Seq2seq ASR encoder bottle-neck features (BNF).
|
|
|
|
"""
|
|
|
|
ppg_model_local.to(device)
|
|
|
|
wav_tensor = torch.from_numpy(wav).float().to(device).unsqueeze(0)
|
|
|
|
wav_length = torch.LongTensor([wav.shape[0]]).to(device)
|
|
|
|
with torch.no_grad():
|
|
|
|
bnf = ppg_model_local(wav_tensor, wav_length)
|
|
|
|
bnf_npy = bnf.squeeze(0).cpu().numpy()
|
|
|
|
np.save(output_fpath, bnf_npy, allow_pickle=False)
|
2022-02-20 11:56:58 +08:00
|
|
|
return bnf_npy, len(bnf_npy)
|
2022-02-09 00:44:43 +08:00
|
|
|
|
|
|
|
def _compute_f0_from_wav(wav, output_fpath):
|
|
|
|
"""Compute merged f0 values."""
|
|
|
|
f0 = compute_f0(wav, SAMPLE_RATE)
|
|
|
|
np.save(output_fpath, f0, allow_pickle=False)
|
2022-02-20 11:56:58 +08:00
|
|
|
return f0, len(f0)
|
2022-02-09 00:44:43 +08:00
|
|
|
|
2022-02-13 11:28:41 +08:00
|
|
|
def _compute_spkEmbed(wav, output_fpath, encoder_model_local, device):
|
|
|
|
Encoder.set_model(encoder_model_local)
|
|
|
|
# Compute where to split the utterance into partials and pad if necessary
|
|
|
|
wave_slices, mel_slices = Encoder.compute_partial_slices(len(wav), rate=1.3, min_pad_coverage=0.75)
|
|
|
|
max_wave_length = wave_slices[-1].stop
|
|
|
|
if max_wave_length >= len(wav):
|
|
|
|
wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
|
|
|
|
|
|
|
|
# Split the utterance into partials
|
|
|
|
frames = audio.wav_to_mel_spectrogram(wav)
|
|
|
|
frames_batch = np.array([frames[s] for s in mel_slices])
|
|
|
|
partial_embeds = Encoder.embed_frames_batch(frames_batch)
|
|
|
|
|
|
|
|
# Compute the utterance embedding from the partial embeddings
|
|
|
|
raw_embed = np.mean(partial_embeds, axis=0)
|
|
|
|
embed = raw_embed / np.linalg.norm(raw_embed, 2)
|
|
|
|
|
2022-02-09 00:44:43 +08:00
|
|
|
np.save(output_fpath, embed, allow_pickle=False)
|
2022-02-20 11:56:58 +08:00
|
|
|
return embed, len(embed)
|
2022-02-09 00:44:43 +08:00
|
|
|
|
2022-02-13 11:28:41 +08:00
|
|
|
def preprocess_one(wav_path, out_dir, device, ppg_model_local, encoder_model_local):
|
2022-02-20 11:56:58 +08:00
|
|
|
# wav = preprocess_wav(wav_path)
|
|
|
|
# try:
|
|
|
|
wav, sr = soundfile.read(wav_path)
|
|
|
|
if len(wav) < sr:
|
|
|
|
return None, sr, len(wav)
|
|
|
|
if sr != SAMPLE_RATE:
|
|
|
|
wav = resampy.resample(wav, sr, SAMPLE_RATE)
|
|
|
|
sr = SAMPLE_RATE
|
2022-02-09 00:44:43 +08:00
|
|
|
utt_id = os.path.basename(wav_path).rstrip(".wav")
|
|
|
|
|
2022-02-20 11:56:58 +08:00
|
|
|
_, length_bnf = _compute_bnf(output_fpath=f"{out_dir}/bnf/{utt_id}.ling_feat.npy", wav=wav, device=device, ppg_model_local=ppg_model_local)
|
|
|
|
_, length_f0 = _compute_f0_from_wav(output_fpath=f"{out_dir}/f0/{utt_id}.f0.npy", wav=wav)
|
|
|
|
_, length_embed = _compute_spkEmbed(output_fpath=f"{out_dir}/embed/{utt_id}.npy", device=device, encoder_model_local=encoder_model_local, wav=wav)
|
2022-02-09 00:44:43 +08:00
|
|
|
|
|
|
|
def preprocess_dataset(datasets_root, dataset, out_dir, n_processes, ppg_encoder_model_fpath, speaker_encoder_model):
|
|
|
|
# Glob wav files
|
|
|
|
wav_file_list = sorted(Path(f"{datasets_root}/{dataset}").glob("**/*.wav"))
|
|
|
|
print(f"Globbed {len(wav_file_list)} wav files.")
|
|
|
|
|
|
|
|
out_dir.joinpath("bnf").mkdir(exist_ok=True, parents=True)
|
|
|
|
out_dir.joinpath("f0").mkdir(exist_ok=True, parents=True)
|
|
|
|
out_dir.joinpath("embed").mkdir(exist_ok=True, parents=True)
|
2022-02-20 11:56:58 +08:00
|
|
|
ppg_model_local = load_model(ppg_encoder_model_fpath, "cpu")
|
2022-02-13 11:28:41 +08:00
|
|
|
encoder_model_local = Encoder.load_model(speaker_encoder_model, "cpu")
|
2022-02-09 00:44:43 +08:00
|
|
|
if n_processes is None:
|
|
|
|
n_processes = cpu_count()
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
2022-02-13 11:28:41 +08:00
|
|
|
func = partial(preprocess_one, out_dir=out_dir, ppg_model_local=ppg_model_local, encoder_model_local=encoder_model_local, device=device)
|
2022-02-09 00:44:43 +08:00
|
|
|
job = Pool(n_processes).imap(func, wav_file_list)
|
|
|
|
list(tqdm(job, "Preprocessing", len(wav_file_list), unit="wav"))
|
|
|
|
|
2022-02-20 11:56:58 +08:00
|
|
|
# finish processing and mark
|
2022-02-13 11:28:41 +08:00
|
|
|
t_fid_file = out_dir.joinpath("train_fidlist.txt").open("w", encoding="utf-8")
|
|
|
|
d_fid_file = out_dir.joinpath("dev_fidlist.txt").open("w", encoding="utf-8")
|
|
|
|
e_fid_file = out_dir.joinpath("eval_fidlist.txt").open("w", encoding="utf-8")
|
2022-02-20 11:56:58 +08:00
|
|
|
for file in sorted(out_dir.joinpath("f0").glob("*.npy")):
|
2022-02-27 13:25:58 +08:00
|
|
|
id = os.path.basename(file).split(".f0.npy")[0]
|
2022-02-20 11:56:58 +08:00
|
|
|
if id.endswith("01"):
|
2022-02-13 11:28:41 +08:00
|
|
|
d_fid_file.write(id + "\n")
|
2022-02-20 11:56:58 +08:00
|
|
|
elif id.endswith("09"):
|
2022-02-13 11:28:41 +08:00
|
|
|
e_fid_file.write(id + "\n")
|
|
|
|
else:
|
|
|
|
t_fid_file.write(id + "\n")
|
|
|
|
t_fid_file.close()
|
|
|
|
d_fid_file.close()
|
|
|
|
e_fid_file.close()
|