MockingBird/models/synthesizer/preprocess_audio.py

138 lines
5.4 KiB
Python

import librosa
import numpy as np
from models.encoder import inference as encoder
from utils import logmmse
from models.synthesizer import audio
from pathlib import Path
from pypinyin import Style
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin
import torch
from transformers import Wav2Vec2Processor
from .models.wav2emo import EmotionExtractorModel
SAMPLE_RATE = 16000
# load model from hub
device = 'cuda' if torch.cuda.is_available() else "cpu"
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionExtractorModel.from_pretrained(model_name).to(device)
embs = []
wavnames = []
def extract_emo(
x: np.ndarray,
sampling_rate: int,
embeddings: bool = False,
) -> np.ndarray:
r"""Predict emotions or extract embeddings from raw audio signal."""
y = processor(x, sampling_rate=sampling_rate)
y = y['input_values'][0]
y = torch.from_numpy(y).to(device)
# run through model
with torch.no_grad():
y = model(y)[0 if embeddings else 1]
# convert to numpy
y = y.detach().cpu().numpy()
return y
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
pass
pinyin = Pinyin(PinyinConverter()).pinyin
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams, emotion_extract: bool):
## FOR REFERENCE:
# For you not to lose your head if you ever wish to change things here or implement your own
# synthesizer.
# - Both the audios and the mel spectrograms are saved as numpy arrays
# - There is no processing done to the audios that will be saved to disk beyond volume
# normalization (in split_on_silences)
# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
# is why we re-apply it on the audio on the side of the vocoder.
# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
# without extra padding. This means that you won't have an exact relation between the length
# of the wav and of the mel spectrogram. See the vocoder data loader.
# Skip existing utterances if needed
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % basename)
skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists())
if skip_existing and mel_fpath.exists() and wav_fpath.exists() and skip_emo_extract:
return None
# Trim silence
if hparams.trim_silence:
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
# Skip utterances that are too short
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
return None
# Compute the mel spectrogram
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
mel_frames = mel_spectrogram.shape[1]
# Skip utterances that are too long
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
return None
# Write the spectrogram, embed and audio to disk
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
np.save(wav_fpath, wav, allow_pickle=False)
if not skip_emo_extract:
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
np.save(emo_fpath, emo, allow_pickle=False)
# Return a tuple describing this training example
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
def _split_on_silences(wav_fpath, words, hparams):
# Load the audio waveform
wav, _ = librosa.load(wav_fpath, sr= hparams.sample_rate)
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# denoise, we may not need it here.
if len(wav) > hparams.sample_rate*(0.3+0.1):
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
wav[-int(hparams.sample_rate*0.15):]])
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile, eta=0)
resp = pinyin(words, style=Style.TONE3)
res = [v[0] for v in resp if v[0].strip()]
res = " ".join(res)
return wav, res
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, emotion_extract: bool):
metadata = []
extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
words = dict_info.get(wav_fpath.name) if not words else words # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams)
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams, emotion_extract))
return [m for m in metadata if m is not None]