mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
138 lines
5.7 KiB
Python
138 lines
5.7 KiB
Python
import librosa
|
|
import numpy as np
|
|
|
|
from models.encoder import inference as encoder
|
|
from utils import logmmse
|
|
from models.synthesizer import audio
|
|
from pathlib import Path
|
|
from pypinyin import Style
|
|
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
|
|
from pypinyin.converter import DefaultConverter
|
|
from pypinyin.core import Pinyin
|
|
import torch
|
|
from transformers import Wav2Vec2Processor
|
|
from .models.wav2emo import EmotionExtractorModel
|
|
|
|
SAMPLE_RATE = 16000
|
|
|
|
# load model from hub
|
|
device = 'cuda' if torch.cuda.is_available() else "cpu"
|
|
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
|
|
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
|
model = EmotionExtractorModel.from_pretrained(model_name).to(device)
|
|
|
|
def extract_emo(
|
|
x: np.ndarray,
|
|
sampling_rate: int,
|
|
embeddings: bool = False,
|
|
) -> np.ndarray:
|
|
r"""Predict emotions or extract embeddings from raw audio signal."""
|
|
y = processor(x, sampling_rate=sampling_rate)
|
|
y = y['input_values'][0]
|
|
y = torch.from_numpy(y).to(device)
|
|
|
|
# run through model
|
|
with torch.no_grad():
|
|
y = model(y)[0 if embeddings else 1]
|
|
|
|
# convert to numpy
|
|
y = y.detach().cpu().numpy()
|
|
|
|
return y
|
|
|
|
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
|
|
pass
|
|
|
|
pinyin = Pinyin(PinyinConverter()).pinyin
|
|
|
|
|
|
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
|
skip_existing: bool, hparams, emotion_extract: bool):
|
|
## FOR REFERENCE:
|
|
# For you not to lose your head if you ever wish to change things here or implement your own
|
|
# synthesizer.
|
|
# - Both the audios and the mel spectrograms are saved as numpy arrays
|
|
# - There is no processing done to the audios that will be saved to disk beyond volume
|
|
# normalization (in split_on_silences)
|
|
# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
|
|
# is why we re-apply it on the audio on the side of the vocoder.
|
|
# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
|
|
# without extra padding. This means that you won't have an exact relation between the length
|
|
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
|
|
|
# Skip existing utterances if needed
|
|
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
|
|
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
|
|
|
|
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
|
|
return None
|
|
|
|
# Trim silence
|
|
if hparams.trim_silence:
|
|
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
|
|
|
# Skip utterances that are too short
|
|
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
|
|
return None
|
|
|
|
# Compute the mel spectrogram
|
|
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
|
mel_frames = mel_spectrogram.shape[1]
|
|
|
|
# Skip utterances that are too long
|
|
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
|
|
return None
|
|
# Write the spectrogram, embed and audio to disk
|
|
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
|
|
np.save(wav_fpath, wav, allow_pickle=False)
|
|
|
|
# Return a tuple describing this training example
|
|
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, wav, mel_frames, text
|
|
|
|
|
|
def _split_on_silences(wav_fpath, words, hparams):
|
|
# Load the audio waveform
|
|
wav, _ = librosa.load(wav_fpath, sr= hparams.sample_rate)
|
|
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=1024)[0]
|
|
if hparams.rescale:
|
|
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
|
# denoise, we may not need it here.
|
|
if len(wav) > hparams.sample_rate*(0.3+0.1):
|
|
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
|
|
wav[-int(hparams.sample_rate*0.15):]])
|
|
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
|
|
wav = logmmse.denoise(wav, profile, eta=0)
|
|
|
|
resp = pinyin(words, style=Style.TONE3)
|
|
res = [v[0] for v in resp if v[0].strip()]
|
|
res = " ".join(res)
|
|
|
|
return wav, res
|
|
|
|
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, emotion_extract: bool):
|
|
metadata = []
|
|
extensions = ["*.wav", "*.flac", "*.mp3"]
|
|
for extension in extensions:
|
|
wav_fpath_list = speaker_dir.glob(extension)
|
|
# Iterate over each wav
|
|
for wav_fpath in wav_fpath_list:
|
|
words = dict_info.get(wav_fpath.name.split(".")[0])
|
|
words = dict_info.get(wav_fpath.name) if not words else words # try with extension
|
|
if not words:
|
|
print("no wordS")
|
|
continue
|
|
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
|
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
|
result = _process_utterance(wav, text, out_dir, sub_basename,
|
|
skip_existing, hparams, emotion_extract)
|
|
if result is None:
|
|
continue
|
|
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
|
emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % sub_basename)
|
|
skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists())
|
|
if not skip_emo_extract and wav is not None:
|
|
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
|
|
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
|
|
metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
|
|
return [m for m in metadata if m is not None]
|