MockingBird/models/synthesizer/preprocess_audio.py

import librosa
import numpy as np

from models.encoder import inference as encoder
from utils import logmmse
from models.synthesizer import audio
from pathlib import Path
from pypinyin import Style
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin
import torch
from transformers import Wav2Vec2Processor
from .models.wav2emo import EmotionExtractorModel

SAMPLE_RATE = 16000

# load model from hub
device = 'cuda' if torch.cuda.is_available() else "cpu"
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionExtractorModel.from_pretrained(model_name).to(device)

def extract_emo(
    x: np.ndarray,
    sampling_rate: int,
    embeddings: bool = False,
) -> np.ndarray:
    r"""Predict emotions or extract embeddings from raw audio signal."""
    y = processor(x, sampling_rate=sampling_rate)
    y = y['input_values'][0]
    y = torch.from_numpy(y).to(device)

    # run through model
    with torch.no_grad():
        y = model(y)[0 if embeddings else 1]

    # convert to numpy
    y = y.detach().cpu().numpy()

    return y

class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
    pass

pinyin = Pinyin(PinyinConverter()).pinyin


def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
                      skip_existing: bool, hparams, emotion_extract: bool):
    ## FOR REFERENCE:
    # For you not to lose your head if you ever wish to change things here or implement your own
    # synthesizer.
    # - Both the audios and the mel spectrograms are saved as numpy arrays
    # - There is no processing done to the audios that will be saved to disk beyond volume
    #   normalization (in split_on_silences)
    # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
    #   is why we re-apply it on the audio on the side of the vocoder.
    # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
    #   without extra padding. This means that you won't have an exact relation between the length
    #   of the wav and of the mel spectrogram. See the vocoder data loader.

    # Skip existing utterances if needed
    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)

    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
        return None

    # Trim silence
    if hparams.trim_silence:
        wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)

    # Skip utterances that are too short
    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
        return None

    # Compute the mel spectrogram
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    # Skip utterances that are too long
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None
    # Write the spectrogram, embed and audio to disk
    np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
    np.save(wav_fpath, wav, allow_pickle=False)

    # Return a tuple describing this training example
    return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, wav, mel_frames, text


def _split_on_silences(wav_fpath, words, hparams):
    # Load the audio waveform
    wav, _ = librosa.load(wav_fpath, sr= hparams.sample_rate)
    wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=1024)[0]
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
    # denoise, we may not need it here.
    if len(wav) > hparams.sample_rate*(0.3+0.1):
        noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
                                    wav[-int(hparams.sample_rate*0.15):]])
        profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
        wav = logmmse.denoise(wav, profile, eta=0)

    resp = pinyin(words, style=Style.TONE3)
    res = [v[0] for v in resp if v[0].strip()]
    res = " ".join(res)

    return wav, res

def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, emotion_extract: bool):
    metadata = []
    extensions = ["*.wav", "*.flac", "*.mp3"]
    for extension in extensions:
        wav_fpath_list = speaker_dir.glob(extension)
        # Iterate over each wav
        for wav_fpath in wav_fpath_list:
            words = dict_info.get(wav_fpath.name.split(".")[0])
            words = dict_info.get(wav_fpath.name) if not words else words # try with extension
            if not words:
                print("no wordS")
                continue
            sub_basename = "%s_%02d" % (wav_fpath.name, 0)
            wav, text = _split_on_silences(wav_fpath, words, hparams)
            result = _process_utterance(wav, text, out_dir, sub_basename,
                                                skip_existing, hparams, emotion_extract)
            if result is None:
                continue
            wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
            emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % sub_basename)
            skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists())
            if not skip_emo_extract and wav is not None:
                emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
                np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
            metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
    return [m for m in metadata if m is not None]