MockingBird/models/ppg_extractor/log_mel.py

import librosa
import numpy as np
import torch
from typing import Tuple

from .nets_utils import make_pad_mask


class LogMel(torch.nn.Module):
    """Convert STFT to fbank feats

    The arguments is same as librosa.filters.mel

    Args:
        fs: number > 0 [scalar] sampling rate of the incoming signal
        n_fft: int > 0 [scalar] number of FFT components
        n_mels: int > 0 [scalar] number of Mel bands to generate
        fmin: float >= 0 [scalar] lowest frequency (in Hz)
        fmax: float >= 0 [scalar] highest frequency (in Hz).
            If `None`, use `fmax = fs / 2.0`
        htk: use HTK formula instead of Slaney
        norm: {None, 1, np.inf} [scalar]
            if 1, divide the triangular mel weights by the width of the mel band
            (area normalization).  Otherwise, leave all the triangles aiming for
            a peak value of 1.0

    """

    def __init__(
        self,
        fs: int = 16000,
        n_fft: int = 512,
        n_mels: int = 80,
        fmin: float = None,
        fmax: float = None,
        htk: bool = False,
        norm=1,
    ):
        super().__init__()

        fmin = 0 if fmin is None else fmin
        fmax = fs / 2 if fmax is None else fmax
        _mel_options = dict(
            sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
        )
        self.mel_options = _mel_options

        # Note(kamo): The mel matrix of librosa is different from kaldi.
        melmat = librosa.filters.mel(**_mel_options)
        # melmat: (D2, D1) -> (D1, D2)
        self.register_buffer("melmat", torch.from_numpy(melmat.T).float())
        inv_mel = np.linalg.pinv(melmat)
        self.register_buffer("inv_melmat", torch.from_numpy(inv_mel.T).float())

    def extra_repr(self):
        return ", ".join(f"{k}={v}" for k, v in self.mel_options.items())

    def forward(
        self, feat: torch.Tensor, ilens: torch.Tensor = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # feat: (B, T, D1) x melmat: (D1, D2) -> mel_feat: (B, T, D2)
        mel_feat = torch.matmul(feat, self.melmat)

        logmel_feat = (mel_feat + 1e-20).log()
        # Zero padding
        if ilens is not None:
            logmel_feat = logmel_feat.masked_fill(
                make_pad_mask(ilens, logmel_feat, 1), 0.0
            )
        else:
            ilens = feat.new_full(
                [feat.size(0)], fill_value=feat.size(1), dtype=torch.long
            )
        return logmel_feat, ilens
Init ppg extractor and ppg2mel (#375) * Init ppg extractor and ppg2mel * add preprocess and training * FIx known issues * Update __init__.py Allow to gen audio * Fix length issue * Fix bug of preparing fid * Fix sample issues * Add UI usage of PPG-vc 2022-03-03 23:38:12 +08:00			`import librosa`
			`import numpy as np`
			`import torch`
			`from typing import Tuple`

			`from .nets_utils import make_pad_mask`


			`class LogMel(torch.nn.Module):`
			`"""Convert STFT to fbank feats`

			`The arguments is same as librosa.filters.mel`

			`Args:`
			`fs: number > 0 [scalar] sampling rate of the incoming signal`
			`n_fft: int > 0 [scalar] number of FFT components`
			`n_mels: int > 0 [scalar] number of Mel bands to generate`
			`fmin: float >= 0 [scalar] lowest frequency (in Hz)`
			`fmax: float >= 0 [scalar] highest frequency (in Hz).`
			If `None`, use `fmax = fs / 2.0`
			`htk: use HTK formula instead of Slaney`
			`norm: {None, 1, np.inf} [scalar]`
			`if 1, divide the triangular mel weights by the width of the mel band`
			`(area normalization). Otherwise, leave all the triangles aiming for`
			`a peak value of 1.0`

			`"""`

			`def __init__(`
			`self,`
			`fs: int = 16000,`
			`n_fft: int = 512,`
			`n_mels: int = 80,`
			`fmin: float = None,`
			`fmax: float = None,`
			`htk: bool = False,`
			`norm=1,`
			`):`
			`super().__init__()`

			`fmin = 0 if fmin is None else fmin`
			`fmax = fs / 2 if fmax is None else fmax`
			`_mel_options = dict(`
			`sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm`
			`)`
			`self.mel_options = _mel_options`

			`# Note(kamo): The mel matrix of librosa is different from kaldi.`
			`melmat = librosa.filters.mel(**_mel_options)`
			`# melmat: (D2, D1) -> (D1, D2)`
			`self.register_buffer("melmat", torch.from_numpy(melmat.T).float())`
			`inv_mel = np.linalg.pinv(melmat)`
			`self.register_buffer("inv_melmat", torch.from_numpy(inv_mel.T).float())`

			`def extra_repr(self):`
			`return ", ".join(f"{k}={v}" for k, v in self.mel_options.items())`

			`def forward(`
			`self, feat: torch.Tensor, ilens: torch.Tensor = None,`
			`) -> Tuple[torch.Tensor, torch.Tensor]:`
			`# feat: (B, T, D1) x melmat: (D1, D2) -> mel_feat: (B, T, D2)`
			`mel_feat = torch.matmul(feat, self.melmat)`

			`logmel_feat = (mel_feat + 1e-20).log()`
			`# Zero padding`
			`if ilens is not None:`
			`logmel_feat = logmel_feat.masked_fill(`
			`make_pad_mask(ilens, logmel_feat, 1), 0.0`
			`)`
			`else:`
			`ilens = feat.new_full(`
			`[feat.size(0)], fill_value=feat.size(1), dtype=torch.long`
			`)`
			`return logmel_feat, ilens`