2022-03-03 23:38:12 +08:00
|
|
|
import librosa
|
|
|
|
import numpy as np
|
|
|
|
import torch
|
|
|
|
from typing import Tuple
|
|
|
|
|
|
|
|
from .nets_utils import make_pad_mask
|
|
|
|
|
|
|
|
|
|
|
|
class LogMel(torch.nn.Module):
|
|
|
|
"""Convert STFT to fbank feats
|
|
|
|
|
|
|
|
The arguments is same as librosa.filters.mel
|
|
|
|
|
|
|
|
Args:
|
|
|
|
fs: number > 0 [scalar] sampling rate of the incoming signal
|
|
|
|
n_fft: int > 0 [scalar] number of FFT components
|
|
|
|
n_mels: int > 0 [scalar] number of Mel bands to generate
|
|
|
|
fmin: float >= 0 [scalar] lowest frequency (in Hz)
|
|
|
|
fmax: float >= 0 [scalar] highest frequency (in Hz).
|
|
|
|
If `None`, use `fmax = fs / 2.0`
|
|
|
|
htk: use HTK formula instead of Slaney
|
|
|
|
norm: {None, 1, np.inf} [scalar]
|
|
|
|
if 1, divide the triangular mel weights by the width of the mel band
|
|
|
|
(area normalization). Otherwise, leave all the triangles aiming for
|
|
|
|
a peak value of 1.0
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
fs: int = 16000,
|
|
|
|
n_fft: int = 512,
|
|
|
|
n_mels: int = 80,
|
2023-06-02 17:22:38 +08:00
|
|
|
fmin: float = 0,
|
2022-03-03 23:38:12 +08:00
|
|
|
fmax: float = None,
|
|
|
|
htk: bool = False,
|
|
|
|
norm=1,
|
|
|
|
):
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
fmax = fs / 2 if fmax is None else fmax
|
|
|
|
_mel_options = dict(
|
|
|
|
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
|
|
|
|
)
|
|
|
|
self.mel_options = _mel_options
|
|
|
|
|
|
|
|
# Note(kamo): The mel matrix of librosa is different from kaldi.
|
|
|
|
melmat = librosa.filters.mel(**_mel_options)
|
|
|
|
# melmat: (D2, D1) -> (D1, D2)
|
|
|
|
self.register_buffer("melmat", torch.from_numpy(melmat.T).float())
|
|
|
|
inv_mel = np.linalg.pinv(melmat)
|
|
|
|
self.register_buffer("inv_melmat", torch.from_numpy(inv_mel.T).float())
|
|
|
|
|
|
|
|
def extra_repr(self):
|
|
|
|
return ", ".join(f"{k}={v}" for k, v in self.mel_options.items())
|
|
|
|
|
|
|
|
def forward(
|
|
|
|
self, feat: torch.Tensor, ilens: torch.Tensor = None,
|
|
|
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
|
|
# feat: (B, T, D1) x melmat: (D1, D2) -> mel_feat: (B, T, D2)
|
|
|
|
mel_feat = torch.matmul(feat, self.melmat)
|
|
|
|
|
|
|
|
logmel_feat = (mel_feat + 1e-20).log()
|
|
|
|
# Zero padding
|
|
|
|
if ilens is not None:
|
|
|
|
logmel_feat = logmel_feat.masked_fill(
|
|
|
|
make_pad_mask(ilens, logmel_feat, 1), 0.0
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
ilens = feat.new_full(
|
|
|
|
[feat.size(0)], fill_value=feat.size(1), dtype=torch.long
|
|
|
|
)
|
|
|
|
return logmel_feat, ilens
|