mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
75 lines
2.4 KiB
Python
75 lines
2.4 KiB
Python
|
import librosa
|
||
|
import numpy as np
|
||
|
import torch
|
||
|
from typing import Tuple
|
||
|
|
||
|
from .nets_utils import make_pad_mask
|
||
|
|
||
|
|
||
|
class LogMel(torch.nn.Module):
|
||
|
"""Convert STFT to fbank feats
|
||
|
|
||
|
The arguments is same as librosa.filters.mel
|
||
|
|
||
|
Args:
|
||
|
fs: number > 0 [scalar] sampling rate of the incoming signal
|
||
|
n_fft: int > 0 [scalar] number of FFT components
|
||
|
n_mels: int > 0 [scalar] number of Mel bands to generate
|
||
|
fmin: float >= 0 [scalar] lowest frequency (in Hz)
|
||
|
fmax: float >= 0 [scalar] highest frequency (in Hz).
|
||
|
If `None`, use `fmax = fs / 2.0`
|
||
|
htk: use HTK formula instead of Slaney
|
||
|
norm: {None, 1, np.inf} [scalar]
|
||
|
if 1, divide the triangular mel weights by the width of the mel band
|
||
|
(area normalization). Otherwise, leave all the triangles aiming for
|
||
|
a peak value of 1.0
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
fs: int = 16000,
|
||
|
n_fft: int = 512,
|
||
|
n_mels: int = 80,
|
||
|
fmin: float = None,
|
||
|
fmax: float = None,
|
||
|
htk: bool = False,
|
||
|
norm=1,
|
||
|
):
|
||
|
super().__init__()
|
||
|
|
||
|
fmin = 0 if fmin is None else fmin
|
||
|
fmax = fs / 2 if fmax is None else fmax
|
||
|
_mel_options = dict(
|
||
|
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
|
||
|
)
|
||
|
self.mel_options = _mel_options
|
||
|
|
||
|
# Note(kamo): The mel matrix of librosa is different from kaldi.
|
||
|
melmat = librosa.filters.mel(**_mel_options)
|
||
|
# melmat: (D2, D1) -> (D1, D2)
|
||
|
self.register_buffer("melmat", torch.from_numpy(melmat.T).float())
|
||
|
inv_mel = np.linalg.pinv(melmat)
|
||
|
self.register_buffer("inv_melmat", torch.from_numpy(inv_mel.T).float())
|
||
|
|
||
|
def extra_repr(self):
|
||
|
return ", ".join(f"{k}={v}" for k, v in self.mel_options.items())
|
||
|
|
||
|
def forward(
|
||
|
self, feat: torch.Tensor, ilens: torch.Tensor = None,
|
||
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||
|
# feat: (B, T, D1) x melmat: (D1, D2) -> mel_feat: (B, T, D2)
|
||
|
mel_feat = torch.matmul(feat, self.melmat)
|
||
|
|
||
|
logmel_feat = (mel_feat + 1e-20).log()
|
||
|
# Zero padding
|
||
|
if ilens is not None:
|
||
|
logmel_feat = logmel_feat.masked_fill(
|
||
|
make_pad_mask(ilens, logmel_feat, 1), 0.0
|
||
|
)
|
||
|
else:
|
||
|
ilens = feat.new_full(
|
||
|
[feat.size(0)], fill_value=feat.size(1), dtype=torch.long
|
||
|
)
|
||
|
return logmel_feat, ilens
|