51 lines
1.3 KiB
Python
51 lines
1.3 KiB
Python
import torch
|
|
import torch.nn as nn
|
|
from transformers.models.wav2vec2.modeling_wav2vec2 import (
|
|
Wav2Vec2Model,
|
|
Wav2Vec2PreTrainedModel,
|
|
)
|
|
|
|
|
|
class RegressionHead(nn.Module):
|
|
r"""Classification head."""
|
|
|
|
def __init__(self, config):
|
|
super().__init__()
|
|
|
|
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
|
self.dropout = nn.Dropout(config.final_dropout)
|
|
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
|
|
|
|
def forward(self, features, **kwargs):
|
|
x = features
|
|
x = self.dropout(x)
|
|
x = self.dense(x)
|
|
x = torch.tanh(x)
|
|
x = self.dropout(x)
|
|
x = self.out_proj(x)
|
|
|
|
return x
|
|
|
|
|
|
class EmotionExtractorModel(Wav2Vec2PreTrainedModel):
|
|
r"""Speech emotion classifier."""
|
|
|
|
def __init__(self, config):
|
|
super().__init__(config)
|
|
|
|
self.config = config
|
|
self.wav2vec2 = Wav2Vec2Model(config)
|
|
self.classifier = RegressionHead(config)
|
|
self.init_weights()
|
|
|
|
def forward(
|
|
self,
|
|
input_values,
|
|
):
|
|
outputs = self.wav2vec2(input_values)
|
|
hidden_states = outputs[0]
|
|
hidden_states = torch.mean(hidden_states, dim=1)
|
|
logits = self.classifier(hidden_states)
|
|
|
|
return hidden_states, logits
|