mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
Add web gui of training and reconstruct taco model methods
This commit is contained in:
parent
a39b6d3117
commit
6f023e313d
|
@ -815,6 +815,9 @@ def getOpyrator(mode: str) -> Opyrator:
|
|||
if mode == None or mode.startswith('模型训练'):
|
||||
from mkgui.train import train
|
||||
return Opyrator(train)
|
||||
if mode == None or mode.startswith('模型训练(VC)'):
|
||||
from mkgui.train_vc import train_vc
|
||||
return Opyrator(train_vc)
|
||||
from mkgui.app import synthesize
|
||||
return Opyrator(synthesize)
|
||||
|
||||
|
@ -829,7 +832,7 @@ def render_streamlit_ui() -> None:
|
|||
with st.spinner("Loading MockingBird GUI. Please wait..."):
|
||||
session_state.mode = st.sidebar.selectbox(
|
||||
'模式选择',
|
||||
( "AI拟音", "VC拟音", "预处理", "模型训练")
|
||||
( "AI拟音", "VC拟音", "预处理", "模型训练", "模型训练(VC)")
|
||||
)
|
||||
if "mode" in session_state:
|
||||
mode = session_state.mode
|
||||
|
|
148
mkgui/train.py
148
mkgui/train.py
|
@ -2,66 +2,55 @@ from pydantic import BaseModel, Field
|
|||
import os
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
from typing import Any, Tuple
|
||||
import numpy as np
|
||||
from utils.load_yaml import HpsYaml
|
||||
from utils.util import AttrDict
|
||||
import torch
|
||||
from typing import Any
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.train import train as synt_train
|
||||
|
||||
# TODO: seperator for *unix systems
|
||||
# Constants
|
||||
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
|
||||
CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
|
||||
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models"
|
||||
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
|
||||
|
||||
if os.path.isdir(EXT_MODELS_DIRT):
|
||||
extractors = Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded extractor models: " + str(len(extractors)))
|
||||
else:
|
||||
raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.")
|
||||
# EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
|
||||
# CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
|
||||
# ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
|
||||
if os.path.isdir(CONV_MODELS_DIRT):
|
||||
convertors = Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth")))
|
||||
print("Loaded convertor models: " + str(len(convertors)))
|
||||
# Pre-Load models
|
||||
if os.path.isdir(SYN_MODELS_DIRT):
|
||||
synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded synthesizer models: " + str(len(synthesizers)))
|
||||
else:
|
||||
raise Exception(f"Model folder {CONV_MODELS_DIRT} doesn't exist.")
|
||||
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
if os.path.isdir(ENC_MODELS_DIRT):
|
||||
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
|
||||
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded encoders models: " + str(len(encoders)))
|
||||
else:
|
||||
raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
class Model(str, Enum):
|
||||
VC_PPG2MEL = "ppg2mel"
|
||||
|
||||
class Dataset(str, Enum):
|
||||
AIDATATANG_200ZH = "aidatatang_200zh"
|
||||
AIDATATANG_200ZH_S = "aidatatang_200zh_s"
|
||||
DEFAULT = "default"
|
||||
|
||||
class Input(BaseModel):
|
||||
# def render_input_ui(st, input) -> Dict:
|
||||
# input["selected_dataset"] = st.selectbox(
|
||||
# '选择数据集',
|
||||
# ("aidatatang_200zh", "aidatatang_200zh_s")
|
||||
# )
|
||||
# return input
|
||||
model: Model = Field(
|
||||
Model.VC_PPG2MEL, title="模型类型",
|
||||
Model.DEFAULT, title="模型类型",
|
||||
)
|
||||
# datasets_root: str = Field(
|
||||
# ..., alias="预处理数据根目录", description="输入目录(相对/绝对),不适用于ppg2mel模型",
|
||||
# format=True,
|
||||
# example="..\\trainning_data\\"
|
||||
# )
|
||||
output_root: str = Field(
|
||||
..., alias="输出目录(可选)", description="建议不填,保持默认",
|
||||
input_root: str = Field(
|
||||
..., alias="输入目录", description="预处理数据根目录",
|
||||
format=True,
|
||||
example=""
|
||||
example=f"..{os.sep}audiodata{os.sep}SV2TTS{os.sep}synthesizer"
|
||||
)
|
||||
continue_mode: bool = Field(
|
||||
True, alias="继续训练模式", description="选择“是”,则从下面选择的模型中继续训练",
|
||||
run_id: str = Field(
|
||||
"", alias="新模型名/运行ID", description="使用新ID进行重新训练,否则选择下面的模型进行继续训练",
|
||||
)
|
||||
synthesizer: synthesizers = Field(
|
||||
..., alias="已有合成模型",
|
||||
description="选择语音合成模型文件."
|
||||
)
|
||||
gpu: bool = Field(
|
||||
True, alias="GPU训练", description="选择“是”,则使用GPU训练",
|
||||
|
@ -69,32 +58,18 @@ class Input(BaseModel):
|
|||
verbose: bool = Field(
|
||||
True, alias="打印详情", description="选择“是”,输出更多详情",
|
||||
)
|
||||
# TODO: Move to hiden fields by default
|
||||
convertor: convertors = Field(
|
||||
..., alias="转换模型",
|
||||
description="选择语音转换模型文件."
|
||||
)
|
||||
extractor: extractors = Field(
|
||||
..., alias="特征提取模型",
|
||||
description="选择PPG特征提取模型文件."
|
||||
)
|
||||
encoder: encoders = Field(
|
||||
..., alias="语音编码模型",
|
||||
description="选择语音编码模型文件."
|
||||
)
|
||||
njobs: int = Field(
|
||||
8, alias="进程数", description="适用于ppg2mel",
|
||||
save_every: int = Field(
|
||||
1000, alias="更新间隔", description="每隔n步则更新一次模型",
|
||||
)
|
||||
seed: int = Field(
|
||||
default=0, alias="初始随机数", description="适用于ppg2mel",
|
||||
backup_every: int = Field(
|
||||
10000, alias="保存间隔", description="每隔n步则保存一次模型",
|
||||
)
|
||||
model_name: str = Field(
|
||||
..., alias="新模型名", description="仅在重新训练时生效,选中继续训练时无效",
|
||||
example="test"
|
||||
)
|
||||
model_config: str = Field(
|
||||
..., alias="新模型配置", description="仅在重新训练时生效,选中继续训练时无效",
|
||||
example=".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2"
|
||||
log_every: int = Field(
|
||||
500, alias="打印间隔", description="每隔n步则打印一次训练统计",
|
||||
)
|
||||
|
||||
class AudioEntity(BaseModel):
|
||||
|
@ -102,55 +77,30 @@ class AudioEntity(BaseModel):
|
|||
mel: Any
|
||||
|
||||
class Output(BaseModel):
|
||||
__root__: Tuple[str, int]
|
||||
__root__: int
|
||||
|
||||
def render_output_ui(self, streamlit_app, input) -> None: # type: ignore
|
||||
def render_output_ui(self, streamlit_app) -> None: # type: ignore
|
||||
"""Custom output UI.
|
||||
If this method is implmeneted, it will be used instead of the default Output UI renderer.
|
||||
"""
|
||||
sr, count = self.__root__
|
||||
streamlit_app.subheader(f"Dataset {sr} done processed total of {count}")
|
||||
streamlit_app.subheader(f"Training started with code: {self.__root__}")
|
||||
|
||||
def train(input: Input) -> Output:
|
||||
"""Train(训练)"""
|
||||
|
||||
print(">>> OneShot VC training ...")
|
||||
params = AttrDict()
|
||||
params.update({
|
||||
"gpu": input.gpu,
|
||||
"cpu": not input.gpu,
|
||||
"njobs": input.njobs,
|
||||
"seed": input.seed,
|
||||
"verbose": input.verbose,
|
||||
"load": input.convertor.value,
|
||||
"warm_start": False,
|
||||
})
|
||||
if input.continue_mode:
|
||||
# trace old model and config
|
||||
p = Path(input.convertor.value)
|
||||
params.name = p.parent.name
|
||||
# search a config file
|
||||
model_config_fpaths = list(p.parent.rglob("*.yaml"))
|
||||
if len(model_config_fpaths) == 0:
|
||||
raise "No model yaml config found for convertor"
|
||||
config = HpsYaml(model_config_fpaths[0])
|
||||
params.ckpdir = p.parent.parent
|
||||
params.config = model_config_fpaths[0]
|
||||
params.logdir = os.path.join(p.parent, "log")
|
||||
else:
|
||||
# Make the config dict dot visitable
|
||||
config = HpsYaml(input.config)
|
||||
np.random.seed(input.seed)
|
||||
torch.manual_seed(input.seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(input.seed)
|
||||
mode = "train"
|
||||
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
solver = Solver(config, params, mode)
|
||||
solver.load_data()
|
||||
solver.set_model()
|
||||
solver.exec()
|
||||
print(">>> Oneshot VC train finished!")
|
||||
|
||||
# TODO: pass useful return code
|
||||
return Output(__root__=(input.dataset, 0))
|
||||
print(">>> Start training ...")
|
||||
force_restart = len(input.run_id) > 0
|
||||
if not force_restart:
|
||||
input.run_id = Path(input.synthesizer.value).name.split('.')[0]
|
||||
|
||||
synt_train(
|
||||
input.run_id,
|
||||
input.input_root,
|
||||
f"synthesizer{os.sep}saved_models",
|
||||
input.save_every,
|
||||
input.backup_every,
|
||||
input.log_every,
|
||||
force_restart,
|
||||
hparams
|
||||
)
|
||||
return Output(__root__=0)
|
155
mkgui/train_vc.py
Normal file
155
mkgui/train_vc.py
Normal file
|
@ -0,0 +1,155 @@
|
|||
from pydantic import BaseModel, Field
|
||||
import os
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
from typing import Any, Tuple
|
||||
import numpy as np
|
||||
from utils.load_yaml import HpsYaml
|
||||
from utils.util import AttrDict
|
||||
import torch
|
||||
|
||||
# Constants
|
||||
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
|
||||
CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
|
||||
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
|
||||
|
||||
if os.path.isdir(EXT_MODELS_DIRT):
|
||||
extractors = Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded extractor models: " + str(len(extractors)))
|
||||
else:
|
||||
raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
if os.path.isdir(CONV_MODELS_DIRT):
|
||||
convertors = Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth")))
|
||||
print("Loaded convertor models: " + str(len(convertors)))
|
||||
else:
|
||||
raise Exception(f"Model folder {CONV_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
if os.path.isdir(ENC_MODELS_DIRT):
|
||||
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded encoders models: " + str(len(encoders)))
|
||||
else:
|
||||
raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
class Model(str, Enum):
|
||||
VC_PPG2MEL = "ppg2mel"
|
||||
|
||||
class Dataset(str, Enum):
|
||||
AIDATATANG_200ZH = "aidatatang_200zh"
|
||||
AIDATATANG_200ZH_S = "aidatatang_200zh_s"
|
||||
|
||||
class Input(BaseModel):
|
||||
# def render_input_ui(st, input) -> Dict:
|
||||
# input["selected_dataset"] = st.selectbox(
|
||||
# '选择数据集',
|
||||
# ("aidatatang_200zh", "aidatatang_200zh_s")
|
||||
# )
|
||||
# return input
|
||||
model: Model = Field(
|
||||
Model.VC_PPG2MEL, title="模型类型",
|
||||
)
|
||||
# datasets_root: str = Field(
|
||||
# ..., alias="预处理数据根目录", description="输入目录(相对/绝对),不适用于ppg2mel模型",
|
||||
# format=True,
|
||||
# example="..\\trainning_data\\"
|
||||
# )
|
||||
output_root: str = Field(
|
||||
..., alias="输出目录(可选)", description="建议不填,保持默认",
|
||||
format=True,
|
||||
example=""
|
||||
)
|
||||
continue_mode: bool = Field(
|
||||
True, alias="继续训练模式", description="选择“是”,则从下面选择的模型中继续训练",
|
||||
)
|
||||
gpu: bool = Field(
|
||||
True, alias="GPU训练", description="选择“是”,则使用GPU训练",
|
||||
)
|
||||
verbose: bool = Field(
|
||||
True, alias="打印详情", description="选择“是”,输出更多详情",
|
||||
)
|
||||
# TODO: Move to hiden fields by default
|
||||
convertor: convertors = Field(
|
||||
..., alias="转换模型",
|
||||
description="选择语音转换模型文件."
|
||||
)
|
||||
extractor: extractors = Field(
|
||||
..., alias="特征提取模型",
|
||||
description="选择PPG特征提取模型文件."
|
||||
)
|
||||
encoder: encoders = Field(
|
||||
..., alias="语音编码模型",
|
||||
description="选择语音编码模型文件."
|
||||
)
|
||||
njobs: int = Field(
|
||||
8, alias="进程数", description="适用于ppg2mel",
|
||||
)
|
||||
seed: int = Field(
|
||||
default=0, alias="初始随机数", description="适用于ppg2mel",
|
||||
)
|
||||
model_name: str = Field(
|
||||
..., alias="新模型名", description="仅在重新训练时生效,选中继续训练时无效",
|
||||
example="test"
|
||||
)
|
||||
model_config: str = Field(
|
||||
..., alias="新模型配置", description="仅在重新训练时生效,选中继续训练时无效",
|
||||
example=".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2"
|
||||
)
|
||||
|
||||
class AudioEntity(BaseModel):
|
||||
content: bytes
|
||||
mel: Any
|
||||
|
||||
class Output(BaseModel):
|
||||
__root__: Tuple[str, int]
|
||||
|
||||
def render_output_ui(self, streamlit_app, input) -> None: # type: ignore
|
||||
"""Custom output UI.
|
||||
If this method is implmeneted, it will be used instead of the default Output UI renderer.
|
||||
"""
|
||||
sr, count = self.__root__
|
||||
streamlit_app.subheader(f"Dataset {sr} done processed total of {count}")
|
||||
|
||||
def train_vc(input: Input) -> Output:
|
||||
"""Train VC(训练 VC)"""
|
||||
|
||||
print(">>> OneShot VC training ...")
|
||||
params = AttrDict()
|
||||
params.update({
|
||||
"gpu": input.gpu,
|
||||
"cpu": not input.gpu,
|
||||
"njobs": input.njobs,
|
||||
"seed": input.seed,
|
||||
"verbose": input.verbose,
|
||||
"load": input.convertor.value,
|
||||
"warm_start": False,
|
||||
})
|
||||
if input.continue_mode:
|
||||
# trace old model and config
|
||||
p = Path(input.convertor.value)
|
||||
params.name = p.parent.name
|
||||
# search a config file
|
||||
model_config_fpaths = list(p.parent.rglob("*.yaml"))
|
||||
if len(model_config_fpaths) == 0:
|
||||
raise "No model yaml config found for convertor"
|
||||
config = HpsYaml(model_config_fpaths[0])
|
||||
params.ckpdir = p.parent.parent
|
||||
params.config = model_config_fpaths[0]
|
||||
params.logdir = os.path.join(p.parent, "log")
|
||||
else:
|
||||
# Make the config dict dot visitable
|
||||
config = HpsYaml(input.config)
|
||||
np.random.seed(input.seed)
|
||||
torch.manual_seed(input.seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(input.seed)
|
||||
mode = "train"
|
||||
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
solver = Solver(config, params, mode)
|
||||
solver.load_data()
|
||||
solver.set_model()
|
||||
solver.exec()
|
||||
print(">>> Oneshot VC train finished!")
|
||||
|
||||
# TODO: pass useful return code
|
||||
return Output(__root__=(input.dataset, 0))
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
from matplotlib.pyplot import step
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
@ -297,7 +298,7 @@ class Decoder(nn.Module):
|
|||
x = torch.cat([context_vec, attn_hidden], dim=1)
|
||||
x = self.rnn_input(x)
|
||||
|
||||
# Compute first Residual RNN
|
||||
# Compute first Residual RNN, training with fixed zoneout rate 0.1
|
||||
rnn1_hidden_next, rnn1_cell = self.res_rnn1(x, (rnn1_hidden, rnn1_cell))
|
||||
if self.training:
|
||||
rnn1_hidden = self.zoneout(rnn1_hidden, rnn1_hidden_next,device=device)
|
||||
|
@ -372,11 +373,15 @@ class Tacotron(nn.Module):
|
|||
outputs = torch.cat([outputs, speaker_embeddings_], dim=-1)
|
||||
return outputs
|
||||
|
||||
def forward(self, texts, mels, speaker_embedding):
|
||||
def forward(self, texts, mels, speaker_embedding=None, steps=2000, style_idx=0, min_stop_token=5):
|
||||
|
||||
device = texts.device # use same device as parameters
|
||||
|
||||
self.step += 1
|
||||
batch_size, _, steps = mels.size()
|
||||
if self.training:
|
||||
self.step += 1
|
||||
batch_size, _, steps = mels.size()
|
||||
else:
|
||||
batch_size, _ = texts.size()
|
||||
|
||||
# Initialise all hidden states and pack into tuple
|
||||
attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
|
||||
|
@ -401,11 +406,22 @@ class Tacotron(nn.Module):
|
|||
# SV2TTS: Run the encoder with the speaker embedding
|
||||
# The projection avoids unnecessary matmuls in the decoder loop
|
||||
encoder_seq = self.encoder(texts, speaker_embedding)
|
||||
# put after encoder
|
||||
|
||||
if hparams.use_gst and self.gst is not None:
|
||||
style_embed = self.gst(speaker_embedding, speaker_embedding) # for training, speaker embedding can represent both style inputs and referenced
|
||||
# style_embed = style_embed.expand_as(encoder_seq)
|
||||
# encoder_seq = torch.cat((encoder_seq, style_embed), 2)
|
||||
if self.training:
|
||||
style_embed = self.gst(speaker_embedding, speaker_embedding) # for training, speaker embedding can represent both style inputs and referenced
|
||||
# style_embed = style_embed.expand_as(encoder_seq)
|
||||
# encoder_seq = torch.cat((encoder_seq, style_embed), 2)
|
||||
elif style_idx >= 0 and style_idx < 10:
|
||||
query = torch.zeros(1, 1, self.gst.stl.attention.num_units)
|
||||
if device.type == 'cuda':
|
||||
query = query.cuda()
|
||||
gst_embed = torch.tanh(self.gst.stl.embed)
|
||||
key = gst_embed[style_idx].unsqueeze(0).expand(1, -1, -1)
|
||||
style_embed = self.gst.stl.attention(query, key)
|
||||
else:
|
||||
speaker_embedding_style = torch.zeros(speaker_embedding.size()[0], 1, self.speaker_embedding_size).to(device)
|
||||
style_embed = self.gst(speaker_embedding_style, speaker_embedding)
|
||||
encoder_seq = self._concat_speaker_embedding(encoder_seq, style_embed)
|
||||
encoder_seq_proj = self.encoder_proj(encoder_seq)
|
||||
|
||||
|
@ -414,13 +430,17 @@ class Tacotron(nn.Module):
|
|||
|
||||
# Run the decoder loop
|
||||
for t in range(0, steps, self.r):
|
||||
prenet_in = mels[:, :, t - 1] if t > 0 else go_frame
|
||||
if self.training:
|
||||
prenet_in = mels[:, :, t -1] if t > 0 else go_frame
|
||||
else:
|
||||
prenet_in = mel_outputs[-1][:, :, -1] if t > 0 else go_frame
|
||||
mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
|
||||
self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
|
||||
hidden_states, cell_states, context_vec, t, texts)
|
||||
mel_outputs.append(mel_frames)
|
||||
attn_scores.append(scores)
|
||||
stop_outputs.extend([stop_tokens] * self.r)
|
||||
if not self.training and (stop_tokens * 10 > min_stop_token).all() and t > 10: break
|
||||
|
||||
# Concat the mel outputs into sequence
|
||||
mel_outputs = torch.cat(mel_outputs, dim=2)
|
||||
|
@ -435,87 +455,93 @@ class Tacotron(nn.Module):
|
|||
# attn_scores = attn_scores.cpu().data.numpy()
|
||||
stop_outputs = torch.cat(stop_outputs, 1)
|
||||
|
||||
|
||||
if self.training:
|
||||
self.train()
|
||||
|
||||
return mel_outputs, linear, attn_scores, stop_outputs
|
||||
|
||||
def generate(self, x, speaker_embedding=None, steps=2000, style_idx=0, min_stop_token=5):
|
||||
self.eval()
|
||||
device = x.device # use same device as parameters
|
||||
|
||||
batch_size, _ = x.size()
|
||||
|
||||
# Need to initialise all hidden states and pack into tuple for tidyness
|
||||
attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
|
||||
rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
||||
rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
||||
hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
|
||||
|
||||
# Need to initialise all lstm cell states and pack into tuple for tidyness
|
||||
rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
||||
rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
||||
cell_states = (rnn1_cell, rnn2_cell)
|
||||
|
||||
# Need a <GO> Frame for start of decoder loop
|
||||
go_frame = torch.zeros(batch_size, self.n_mels, device=device)
|
||||
|
||||
# Need an initial context vector
|
||||
size = self.encoder_dims + self.speaker_embedding_size
|
||||
if hparams.use_gst:
|
||||
size += gst_hp.E
|
||||
context_vec = torch.zeros(batch_size, size, device=device)
|
||||
|
||||
# SV2TTS: Run the encoder with the speaker embedding
|
||||
# The projection avoids unnecessary matmuls in the decoder loop
|
||||
encoder_seq = self.encoder(x, speaker_embedding)
|
||||
|
||||
# put after encoder
|
||||
if hparams.use_gst and self.gst is not None:
|
||||
if style_idx >= 0 and style_idx < 10:
|
||||
query = torch.zeros(1, 1, self.gst.stl.attention.num_units)
|
||||
if device.type == 'cuda':
|
||||
query = query.cuda()
|
||||
gst_embed = torch.tanh(self.gst.stl.embed)
|
||||
key = gst_embed[style_idx].unsqueeze(0).expand(1, -1, -1)
|
||||
style_embed = self.gst.stl.attention(query, key)
|
||||
else:
|
||||
speaker_embedding_style = torch.zeros(speaker_embedding.size()[0], 1, self.speaker_embedding_size).to(device)
|
||||
style_embed = self.gst(speaker_embedding_style, speaker_embedding)
|
||||
encoder_seq = self._concat_speaker_embedding(encoder_seq, style_embed)
|
||||
# style_embed = style_embed.expand_as(encoder_seq)
|
||||
# encoder_seq = torch.cat((encoder_seq, style_embed), 2)
|
||||
encoder_seq_proj = self.encoder_proj(encoder_seq)
|
||||
|
||||
# Need a couple of lists for outputs
|
||||
mel_outputs, attn_scores, stop_outputs = [], [], []
|
||||
|
||||
# Run the decoder loop
|
||||
for t in range(0, steps, self.r):
|
||||
prenet_in = mel_outputs[-1][:, :, -1] if t > 0 else go_frame
|
||||
mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
|
||||
self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
|
||||
hidden_states, cell_states, context_vec, t, x)
|
||||
mel_outputs.append(mel_frames)
|
||||
attn_scores.append(scores)
|
||||
stop_outputs.extend([stop_tokens] * self.r)
|
||||
# Stop the loop when all stop tokens in batch exceed threshold
|
||||
if (stop_tokens * 10 > min_stop_token).all() and t > 10: break
|
||||
|
||||
# Concat the mel outputs into sequence
|
||||
mel_outputs = torch.cat(mel_outputs, dim=2)
|
||||
|
||||
# Post-Process for Linear Spectrograms
|
||||
postnet_out = self.postnet(mel_outputs)
|
||||
linear = self.post_proj(postnet_out)
|
||||
|
||||
|
||||
linear = linear.transpose(1, 2)
|
||||
|
||||
# For easy visualisation
|
||||
attn_scores = torch.cat(attn_scores, 1)
|
||||
stop_outputs = torch.cat(stop_outputs, 1)
|
||||
|
||||
self.train()
|
||||
|
||||
mel_outputs, linear, attn_scores, _ = self.forward(x, None, speaker_embedding, steps, style_idx, min_stop_token)
|
||||
return mel_outputs, linear, attn_scores
|
||||
# device = x.device # use same device as parameters
|
||||
|
||||
# batch_size, _ = x.size()
|
||||
|
||||
# # Need to initialise all hidden states and pack into tuple for tidyness
|
||||
# attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
|
||||
# rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
||||
# rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
||||
# hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
|
||||
|
||||
# # Need to initialise all lstm cell states and pack into tuple for tidyness
|
||||
# rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
||||
# rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
||||
# cell_states = (rnn1_cell, rnn2_cell)
|
||||
|
||||
# # Need a <GO> Frame for start of decoder loop
|
||||
# go_frame = torch.zeros(batch_size, self.n_mels, device=device)
|
||||
|
||||
# # Need an initial context vector
|
||||
# size = self.encoder_dims + self.speaker_embedding_size
|
||||
# if hparams.use_gst:
|
||||
# size += gst_hp.E
|
||||
# context_vec = torch.zeros(batch_size, size, device=device)
|
||||
|
||||
# # SV2TTS: Run the encoder with the speaker embedding
|
||||
# # The projection avoids unnecessary matmuls in the decoder loop
|
||||
# encoder_seq = self.encoder(x, speaker_embedding)
|
||||
|
||||
# # put after encoder
|
||||
# if hparams.use_gst and self.gst is not None:
|
||||
# if style_idx >= 0 and style_idx < 10:
|
||||
# query = torch.zeros(1, 1, self.gst.stl.attention.num_units)
|
||||
# if device.type == 'cuda':
|
||||
# query = query.cuda()
|
||||
# gst_embed = torch.tanh(self.gst.stl.embed)
|
||||
# key = gst_embed[style_idx].unsqueeze(0).expand(1, -1, -1)
|
||||
# style_embed = self.gst.stl.attention(query, key)
|
||||
# else:
|
||||
# speaker_embedding_style = torch.zeros(speaker_embedding.size()[0], 1, self.speaker_embedding_size).to(device)
|
||||
# style_embed = self.gst(speaker_embedding_style, speaker_embedding)
|
||||
# encoder_seq = self._concat_speaker_embedding(encoder_seq, style_embed)
|
||||
# # style_embed = style_embed.expand_as(encoder_seq)
|
||||
# # encoder_seq = torch.cat((encoder_seq, style_embed), 2)
|
||||
# encoder_seq_proj = self.encoder_proj(encoder_seq)
|
||||
|
||||
# # Need a couple of lists for outputs
|
||||
# mel_outputs, attn_scores, stop_outputs = [], [], []
|
||||
|
||||
# # Run the decoder loop
|
||||
# for t in range(0, steps, self.r):
|
||||
# prenet_in = mel_outputs[-1][:, :, -1] if t > 0 else go_frame
|
||||
# mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
|
||||
# self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
|
||||
# hidden_states, cell_states, context_vec, t, x)
|
||||
# mel_outputs.append(mel_frames)
|
||||
# attn_scores.append(scores)
|
||||
# stop_outputs.extend([stop_tokens] * self.r)
|
||||
# # Stop the loop when all stop tokens in batch exceed threshold
|
||||
# if (stop_tokens * 10 > min_stop_token).all() and t > 10: break
|
||||
|
||||
# # Concat the mel outputs into sequence
|
||||
# mel_outputs = torch.cat(mel_outputs, dim=2)
|
||||
|
||||
# # Post-Process for Linear Spectrograms
|
||||
# postnet_out = self.postnet(mel_outputs)
|
||||
# linear = self.post_proj(postnet_out)
|
||||
|
||||
|
||||
# linear = linear.transpose(1, 2)
|
||||
|
||||
# # For easy visualisation
|
||||
# attn_scores = torch.cat(attn_scores, 1)
|
||||
# stop_outputs = torch.cat(stop_outputs, 1)
|
||||
|
||||
# self.train()
|
||||
|
||||
# return mel_outputs, linear, attn_scores
|
||||
|
||||
def init_model(self):
|
||||
for p in self.parameters():
|
||||
|
|
|
@ -15,9 +15,8 @@ from datetime import datetime
|
|||
import json
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import time
|
||||
|
||||
import os
|
||||
|
||||
def np_now(x: torch.Tensor): return x.detach().cpu().numpy()
|
||||
|
||||
|
@ -265,7 +264,19 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
|
|||
loss=loss,
|
||||
hparams=hparams,
|
||||
sw=sw)
|
||||
|
||||
MAX_SAVED_COUNT = 20
|
||||
if (step / hparams.tts_eval_interval) % MAX_SAVED_COUNT:
|
||||
# clean up and save last MAX_SAVED_COUNT;
|
||||
plots = next(os.walk(plot_dir), (None, None, []))[2]
|
||||
for plot in plots[-MAX_SAVED_COUNT:]:
|
||||
os.remove(plot_dir.joinpath(plot))
|
||||
mel_files = next(os.walk(mel_output_dir), (None, None, []))[2]
|
||||
for mel_file in mel_files[-MAX_SAVED_COUNT:]:
|
||||
os.remove(mel_output_dir.joinpath(mel_file))
|
||||
wavs = next(os.walk(wav_dir), (None, None, []))[2]
|
||||
for w in wavs[-MAX_SAVED_COUNT:]:
|
||||
os.remove(wav_dir.joinpath(w))
|
||||
|
||||
# Break out of loop to update training schedule
|
||||
if step >= max_step:
|
||||
break
|
||||
|
|
Loading…
Reference in New Issue
Block a user