diff --git a/.vscode/launch.json b/.vscode/launch.json
index b2ab7f8..a766d67 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -64,6 +64,14 @@
"args": ["-c", ".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2.yaml",
"-m", ".\\ppg2mel\\saved_models\\best_loss_step_304000.pth", "--wav_dir", ".\\wavs\\input", "--ref_wav_path", ".\\wavs\\pkq.mp3", "-o", ".\\wavs\\output\\"
]
- }
+ },
+ {
+ "name": "Python: Vits Train",
+ "type": "python",
+ "request": "launch",
+ "program": "train.py",
+ "console": "integratedTerminal",
+ "args": ["--type", "vits"]
+ },
]
}
diff --git a/models/synthesizer/hparams.py b/models/synthesizer/hparams.py
index ca3e635..b0d46f7 100644
--- a/models/synthesizer/hparams.py
+++ b/models/synthesizer/hparams.py
@@ -3,10 +3,10 @@ from utils.hparams import HParams
hparams = HParams(
### Signal Processing (used in both synthesizer and vocoder)
sample_rate = 16000,
- n_fft = 800,
+ n_fft = 1024, # filter_length
num_mels = 80,
- hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
- win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
+ hop_size = 256, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
+ win_size = 1024, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
fmin = 55,
min_level_db = -100,
ref_level_db = 20,
@@ -67,7 +67,7 @@ hparams = HParams(
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
# and [0, max_abs_value] if False
- trim_silence = True, # Use with sample_rate of 16000 for best results
+ trim_silence = False, # Use with sample_rate of 16000 for best results
### SV2TTS
speaker_embedding_size = 256, # Dimension for the speaker embedding
diff --git a/models/synthesizer/models/vits.py b/models/synthesizer/models/vits.py
index db4a917..0041a88 100644
--- a/models/synthesizer/models/vits.py
+++ b/models/synthesizer/models/vits.py
@@ -2,12 +2,12 @@ import math
import torch
from torch import nn
from torch.nn import functional as F
+from loguru import logger
from .sublayer.vits_modules import *
import monotonic_align
-from .base import Base
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from utils.util import init_weights, get_padding, sequence_mask, rand_slice_segments, generate_path
@@ -386,7 +386,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-class Vits(Base):
+class Vits(nn.Module):
"""
Synthesizer of Vits
"""
@@ -408,13 +408,12 @@ class Vits(Base):
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
- stop_threshold,
n_speakers=0,
gin_channels=0,
use_sdp=True,
**kwargs):
- super().__init__(stop_threshold)
+ super().__init__()
self.n_vocab = n_vocab
self.spec_channels = spec_channels
self.inter_channels = inter_channels
@@ -457,7 +456,7 @@ class Vits(Base):
self.emb_g = nn.Embedding(n_speakers, gin_channels)
def forward(self, x, x_lengths, y, y_lengths, sid=None, emo=None):
-
+ # logger.info(f'====> Forward: 1.1.0')
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emo)
if self.n_speakers > 0:
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
@@ -466,7 +465,7 @@ class Vits(Base):
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
-
+ # logger.info(f'====> Forward: 1.1.1')
with torch.no_grad():
# negative cross-entropy
s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
@@ -475,10 +474,11 @@ class Vits(Base):
neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
-
+ #logger.info(f'====> Forward: 1.1.1.1')
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
+ # logger.info(f'====> Forward: 1.1.2')
w = attn.sum(2)
if self.use_sdp:
l_length = self.dp(x, x_mask, w, g=g)
@@ -487,7 +487,6 @@ class Vits(Base):
logw_ = torch.log(w + 1e-6) * x_mask
logw = self.dp(x, x_mask, g=g)
l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
-
# expand prior
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
@@ -497,7 +496,9 @@ class Vits(Base):
return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, x, x_lengths, sid=None, emo=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
+ # logger.info(f'====> Infer: 1.1.0')
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths,emo)
+ # logger.info(f'====> Infer: 1.1.1')
if self.n_speakers > 0:
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
else:
@@ -514,11 +515,14 @@ class Vits(Base):
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
attn = generate_path(w_ceil, attn_mask)
+ # logger.info(f'====> Infer: 1.1.2')
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
z = self.flow(z_p, y_mask, g=g, reverse=True)
o = self.dec((z * y_mask)[:,:,:max_len], g=g)
+
+ # logger.info(f'====> Infer: 1.1.3')
return o, attn, y_mask, (z, z_p, m_p, logs_p)
diff --git a/models/synthesizer/preprocess_audio.py b/models/synthesizer/preprocess_audio.py
index c8f7904..a2e08f8 100644
--- a/models/synthesizer/preprocess_audio.py
+++ b/models/synthesizer/preprocess_audio.py
@@ -20,8 +20,6 @@ device = 'cuda' if torch.cuda.is_available() else "cpu"
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionExtractorModel.from_pretrained(model_name).to(device)
-embs = []
-wavnames = []
def extract_emo(
x: np.ndarray,
@@ -48,8 +46,6 @@ class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
pinyin = Pinyin(PinyinConverter()).pinyin
-
-
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams, emotion_extract: bool):
## FOR REFERENCE:
@@ -67,9 +63,8 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
# Skip existing utterances if needed
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
- emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % basename)
- skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists())
- if skip_existing and mel_fpath.exists() and wav_fpath.exists() and skip_emo_extract:
+
+ if skip_existing and mel_fpath.exists() and wav_fpath.exists():
return None
# Trim silence
@@ -91,18 +86,14 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
np.save(wav_fpath, wav, allow_pickle=False)
- if not skip_emo_extract:
- emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
- np.save(emo_fpath, emo, allow_pickle=False)
-
# Return a tuple describing this training example
- return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
+ return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, wav, mel_frames, text
def _split_on_silences(wav_fpath, words, hparams):
# Load the audio waveform
wav, _ = librosa.load(wav_fpath, sr= hparams.sample_rate)
- wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
+ wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=1024)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# denoise, we may not need it here.
@@ -132,6 +123,15 @@ def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams,
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams)
- metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
- skip_existing, hparams, emotion_extract))
+ result = _process_utterance(wav, text, out_dir, sub_basename,
+ skip_existing, hparams, emotion_extract)
+ if result is None:
+ continue
+ wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
+ emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % sub_basename)
+ skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists())
+ if not skip_emo_extract and wav is not None:
+ emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
+ np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
+ metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
return [m for m in metadata if m is not None]
diff --git a/models/synthesizer/train_vits.py b/models/synthesizer/train_vits.py
index d8324d9..6eba6ad 100644
--- a/models/synthesizer/train_vits.py
+++ b/models/synthesizer/train_vits.py
@@ -39,7 +39,7 @@ def new_train():
parser.add_argument("--syn_dir", type=str, default="../audiodata/SV2TTS/synthesizer", help= \
"Path to the synthesizer directory that contains the ground truth mel spectrograms, "
"the wavs, the emos and the embeds.")
- parser.add_argument("-m", "--model_dir", type=str, default="data/ckpt/synthesizer/vits", help=\
+ parser.add_argument("-m", "--model_dir", type=str, default="data/ckpt/synthesizer/vits2", help=\
"Path to the output directory that will contain the saved model weights and the logs.")
parser.add_argument('--ckptG', type=str, required=False,
help='original VITS G checkpoint path')
@@ -65,7 +65,7 @@ def new_train():
run(0, 1, hparams)
-def load_checkpoint(checkpoint_path, model, optimizer=None, is_old=False):
+def load_checkpoint(checkpoint_path, model, optimizer=None, is_old=False, epochs=10000):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
iteration = checkpoint_dict['iteration']
@@ -89,8 +89,12 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, is_old=False):
try:
new_state_dict[k] = saved_state_dict[k]
except:
- logger.info("%s is not in the checkpoint" % k)
- new_state_dict[k] = v
+ if k == 'step':
+ new_state_dict[k] = iteration * epochs
+ else:
+ logger.info("%s is not in the checkpoint" % k)
+ new_state_dict[k] = v
+
if hasattr(model, 'module'):
model.module.load_state_dict(new_state_dict, strict=False)
else:
@@ -173,13 +177,13 @@ def run(rank, n_gpus, hps):
print("加载原版VITS模型G记录点成功")
else:
_, _, _, epoch_str = load_checkpoint(latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g,
- optim_g)
+ optim_g, epochs=hps.train.epochs)
if ckptD is not None:
_, _, _, epoch_str = load_checkpoint(ckptG, net_g, optim_g, is_old=True)
print("加载原版VITS模型D记录点成功")
else:
_, _, _, epoch_str = load_checkpoint(latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d,
- optim_d)
+ optim_d, epochs=hps.train.epochs)
global_step = (epoch_str - 1) * len(train_loader)
except:
epoch_str = 1
@@ -216,17 +220,17 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
net_g.train()
net_d.train()
for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers, emo) in enumerate(train_loader):
- logger.info(f'====> Step: 1 {batch_idx}')
- x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True)
- spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
- y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True)
- speakers = speakers.cuda(rank, non_blocking=True)
- emo = emo.cuda(rank, non_blocking=True)
-
+ # logger.info(f'====> Step: 1 {batch_idx}')
+ x, x_lengths = x.cuda(rank), x_lengths.cuda(rank)
+ spec, spec_lengths = spec.cuda(rank), spec_lengths.cuda(rank)
+ y, y_lengths = y.cuda(rank), y_lengths.cuda(rank)
+ speakers = speakers.cuda(rank)
+ emo = emo.cuda(rank)
+ # logger.info(f'====> Step: 1.0 {batch_idx}')
with autocast(enabled=hps.train.fp16_run):
y_hat, l_length, attn, ids_slice, x_mask, z_mask, \
(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, speakers, emo)
-
+ # logger.info(f'====> Step: 1.1 {batch_idx}')
mel = spec_to_mel(
spec,
hps.data.filter_length,
@@ -247,7 +251,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
)
y = slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
-
+ # logger.info(f'====> Step: 1.3 {batch_idx}')
# Discriminator
y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
with autocast(enabled=False):
@@ -258,7 +262,6 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
scaler.unscale_(optim_d)
grad_norm_d = clip_grad_value_(net_d.parameters(), None)
scaler.step(optim_d)
- logger.info(f'====> Step: 2 {batch_idx}')
with autocast(enabled=hps.train.fp16_run):
# Generator
@@ -277,7 +280,6 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
grad_norm_g = clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
scaler.update()
- # logger.info(f'====> Step: 3 {batch_idx}')
if rank == 0:
if global_step % hps.train.log_interval == 0:
lr = optim_g.param_groups[0]['lr']
@@ -339,6 +341,8 @@ def evaluate(hps, generator, eval_loader, writer_eval):
emo = emo[:1]
break
y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, emo, max_len=1000)
+ # y_hat, attn, mask, *_ = generator.infer(x, x_lengths, speakers, emo, max_len=1000) # for non DistributedDataParallel object
+
y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length
mel = spec_to_mel(
diff --git a/models/synthesizer/vits_dataset.py b/models/synthesizer/vits_dataset.py
index 32702d1..6acbfe8 100644
--- a/models/synthesizer/vits_dataset.py
+++ b/models/synthesizer/vits_dataset.py
@@ -4,7 +4,7 @@ import numpy as np
import torch
import torch.utils.data
-from utils.audio_utils import spectrogram, load_wav
+from utils.audio_utils import spectrogram1, load_wav_to_torch, spectrogram
from utils.util import intersperse
from models.synthesizer.utils.text import text_to_sequence
@@ -57,6 +57,8 @@ class VitsDataset(torch.utils.data.Dataset):
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
# TODO: for magic data only
speaker_name = wav_fpath.split("_")[1]
+ # # TODO: for ai data only
+ # speaker_name = wav_fpath.split("-")[1][6:9]
if speaker_name not in spk_to_sid:
sid += 1
spk_to_sid[speaker_name] = sid
@@ -71,36 +73,45 @@ class VitsDataset(torch.utils.data.Dataset):
# separate filename, speaker_id and text
wav_fpath, text, sid = audio_metadata[0], audio_metadata[5], audio_metadata[6]
text = self.get_text(text)
-
- spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
+
+ # TODO: add original audio data root for loading
+ file_name = wav_fpath.split("_00")[0].split('-')[1]
+ spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}..{os.sep}..{os.sep}magicdata{os.sep}train{os.sep}{"_".join(file_name.split("_")[:2])}{os.sep}{file_name}')
+
+ # spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
sid = self.get_sid(sid)
emo = torch.FloatTensor(np.load(f'{self.datasets_root}{os.sep}emo{os.sep}{wav_fpath.replace("audio", "emo")}'))
return (text, spec, wav, sid, emo)
def get_audio(self, filename):
- # audio, sampling_rate = load_wav(filename)
-
- # if sampling_rate != self.sampling_rate:
- # raise ValueError("{} {} SR doesn't match target {} SR".format(
- # sampling_rate, self.sampling_rate))
- # audio = torch.load(filename)
- audio = torch.FloatTensor(np.load(filename).astype(np.float32))
- audio = audio.unsqueeze(0)
- # audio_norm = audio / self.max_wav_value
- # audio_norm = audio_norm.unsqueeze(0)
- # spec_filename = filename.replace(".wav", ".spec.pt")
- # if os.path.exists(spec_filename):
- # spec = torch.load(spec_filename)
- # else:
- # spec = spectrogram(audio, self.filter_length,
- # self.sampling_rate, self.hop_length, self.win_length,
- # center=False)
- # spec = torch.squeeze(spec, 0)
- # torch.save(spec, spec_filename)
- spec = spectrogram(audio, self.filter_length, self.hop_length, self.win_length,
+ audio, sampling_rate = load_wav_to_torch(filename)
+ if sampling_rate != self.sampling_rate:
+ raise ValueError("{} {} SR doesn't match target {} SR".format(
+ sampling_rate, self.sampling_rate))
+ audio_norm = audio / self.max_wav_value
+ audio_norm = audio_norm.unsqueeze(0)
+ spec = spectrogram(audio_norm, self.filter_length, self.hop_length, self.win_length,
center=False)
spec = torch.squeeze(spec, 0)
- return spec, audio
+ return spec, audio_norm
+
+ # print("Loading", filename)
+ # # audio = torch.FloatTensor(np.load(filename).astype(np.float32))
+ # audio = audio.unsqueeze(0)
+ # audio_norm = audio / self.max_wav_value
+ # audio_norm = audio_norm.unsqueeze(0)
+ # # spec_filename = filename.replace(".wav", ".spec.pt")
+ # # if os.path.exists(spec_filename):
+ # # spec = torch.load(spec_filename)
+ # # else:
+ # # spec = spectrogram(audio, self.filter_length,self.hop_length, self.win_length,
+ # # center=False)
+ # # spec = torch.squeeze(spec, 0)
+ # # torch.save(spec, spec_filename)
+ # spec = spectrogram(audio, self.filter_length, self.hop_length, self.win_length,
+ # center=False)
+ # spec = torch.squeeze(spec, 0)
+ # return spec, audio
def get_text(self, text):
if self.cleaned_text:
diff --git a/utils/audio_utils.py b/utils/audio_utils.py
index dee34d1..93effe0 100644
--- a/utils/audio_utils.py
+++ b/utils/audio_utils.py
@@ -17,8 +17,7 @@ def load_wav_to_torch(full_path):
sampling_rate, data = read(full_path)
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
-
-def spectrogram(y, n_fft, hop_size, win_size, center=False):
+def spectrogram1(y, n_fft, sampling_rate, hop_size, win_size, center=False):
if torch.min(y) < -1.:
print('min value is ', torch.min(y))
if torch.max(y) > 1.:
@@ -34,7 +33,29 @@ def spectrogram(y, n_fft, hop_size, win_size, center=False):
y = y.squeeze(1)
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
- center=center, pad_mode='reflect', normalized=False, onesided=True)
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
+
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+ return spec
+
+
+def spectrogram(y, n_fft, hop_size, win_size, center=False):
+ if torch.min(y) < -1.:
+ print('min value is ', torch.min(y))
+ if torch.max(y) > 1.:
+ print('max value is ', torch.max(y))
+
+ global hann_window
+ dtype_device = str(y.dtype) + '_' + str(y.device)
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
+ if wnsize_dtype_device not in hann_window:
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+ y = y.squeeze(1)
+
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
return spec
diff --git a/vits.ipynb b/vits.ipynb
index cd01684..10242da 100644
--- a/vits.ipynb
+++ b/vits.ipynb
@@ -2,18 +2,9 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'log_interval': 2000, 'eval_interval': 4000, 'seed': 1234, 'epochs': 10000, 'learning_rate': 0.0001, 'betas': [0.8, 0.99], 'eps': 1e-09, 'batch_size': 16, 'fp16_run': True, 'lr_decay': 0.5, 'segment_size': 8192, 'init_lr_ratio': 1, 'warmup_epochs': 0, 'c_mel': 45, 'c_kl': 1.0}\n",
- "Trainable Parameters: 0.000M\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"from utils.hparams import load_hparams_json\n",
"from utils.util import intersperse\n",
@@ -22,124 +13,36 @@
"import torch\n",
"import numpy as np\n",
"import IPython.display as ipd\n",
+ "from models.synthesizer.utils.symbols import symbols\n",
+ "from models.synthesizer.utils.text import text_to_sequence\n",
"\n",
- "# chinese_cleaners\n",
- "_pad = '_'\n",
- "_punctuation = ',。!?—…'\n",
- "_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '\n",
- "# Export all symbols:\n",
- "symbols = [_pad] + list(_punctuation) + list(_letters)\n",
"\n",
- "hps = load_hparams_json(\"data/ckpt/synthesizer/vits/config.json\")\n",
+ "hps = load_hparams_json(\"data/ckpt/synthesizer/vits2/config.json\")\n",
"print(hps.train)\n",
"model = Vits(\n",
" len(symbols),\n",
" hps[\"data\"][\"filter_length\"] // 2 + 1,\n",
" hps[\"train\"][\"segment_size\"] // hps[\"data\"][\"hop_length\"],\n",
" n_speakers=hps[\"data\"][\"n_speakers\"],\n",
- " stop_threshold=0.5,\n",
" **hps[\"model\"])\n",
"_ = model.eval()\n",
"device = torch.device(\"cpu\")\n",
- "model.load(\"data/ckpt/synthesizer/vits/G_208000.pth\", device)\n",
+ "checkpoint = torch.load(str(\"data/ckpt/synthesizer/vits2/G_120000.pth\"), map_location=device)\n",
+ "if \"model_state\" in checkpoint:\n",
+ " state = checkpoint[\"model_state\"]\n",
+ "else:\n",
+ " state = checkpoint[\"model\"]\n",
+ "model.load_state_dict(state, strict=False)\n",
"\n",
"# 随机抽取情感参考音频的根目录\n",
"random_emotion_root = \"D:\\\\audiodata\\\\aidatatang_200zh\\\\corpus\\\\train\\\\G0017\"\n",
"import random, re\n",
- "# import cn2an # remove dependency before production\n",
- "from pypinyin import lazy_pinyin, BOPOMOFO\n",
- "\n",
- "_symbol_to_id = {s: i for i, s in enumerate(symbols)}\n",
- "\n",
- "# def number_to_chinese(text):\n",
- "# numbers = re.findall(r'\\d+(?:\\.?\\d+)?', text)\n",
- "# for number in numbers:\n",
- "# text = text.replace(number, cn2an.an2cn(number), 1)\n",
- "# return text\n",
- "\n",
- "def chinese_to_bopomofo(text, taiwanese=False):\n",
- " text = text.replace('、', ',').replace(';', ',').replace(':', ',')\n",
- " for word in list(text):\n",
- " bopomofos = lazy_pinyin(word, BOPOMOFO)\n",
- " if not re.search('[\\u4e00-\\u9fff]', word):\n",
- " text += word\n",
- " continue\n",
- " for i in range(len(bopomofos)):\n",
- " bopomofos[i] = re.sub(r'([\\u3105-\\u3129])$', r'\\1ˉ', bopomofos[i])\n",
- " if text != '':\n",
- " text += ' '\n",
- " if taiwanese:\n",
- " text += '#'+'#'.join(bopomofos)\n",
- " else:\n",
- " text += ''.join(bopomofos)\n",
- " return text\n",
- "\n",
- "_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [\n",
- " ('a', 'ㄟˉ'),\n",
- " ('b', 'ㄅㄧˋ'),\n",
- " ('c', 'ㄙㄧˉ'),\n",
- " ('d', 'ㄉㄧˋ'),\n",
- " ('e', 'ㄧˋ'),\n",
- " ('f', 'ㄝˊㄈㄨˋ'),\n",
- " ('g', 'ㄐㄧˋ'),\n",
- " ('h', 'ㄝˇㄑㄩˋ'),\n",
- " ('i', 'ㄞˋ'),\n",
- " ('j', 'ㄐㄟˋ'),\n",
- " ('k', 'ㄎㄟˋ'),\n",
- " ('l', 'ㄝˊㄛˋ'),\n",
- " ('m', 'ㄝˊㄇㄨˋ'),\n",
- " ('n', 'ㄣˉ'),\n",
- " ('o', 'ㄡˉ'),\n",
- " ('p', 'ㄆㄧˉ'),\n",
- " ('q', 'ㄎㄧㄡˉ'),\n",
- " ('r', 'ㄚˋ'),\n",
- " ('s', 'ㄝˊㄙˋ'),\n",
- " ('t', 'ㄊㄧˋ'),\n",
- " ('u', 'ㄧㄡˉ'),\n",
- " ('v', 'ㄨㄧˉ'),\n",
- " ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),\n",
- " ('x', 'ㄝˉㄎㄨˋㄙˋ'),\n",
- " ('y', 'ㄨㄞˋ'),\n",
- " ('z', 'ㄗㄟˋ')\n",
- "]]\n",
- "\n",
- "def latin_to_bopomofo(text):\n",
- " for regex, replacement in _latin_to_bopomofo:\n",
- " text = re.sub(regex, replacement, text)\n",
- " return text\n",
- "\n",
- "#TODO: add cleaner to support multilang\n",
- "def chinese_cleaners(text, cleaner_names):\n",
- " '''Pipeline for Chinese text'''\n",
- " # text = number_to_chinese(text)\n",
- " text = chinese_to_bopomofo(text)\n",
- " text = latin_to_bopomofo(text)\n",
- " if re.match('[ˉˊˇˋ˙]', text[-1]):\n",
- " text += '。'\n",
- " return text\n",
- "\n",
- "\n",
- "def text_to_sequence(text, cleaner_names):\n",
- " '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.\n",
- " Args:\n",
- " text: string to convert to a sequence\n",
- " cleaner_names: names of the cleaner functions to run the text through\n",
- " Returns:\n",
- " List of integers corresponding to the symbols in the text\n",
- " '''\n",
- " sequence = []\n",
- "\n",
- " clean_text = chinese_cleaners(text, cleaner_names)\n",
- " for symbol in clean_text:\n",
- " if symbol not in _symbol_to_id.keys():\n",
- " continue\n",
- " symbol_id = _symbol_to_id[symbol]\n",
- " sequence += [symbol_id]\n",
- " return sequence\n",
+ "from pypinyin import lazy_pinyin, Style\n",
"\n",
"import os\n",
"\n",
"def tts(txt, emotion, sid=0):\n",
+ " txt = \" \".join(lazy_pinyin(txt, style=Style.TONE3, neutral_tone_with_five=True))\n",
" text_norm = text_to_sequence(txt, hps[\"data\"][\"text_cleaners\"])\n",
" if hps[\"data\"][\"add_blank\"]:\n",
" text_norm = intersperse(text_norm, 0)\n",
@@ -173,37 +76,12 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"txt = \"随机抽取的音频文件路径可以用于使用该情感合成其他句子\"\n",
- "tts(txt, emotion='C:\\\\Users\\\\babys\\\\Desktop\\\\voicecollection\\\\secondround\\\\美玉.wav', sid=0)"
+ "tts(txt, emotion='C:\\\\Users\\\\babys\\\\Desktop\\\\voicecollection\\\\secondround\\\\美玉.wav', sid=2)"
]
},
{
@@ -216,43 +94,31 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Using data from:\n",
- " ..\\audiodata\\magicdata\\train\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "magicdata: 0%| | 0/1018 [00:00, ?speakers/s]"
- ]
- },
- {
- "ename": "",
- "evalue": "",
- "output_type": "error",
- "traceback": [
- "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details."
- ]
- }
- ],
+ "outputs": [],
"source": [
"from models.synthesizer.preprocess import preprocess_dataset\n",
"from pathlib import Path\n",
"from utils.hparams import HParams\n",
"datasets_root = Path(\"../audiodata/\")\n",
- "hparams=HParams(\n",
+ "hparams = HParams(\n",
+ " n_fft = 1024, # filter_length\n",
+ " num_mels = 80,\n",
+ " hop_size = 256, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)\n",
+ " win_size = 1024, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)\n",
+ " fmin = 55,\n",
+ " min_level_db = -100,\n",
+ " ref_level_db = 20,\n",
+ " max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.\n",
" sample_rate = 16000,\n",
" rescale = True,\n",
" max_mel_frames = 900,\n",
- " rescaling_max = 0.9,\n",
+ " rescaling_max = 0.9, \n",
+ " preemphasis = 0.97, # Filter coefficient to use if preemphasize is True\n",
+ " preemphasize = True,\n",
+ " ### Mel Visualization and Griffin-Lim\n",
+ " signal_normalization = True,\n",
"\n",
" utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded\n",
" ### Audio processing options\n",
@@ -285,40 +151,9 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\\Loading the json with %s\n",
- " data\\ckpt\\synthesizer\\vits\\config.json\n"
- ]
- },
- {
- "ename": "ProcessRaisedException",
- "evalue": "\n\n-- Process 0 terminated with the following error:\nTraceback (most recent call last):\n File \"d:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py\", line 59, in _wrap\n fn(i, *args)\n File \"d:\\Real-Time-Voice-Cloning-Chinese\\models\\synthesizer\\train_vits.py\", line 123, in run\n net_g = Vits(\nTypeError: __init__() missing 1 required positional argument: 'stop_threshold'\n",
- "output_type": "error",
- "traceback": [
- "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[1;31mProcessRaisedException\u001b[0m Traceback (most recent call last)",
- "\u001b[1;32md:\\Real-Time-Voice-Cloning-Chinese\\vits.ipynb Cell 7\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 18\u001b[0m os\u001b[39m.\u001b[39menviron[\u001b[39m'\u001b[39m\u001b[39mMASTER_ADDR\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mlocalhost\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m 19\u001b[0m os\u001b[39m.\u001b[39menviron[\u001b[39m'\u001b[39m\u001b[39mMASTER_PORT\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m8899\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m---> 20\u001b[0m mp\u001b[39m.\u001b[39;49mspawn(run, nprocs\u001b[39m=\u001b[39;49mn_gpus, args\u001b[39m=\u001b[39;49m(n_gpus, hparams))\n",
- "File \u001b[1;32md:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py:230\u001b[0m, in \u001b[0;36mspawn\u001b[1;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[0;32m 226\u001b[0m msg \u001b[39m=\u001b[39m (\u001b[39m'\u001b[39m\u001b[39mThis method only supports start_method=spawn (got: \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m).\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m'\u001b[39m\n\u001b[0;32m 227\u001b[0m \u001b[39m'\u001b[39m\u001b[39mTo use a different start_method use:\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\t\u001b[39;00m\u001b[39m\\t\u001b[39;00m\u001b[39m'\u001b[39m\n\u001b[0;32m 228\u001b[0m \u001b[39m'\u001b[39m\u001b[39m torch.multiprocessing.start_processes(...)\u001b[39m\u001b[39m'\u001b[39m \u001b[39m%\u001b[39m start_method)\n\u001b[0;32m 229\u001b[0m warnings\u001b[39m.\u001b[39mwarn(msg)\n\u001b[1;32m--> 230\u001b[0m \u001b[39mreturn\u001b[39;00m start_processes(fn, args, nprocs, join, daemon, start_method\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mspawn\u001b[39;49m\u001b[39m'\u001b[39;49m)\n",
- "File \u001b[1;32md:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py:188\u001b[0m, in \u001b[0;36mstart_processes\u001b[1;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[0;32m 185\u001b[0m \u001b[39mreturn\u001b[39;00m context\n\u001b[0;32m 187\u001b[0m \u001b[39m# Loop on join until it returns True or raises an exception.\u001b[39;00m\n\u001b[1;32m--> 188\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m context\u001b[39m.\u001b[39;49mjoin():\n\u001b[0;32m 189\u001b[0m \u001b[39mpass\u001b[39;00m\n",
- "File \u001b[1;32md:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py:150\u001b[0m, in \u001b[0;36mProcessContext.join\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 148\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39m-- Process \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m terminated with the following error:\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m error_index\n\u001b[0;32m 149\u001b[0m msg \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m original_trace\n\u001b[1;32m--> 150\u001b[0m \u001b[39mraise\u001b[39;00m ProcessRaisedException(msg, error_index, failed_process\u001b[39m.\u001b[39mpid)\n",
- "\u001b[1;31mProcessRaisedException\u001b[0m: \n\n-- Process 0 terminated with the following error:\nTraceback (most recent call last):\n File \"d:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py\", line 59, in _wrap\n fn(i, *args)\n File \"d:\\Real-Time-Voice-Cloning-Chinese\\models\\synthesizer\\train_vits.py\", line 123, in run\n net_g = Vits(\nTypeError: __init__() missing 1 required positional argument: 'stop_threshold'\n"
- ]
- },
- {
- "ename": "",
- "evalue": "",
- "output_type": "error",
- "traceback": [
- "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details."
- ]
- }
- ],
+ "outputs": [],
"source": [
"from models.synthesizer.train_vits import run\n",
"from pathlib import Path\n",
@@ -352,7 +187,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -380,7 +215,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
|