diff --git a/.vscode/launch.json b/.vscode/launch.json index b2ab7f8..a766d67 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -64,6 +64,14 @@ "args": ["-c", ".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2.yaml", "-m", ".\\ppg2mel\\saved_models\\best_loss_step_304000.pth", "--wav_dir", ".\\wavs\\input", "--ref_wav_path", ".\\wavs\\pkq.mp3", "-o", ".\\wavs\\output\\" ] - } + }, + { + "name": "Python: Vits Train", + "type": "python", + "request": "launch", + "program": "train.py", + "console": "integratedTerminal", + "args": ["--type", "vits"] + }, ] } diff --git a/models/synthesizer/hparams.py b/models/synthesizer/hparams.py index ca3e635..b0d46f7 100644 --- a/models/synthesizer/hparams.py +++ b/models/synthesizer/hparams.py @@ -3,10 +3,10 @@ from utils.hparams import HParams hparams = HParams( ### Signal Processing (used in both synthesizer and vocoder) sample_rate = 16000, - n_fft = 800, + n_fft = 1024, # filter_length num_mels = 80, - hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125) - win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050) + hop_size = 256, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125) + win_size = 1024, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050) fmin = 55, min_level_db = -100, ref_level_db = 20, @@ -67,7 +67,7 @@ hparams = HParams( use_lws = False, # "Fast spectrogram phase recovery using local weighted sums" symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True, # and [0, max_abs_value] if False - trim_silence = True, # Use with sample_rate of 16000 for best results + trim_silence = False, # Use with sample_rate of 16000 for best results ### SV2TTS speaker_embedding_size = 256, # Dimension for the speaker embedding diff --git a/models/synthesizer/models/vits.py b/models/synthesizer/models/vits.py index db4a917..0041a88 100644 --- a/models/synthesizer/models/vits.py +++ b/models/synthesizer/models/vits.py @@ -2,12 +2,12 @@ import math import torch from torch import nn from torch.nn import functional as F +from loguru import logger from .sublayer.vits_modules import * import monotonic_align -from .base import Base -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn import Conv1d, ConvTranspose1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from utils.util import init_weights, get_padding, sequence_mask, rand_slice_segments, generate_path @@ -386,7 +386,7 @@ class MultiPeriodDiscriminator(torch.nn.Module): return y_d_rs, y_d_gs, fmap_rs, fmap_gs -class Vits(Base): +class Vits(nn.Module): """ Synthesizer of Vits """ @@ -408,13 +408,12 @@ class Vits(Base): upsample_rates, upsample_initial_channel, upsample_kernel_sizes, - stop_threshold, n_speakers=0, gin_channels=0, use_sdp=True, **kwargs): - super().__init__(stop_threshold) + super().__init__() self.n_vocab = n_vocab self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -457,7 +456,7 @@ class Vits(Base): self.emb_g = nn.Embedding(n_speakers, gin_channels) def forward(self, x, x_lengths, y, y_lengths, sid=None, emo=None): - + # logger.info(f'====> Forward: 1.1.0') x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emo) if self.n_speakers > 0: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] @@ -466,7 +465,7 @@ class Vits(Base): z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) z_p = self.flow(z, y_mask, g=g) - + # logger.info(f'====> Forward: 1.1.1') with torch.no_grad(): # negative cross-entropy s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t] @@ -475,10 +474,11 @@ class Vits(Base): neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s] neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4 - + #logger.info(f'====> Forward: 1.1.1.1') attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach() + # logger.info(f'====> Forward: 1.1.2') w = attn.sum(2) if self.use_sdp: l_length = self.dp(x, x_mask, w, g=g) @@ -487,7 +487,6 @@ class Vits(Base): logw_ = torch.log(w + 1e-6) * x_mask logw = self.dp(x, x_mask, g=g) l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging - # expand prior m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) @@ -497,7 +496,9 @@ class Vits(Base): return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) def infer(self, x, x_lengths, sid=None, emo=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None): + # logger.info(f'====> Infer: 1.1.0') x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths,emo) + # logger.info(f'====> Infer: 1.1.1') if self.n_speakers > 0: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] else: @@ -514,11 +515,14 @@ class Vits(Base): attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) attn = generate_path(w_ceil, attn_mask) + # logger.info(f'====> Infer: 1.1.2') m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z = self.flow(z_p, y_mask, g=g, reverse=True) o = self.dec((z * y_mask)[:,:,:max_len], g=g) + + # logger.info(f'====> Infer: 1.1.3') return o, attn, y_mask, (z, z_p, m_p, logs_p) diff --git a/models/synthesizer/preprocess_audio.py b/models/synthesizer/preprocess_audio.py index c8f7904..a2e08f8 100644 --- a/models/synthesizer/preprocess_audio.py +++ b/models/synthesizer/preprocess_audio.py @@ -20,8 +20,6 @@ device = 'cuda' if torch.cuda.is_available() else "cpu" model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim' processor = Wav2Vec2Processor.from_pretrained(model_name) model = EmotionExtractorModel.from_pretrained(model_name).to(device) -embs = [] -wavnames = [] def extract_emo( x: np.ndarray, @@ -48,8 +46,6 @@ class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter): pinyin = Pinyin(PinyinConverter()).pinyin - - def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, skip_existing: bool, hparams, emotion_extract: bool): ## FOR REFERENCE: @@ -67,9 +63,8 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, # Skip existing utterances if needed mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) - emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % basename) - skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists()) - if skip_existing and mel_fpath.exists() and wav_fpath.exists() and skip_emo_extract: + + if skip_existing and mel_fpath.exists() and wav_fpath.exists(): return None # Trim silence @@ -91,18 +86,14 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) np.save(wav_fpath, wav, allow_pickle=False) - if not skip_emo_extract: - emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True) - np.save(emo_fpath, emo, allow_pickle=False) - # Return a tuple describing this training example - return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text + return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, wav, mel_frames, text def _split_on_silences(wav_fpath, words, hparams): # Load the audio waveform wav, _ = librosa.load(wav_fpath, sr= hparams.sample_rate) - wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0] + wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=1024)[0] if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max # denoise, we may not need it here. @@ -132,6 +123,15 @@ def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, continue sub_basename = "%s_%02d" % (wav_fpath.name, 0) wav, text = _split_on_silences(wav_fpath, words, hparams) - metadata.append(_process_utterance(wav, text, out_dir, sub_basename, - skip_existing, hparams, emotion_extract)) + result = _process_utterance(wav, text, out_dir, sub_basename, + skip_existing, hparams, emotion_extract) + if result is None: + continue + wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result + emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % sub_basename) + skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists()) + if not skip_emo_extract and wav is not None: + emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True) + np.save(emo_fpath, emo.squeeze(0), allow_pickle=False) + metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text]) return [m for m in metadata if m is not None] diff --git a/models/synthesizer/train_vits.py b/models/synthesizer/train_vits.py index d8324d9..6eba6ad 100644 --- a/models/synthesizer/train_vits.py +++ b/models/synthesizer/train_vits.py @@ -39,7 +39,7 @@ def new_train(): parser.add_argument("--syn_dir", type=str, default="../audiodata/SV2TTS/synthesizer", help= \ "Path to the synthesizer directory that contains the ground truth mel spectrograms, " "the wavs, the emos and the embeds.") - parser.add_argument("-m", "--model_dir", type=str, default="data/ckpt/synthesizer/vits", help=\ + parser.add_argument("-m", "--model_dir", type=str, default="data/ckpt/synthesizer/vits2", help=\ "Path to the output directory that will contain the saved model weights and the logs.") parser.add_argument('--ckptG', type=str, required=False, help='original VITS G checkpoint path') @@ -65,7 +65,7 @@ def new_train(): run(0, 1, hparams) -def load_checkpoint(checkpoint_path, model, optimizer=None, is_old=False): +def load_checkpoint(checkpoint_path, model, optimizer=None, is_old=False, epochs=10000): assert os.path.isfile(checkpoint_path) checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') iteration = checkpoint_dict['iteration'] @@ -89,8 +89,12 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, is_old=False): try: new_state_dict[k] = saved_state_dict[k] except: - logger.info("%s is not in the checkpoint" % k) - new_state_dict[k] = v + if k == 'step': + new_state_dict[k] = iteration * epochs + else: + logger.info("%s is not in the checkpoint" % k) + new_state_dict[k] = v + if hasattr(model, 'module'): model.module.load_state_dict(new_state_dict, strict=False) else: @@ -173,13 +177,13 @@ def run(rank, n_gpus, hps): print("加载原版VITS模型G记录点成功") else: _, _, _, epoch_str = load_checkpoint(latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, - optim_g) + optim_g, epochs=hps.train.epochs) if ckptD is not None: _, _, _, epoch_str = load_checkpoint(ckptG, net_g, optim_g, is_old=True) print("加载原版VITS模型D记录点成功") else: _, _, _, epoch_str = load_checkpoint(latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, - optim_d) + optim_d, epochs=hps.train.epochs) global_step = (epoch_str - 1) * len(train_loader) except: epoch_str = 1 @@ -216,17 +220,17 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade net_g.train() net_d.train() for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers, emo) in enumerate(train_loader): - logger.info(f'====> Step: 1 {batch_idx}') - x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True) - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) - y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True) - speakers = speakers.cuda(rank, non_blocking=True) - emo = emo.cuda(rank, non_blocking=True) - + # logger.info(f'====> Step: 1 {batch_idx}') + x, x_lengths = x.cuda(rank), x_lengths.cuda(rank) + spec, spec_lengths = spec.cuda(rank), spec_lengths.cuda(rank) + y, y_lengths = y.cuda(rank), y_lengths.cuda(rank) + speakers = speakers.cuda(rank) + emo = emo.cuda(rank) + # logger.info(f'====> Step: 1.0 {batch_idx}') with autocast(enabled=hps.train.fp16_run): y_hat, l_length, attn, ids_slice, x_mask, z_mask, \ (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, speakers, emo) - + # logger.info(f'====> Step: 1.1 {batch_idx}') mel = spec_to_mel( spec, hps.data.filter_length, @@ -247,7 +251,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade ) y = slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice - + # logger.info(f'====> Step: 1.3 {batch_idx}') # Discriminator y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) with autocast(enabled=False): @@ -258,7 +262,6 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade scaler.unscale_(optim_d) grad_norm_d = clip_grad_value_(net_d.parameters(), None) scaler.step(optim_d) - logger.info(f'====> Step: 2 {batch_idx}') with autocast(enabled=hps.train.fp16_run): # Generator @@ -277,7 +280,6 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade grad_norm_g = clip_grad_value_(net_g.parameters(), None) scaler.step(optim_g) scaler.update() - # logger.info(f'====> Step: 3 {batch_idx}') if rank == 0: if global_step % hps.train.log_interval == 0: lr = optim_g.param_groups[0]['lr'] @@ -339,6 +341,8 @@ def evaluate(hps, generator, eval_loader, writer_eval): emo = emo[:1] break y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, emo, max_len=1000) + # y_hat, attn, mask, *_ = generator.infer(x, x_lengths, speakers, emo, max_len=1000) # for non DistributedDataParallel object + y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length mel = spec_to_mel( diff --git a/models/synthesizer/vits_dataset.py b/models/synthesizer/vits_dataset.py index 32702d1..6acbfe8 100644 --- a/models/synthesizer/vits_dataset.py +++ b/models/synthesizer/vits_dataset.py @@ -4,7 +4,7 @@ import numpy as np import torch import torch.utils.data -from utils.audio_utils import spectrogram, load_wav +from utils.audio_utils import spectrogram1, load_wav_to_torch, spectrogram from utils.util import intersperse from models.synthesizer.utils.text import text_to_sequence @@ -57,6 +57,8 @@ class VitsDataset(torch.utils.data.Dataset): if self.min_text_len <= len(text) and len(text) <= self.max_text_len: # TODO: for magic data only speaker_name = wav_fpath.split("_")[1] + # # TODO: for ai data only + # speaker_name = wav_fpath.split("-")[1][6:9] if speaker_name not in spk_to_sid: sid += 1 spk_to_sid[speaker_name] = sid @@ -71,36 +73,45 @@ class VitsDataset(torch.utils.data.Dataset): # separate filename, speaker_id and text wav_fpath, text, sid = audio_metadata[0], audio_metadata[5], audio_metadata[6] text = self.get_text(text) - - spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}') + + # TODO: add original audio data root for loading + file_name = wav_fpath.split("_00")[0].split('-')[1] + spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}..{os.sep}..{os.sep}magicdata{os.sep}train{os.sep}{"_".join(file_name.split("_")[:2])}{os.sep}{file_name}') + + # spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}') sid = self.get_sid(sid) emo = torch.FloatTensor(np.load(f'{self.datasets_root}{os.sep}emo{os.sep}{wav_fpath.replace("audio", "emo")}')) return (text, spec, wav, sid, emo) def get_audio(self, filename): - # audio, sampling_rate = load_wav(filename) - - # if sampling_rate != self.sampling_rate: - # raise ValueError("{} {} SR doesn't match target {} SR".format( - # sampling_rate, self.sampling_rate)) - # audio = torch.load(filename) - audio = torch.FloatTensor(np.load(filename).astype(np.float32)) - audio = audio.unsqueeze(0) - # audio_norm = audio / self.max_wav_value - # audio_norm = audio_norm.unsqueeze(0) - # spec_filename = filename.replace(".wav", ".spec.pt") - # if os.path.exists(spec_filename): - # spec = torch.load(spec_filename) - # else: - # spec = spectrogram(audio, self.filter_length, - # self.sampling_rate, self.hop_length, self.win_length, - # center=False) - # spec = torch.squeeze(spec, 0) - # torch.save(spec, spec_filename) - spec = spectrogram(audio, self.filter_length, self.hop_length, self.win_length, + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError("{} {} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate)) + audio_norm = audio / self.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + spec = spectrogram(audio_norm, self.filter_length, self.hop_length, self.win_length, center=False) spec = torch.squeeze(spec, 0) - return spec, audio + return spec, audio_norm + + # print("Loading", filename) + # # audio = torch.FloatTensor(np.load(filename).astype(np.float32)) + # audio = audio.unsqueeze(0) + # audio_norm = audio / self.max_wav_value + # audio_norm = audio_norm.unsqueeze(0) + # # spec_filename = filename.replace(".wav", ".spec.pt") + # # if os.path.exists(spec_filename): + # # spec = torch.load(spec_filename) + # # else: + # # spec = spectrogram(audio, self.filter_length,self.hop_length, self.win_length, + # # center=False) + # # spec = torch.squeeze(spec, 0) + # # torch.save(spec, spec_filename) + # spec = spectrogram(audio, self.filter_length, self.hop_length, self.win_length, + # center=False) + # spec = torch.squeeze(spec, 0) + # return spec, audio def get_text(self, text): if self.cleaned_text: diff --git a/utils/audio_utils.py b/utils/audio_utils.py index dee34d1..93effe0 100644 --- a/utils/audio_utils.py +++ b/utils/audio_utils.py @@ -17,8 +17,7 @@ def load_wav_to_torch(full_path): sampling_rate, data = read(full_path) return torch.FloatTensor(data.astype(np.float32)), sampling_rate - -def spectrogram(y, n_fft, hop_size, win_size, center=False): +def spectrogram1(y, n_fft, sampling_rate, hop_size, win_size, center=False): if torch.min(y) < -1.: print('min value is ', torch.min(y)) if torch.max(y) > 1.: @@ -34,7 +33,29 @@ def spectrogram(y, n_fft, hop_size, win_size, center=False): y = y.squeeze(1) spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], - center=center, pad_mode='reflect', normalized=False, onesided=True) + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +def spectrogram(y, n_fft, hop_size, win_size, center=False): + if torch.min(y) < -1.: + print('min value is ', torch.min(y)) + if torch.max(y) > 1.: + print('max value is ', torch.max(y)) + + global hann_window + dtype_device = str(y.dtype) + '_' + str(y.device) + wnsize_dtype_device = str(win_size) + '_' + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) return spec diff --git a/vits.ipynb b/vits.ipynb index cd01684..10242da 100644 --- a/vits.ipynb +++ b/vits.ipynb @@ -2,18 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'log_interval': 2000, 'eval_interval': 4000, 'seed': 1234, 'epochs': 10000, 'learning_rate': 0.0001, 'betas': [0.8, 0.99], 'eps': 1e-09, 'batch_size': 16, 'fp16_run': True, 'lr_decay': 0.5, 'segment_size': 8192, 'init_lr_ratio': 1, 'warmup_epochs': 0, 'c_mel': 45, 'c_kl': 1.0}\n", - "Trainable Parameters: 0.000M\n" - ] - } - ], + "outputs": [], "source": [ "from utils.hparams import load_hparams_json\n", "from utils.util import intersperse\n", @@ -22,124 +13,36 @@ "import torch\n", "import numpy as np\n", "import IPython.display as ipd\n", + "from models.synthesizer.utils.symbols import symbols\n", + "from models.synthesizer.utils.text import text_to_sequence\n", "\n", - "# chinese_cleaners\n", - "_pad = '_'\n", - "_punctuation = ',。!?—…'\n", - "_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '\n", - "# Export all symbols:\n", - "symbols = [_pad] + list(_punctuation) + list(_letters)\n", "\n", - "hps = load_hparams_json(\"data/ckpt/synthesizer/vits/config.json\")\n", + "hps = load_hparams_json(\"data/ckpt/synthesizer/vits2/config.json\")\n", "print(hps.train)\n", "model = Vits(\n", " len(symbols),\n", " hps[\"data\"][\"filter_length\"] // 2 + 1,\n", " hps[\"train\"][\"segment_size\"] // hps[\"data\"][\"hop_length\"],\n", " n_speakers=hps[\"data\"][\"n_speakers\"],\n", - " stop_threshold=0.5,\n", " **hps[\"model\"])\n", "_ = model.eval()\n", "device = torch.device(\"cpu\")\n", - "model.load(\"data/ckpt/synthesizer/vits/G_208000.pth\", device)\n", + "checkpoint = torch.load(str(\"data/ckpt/synthesizer/vits2/G_120000.pth\"), map_location=device)\n", + "if \"model_state\" in checkpoint:\n", + " state = checkpoint[\"model_state\"]\n", + "else:\n", + " state = checkpoint[\"model\"]\n", + "model.load_state_dict(state, strict=False)\n", "\n", "# 随机抽取情感参考音频的根目录\n", "random_emotion_root = \"D:\\\\audiodata\\\\aidatatang_200zh\\\\corpus\\\\train\\\\G0017\"\n", "import random, re\n", - "# import cn2an # remove dependency before production\n", - "from pypinyin import lazy_pinyin, BOPOMOFO\n", - "\n", - "_symbol_to_id = {s: i for i, s in enumerate(symbols)}\n", - "\n", - "# def number_to_chinese(text):\n", - "# numbers = re.findall(r'\\d+(?:\\.?\\d+)?', text)\n", - "# for number in numbers:\n", - "# text = text.replace(number, cn2an.an2cn(number), 1)\n", - "# return text\n", - "\n", - "def chinese_to_bopomofo(text, taiwanese=False):\n", - " text = text.replace('、', ',').replace(';', ',').replace(':', ',')\n", - " for word in list(text):\n", - " bopomofos = lazy_pinyin(word, BOPOMOFO)\n", - " if not re.search('[\\u4e00-\\u9fff]', word):\n", - " text += word\n", - " continue\n", - " for i in range(len(bopomofos)):\n", - " bopomofos[i] = re.sub(r'([\\u3105-\\u3129])$', r'\\1ˉ', bopomofos[i])\n", - " if text != '':\n", - " text += ' '\n", - " if taiwanese:\n", - " text += '#'+'#'.join(bopomofos)\n", - " else:\n", - " text += ''.join(bopomofos)\n", - " return text\n", - "\n", - "_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [\n", - " ('a', 'ㄟˉ'),\n", - " ('b', 'ㄅㄧˋ'),\n", - " ('c', 'ㄙㄧˉ'),\n", - " ('d', 'ㄉㄧˋ'),\n", - " ('e', 'ㄧˋ'),\n", - " ('f', 'ㄝˊㄈㄨˋ'),\n", - " ('g', 'ㄐㄧˋ'),\n", - " ('h', 'ㄝˇㄑㄩˋ'),\n", - " ('i', 'ㄞˋ'),\n", - " ('j', 'ㄐㄟˋ'),\n", - " ('k', 'ㄎㄟˋ'),\n", - " ('l', 'ㄝˊㄛˋ'),\n", - " ('m', 'ㄝˊㄇㄨˋ'),\n", - " ('n', 'ㄣˉ'),\n", - " ('o', 'ㄡˉ'),\n", - " ('p', 'ㄆㄧˉ'),\n", - " ('q', 'ㄎㄧㄡˉ'),\n", - " ('r', 'ㄚˋ'),\n", - " ('s', 'ㄝˊㄙˋ'),\n", - " ('t', 'ㄊㄧˋ'),\n", - " ('u', 'ㄧㄡˉ'),\n", - " ('v', 'ㄨㄧˉ'),\n", - " ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),\n", - " ('x', 'ㄝˉㄎㄨˋㄙˋ'),\n", - " ('y', 'ㄨㄞˋ'),\n", - " ('z', 'ㄗㄟˋ')\n", - "]]\n", - "\n", - "def latin_to_bopomofo(text):\n", - " for regex, replacement in _latin_to_bopomofo:\n", - " text = re.sub(regex, replacement, text)\n", - " return text\n", - "\n", - "#TODO: add cleaner to support multilang\n", - "def chinese_cleaners(text, cleaner_names):\n", - " '''Pipeline for Chinese text'''\n", - " # text = number_to_chinese(text)\n", - " text = chinese_to_bopomofo(text)\n", - " text = latin_to_bopomofo(text)\n", - " if re.match('[ˉˊˇˋ˙]', text[-1]):\n", - " text += '。'\n", - " return text\n", - "\n", - "\n", - "def text_to_sequence(text, cleaner_names):\n", - " '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.\n", - " Args:\n", - " text: string to convert to a sequence\n", - " cleaner_names: names of the cleaner functions to run the text through\n", - " Returns:\n", - " List of integers corresponding to the symbols in the text\n", - " '''\n", - " sequence = []\n", - "\n", - " clean_text = chinese_cleaners(text, cleaner_names)\n", - " for symbol in clean_text:\n", - " if symbol not in _symbol_to_id.keys():\n", - " continue\n", - " symbol_id = _symbol_to_id[symbol]\n", - " sequence += [symbol_id]\n", - " return sequence\n", + "from pypinyin import lazy_pinyin, Style\n", "\n", "import os\n", "\n", "def tts(txt, emotion, sid=0):\n", + " txt = \" \".join(lazy_pinyin(txt, style=Style.TONE3, neutral_tone_with_five=True))\n", " text_norm = text_to_sequence(txt, hps[\"data\"][\"text_cleaners\"])\n", " if hps[\"data\"][\"add_blank\"]:\n", " text_norm = intersperse(text_norm, 0)\n", @@ -173,37 +76,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "txt = \"随机抽取的音频文件路径可以用于使用该情感合成其他句子\"\n", - "tts(txt, emotion='C:\\\\Users\\\\babys\\\\Desktop\\\\voicecollection\\\\secondround\\\\美玉.wav', sid=0)" + "tts(txt, emotion='C:\\\\Users\\\\babys\\\\Desktop\\\\voicecollection\\\\secondround\\\\美玉.wav', sid=2)" ] }, { @@ -216,43 +94,31 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using data from:\n", - " ..\\audiodata\\magicdata\\train\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "magicdata: 0%| | 0/1018 [00:00here for more info. View Jupyter log for further details." - ] - } - ], + "outputs": [], "source": [ "from models.synthesizer.preprocess import preprocess_dataset\n", "from pathlib import Path\n", "from utils.hparams import HParams\n", "datasets_root = Path(\"../audiodata/\")\n", - "hparams=HParams(\n", + "hparams = HParams(\n", + " n_fft = 1024, # filter_length\n", + " num_mels = 80,\n", + " hop_size = 256, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)\n", + " win_size = 1024, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)\n", + " fmin = 55,\n", + " min_level_db = -100,\n", + " ref_level_db = 20,\n", + " max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.\n", " sample_rate = 16000,\n", " rescale = True,\n", " max_mel_frames = 900,\n", - " rescaling_max = 0.9,\n", + " rescaling_max = 0.9, \n", + " preemphasis = 0.97, # Filter coefficient to use if preemphasize is True\n", + " preemphasize = True,\n", + " ### Mel Visualization and Griffin-Lim\n", + " signal_normalization = True,\n", "\n", " utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded\n", " ### Audio processing options\n", @@ -285,40 +151,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\\Loading the json with %s\n", - " data\\ckpt\\synthesizer\\vits\\config.json\n" - ] - }, - { - "ename": "ProcessRaisedException", - "evalue": "\n\n-- Process 0 terminated with the following error:\nTraceback (most recent call last):\n File \"d:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py\", line 59, in _wrap\n fn(i, *args)\n File \"d:\\Real-Time-Voice-Cloning-Chinese\\models\\synthesizer\\train_vits.py\", line 123, in run\n net_g = Vits(\nTypeError: __init__() missing 1 required positional argument: 'stop_threshold'\n", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mProcessRaisedException\u001b[0m Traceback (most recent call last)", - "\u001b[1;32md:\\Real-Time-Voice-Cloning-Chinese\\vits.ipynb Cell 7\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 18\u001b[0m os\u001b[39m.\u001b[39menviron[\u001b[39m'\u001b[39m\u001b[39mMASTER_ADDR\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mlocalhost\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m 19\u001b[0m os\u001b[39m.\u001b[39menviron[\u001b[39m'\u001b[39m\u001b[39mMASTER_PORT\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m8899\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m---> 20\u001b[0m mp\u001b[39m.\u001b[39;49mspawn(run, nprocs\u001b[39m=\u001b[39;49mn_gpus, args\u001b[39m=\u001b[39;49m(n_gpus, hparams))\n", - "File \u001b[1;32md:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py:230\u001b[0m, in \u001b[0;36mspawn\u001b[1;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[0;32m 226\u001b[0m msg \u001b[39m=\u001b[39m (\u001b[39m'\u001b[39m\u001b[39mThis method only supports start_method=spawn (got: \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m).\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m'\u001b[39m\n\u001b[0;32m 227\u001b[0m \u001b[39m'\u001b[39m\u001b[39mTo use a different start_method use:\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\t\u001b[39;00m\u001b[39m\\t\u001b[39;00m\u001b[39m'\u001b[39m\n\u001b[0;32m 228\u001b[0m \u001b[39m'\u001b[39m\u001b[39m torch.multiprocessing.start_processes(...)\u001b[39m\u001b[39m'\u001b[39m \u001b[39m%\u001b[39m start_method)\n\u001b[0;32m 229\u001b[0m warnings\u001b[39m.\u001b[39mwarn(msg)\n\u001b[1;32m--> 230\u001b[0m \u001b[39mreturn\u001b[39;00m start_processes(fn, args, nprocs, join, daemon, start_method\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mspawn\u001b[39;49m\u001b[39m'\u001b[39;49m)\n", - "File \u001b[1;32md:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py:188\u001b[0m, in \u001b[0;36mstart_processes\u001b[1;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[0;32m 185\u001b[0m \u001b[39mreturn\u001b[39;00m context\n\u001b[0;32m 187\u001b[0m \u001b[39m# Loop on join until it returns True or raises an exception.\u001b[39;00m\n\u001b[1;32m--> 188\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m context\u001b[39m.\u001b[39;49mjoin():\n\u001b[0;32m 189\u001b[0m \u001b[39mpass\u001b[39;00m\n", - "File \u001b[1;32md:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py:150\u001b[0m, in \u001b[0;36mProcessContext.join\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 148\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39m-- Process \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m terminated with the following error:\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m error_index\n\u001b[0;32m 149\u001b[0m msg \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m original_trace\n\u001b[1;32m--> 150\u001b[0m \u001b[39mraise\u001b[39;00m ProcessRaisedException(msg, error_index, failed_process\u001b[39m.\u001b[39mpid)\n", - "\u001b[1;31mProcessRaisedException\u001b[0m: \n\n-- Process 0 terminated with the following error:\nTraceback (most recent call last):\n File \"d:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py\", line 59, in _wrap\n fn(i, *args)\n File \"d:\\Real-Time-Voice-Cloning-Chinese\\models\\synthesizer\\train_vits.py\", line 123, in run\n net_g = Vits(\nTypeError: __init__() missing 1 required positional argument: 'stop_threshold'\n" - ] - }, - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." - ] - } - ], + "outputs": [], "source": [ "from models.synthesizer.train_vits import run\n", "from pathlib import Path\n", @@ -352,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -380,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [