add pretrained

pull/892/head
babysor00 2023-02-18 09:31:05 +08:00
parent 3ce874ab46
commit 5c17fc8bb0
16 changed files with 21908 additions and 123 deletions

8
.gitignore vendored
View File

@ -14,11 +14,13 @@
*.bcf *.bcf
*.toc *.toc
*.sh *.sh
data/ckpt data/ckpt/*/*
!data/ckpt/vocoder/pretrained/**
!data/ckpt/encoder/pretrained.pt !data/ckpt/encoder/pretrained.pt
!data/ckpt/vocoder/pretrained/
wavs wavs
log log
!/docker-entrypoint.sh !/docker-entrypoint.sh
!/datasets_download/*.sh !/datasets_download/*.sh
/datasets /datasets
monotonic_align/build
monotonic_align/monotonic_align

2
.vscode/launch.json vendored
View File

@ -53,7 +53,7 @@
"request": "launch", "request": "launch",
"program": "train.py", "program": "train.py",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": ["--type", "synth", "..\\audiodata\\SV2TTS\\synthesizer"] "args": ["--type", "vits"]
}, },
{ {
"name": "Python: PPG Convert", "name": "Python: PPG Convert",

Binary file not shown.

View File

@ -0,0 +1,31 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 16,
"learning_rate": 0.0002,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.999,
"seed": 1234,
"upsample_rates": [5,5,4,2],
"upsample_kernel_sizes": [10,10,8,4],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"segment_size": 6400,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 200,
"win_size": 800,
"sampling_rate": 16000,
"fmin": 0,
"fmax": 7600,
"fmax_for_loss": null,
"num_workers": 4
}

Binary file not shown.

Binary file not shown.

View File

@ -6,7 +6,7 @@ from pathlib import Path
from tqdm import tqdm from tqdm import tqdm
import numpy as np import numpy as np
from models.encoder import inference as encoder from models.encoder import inference as encoder
from models.synthesizer.preprocess_audio import preprocess_general from models.synthesizer.preprocess_audio import preprocess_general, extract_emo
from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata
data_info = { data_info = {
@ -41,7 +41,7 @@ data_info = {
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
skip_existing: bool, hparams, no_alignments: bool, skip_existing: bool, hparams, no_alignments: bool,
dataset: str, emotion_extract = False): dataset: str, emotion_extract = False, encoder_model_fpath=None):
dataset_info = data_info[dataset] dataset_info = data_info[dataset]
# Gather the input directories # Gather the input directories
dataset_root = datasets_root.joinpath(dataset) dataset_root = datasets_root.joinpath(dataset)
@ -77,7 +77,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, emotion_extract=emotion_extract) hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, speaker_dirs) job = Pool(n_processes).imap(func, speaker_dirs)
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"): for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
@ -110,6 +110,13 @@ def embed_utterance(fpaths, encoder_model_fpath):
embed = encoder.embed_utterance(wav) embed = encoder.embed_utterance(wav)
np.save(embed_fpath, embed, allow_pickle=False) np.save(embed_fpath, embed, allow_pickle=False)
def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False):
if skip_existing and fpaths.exists():
return
wav_fpath, emo_fpath = fpaths
wav = np.load(wav_fpath)
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int): def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
wav_dir = synthesizer_root.joinpath("audio") wav_dir = synthesizer_root.joinpath("audio")
@ -128,3 +135,21 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths) job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
wav_dir = synthesizer_root.joinpath("audio")
metadata_fpath = synthesizer_root.joinpath("train.txt")
assert wav_dir.exists() and metadata_fpath.exists()
emo_dir = synthesizer_root.joinpath("emo")
emo_dir.mkdir(exist_ok=True)
# Gather the input wave filepath and the target output embed filepath
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata]
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
# Embed the utterances in separate threads
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Emo", len(fpaths), unit="utterances"))

View File

@ -13,7 +13,11 @@ import torch
from transformers import Wav2Vec2Processor from transformers import Wav2Vec2Processor
from .models.wav2emo import EmotionExtractorModel from .models.wav2emo import EmotionExtractorModel
SAMPLE_RATE = 16000 class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
pass
pinyin = Pinyin(PinyinConverter()).pinyin
# load model from hub # load model from hub
device = 'cuda' if torch.cuda.is_available() else "cpu" device = 'cuda' if torch.cuda.is_available() else "cpu"
@ -40,14 +44,8 @@ def extract_emo(
return y return y
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
pass
pinyin = Pinyin(PinyinConverter()).pinyin
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams, emotion_extract: bool): skip_existing: bool, hparams, encoder_model_fpath):
## FOR REFERENCE: ## FOR REFERENCE:
# For you not to lose your head if you ever wish to change things here or implement your own # For you not to lose your head if you ever wish to change things here or implement your own
# synthesizer. # synthesizer.
@ -69,6 +67,8 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
# Trim silence # Trim silence
if hparams.trim_silence: if hparams.trim_silence:
if not encoder.is_loaded():
encoder.load_model(encoder_model_fpath)
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True) wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
# Skip utterances that are too short # Skip utterances that are too short
@ -109,7 +109,7 @@ def _split_on_silences(wav_fpath, words, hparams):
return wav, res return wav, res
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, emotion_extract: bool): def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
metadata = [] metadata = []
extensions = ["*.wav", "*.flac", "*.mp3"] extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions: for extension in extensions:
@ -124,14 +124,9 @@ def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams,
sub_basename = "%s_%02d" % (wav_fpath.name, 0) sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams) wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename, result = _process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams, emotion_extract) skip_existing, hparams, encoder_model_fpath)
if result is None: if result is None:
continue continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % sub_basename)
skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists())
if not skip_emo_extract and wav is not None:
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text]) metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
return [m for m in metadata if m is not None] return [m for m in metadata if m is not None]

View File

@ -1,10 +1,11 @@
import os import os
import random import random
import numpy as np import numpy as np
import torch.nn.functional as F
import torch import torch
import torch.utils.data import torch.utils.data
from utils.audio_utils import spectrogram1, load_wav_to_torch, spectrogram from utils.audio_utils import load_wav_to_torch, spectrogram
from utils.util import intersperse from utils.util import intersperse
from models.synthesizer.utils.text import text_to_sequence from models.synthesizer.utils.text import text_to_sequence
@ -51,21 +52,10 @@ class VitsDataset(torch.utils.data.Dataset):
lengths = [] lengths = []
# for audiopath, sid, text in self.audio_metadata: # for audiopath, sid, text in self.audio_metadata:
sid = 0 for wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid in self.audio_metadata:
spk_to_sid = {}
for wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text in self.audio_metadata:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len: if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
# TODO: for magic data only audio_metadata_new.append([wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid])
speaker_name = wav_fpath.split("_")[1]
# # TODO: for ai data only
# speaker_name = wav_fpath.split("-")[1][6:9]
if speaker_name not in spk_to_sid:
sid += 1
spk_to_sid[speaker_name] = sid
audio_metadata_new.append([wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spk_to_sid[speaker_name]])
lengths.append(os.path.getsize(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}') // (2 * self.hop_length)) lengths.append(os.path.getsize(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}') // (2 * self.hop_length))
print("found sid:%d", sid)
self.audio_metadata = audio_metadata_new self.audio_metadata = audio_metadata_new
self.lengths = lengths self.lengths = lengths
@ -74,50 +64,31 @@ class VitsDataset(torch.utils.data.Dataset):
wav_fpath, text, sid = audio_metadata[0], audio_metadata[5], audio_metadata[6] wav_fpath, text, sid = audio_metadata[0], audio_metadata[5], audio_metadata[6]
text = self.get_text(text) text = self.get_text(text)
# TODO: add original audio data root for loading spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
file_name = wav_fpath.split("_00")[0].split('-')[1]
spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}..{os.sep}..{os.sep}magicdata{os.sep}train{os.sep}{"_".join(file_name.split("_")[:2])}{os.sep}{file_name}')
# spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
sid = self.get_sid(sid) sid = self.get_sid(sid)
emo = torch.FloatTensor(np.load(f'{self.datasets_root}{os.sep}emo{os.sep}{wav_fpath.replace("audio", "emo")}')) emo = torch.FloatTensor(np.load(f'{self.datasets_root}{os.sep}emo{os.sep}{wav_fpath.replace("audio", "emo")}'))
return (text, spec, wav, sid, emo) return (text, spec, wav, sid, emo)
def get_audio(self, filename): def get_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename) # Load preprocessed wav npy instead of reading from wav file
if sampling_rate != self.sampling_rate: audio = torch.FloatTensor(np.load(filename))
raise ValueError("{} {} SR doesn't match target {} SR".format( audio_norm = audio.unsqueeze(0)
sampling_rate, self.sampling_rate))
audio_norm = audio / self.max_wav_value spec_filename = filename.replace(".wav", ".spec")
audio_norm = audio_norm.unsqueeze(0) if os.path.exists(spec_filename):
spec = spectrogram(audio_norm, self.filter_length, self.hop_length, self.win_length, spec = torch.load(spec_filename)
center=False) else:
spec = spectrogram(audio_norm, self.filter_length,self.hop_length, self.win_length,
center=False)
torch.save(spec, spec_filename)
spec = torch.squeeze(spec, 0) spec = torch.squeeze(spec, 0)
return spec, audio_norm return spec, audio_norm
# print("Loading", filename)
# # audio = torch.FloatTensor(np.load(filename).astype(np.float32))
# audio = audio.unsqueeze(0)
# audio_norm = audio / self.max_wav_value
# audio_norm = audio_norm.unsqueeze(0)
# # spec_filename = filename.replace(".wav", ".spec.pt")
# # if os.path.exists(spec_filename):
# # spec = torch.load(spec_filename)
# # else:
# # spec = spectrogram(audio, self.filter_length,self.hop_length, self.win_length,
# # center=False)
# # spec = torch.squeeze(spec, 0)
# # torch.save(spec, spec_filename)
# spec = spectrogram(audio, self.filter_length, self.hop_length, self.win_length,
# center=False)
# spec = torch.squeeze(spec, 0)
# return spec, audio
def get_text(self, text): def get_text(self, text):
if self.cleaned_text: if self.cleaned_text:
text_norm = text_to_sequence(text, self.text_cleaners) text_norm = text_to_sequence(text, self.text_cleaners)
if self.add_blank: if self.add_blank:
text_norm = intersperse(text_norm, 0) text_norm = intersperse(text_norm, 0) # 在所有文本数值序列中的元素前后都补充一个0 - 不适用于中文
text_norm = torch.LongTensor(text_norm) text_norm = torch.LongTensor(text_norm)
return text_norm return text_norm
@ -188,7 +159,7 @@ class VitsDatasetCollate():
emo[i, :] = row[4] emo[i, :] = row[4]
if self.return_ids: if self.return_ids:
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing, emo
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, emo return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, emo

View File

@ -0,0 +1,19 @@
import numpy as np
import torch
from .monotonic_align.core import maximum_path_c
def maximum_path(neg_cent, mask):
""" Cython optimized version.
neg_cent: [b, t_t, t_s]
mask: [b, t_t, t_s]
"""
device = neg_cent.device
dtype = neg_cent.dtype
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
path = np.zeros(neg_cent.shape, dtype=np.int32)
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
return torch.from_numpy(path).to(device=device, dtype=dtype)

21446
monotonic_align/core.c Normal file

File diff suppressed because it is too large Load Diff

42
monotonic_align/core.pyx Normal file
View File

@ -0,0 +1,42 @@
cimport cython
from cython.parallel import prange
@cython.boundscheck(False)
@cython.wraparound(False)
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
cdef int x
cdef int y
cdef float v_prev
cdef float v_cur
cdef float tmp
cdef int index = t_x - 1
for y in range(t_y):
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
if x == y:
v_cur = max_neg_val
else:
v_cur = value[y-1, x]
if x == 0:
if y == 0:
v_prev = 0.
else:
v_prev = max_neg_val
else:
v_prev = value[y-1, x-1]
value[y, x] += max(v_prev, v_cur)
for y in range(t_y - 1, -1, -1):
path[y, index] = 1
if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
index = index - 1
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
cdef int b = paths.shape[0]
cdef int i
for i in prange(b, nogil=True):
maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])

9
monotonic_align/setup.py Normal file
View File

@ -0,0 +1,9 @@
from distutils.core import setup
from Cython.Build import cythonize
import numpy
setup(
name = 'monotonic_align',
ext_modules = cythonize("core.pyx"),
include_dirs=[numpy.get_include()]
)

6
pre.py
View File

@ -1,4 +1,4 @@
from models.synthesizer.preprocess import create_embeddings, preprocess_dataset from models.synthesizer.preprocess import create_embeddings, preprocess_dataset, create_emo
from models.synthesizer.hparams import hparams from models.synthesizer.hparams import hparams
from pathlib import Path from pathlib import Path
import argparse import argparse
@ -64,7 +64,7 @@ if __name__ == "__main__":
"noise removal and is recommended. Please install and try again. If installation fails, " "noise removal and is recommended. Please install and try again. If installation fails, "
"use --no_trim to disable this error message.") "use --no_trim to disable this error message.")
encoder_model_fpath = args.encoder_model_fpath encoder_model_fpath = args.encoder_model_fpath
del args.no_trim, args.encoder_model_fpath del args.no_trim
args.hparams = hparams.parse(args.hparams) args.hparams = hparams.parse(args.hparams)
n_processes_embed = args.n_processes_embed n_processes_embed = args.n_processes_embed
@ -73,3 +73,5 @@ if __name__ == "__main__":
create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath) create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath)
if args.emotion_extract:
create_emo(synthesizer_root=args.out_dir, n_processes=n_processes_embed, skip_existing=args.skip_existing, hparams=args.hparams)

View File

@ -17,28 +17,6 @@ def load_wav_to_torch(full_path):
sampling_rate, data = read(full_path) sampling_rate, data = read(full_path)
return torch.FloatTensor(data.astype(np.float32)), sampling_rate return torch.FloatTensor(data.astype(np.float32)), sampling_rate
def spectrogram1(y, n_fft, sampling_rate, hop_size, win_size, center=False):
if torch.min(y) < -1.:
print('min value is ', torch.min(y))
if torch.max(y) > 1.:
print('max value is ', torch.max(y))
global hann_window
dtype_device = str(y.dtype) + '_' + str(y.device)
wnsize_dtype_device = str(win_size) + '_' + dtype_device
if wnsize_dtype_device not in hann_window:
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
y = y.squeeze(1)
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
return spec
def spectrogram(y, n_fft, hop_size, win_size, center=False): def spectrogram(y, n_fft, hop_size, win_size, center=False):
if torch.min(y) < -1.: if torch.min(y) < -1.:
print('min value is ', torch.min(y)) print('min value is ', torch.min(y))

325
vits.ipynb vendored
View File

@ -17,7 +17,7 @@
"from models.synthesizer.utils.text import text_to_sequence\n", "from models.synthesizer.utils.text import text_to_sequence\n",
"\n", "\n",
"\n", "\n",
"hps = load_hparams_json(\"data/ckpt/synthesizer/vits2/config.json\")\n", "hps = load_hparams_json(\"data/ckpt/synthesizer/vits5/config.json\")\n",
"print(hps.train)\n", "print(hps.train)\n",
"model = Vits(\n", "model = Vits(\n",
" len(symbols),\n", " len(symbols),\n",
@ -27,7 +27,7 @@
" **hps[\"model\"])\n", " **hps[\"model\"])\n",
"_ = model.eval()\n", "_ = model.eval()\n",
"device = torch.device(\"cpu\")\n", "device = torch.device(\"cpu\")\n",
"checkpoint = torch.load(str(\"data/ckpt/synthesizer/vits2/G_120000.pth\"), map_location=device)\n", "checkpoint = torch.load(str(\"data/ckpt/synthesizer/vits5/G_56000.pth\"), map_location=device)\n",
"if \"model_state\" in checkpoint:\n", "if \"model_state\" in checkpoint:\n",
" state = checkpoint[\"model_state\"]\n", " state = checkpoint[\"model_state\"]\n",
"else:\n", "else:\n",
@ -35,17 +35,17 @@
"model.load_state_dict(state, strict=False)\n", "model.load_state_dict(state, strict=False)\n",
"\n", "\n",
"# 随机抽取情感参考音频的根目录\n", "# 随机抽取情感参考音频的根目录\n",
"random_emotion_root = \"D:\\\\audiodata\\\\aidatatang_200zh\\\\corpus\\\\train\\\\G0017\"\n", "random_emotion_root = \"D:\\\\audiodata\\\\SV2TTS\\\\synthesizer\\\\emo\\\\\"\n",
"import random, re\n", "import random, re\n",
"from pypinyin import lazy_pinyin, Style\n", "from pypinyin import lazy_pinyin, Style\n",
"\n", "\n",
"import os\n", "import os\n",
"\n", "\n",
"def tts(txt, emotion, sid=0):\n", "def tts(txt, emotion, sid=0):\n",
" txt = \" \".join(lazy_pinyin(txt, style=Style.TONE3, neutral_tone_with_five=True))\n", " txt = \" \".join(lazy_pinyin(txt, style=Style.TONE3, neutral_tone_with_five=False))\n",
" text_norm = text_to_sequence(txt, hps[\"data\"][\"text_cleaners\"])\n", " text_norm = text_to_sequence(txt, hps[\"data\"][\"text_cleaners\"])\n",
" if hps[\"data\"][\"add_blank\"]:\n", " # if hps[\"data\"][\"add_blank\"]:\n",
" text_norm = intersperse(text_norm, 0)\n", " # text_norm = intersperse(text_norm, 0)\n",
" stn_tst = torch.LongTensor(text_norm)\n", " stn_tst = torch.LongTensor(text_norm)\n",
"\n", "\n",
" with torch.no_grad(): #inference mode\n", " with torch.no_grad(): #inference mode\n",
@ -57,6 +57,13 @@
" import librosa\n", " import librosa\n",
" wav, sr = librosa.load(emotion, 16000)\n", " wav, sr = librosa.load(emotion, 16000)\n",
" emo = torch.FloatTensor(extract_emo(np.expand_dims(wav, 0), sr, embeddings=True))\n", " emo = torch.FloatTensor(extract_emo(np.expand_dims(wav, 0), sr, embeddings=True))\n",
" elif emotion == \"random_sample\":\n",
" rand_emo = random.sample(os.listdir(random_emotion_root), 1)[0]\n",
" print(rand_emo)\n",
" emo = torch.FloatTensor(np.load(f\"{random_emotion_root}\\\\{rand_emo}\")).unsqueeze(0)\n",
" elif emotion.endswith(\"npy\"):\n",
" print(emotion)\n",
" emo = torch.FloatTensor(np.load(f\"{random_emotion_root}\\\\{emotion}\")).unsqueeze(0)\n",
" else:\n", " else:\n",
" print(\"emotion参数不正确\")\n", " print(\"emotion参数不正确\")\n",
"\n", "\n",
@ -80,8 +87,55 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"txt = \"随机抽取的音频文件路径可以用于使用该情感合成其他句子\"\n", "txt = \"我们将其拓展到文本驱动数字人形象领域\"\n",
"tts(txt, emotion='C:\\\\Users\\\\babys\\\\Desktop\\\\voicecollection\\\\secondround\\\\美玉.wav', sid=2)" "#正常: \n",
"tts(txt, emotion='emo-T0055G4906S0052.wav_00.npy', sid=100)\n",
"#快速emo-T0055G2323S0179.wav_00.npy\n",
"\n",
"#难过:\n",
"tts(txt, emotion='emo-15_4581_20170825202626.wav_00.npy', sid=100)\n",
"\n",
"#开心T0055G2412S0498.wav\n",
"tts(txt, emotion='emo-T0055G2412S0498.wav_00.npy', sid=100)\n",
"\n",
"#愤怒 T0055G1371S0363.wav T0055G1344S0160.wav\n",
"tts(txt, emotion='emo-T0055G1344S0160.wav_00.npy', sid=100)\n",
"\n",
"#疲惫\n",
"tts(txt, emotion='emo-T0055G2294S0476.wav_00.npy', sid=100)\n",
"\n",
"#着急\n",
"tts(txt, emotion='emo-T0055G1671S0170.wav_00.npy', sid=100)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"txt = \"我们将其拓展到文本驱动数字人形象领域\"\n",
"tts(txt, emotion='random_sample', sid=100)\n",
"tts(txt, emotion='random_sample', sid=100)\n",
"tts(txt, emotion='random_sample', sid=100)\n",
"tts(txt, emotion='random_sample', sid=100)\n",
"tts(txt, emotion='random_sample', sid=100)\n",
"tts(txt, emotion='random_sample', sid=100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"txt = \"我们将其拓展到文本驱动数字人形象领域\"\n",
"types = [\"平淡\", \"激动\", \"疲惫\", \"兴奋\", \"沮丧\", \"开心\"]\n",
"for t in types:\n",
" print(t)\n",
" tts(txt, emotion=f'C:\\\\Users\\\\babys\\\\Music\\\\{t}.wav', sid=100)\n",
"# tts(txt, emotion='D:\\\\audiodata\\\\aidatatang_200zh\\\\corpus\\\\train\\\\G1858\\\\T0055G1858S0342.wav', sid=5)"
] ]
}, },
{ {
@ -128,7 +182,7 @@
" use_lws = False, # \"Fast spectrogram phase recovery using local weighted sums\"\n", " use_lws = False, # \"Fast spectrogram phase recovery using local weighted sums\"\n",
" symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,\n", " symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,\n",
" # and [0, max_abs_value] if False\n", " # and [0, max_abs_value] if False\n",
" trim_silence = True, # Use with sample_rate of 16000 for best results\n", " trim_silence = False, # Use with sample_rate of 16000 for best results\n",
"\n", "\n",
")\n", ")\n",
"preprocess_dataset(datasets_root=datasets_root, \n", "preprocess_dataset(datasets_root=datasets_root, \n",
@ -137,7 +191,7 @@
" skip_existing=True, \n", " skip_existing=True, \n",
" hparams=hparams, \n", " hparams=hparams, \n",
" no_alignments=False, \n", " no_alignments=False, \n",
" dataset=\"magicdata\", \n", " dataset=\"aidatatang_200zh\", \n",
" emotion_extract=True)" " emotion_extract=True)"
] ]
}, },
@ -213,6 +267,14 @@
"metadata_file.close()" "metadata_file.close()"
] ]
}, },
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"从训练集中抽取10%作为测试集"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -220,36 +282,239 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"from pathlib import Path\n", "from pathlib import Path\n",
"import os\n",
"import shutil\n",
"emo_root = Path('../audiodata/SV2TTS/synthesizer').joinpath('emo')\n",
"# raw_root = Path('../audiodata/aidatatang_200zh/corpus/train')\n",
"# emo_file_list = emo_root.glob(\"**/*.npy\")\n",
"# for emo_file in emo_file_list:\n",
"# if emo_file.name.endswith('wav__00.npy'):\n",
"# folder = emo_file.parent\n",
"# os.rename(emo_file, folder.joinpath(emo_file.name.replace(\"__00\", \"_00\")))\n",
" # shutil.move(emo_file, emo_root.joinpath(emo_file.name))\n",
"\n",
"root = Path('../audiodata/SV2TTS/synthesizer')\n", "root = Path('../audiodata/SV2TTS/synthesizer')\n",
"dict_info = []\n", "dict_info1 = []\n",
"dict_info2 = []\n",
"count = 1\n",
"with open(root.joinpath(\"train.txt\"), \"r\", encoding=\"utf-8\") as dict_meta:\n", "with open(root.joinpath(\"train.txt\"), \"r\", encoding=\"utf-8\") as dict_meta:\n",
" for raw in dict_meta:\n", " for raw in dict_meta:\n",
" if not raw:\n", " if not raw:\n",
" continue\n", " continue\n",
" v = raw.split(\"|\")[0].replace(\"audio\",\"emo\")\n", " if count % 10 == 0:\n",
" emo_fpath = root.joinpath(\"emo\").joinpath(v)\n", " dict_info2.append(raw)\n",
" if emo_fpath.exists():\n", " else:\n",
" dict_info.append(raw)\n", " dict_info1.append(raw)\n",
" # else:\n", " count += 1\n",
" # print(emo_fpath)\n",
"# Iterate over each wav\n", "# Iterate over each wav\n",
"meta2 = Path('../audiodata/SV2TTS/synthesizer/train2.txt')\n", "meta1 = Path('../audiodata/SV2TTS/synthesizer/train1.txt')\n",
"metadata_file = meta1.open(\"w\", encoding=\"utf-8\")\n",
"for new_info in dict_info1:\n",
" metadata_file.write(new_info)\n",
"metadata_file.close()\n",
"\n",
"meta2 = Path('../audiodata/SV2TTS/synthesizer/eval.txt')\n",
"metadata_file = meta2.open(\"w\", encoding=\"utf-8\")\n", "metadata_file = meta2.open(\"w\", encoding=\"utf-8\")\n",
"for new_info in dict_info:\n", "for new_info in dict_info2:\n",
" metadata_file.write(new_info)\n", " metadata_file.write(new_info)\n",
"metadata_file.close()" "metadata_file.close()"
] ]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"evaluation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"root = Path('../audiodata/SV2TTS/synthesizer')\n",
"spks = []\n",
"spk_id = {}\n",
"rows = []\n",
"with open(root.joinpath(\"eval.txt\"), \"r\", encoding=\"utf-8\") as dict_meta:\n",
" for raw in dict_meta:\n",
" speaker_name = raw.split(\"-\")[1][6:10]\n",
" if speaker_name not in spk_id:\n",
" spks.append(speaker_name)\n",
" spk_id[speaker_name] = 1\n",
" rows.append(raw)\n",
"i = 0\n",
"spks.sort()\n",
"\n",
"for sp in spks:\n",
" spk_id[sp] = str(i)\n",
" i = i + 1\n",
"print(len(spks))\n",
"meta2 = Path('../audiodata/SV2TTS/synthesizer/eval2.txt')\n",
"metadata_file = meta2.open(\"w\", encoding=\"utf-8\")\n",
"for row in rows:\n",
" speaker_n = row.split(\"-\")[1][6:10]\n",
" metadata_file.write(row.strip()+\"|\"+spk_id[speaker_n]+\"\\n\")\n",
"metadata_file.close()\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"[Not Recommended]\n",
"Try to transcript map to detailed format:\n",
"ni3 hao3 -> n i3 <pad> h ao3\n",
"\n",
"After couple of tests, I think this method will not improve the quality of result and may cause the crash of monotonic alignment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"from pathlib import Path\n",
"datasets_root = Path(\"../audiodata/SV2TTS/synthesizer/\")\n",
"\n",
"dictionary_fp = Path(\"../audiodata/ProDiff/processed/mandarin_pinyin.dict\")\n",
"dict_map = {}\n",
"for l in open(dictionary_fp, encoding='utf-8').readlines():\n",
" item = l.split(\"\\t\")\n",
" dict_map[item[0]] = item[1].replace(\"\\n\",\"\")\n",
"\n",
"with datasets_root.joinpath('train2.txt').open(\"w+\", encoding='utf-8') as f:\n",
" for l in open(datasets_root.joinpath('train.txt'), encoding='utf-8').readlines():\n",
" items = l.strip().replace(\"\\n\",\"\").replace(\"\\t\",\" \").split(\"|\")\n",
" phs_str = \"\"\n",
" for word in items[5].split(\" \"):\n",
" if word in dict_map:\n",
" phs_str += dict_map[word] \n",
" else:\n",
" phs_str += word\n",
" phs_str += \" _ \"\n",
" items[5] = phs_str\n",
" # if not os.path.exists(mfa_input_root.joinpath('train.txt')):\n",
" # with open(mfa_input_root.joinpath(fileName + 'lab'), 'w+', encoding=\"utf-8\") as f:\n",
" f.write(\"|\".join(items) + \"\\n\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"预处理后的数据可视化"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import librosa.display\n",
"import librosa, torch\n",
"import numpy as np\n",
"from utils.audio_utils import spectrogram, mel_spectrogram, load_wav_to_torch, spec_to_mel\n",
"\n",
"# x, sr = librosa.load(\"D:\\audiodata\\SV2TTS\\synthesizer\\audio\\audio-T0055G2333S0196.wav_00.npy\")\n",
"x = np.load(\"D:\\\\audiodata\\\\SV2TTS\\\\synthesizer\\\\audio\\\\audio-T0055G1858S0342.wav_00.npy\")\n",
"\n",
"plt.figure(figsize=(14, 5))\n",
"librosa.display.waveplot(x)\n",
"\n",
"X = librosa.stft(x)\n",
"Xdb = librosa.amplitude_to_db(abs(X))\n",
"plt.figure(figsize=(14, 5))\n",
"librosa.display.specshow(Xdb, x_axis='time', y_axis='hz')\n",
"\n",
"# spectrogram = np.load(\"D:\\\\audiodata\\\\SV2TTS\\\\synthesizer\\\\mels\\\\mel-T0055G1858S0342.wav_00.npy\")\n",
"audio = torch.from_numpy(x.astype(np.float32))\n",
"\n",
"# audio, sampling_rate = load_wav_to_torch(\"D:\\\\audiodata\\\\aidatatang_200zh\\\\corpus\\\\train\\\\G1858\\\\T0055G1858S0342.wav\")\n",
"# audio_norm = audio / 32768.0\n",
"audio_norm = audio.unsqueeze(0)\n",
"spec = spectrogram(audio_norm, 1024, 256, 1024, center=False)\n",
"# spec = spec_to_mel()\n",
"spec = torch.squeeze(spec, 0)\n",
"mel = spec_to_mel(spec, 1024, 80, 16000, 0, None)\n",
"\n",
"fig = plt.figure(figsize=(10, 8))\n",
"ax2 = fig.add_subplot(211)\n",
"im = ax2.imshow(mel, interpolation=\"none\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"情感聚类"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# from sklearn import metrics\n",
"# from sklearn.mixture import GaussianMixture # 高斯混合模型\n",
"import os\n",
"import numpy as np\n",
"import librosa\n",
"import IPython.display as ipd\n",
"from random import sample\n",
"\n",
"embs = []\n",
"wavnames = []\n",
"emo_root_path = \"D:\\\\audiodata\\\\SV2TTS\\\\synthesizer\\\\emo\\\\\"\n",
"wav_root_path = \"D:\\\\audiodata\\\\aidatatang_200zh\\\\corpus\\\\train\\\\\"\n",
"for idx, emo_fpath in enumerate(sample(os.listdir(emo_root_path), 10000)):\n",
" if emo_fpath.endswith(\".npy\") and emo_fpath.startswith(\"emo-T\"):\n",
" embs.append(np.expand_dims(np.load(emo_root_path + emo_fpath), axis=0))\n",
" wav_fpath = wav_root_path + emo_fpath[9:14] + \"\\\\\" + emo_fpath.split(\"_00\")[0][4:]\n",
" wavnames.append(wav_fpath)\n",
"print(len(embs))\n",
"\n",
"\n",
"x = np.concatenate(embs, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 聚类算法类的数量\n",
"n_clusters = 20\n",
"from sklearn.cluster import *\n",
"# model = KMeans(n_clusters=n_clusters, random_state=10)\n",
"# model = DBSCAN(eps=0.002, min_samples=2)\n",
"# 可以自行尝试各种不同的聚类算法\n",
"# model = Birch(n_clusters= n_clusters, threshold= 0.2)\n",
"# model = SpectralClustering(n_clusters=n_clusters)\n",
"model = AgglomerativeClustering(n_clusters= n_clusters)\n",
"import random\n",
"\n",
"y_predict = model.fit_predict(x)\n",
"\n",
"def disp(wavname):\n",
" wav, sr =librosa.load(wavname, 16000)\n",
" display(ipd.Audio(wav, rate=sr))\n",
"\n",
"classes=[[] for i in range(y_predict.max()+1)]\n",
"\n",
"for idx, wavname in enumerate(wavnames):\n",
" classes[y_predict[idx]].append(wavname)\n",
"\n",
"for i in range(y_predict.max()+1):\n",
" print(\"类别:\", i, \"本类中样本数量:\", len(classes[i]))\n",
" \"\"\"每一个类只预览2条音频\"\"\"\n",
" for j in range(2):\n",
" idx = random.randint(0, len(classes[i]) - 1)\n",
" print(classes[i][idx])\n",
" disp(classes[i][idx])"
]
} }
], ],
"metadata": { "metadata": {