add pretrained
parent
3ce874ab46
commit
5c17fc8bb0
|
@ -14,11 +14,13 @@
|
||||||
*.bcf
|
*.bcf
|
||||||
*.toc
|
*.toc
|
||||||
*.sh
|
*.sh
|
||||||
data/ckpt
|
data/ckpt/*/*
|
||||||
!data/ckpt/vocoder/pretrained/**
|
|
||||||
!data/ckpt/encoder/pretrained.pt
|
!data/ckpt/encoder/pretrained.pt
|
||||||
|
!data/ckpt/vocoder/pretrained/
|
||||||
wavs
|
wavs
|
||||||
log
|
log
|
||||||
!/docker-entrypoint.sh
|
!/docker-entrypoint.sh
|
||||||
!/datasets_download/*.sh
|
!/datasets_download/*.sh
|
||||||
/datasets
|
/datasets
|
||||||
|
monotonic_align/build
|
||||||
|
monotonic_align/monotonic_align
|
|
@ -53,7 +53,7 @@
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"program": "train.py",
|
"program": "train.py",
|
||||||
"console": "integratedTerminal",
|
"console": "integratedTerminal",
|
||||||
"args": ["--type", "synth", "..\\audiodata\\SV2TTS\\synthesizer"]
|
"args": ["--type", "vits"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Python: PPG Convert",
|
"name": "Python: PPG Convert",
|
||||||
|
|
Binary file not shown.
|
@ -0,0 +1,31 @@
|
||||||
|
{
|
||||||
|
"resblock": "1",
|
||||||
|
"num_gpus": 0,
|
||||||
|
"batch_size": 16,
|
||||||
|
"learning_rate": 0.0002,
|
||||||
|
"adam_b1": 0.8,
|
||||||
|
"adam_b2": 0.99,
|
||||||
|
"lr_decay": 0.999,
|
||||||
|
"seed": 1234,
|
||||||
|
|
||||||
|
"upsample_rates": [5,5,4,2],
|
||||||
|
"upsample_kernel_sizes": [10,10,8,4],
|
||||||
|
"upsample_initial_channel": 512,
|
||||||
|
"resblock_kernel_sizes": [3,7,11],
|
||||||
|
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||||
|
|
||||||
|
"segment_size": 6400,
|
||||||
|
"num_mels": 80,
|
||||||
|
"num_freq": 1025,
|
||||||
|
"n_fft": 1024,
|
||||||
|
"hop_size": 200,
|
||||||
|
"win_size": 800,
|
||||||
|
|
||||||
|
"sampling_rate": 16000,
|
||||||
|
|
||||||
|
"fmin": 0,
|
||||||
|
"fmax": 7600,
|
||||||
|
"fmax_for_loss": null,
|
||||||
|
|
||||||
|
"num_workers": 4
|
||||||
|
}
|
Binary file not shown.
Binary file not shown.
|
@ -6,7 +6,7 @@ from pathlib import Path
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from models.encoder import inference as encoder
|
from models.encoder import inference as encoder
|
||||||
from models.synthesizer.preprocess_audio import preprocess_general
|
from models.synthesizer.preprocess_audio import preprocess_general, extract_emo
|
||||||
from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata
|
from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata
|
||||||
|
|
||||||
data_info = {
|
data_info = {
|
||||||
|
@ -41,7 +41,7 @@ data_info = {
|
||||||
|
|
||||||
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
skip_existing: bool, hparams, no_alignments: bool,
|
skip_existing: bool, hparams, no_alignments: bool,
|
||||||
dataset: str, emotion_extract = False):
|
dataset: str, emotion_extract = False, encoder_model_fpath=None):
|
||||||
dataset_info = data_info[dataset]
|
dataset_info = data_info[dataset]
|
||||||
# Gather the input directories
|
# Gather the input directories
|
||||||
dataset_root = datasets_root.joinpath(dataset)
|
dataset_root = datasets_root.joinpath(dataset)
|
||||||
|
@ -77,7 +77,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
||||||
|
|
||||||
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||||
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, emotion_extract=emotion_extract)
|
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
|
||||||
job = Pool(n_processes).imap(func, speaker_dirs)
|
job = Pool(n_processes).imap(func, speaker_dirs)
|
||||||
|
|
||||||
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
||||||
|
@ -110,6 +110,13 @@ def embed_utterance(fpaths, encoder_model_fpath):
|
||||||
embed = encoder.embed_utterance(wav)
|
embed = encoder.embed_utterance(wav)
|
||||||
np.save(embed_fpath, embed, allow_pickle=False)
|
np.save(embed_fpath, embed, allow_pickle=False)
|
||||||
|
|
||||||
|
def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False):
|
||||||
|
if skip_existing and fpaths.exists():
|
||||||
|
return
|
||||||
|
wav_fpath, emo_fpath = fpaths
|
||||||
|
wav = np.load(wav_fpath)
|
||||||
|
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
|
||||||
|
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
|
||||||
|
|
||||||
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
|
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
|
||||||
wav_dir = synthesizer_root.joinpath("audio")
|
wav_dir = synthesizer_root.joinpath("audio")
|
||||||
|
@ -128,3 +135,21 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
|
||||||
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
||||||
job = Pool(n_processes).imap(func, fpaths)
|
job = Pool(n_processes).imap(func, fpaths)
|
||||||
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
||||||
|
|
||||||
|
def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
|
||||||
|
wav_dir = synthesizer_root.joinpath("audio")
|
||||||
|
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
||||||
|
assert wav_dir.exists() and metadata_fpath.exists()
|
||||||
|
emo_dir = synthesizer_root.joinpath("emo")
|
||||||
|
emo_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Gather the input wave filepath and the target output embed filepath
|
||||||
|
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
||||||
|
metadata = [line.split("|") for line in metadata_file]
|
||||||
|
fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata]
|
||||||
|
|
||||||
|
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
||||||
|
# Embed the utterances in separate threads
|
||||||
|
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
|
||||||
|
job = Pool(n_processes).imap(func, fpaths)
|
||||||
|
list(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
||||||
|
|
|
@ -13,7 +13,11 @@ import torch
|
||||||
from transformers import Wav2Vec2Processor
|
from transformers import Wav2Vec2Processor
|
||||||
from .models.wav2emo import EmotionExtractorModel
|
from .models.wav2emo import EmotionExtractorModel
|
||||||
|
|
||||||
SAMPLE_RATE = 16000
|
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
|
||||||
|
pass
|
||||||
|
|
||||||
|
pinyin = Pinyin(PinyinConverter()).pinyin
|
||||||
|
|
||||||
|
|
||||||
# load model from hub
|
# load model from hub
|
||||||
device = 'cuda' if torch.cuda.is_available() else "cpu"
|
device = 'cuda' if torch.cuda.is_available() else "cpu"
|
||||||
|
@ -40,14 +44,8 @@ def extract_emo(
|
||||||
|
|
||||||
return y
|
return y
|
||||||
|
|
||||||
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
|
|
||||||
pass
|
|
||||||
|
|
||||||
pinyin = Pinyin(PinyinConverter()).pinyin
|
|
||||||
|
|
||||||
|
|
||||||
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||||
skip_existing: bool, hparams, emotion_extract: bool):
|
skip_existing: bool, hparams, encoder_model_fpath):
|
||||||
## FOR REFERENCE:
|
## FOR REFERENCE:
|
||||||
# For you not to lose your head if you ever wish to change things here or implement your own
|
# For you not to lose your head if you ever wish to change things here or implement your own
|
||||||
# synthesizer.
|
# synthesizer.
|
||||||
|
@ -69,6 +67,8 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||||
|
|
||||||
# Trim silence
|
# Trim silence
|
||||||
if hparams.trim_silence:
|
if hparams.trim_silence:
|
||||||
|
if not encoder.is_loaded():
|
||||||
|
encoder.load_model(encoder_model_fpath)
|
||||||
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
||||||
|
|
||||||
# Skip utterances that are too short
|
# Skip utterances that are too short
|
||||||
|
@ -109,7 +109,7 @@ def _split_on_silences(wav_fpath, words, hparams):
|
||||||
|
|
||||||
return wav, res
|
return wav, res
|
||||||
|
|
||||||
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, emotion_extract: bool):
|
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
|
||||||
metadata = []
|
metadata = []
|
||||||
extensions = ["*.wav", "*.flac", "*.mp3"]
|
extensions = ["*.wav", "*.flac", "*.mp3"]
|
||||||
for extension in extensions:
|
for extension in extensions:
|
||||||
|
@ -124,14 +124,9 @@ def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams,
|
||||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||||
result = _process_utterance(wav, text, out_dir, sub_basename,
|
result = _process_utterance(wav, text, out_dir, sub_basename,
|
||||||
skip_existing, hparams, emotion_extract)
|
skip_existing, hparams, encoder_model_fpath)
|
||||||
if result is None:
|
if result is None:
|
||||||
continue
|
continue
|
||||||
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
||||||
emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % sub_basename)
|
|
||||||
skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists())
|
|
||||||
if not skip_emo_extract and wav is not None:
|
|
||||||
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
|
|
||||||
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
|
|
||||||
metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
|
metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
|
||||||
return [m for m in metadata if m is not None]
|
return [m for m in metadata if m is not None]
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import torch.nn.functional as F
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.data
|
import torch.utils.data
|
||||||
|
|
||||||
from utils.audio_utils import spectrogram1, load_wav_to_torch, spectrogram
|
from utils.audio_utils import load_wav_to_torch, spectrogram
|
||||||
from utils.util import intersperse
|
from utils.util import intersperse
|
||||||
from models.synthesizer.utils.text import text_to_sequence
|
from models.synthesizer.utils.text import text_to_sequence
|
||||||
|
|
||||||
|
@ -51,21 +52,10 @@ class VitsDataset(torch.utils.data.Dataset):
|
||||||
lengths = []
|
lengths = []
|
||||||
|
|
||||||
# for audiopath, sid, text in self.audio_metadata:
|
# for audiopath, sid, text in self.audio_metadata:
|
||||||
sid = 0
|
for wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid in self.audio_metadata:
|
||||||
spk_to_sid = {}
|
|
||||||
for wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text in self.audio_metadata:
|
|
||||||
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
|
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
|
||||||
# TODO: for magic data only
|
audio_metadata_new.append([wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid])
|
||||||
speaker_name = wav_fpath.split("_")[1]
|
|
||||||
# # TODO: for ai data only
|
|
||||||
# speaker_name = wav_fpath.split("-")[1][6:9]
|
|
||||||
if speaker_name not in spk_to_sid:
|
|
||||||
sid += 1
|
|
||||||
spk_to_sid[speaker_name] = sid
|
|
||||||
|
|
||||||
audio_metadata_new.append([wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spk_to_sid[speaker_name]])
|
|
||||||
lengths.append(os.path.getsize(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}') // (2 * self.hop_length))
|
lengths.append(os.path.getsize(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}') // (2 * self.hop_length))
|
||||||
print("found sid:%d", sid)
|
|
||||||
self.audio_metadata = audio_metadata_new
|
self.audio_metadata = audio_metadata_new
|
||||||
self.lengths = lengths
|
self.lengths = lengths
|
||||||
|
|
||||||
|
@ -74,50 +64,31 @@ class VitsDataset(torch.utils.data.Dataset):
|
||||||
wav_fpath, text, sid = audio_metadata[0], audio_metadata[5], audio_metadata[6]
|
wav_fpath, text, sid = audio_metadata[0], audio_metadata[5], audio_metadata[6]
|
||||||
text = self.get_text(text)
|
text = self.get_text(text)
|
||||||
|
|
||||||
# TODO: add original audio data root for loading
|
spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
|
||||||
file_name = wav_fpath.split("_00")[0].split('-')[1]
|
|
||||||
spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}..{os.sep}..{os.sep}magicdata{os.sep}train{os.sep}{"_".join(file_name.split("_")[:2])}{os.sep}{file_name}')
|
|
||||||
|
|
||||||
# spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
|
|
||||||
sid = self.get_sid(sid)
|
sid = self.get_sid(sid)
|
||||||
emo = torch.FloatTensor(np.load(f'{self.datasets_root}{os.sep}emo{os.sep}{wav_fpath.replace("audio", "emo")}'))
|
emo = torch.FloatTensor(np.load(f'{self.datasets_root}{os.sep}emo{os.sep}{wav_fpath.replace("audio", "emo")}'))
|
||||||
return (text, spec, wav, sid, emo)
|
return (text, spec, wav, sid, emo)
|
||||||
|
|
||||||
def get_audio(self, filename):
|
def get_audio(self, filename):
|
||||||
audio, sampling_rate = load_wav_to_torch(filename)
|
# Load preprocessed wav npy instead of reading from wav file
|
||||||
if sampling_rate != self.sampling_rate:
|
audio = torch.FloatTensor(np.load(filename))
|
||||||
raise ValueError("{} {} SR doesn't match target {} SR".format(
|
audio_norm = audio.unsqueeze(0)
|
||||||
sampling_rate, self.sampling_rate))
|
|
||||||
audio_norm = audio / self.max_wav_value
|
spec_filename = filename.replace(".wav", ".spec")
|
||||||
audio_norm = audio_norm.unsqueeze(0)
|
if os.path.exists(spec_filename):
|
||||||
spec = spectrogram(audio_norm, self.filter_length, self.hop_length, self.win_length,
|
spec = torch.load(spec_filename)
|
||||||
center=False)
|
else:
|
||||||
|
spec = spectrogram(audio_norm, self.filter_length,self.hop_length, self.win_length,
|
||||||
|
center=False)
|
||||||
|
torch.save(spec, spec_filename)
|
||||||
spec = torch.squeeze(spec, 0)
|
spec = torch.squeeze(spec, 0)
|
||||||
return spec, audio_norm
|
return spec, audio_norm
|
||||||
|
|
||||||
# print("Loading", filename)
|
|
||||||
# # audio = torch.FloatTensor(np.load(filename).astype(np.float32))
|
|
||||||
# audio = audio.unsqueeze(0)
|
|
||||||
# audio_norm = audio / self.max_wav_value
|
|
||||||
# audio_norm = audio_norm.unsqueeze(0)
|
|
||||||
# # spec_filename = filename.replace(".wav", ".spec.pt")
|
|
||||||
# # if os.path.exists(spec_filename):
|
|
||||||
# # spec = torch.load(spec_filename)
|
|
||||||
# # else:
|
|
||||||
# # spec = spectrogram(audio, self.filter_length,self.hop_length, self.win_length,
|
|
||||||
# # center=False)
|
|
||||||
# # spec = torch.squeeze(spec, 0)
|
|
||||||
# # torch.save(spec, spec_filename)
|
|
||||||
# spec = spectrogram(audio, self.filter_length, self.hop_length, self.win_length,
|
|
||||||
# center=False)
|
|
||||||
# spec = torch.squeeze(spec, 0)
|
|
||||||
# return spec, audio
|
|
||||||
|
|
||||||
def get_text(self, text):
|
def get_text(self, text):
|
||||||
if self.cleaned_text:
|
if self.cleaned_text:
|
||||||
text_norm = text_to_sequence(text, self.text_cleaners)
|
text_norm = text_to_sequence(text, self.text_cleaners)
|
||||||
if self.add_blank:
|
if self.add_blank:
|
||||||
text_norm = intersperse(text_norm, 0)
|
text_norm = intersperse(text_norm, 0) # 在所有文本数值序列中的元素前后都补充一个0 - 不适用于中文
|
||||||
text_norm = torch.LongTensor(text_norm)
|
text_norm = torch.LongTensor(text_norm)
|
||||||
return text_norm
|
return text_norm
|
||||||
|
|
||||||
|
@ -188,7 +159,7 @@ class VitsDatasetCollate():
|
||||||
emo[i, :] = row[4]
|
emo[i, :] = row[4]
|
||||||
|
|
||||||
if self.return_ids:
|
if self.return_ids:
|
||||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
|
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing, emo
|
||||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, emo
|
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, emo
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from .monotonic_align.core import maximum_path_c
|
||||||
|
|
||||||
|
|
||||||
|
def maximum_path(neg_cent, mask):
|
||||||
|
""" Cython optimized version.
|
||||||
|
neg_cent: [b, t_t, t_s]
|
||||||
|
mask: [b, t_t, t_s]
|
||||||
|
"""
|
||||||
|
device = neg_cent.device
|
||||||
|
dtype = neg_cent.dtype
|
||||||
|
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
|
||||||
|
path = np.zeros(neg_cent.shape, dtype=np.int32)
|
||||||
|
|
||||||
|
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||||
|
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||||
|
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
|
||||||
|
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,42 @@
|
||||||
|
cimport cython
|
||||||
|
from cython.parallel import prange
|
||||||
|
|
||||||
|
|
||||||
|
@cython.boundscheck(False)
|
||||||
|
@cython.wraparound(False)
|
||||||
|
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
|
||||||
|
cdef int x
|
||||||
|
cdef int y
|
||||||
|
cdef float v_prev
|
||||||
|
cdef float v_cur
|
||||||
|
cdef float tmp
|
||||||
|
cdef int index = t_x - 1
|
||||||
|
|
||||||
|
for y in range(t_y):
|
||||||
|
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
|
||||||
|
if x == y:
|
||||||
|
v_cur = max_neg_val
|
||||||
|
else:
|
||||||
|
v_cur = value[y-1, x]
|
||||||
|
if x == 0:
|
||||||
|
if y == 0:
|
||||||
|
v_prev = 0.
|
||||||
|
else:
|
||||||
|
v_prev = max_neg_val
|
||||||
|
else:
|
||||||
|
v_prev = value[y-1, x-1]
|
||||||
|
value[y, x] += max(v_prev, v_cur)
|
||||||
|
|
||||||
|
for y in range(t_y - 1, -1, -1):
|
||||||
|
path[y, index] = 1
|
||||||
|
if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
|
||||||
|
index = index - 1
|
||||||
|
|
||||||
|
|
||||||
|
@cython.boundscheck(False)
|
||||||
|
@cython.wraparound(False)
|
||||||
|
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
|
||||||
|
cdef int b = paths.shape[0]
|
||||||
|
cdef int i
|
||||||
|
for i in prange(b, nogil=True):
|
||||||
|
maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
|
|
@ -0,0 +1,9 @@
|
||||||
|
from distutils.core import setup
|
||||||
|
from Cython.Build import cythonize
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name = 'monotonic_align',
|
||||||
|
ext_modules = cythonize("core.pyx"),
|
||||||
|
include_dirs=[numpy.get_include()]
|
||||||
|
)
|
6
pre.py
6
pre.py
|
@ -1,4 +1,4 @@
|
||||||
from models.synthesizer.preprocess import create_embeddings, preprocess_dataset
|
from models.synthesizer.preprocess import create_embeddings, preprocess_dataset, create_emo
|
||||||
from models.synthesizer.hparams import hparams
|
from models.synthesizer.hparams import hparams
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import argparse
|
import argparse
|
||||||
|
@ -64,7 +64,7 @@ if __name__ == "__main__":
|
||||||
"noise removal and is recommended. Please install and try again. If installation fails, "
|
"noise removal and is recommended. Please install and try again. If installation fails, "
|
||||||
"use --no_trim to disable this error message.")
|
"use --no_trim to disable this error message.")
|
||||||
encoder_model_fpath = args.encoder_model_fpath
|
encoder_model_fpath = args.encoder_model_fpath
|
||||||
del args.no_trim, args.encoder_model_fpath
|
del args.no_trim
|
||||||
|
|
||||||
args.hparams = hparams.parse(args.hparams)
|
args.hparams = hparams.parse(args.hparams)
|
||||||
n_processes_embed = args.n_processes_embed
|
n_processes_embed = args.n_processes_embed
|
||||||
|
@ -73,3 +73,5 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath)
|
create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath)
|
||||||
|
|
||||||
|
if args.emotion_extract:
|
||||||
|
create_emo(synthesizer_root=args.out_dir, n_processes=n_processes_embed, skip_existing=args.skip_existing, hparams=args.hparams)
|
||||||
|
|
|
@ -17,28 +17,6 @@ def load_wav_to_torch(full_path):
|
||||||
sampling_rate, data = read(full_path)
|
sampling_rate, data = read(full_path)
|
||||||
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
||||||
|
|
||||||
def spectrogram1(y, n_fft, sampling_rate, hop_size, win_size, center=False):
|
|
||||||
if torch.min(y) < -1.:
|
|
||||||
print('min value is ', torch.min(y))
|
|
||||||
if torch.max(y) > 1.:
|
|
||||||
print('max value is ', torch.max(y))
|
|
||||||
|
|
||||||
global hann_window
|
|
||||||
dtype_device = str(y.dtype) + '_' + str(y.device)
|
|
||||||
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
|
||||||
if wnsize_dtype_device not in hann_window:
|
|
||||||
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
|
||||||
|
|
||||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
|
||||||
y = y.squeeze(1)
|
|
||||||
|
|
||||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
|
||||||
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
|
|
||||||
|
|
||||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
|
||||||
return spec
|
|
||||||
|
|
||||||
|
|
||||||
def spectrogram(y, n_fft, hop_size, win_size, center=False):
|
def spectrogram(y, n_fft, hop_size, win_size, center=False):
|
||||||
if torch.min(y) < -1.:
|
if torch.min(y) < -1.:
|
||||||
print('min value is ', torch.min(y))
|
print('min value is ', torch.min(y))
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
"from models.synthesizer.utils.text import text_to_sequence\n",
|
"from models.synthesizer.utils.text import text_to_sequence\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"hps = load_hparams_json(\"data/ckpt/synthesizer/vits2/config.json\")\n",
|
"hps = load_hparams_json(\"data/ckpt/synthesizer/vits5/config.json\")\n",
|
||||||
"print(hps.train)\n",
|
"print(hps.train)\n",
|
||||||
"model = Vits(\n",
|
"model = Vits(\n",
|
||||||
" len(symbols),\n",
|
" len(symbols),\n",
|
||||||
|
@ -27,7 +27,7 @@
|
||||||
" **hps[\"model\"])\n",
|
" **hps[\"model\"])\n",
|
||||||
"_ = model.eval()\n",
|
"_ = model.eval()\n",
|
||||||
"device = torch.device(\"cpu\")\n",
|
"device = torch.device(\"cpu\")\n",
|
||||||
"checkpoint = torch.load(str(\"data/ckpt/synthesizer/vits2/G_120000.pth\"), map_location=device)\n",
|
"checkpoint = torch.load(str(\"data/ckpt/synthesizer/vits5/G_56000.pth\"), map_location=device)\n",
|
||||||
"if \"model_state\" in checkpoint:\n",
|
"if \"model_state\" in checkpoint:\n",
|
||||||
" state = checkpoint[\"model_state\"]\n",
|
" state = checkpoint[\"model_state\"]\n",
|
||||||
"else:\n",
|
"else:\n",
|
||||||
|
@ -35,17 +35,17 @@
|
||||||
"model.load_state_dict(state, strict=False)\n",
|
"model.load_state_dict(state, strict=False)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# 随机抽取情感参考音频的根目录\n",
|
"# 随机抽取情感参考音频的根目录\n",
|
||||||
"random_emotion_root = \"D:\\\\audiodata\\\\aidatatang_200zh\\\\corpus\\\\train\\\\G0017\"\n",
|
"random_emotion_root = \"D:\\\\audiodata\\\\SV2TTS\\\\synthesizer\\\\emo\\\\\"\n",
|
||||||
"import random, re\n",
|
"import random, re\n",
|
||||||
"from pypinyin import lazy_pinyin, Style\n",
|
"from pypinyin import lazy_pinyin, Style\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def tts(txt, emotion, sid=0):\n",
|
"def tts(txt, emotion, sid=0):\n",
|
||||||
" txt = \" \".join(lazy_pinyin(txt, style=Style.TONE3, neutral_tone_with_five=True))\n",
|
" txt = \" \".join(lazy_pinyin(txt, style=Style.TONE3, neutral_tone_with_five=False))\n",
|
||||||
" text_norm = text_to_sequence(txt, hps[\"data\"][\"text_cleaners\"])\n",
|
" text_norm = text_to_sequence(txt, hps[\"data\"][\"text_cleaners\"])\n",
|
||||||
" if hps[\"data\"][\"add_blank\"]:\n",
|
" # if hps[\"data\"][\"add_blank\"]:\n",
|
||||||
" text_norm = intersperse(text_norm, 0)\n",
|
" # text_norm = intersperse(text_norm, 0)\n",
|
||||||
" stn_tst = torch.LongTensor(text_norm)\n",
|
" stn_tst = torch.LongTensor(text_norm)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" with torch.no_grad(): #inference mode\n",
|
" with torch.no_grad(): #inference mode\n",
|
||||||
|
@ -57,6 +57,13 @@
|
||||||
" import librosa\n",
|
" import librosa\n",
|
||||||
" wav, sr = librosa.load(emotion, 16000)\n",
|
" wav, sr = librosa.load(emotion, 16000)\n",
|
||||||
" emo = torch.FloatTensor(extract_emo(np.expand_dims(wav, 0), sr, embeddings=True))\n",
|
" emo = torch.FloatTensor(extract_emo(np.expand_dims(wav, 0), sr, embeddings=True))\n",
|
||||||
|
" elif emotion == \"random_sample\":\n",
|
||||||
|
" rand_emo = random.sample(os.listdir(random_emotion_root), 1)[0]\n",
|
||||||
|
" print(rand_emo)\n",
|
||||||
|
" emo = torch.FloatTensor(np.load(f\"{random_emotion_root}\\\\{rand_emo}\")).unsqueeze(0)\n",
|
||||||
|
" elif emotion.endswith(\"npy\"):\n",
|
||||||
|
" print(emotion)\n",
|
||||||
|
" emo = torch.FloatTensor(np.load(f\"{random_emotion_root}\\\\{emotion}\")).unsqueeze(0)\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" print(\"emotion参数不正确\")\n",
|
" print(\"emotion参数不正确\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -80,8 +87,55 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"txt = \"随机抽取的音频文件路径可以用于使用该情感合成其他句子\"\n",
|
"txt = \"我们将其拓展到文本驱动数字人形象领域\"\n",
|
||||||
"tts(txt, emotion='C:\\\\Users\\\\babys\\\\Desktop\\\\voicecollection\\\\secondround\\\\美玉.wav', sid=2)"
|
"#正常: \n",
|
||||||
|
"tts(txt, emotion='emo-T0055G4906S0052.wav_00.npy', sid=100)\n",
|
||||||
|
"#快速:emo-T0055G2323S0179.wav_00.npy\n",
|
||||||
|
"\n",
|
||||||
|
"#难过:\n",
|
||||||
|
"tts(txt, emotion='emo-15_4581_20170825202626.wav_00.npy', sid=100)\n",
|
||||||
|
"\n",
|
||||||
|
"#开心:T0055G2412S0498.wav\n",
|
||||||
|
"tts(txt, emotion='emo-T0055G2412S0498.wav_00.npy', sid=100)\n",
|
||||||
|
"\n",
|
||||||
|
"#愤怒 T0055G1371S0363.wav T0055G1344S0160.wav\n",
|
||||||
|
"tts(txt, emotion='emo-T0055G1344S0160.wav_00.npy', sid=100)\n",
|
||||||
|
"\n",
|
||||||
|
"#疲惫\n",
|
||||||
|
"tts(txt, emotion='emo-T0055G2294S0476.wav_00.npy', sid=100)\n",
|
||||||
|
"\n",
|
||||||
|
"#着急\n",
|
||||||
|
"tts(txt, emotion='emo-T0055G1671S0170.wav_00.npy', sid=100)\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"txt = \"我们将其拓展到文本驱动数字人形象领域\"\n",
|
||||||
|
"tts(txt, emotion='random_sample', sid=100)\n",
|
||||||
|
"tts(txt, emotion='random_sample', sid=100)\n",
|
||||||
|
"tts(txt, emotion='random_sample', sid=100)\n",
|
||||||
|
"tts(txt, emotion='random_sample', sid=100)\n",
|
||||||
|
"tts(txt, emotion='random_sample', sid=100)\n",
|
||||||
|
"tts(txt, emotion='random_sample', sid=100)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"txt = \"我们将其拓展到文本驱动数字人形象领域\"\n",
|
||||||
|
"types = [\"平淡\", \"激动\", \"疲惫\", \"兴奋\", \"沮丧\", \"开心\"]\n",
|
||||||
|
"for t in types:\n",
|
||||||
|
" print(t)\n",
|
||||||
|
" tts(txt, emotion=f'C:\\\\Users\\\\babys\\\\Music\\\\{t}.wav', sid=100)\n",
|
||||||
|
"# tts(txt, emotion='D:\\\\audiodata\\\\aidatatang_200zh\\\\corpus\\\\train\\\\G1858\\\\T0055G1858S0342.wav', sid=5)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -128,7 +182,7 @@
|
||||||
" use_lws = False, # \"Fast spectrogram phase recovery using local weighted sums\"\n",
|
" use_lws = False, # \"Fast spectrogram phase recovery using local weighted sums\"\n",
|
||||||
" symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,\n",
|
" symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,\n",
|
||||||
" # and [0, max_abs_value] if False\n",
|
" # and [0, max_abs_value] if False\n",
|
||||||
" trim_silence = True, # Use with sample_rate of 16000 for best results\n",
|
" trim_silence = False, # Use with sample_rate of 16000 for best results\n",
|
||||||
"\n",
|
"\n",
|
||||||
")\n",
|
")\n",
|
||||||
"preprocess_dataset(datasets_root=datasets_root, \n",
|
"preprocess_dataset(datasets_root=datasets_root, \n",
|
||||||
|
@ -137,7 +191,7 @@
|
||||||
" skip_existing=True, \n",
|
" skip_existing=True, \n",
|
||||||
" hparams=hparams, \n",
|
" hparams=hparams, \n",
|
||||||
" no_alignments=False, \n",
|
" no_alignments=False, \n",
|
||||||
" dataset=\"magicdata\", \n",
|
" dataset=\"aidatatang_200zh\", \n",
|
||||||
" emotion_extract=True)"
|
" emotion_extract=True)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -213,6 +267,14 @@
|
||||||
"metadata_file.close()"
|
"metadata_file.close()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"从训练集中抽取10%作为测试集"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
@ -220,36 +282,239 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from pathlib import Path\n",
|
"from pathlib import Path\n",
|
||||||
"import os\n",
|
|
||||||
"import shutil\n",
|
|
||||||
"emo_root = Path('../audiodata/SV2TTS/synthesizer').joinpath('emo')\n",
|
|
||||||
"# raw_root = Path('../audiodata/aidatatang_200zh/corpus/train')\n",
|
|
||||||
"# emo_file_list = emo_root.glob(\"**/*.npy\")\n",
|
|
||||||
"# for emo_file in emo_file_list:\n",
|
|
||||||
"# if emo_file.name.endswith('wav__00.npy'):\n",
|
|
||||||
"# folder = emo_file.parent\n",
|
|
||||||
"# os.rename(emo_file, folder.joinpath(emo_file.name.replace(\"__00\", \"_00\")))\n",
|
|
||||||
" # shutil.move(emo_file, emo_root.joinpath(emo_file.name))\n",
|
|
||||||
"\n",
|
|
||||||
"root = Path('../audiodata/SV2TTS/synthesizer')\n",
|
"root = Path('../audiodata/SV2TTS/synthesizer')\n",
|
||||||
"dict_info = []\n",
|
"dict_info1 = []\n",
|
||||||
|
"dict_info2 = []\n",
|
||||||
|
"count = 1\n",
|
||||||
"with open(root.joinpath(\"train.txt\"), \"r\", encoding=\"utf-8\") as dict_meta:\n",
|
"with open(root.joinpath(\"train.txt\"), \"r\", encoding=\"utf-8\") as dict_meta:\n",
|
||||||
" for raw in dict_meta:\n",
|
" for raw in dict_meta:\n",
|
||||||
" if not raw:\n",
|
" if not raw:\n",
|
||||||
" continue\n",
|
" continue\n",
|
||||||
" v = raw.split(\"|\")[0].replace(\"audio\",\"emo\")\n",
|
" if count % 10 == 0:\n",
|
||||||
" emo_fpath = root.joinpath(\"emo\").joinpath(v)\n",
|
" dict_info2.append(raw)\n",
|
||||||
" if emo_fpath.exists():\n",
|
" else:\n",
|
||||||
" dict_info.append(raw)\n",
|
" dict_info1.append(raw)\n",
|
||||||
" # else:\n",
|
" count += 1\n",
|
||||||
" # print(emo_fpath)\n",
|
|
||||||
"# Iterate over each wav\n",
|
"# Iterate over each wav\n",
|
||||||
"meta2 = Path('../audiodata/SV2TTS/synthesizer/train2.txt')\n",
|
"meta1 = Path('../audiodata/SV2TTS/synthesizer/train1.txt')\n",
|
||||||
|
"metadata_file = meta1.open(\"w\", encoding=\"utf-8\")\n",
|
||||||
|
"for new_info in dict_info1:\n",
|
||||||
|
" metadata_file.write(new_info)\n",
|
||||||
|
"metadata_file.close()\n",
|
||||||
|
"\n",
|
||||||
|
"meta2 = Path('../audiodata/SV2TTS/synthesizer/eval.txt')\n",
|
||||||
"metadata_file = meta2.open(\"w\", encoding=\"utf-8\")\n",
|
"metadata_file = meta2.open(\"w\", encoding=\"utf-8\")\n",
|
||||||
"for new_info in dict_info:\n",
|
"for new_info in dict_info2:\n",
|
||||||
" metadata_file.write(new_info)\n",
|
" metadata_file.write(new_info)\n",
|
||||||
"metadata_file.close()"
|
"metadata_file.close()"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"evaluation"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"root = Path('../audiodata/SV2TTS/synthesizer')\n",
|
||||||
|
"spks = []\n",
|
||||||
|
"spk_id = {}\n",
|
||||||
|
"rows = []\n",
|
||||||
|
"with open(root.joinpath(\"eval.txt\"), \"r\", encoding=\"utf-8\") as dict_meta:\n",
|
||||||
|
" for raw in dict_meta:\n",
|
||||||
|
" speaker_name = raw.split(\"-\")[1][6:10]\n",
|
||||||
|
" if speaker_name not in spk_id:\n",
|
||||||
|
" spks.append(speaker_name)\n",
|
||||||
|
" spk_id[speaker_name] = 1\n",
|
||||||
|
" rows.append(raw)\n",
|
||||||
|
"i = 0\n",
|
||||||
|
"spks.sort()\n",
|
||||||
|
"\n",
|
||||||
|
"for sp in spks:\n",
|
||||||
|
" spk_id[sp] = str(i)\n",
|
||||||
|
" i = i + 1\n",
|
||||||
|
"print(len(spks))\n",
|
||||||
|
"meta2 = Path('../audiodata/SV2TTS/synthesizer/eval2.txt')\n",
|
||||||
|
"metadata_file = meta2.open(\"w\", encoding=\"utf-8\")\n",
|
||||||
|
"for row in rows:\n",
|
||||||
|
" speaker_n = row.split(\"-\")[1][6:10]\n",
|
||||||
|
" metadata_file.write(row.strip()+\"|\"+spk_id[speaker_n]+\"\\n\")\n",
|
||||||
|
"metadata_file.close()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"[Not Recommended]\n",
|
||||||
|
"Try to transcript map to detailed format:\n",
|
||||||
|
"ni3 hao3 -> n i3 <pad> h ao3\n",
|
||||||
|
"\n",
|
||||||
|
"After couple of tests, I think this method will not improve the quality of result and may cause the crash of monotonic alignment."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"datasets_root = Path(\"../audiodata/SV2TTS/synthesizer/\")\n",
|
||||||
|
"\n",
|
||||||
|
"dictionary_fp = Path(\"../audiodata/ProDiff/processed/mandarin_pinyin.dict\")\n",
|
||||||
|
"dict_map = {}\n",
|
||||||
|
"for l in open(dictionary_fp, encoding='utf-8').readlines():\n",
|
||||||
|
" item = l.split(\"\\t\")\n",
|
||||||
|
" dict_map[item[0]] = item[1].replace(\"\\n\",\"\")\n",
|
||||||
|
"\n",
|
||||||
|
"with datasets_root.joinpath('train2.txt').open(\"w+\", encoding='utf-8') as f:\n",
|
||||||
|
" for l in open(datasets_root.joinpath('train.txt'), encoding='utf-8').readlines():\n",
|
||||||
|
" items = l.strip().replace(\"\\n\",\"\").replace(\"\\t\",\" \").split(\"|\")\n",
|
||||||
|
" phs_str = \"\"\n",
|
||||||
|
" for word in items[5].split(\" \"):\n",
|
||||||
|
" if word in dict_map:\n",
|
||||||
|
" phs_str += dict_map[word] \n",
|
||||||
|
" else:\n",
|
||||||
|
" phs_str += word\n",
|
||||||
|
" phs_str += \" _ \"\n",
|
||||||
|
" items[5] = phs_str\n",
|
||||||
|
" # if not os.path.exists(mfa_input_root.joinpath('train.txt')):\n",
|
||||||
|
" # with open(mfa_input_root.joinpath(fileName + 'lab'), 'w+', encoding=\"utf-8\") as f:\n",
|
||||||
|
" f.write(\"|\".join(items) + \"\\n\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"预处理后的数据可视化"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"import librosa.display\n",
|
||||||
|
"import librosa, torch\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"from utils.audio_utils import spectrogram, mel_spectrogram, load_wav_to_torch, spec_to_mel\n",
|
||||||
|
"\n",
|
||||||
|
"# x, sr = librosa.load(\"D:\\audiodata\\SV2TTS\\synthesizer\\audio\\audio-T0055G2333S0196.wav_00.npy\")\n",
|
||||||
|
"x = np.load(\"D:\\\\audiodata\\\\SV2TTS\\\\synthesizer\\\\audio\\\\audio-T0055G1858S0342.wav_00.npy\")\n",
|
||||||
|
"\n",
|
||||||
|
"plt.figure(figsize=(14, 5))\n",
|
||||||
|
"librosa.display.waveplot(x)\n",
|
||||||
|
"\n",
|
||||||
|
"X = librosa.stft(x)\n",
|
||||||
|
"Xdb = librosa.amplitude_to_db(abs(X))\n",
|
||||||
|
"plt.figure(figsize=(14, 5))\n",
|
||||||
|
"librosa.display.specshow(Xdb, x_axis='time', y_axis='hz')\n",
|
||||||
|
"\n",
|
||||||
|
"# spectrogram = np.load(\"D:\\\\audiodata\\\\SV2TTS\\\\synthesizer\\\\mels\\\\mel-T0055G1858S0342.wav_00.npy\")\n",
|
||||||
|
"audio = torch.from_numpy(x.astype(np.float32))\n",
|
||||||
|
"\n",
|
||||||
|
"# audio, sampling_rate = load_wav_to_torch(\"D:\\\\audiodata\\\\aidatatang_200zh\\\\corpus\\\\train\\\\G1858\\\\T0055G1858S0342.wav\")\n",
|
||||||
|
"# audio_norm = audio / 32768.0\n",
|
||||||
|
"audio_norm = audio.unsqueeze(0)\n",
|
||||||
|
"spec = spectrogram(audio_norm, 1024, 256, 1024, center=False)\n",
|
||||||
|
"# spec = spec_to_mel()\n",
|
||||||
|
"spec = torch.squeeze(spec, 0)\n",
|
||||||
|
"mel = spec_to_mel(spec, 1024, 80, 16000, 0, None)\n",
|
||||||
|
"\n",
|
||||||
|
"fig = plt.figure(figsize=(10, 8))\n",
|
||||||
|
"ax2 = fig.add_subplot(211)\n",
|
||||||
|
"im = ax2.imshow(mel, interpolation=\"none\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"情感聚类"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"# from sklearn import metrics\n",
|
||||||
|
"# from sklearn.mixture import GaussianMixture # 高斯混合模型\n",
|
||||||
|
"import os\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import librosa\n",
|
||||||
|
"import IPython.display as ipd\n",
|
||||||
|
"from random import sample\n",
|
||||||
|
"\n",
|
||||||
|
"embs = []\n",
|
||||||
|
"wavnames = []\n",
|
||||||
|
"emo_root_path = \"D:\\\\audiodata\\\\SV2TTS\\\\synthesizer\\\\emo\\\\\"\n",
|
||||||
|
"wav_root_path = \"D:\\\\audiodata\\\\aidatatang_200zh\\\\corpus\\\\train\\\\\"\n",
|
||||||
|
"for idx, emo_fpath in enumerate(sample(os.listdir(emo_root_path), 10000)):\n",
|
||||||
|
" if emo_fpath.endswith(\".npy\") and emo_fpath.startswith(\"emo-T\"):\n",
|
||||||
|
" embs.append(np.expand_dims(np.load(emo_root_path + emo_fpath), axis=0))\n",
|
||||||
|
" wav_fpath = wav_root_path + emo_fpath[9:14] + \"\\\\\" + emo_fpath.split(\"_00\")[0][4:]\n",
|
||||||
|
" wavnames.append(wav_fpath)\n",
|
||||||
|
"print(len(embs))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"x = np.concatenate(embs, axis=0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# 聚类算法类的数量\n",
|
||||||
|
"n_clusters = 20\n",
|
||||||
|
"from sklearn.cluster import *\n",
|
||||||
|
"# model = KMeans(n_clusters=n_clusters, random_state=10)\n",
|
||||||
|
"# model = DBSCAN(eps=0.002, min_samples=2)\n",
|
||||||
|
"# 可以自行尝试各种不同的聚类算法\n",
|
||||||
|
"# model = Birch(n_clusters= n_clusters, threshold= 0.2)\n",
|
||||||
|
"# model = SpectralClustering(n_clusters=n_clusters)\n",
|
||||||
|
"model = AgglomerativeClustering(n_clusters= n_clusters)\n",
|
||||||
|
"import random\n",
|
||||||
|
"\n",
|
||||||
|
"y_predict = model.fit_predict(x)\n",
|
||||||
|
"\n",
|
||||||
|
"def disp(wavname):\n",
|
||||||
|
" wav, sr =librosa.load(wavname, 16000)\n",
|
||||||
|
" display(ipd.Audio(wav, rate=sr))\n",
|
||||||
|
"\n",
|
||||||
|
"classes=[[] for i in range(y_predict.max()+1)]\n",
|
||||||
|
"\n",
|
||||||
|
"for idx, wavname in enumerate(wavnames):\n",
|
||||||
|
" classes[y_predict[idx]].append(wavname)\n",
|
||||||
|
"\n",
|
||||||
|
"for i in range(y_predict.max()+1):\n",
|
||||||
|
" print(\"类别:\", i, \"本类中样本数量:\", len(classes[i]))\n",
|
||||||
|
" \"\"\"每一个类只预览2条音频\"\"\"\n",
|
||||||
|
" for j in range(2):\n",
|
||||||
|
" idx = random.randint(0, len(classes[i]) - 1)\n",
|
||||||
|
" print(classes[i][idx])\n",
|
||||||
|
" disp(classes[i][idx])"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|
Loading…
Reference in New Issue