In [1]:
from utils.hparams import load_hparams_json
from utils.util import intersperse
import json
from models.synthesizer.models.vits import Vits
import torch
import numpy as np
import IPython.display as ipd

# chinese_cleaners
_pad = '_'
_punctuation = ',。!?—…'
_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters)

hps = load_hparams_json("data/ckpt/synthesizer/vits/config.json")
print(hps.train)
model = Vits(
 len(symbols),
 hps["data"]["filter_length"] // 2 + 1,
 hps["train"]["segment_size"] // hps["data"]["hop_length"],
 n_speakers=hps["data"]["n_speakers"],
 stop_threshold=0.5,
 **hps["model"])
_ = model.eval()
device = torch.device("cpu")
model.load("data/ckpt/synthesizer/vits/G_208000.pth", device)

# 随机抽取情感参考音频的根目录
random_emotion_root = "D:\\audiodata\\aidatatang_200zh\\corpus\\train\\G0017"
import random, re
# import cn2an # remove dependency before production
from pypinyin import lazy_pinyin, BOPOMOFO

_symbol_to_id = {s: i for i, s in enumerate(symbols)}

# def number_to_chinese(text):
# numbers = re.findall(r'\d+(?:\.?\d+)?', text)
# for number in numbers:
# text = text.replace(number, cn2an.an2cn(number), 1)
# return text

def chinese_to_bopomofo(text, taiwanese=False):
 text = text.replace('、', ',').replace(';', ',').replace(':', ',')
 for word in list(text):
 bopomofos = lazy_pinyin(word, BOPOMOFO)
 if not re.search('[\u4e00-\u9fff]', word):
 text += word
 continue
 for i in range(len(bopomofos)):
 bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
 if text != '':
 text += ' '
 if taiwanese:
 text += '#'+'#'.join(bopomofos)
 else:
 text += ''.join(bopomofos)
 return text

_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 ('a', 'ㄟˉ'),
 ('b', 'ㄅㄧˋ'),
 ('c', 'ㄙㄧˉ'),
 ('d', 'ㄉㄧˋ'),
 ('e', 'ㄧˋ'),
 ('f', 'ㄝˊㄈㄨˋ'),
 ('g', 'ㄐㄧˋ'),
 ('h', 'ㄝˇㄑㄩˋ'),
 ('i', 'ㄞˋ'),
 ('j', 'ㄐㄟˋ'),
 ('k', 'ㄎㄟˋ'),
 ('l', 'ㄝˊㄛˋ'),
 ('m', 'ㄝˊㄇㄨˋ'),
 ('n', 'ㄣˉ'),
 ('o', 'ㄡˉ'),
 ('p', 'ㄆㄧˉ'),
 ('q', 'ㄎㄧㄡˉ'),
 ('r', 'ㄚˋ'),
 ('s', 'ㄝˊㄙˋ'),
 ('t', 'ㄊㄧˋ'),
 ('u', 'ㄧㄡˉ'),
 ('v', 'ㄨㄧˉ'),
 ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
 ('x', 'ㄝˉㄎㄨˋㄙˋ'),
 ('y', 'ㄨㄞˋ'),
 ('z', 'ㄗㄟˋ')
]]

def latin_to_bopomofo(text):
 for regex, replacement in _latin_to_bopomofo:
 text = re.sub(regex, replacement, text)
 return text

#TODO: add cleaner to support multilang
def chinese_cleaners(text, cleaner_names):
 '''Pipeline for Chinese text'''
 # text = number_to_chinese(text)
 text = chinese_to_bopomofo(text)
 text = latin_to_bopomofo(text)
 if re.match('[ˉˊˇˋ˙]', text[-1]):
 text += '。'
 return text


def text_to_sequence(text, cleaner_names):
 '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 Args:
 text: string to convert to a sequence
 cleaner_names: names of the cleaner functions to run the text through
 Returns:
 List of integers corresponding to the symbols in the text
 '''
 sequence = []

 clean_text = chinese_cleaners(text, cleaner_names)
 for symbol in clean_text:
 if symbol not in _symbol_to_id.keys():
 continue
 symbol_id = _symbol_to_id[symbol]
 sequence += [symbol_id]
 return sequence

import os

def tts(txt, emotion, sid=0):
 text_norm = text_to_sequence(txt, hps["data"]["text_cleaners"])
 if hps["data"]["add_blank"]:
 text_norm = intersperse(text_norm, 0)
 stn_tst = torch.LongTensor(text_norm)

 with torch.no_grad(): #inference mode
 x_tst = stn_tst.unsqueeze(0)
 x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
 sid = torch.LongTensor([sid])
 if emotion.endswith("wav"):
 from models.synthesizer.preprocess_audio import extract_emo
 import librosa
 wav, sr = librosa.load(emotion, 16000)
 emo = torch.FloatTensor(extract_emo(np.expand_dims(wav, 0), sr, embeddings=True))
 else:
 print("emotion参数不正确")

 audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1, emo=emo)[0][0,0].data.float().numpy()
 ipd.display(ipd.Audio(audio, rate=hps["data"]["sampling_rate"], normalize=False))




{'log_interval': 2000, 'eval_interval': 4000, 'seed': 1234, 'epochs': 10000, 'learning_rate': 0.0001, 'betas': [0.8, 0.99], 'eps': 1e-09, 'batch_size': 16, 'fp16_run': True, 'lr_decay': 0.5, 'segment_size': 8192, 'init_lr_ratio': 1, 'warmup_epochs': 0, 'c_mel': 45, 'c_kl': 1.0}
Trainable Parameters: 0.000M


推理:

In [2]:
txt = "随机抽取的音频文件路径可以用于使用该情感合成其他句子"
tts(txt, emotion='C:\\Users\\babys\\Desktop\\voicecollection\\secondround\\美玉.wav', sid=0)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


预处理:

In [2]:
from models.synthesizer.preprocess import preprocess_dataset
from pathlib import Path
from utils.hparams import HParams
datasets_root = Path("../audiodata/")
hparams=HParams(
 sample_rate = 16000,
 rescale = True,
 max_mel_frames = 900,
 rescaling_max = 0.9,

 utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
 ### Audio processing options
 fmax = 7600, # Should not exceed (sample_rate // 2)
 allow_clipping_in_normalization = True, # Used when signal_normalization = True
 clip_mels_length = True, # If true, discards samples exceeding max_mel_frames
 use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
 symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
 # and [0, max_abs_value] if False
 trim_silence = True, # Use with sample_rate of 16000 for best results

)
preprocess_dataset(datasets_root=datasets_root, 
 out_dir=datasets_root.joinpath("SV2TTS", "synthesizer"),
 n_processes=8,
 skip_existing=True, 
 hparams=hparams, 
 no_alignments=False, 
 dataset="magicdata", 
 emotion_extract=True)

Using data from:
 ..\audiodata\magicdata\train


magicdata: 0%| | 0/1018 [00:00here for more info. View Jupyter log for further details.

训练:

In [6]:
from models.synthesizer.train_vits import run
from pathlib import Path
from utils.hparams import HParams
import torch, os
import torch.multiprocessing as mp

datasets_root = Path("../audiodata/SV2TTS/synthesizer")
hparams= HParams(
 model_dir = "data/ckpt/synthesizer/vits",
)
hparams.loadJson(Path(hparams.model_dir).joinpath("config.json"))
hparams.data["training_files"] = str(datasets_root.joinpath("train.txt"))
hparams.data["validation_files"] = str(datasets_root.joinpath("train.txt"))
hparams.data["datasets_root"] = str(datasets_root)

n_gpus = torch.cuda.device_count()
# for spawn
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8899'
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hparams))

\Loading the json with %s
 data\ckpt\synthesizer\vits\config.json


ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
 File "d:\Users\babys\Anaconda3\envs\mo\lib\site-packages\torch\multiprocessing\spawn.py", line 59, in _wrap
 fn(i, *args)
 File "d:\Real-Time-Voice-Cloning-Chinese\models\synthesizer\train_vits.py", line 123, in run
 net_g = Vits(
TypeError: __init__() missing 1 required positional argument: 'stop_threshold'


: 

挑选只有对应emo文件的meta数据

In [8]:
from pathlib import Path
import os
root = Path('../audiodata/SV2TTS/synthesizer')
dict_info = []
with open(root.joinpath("train.txt"), "r", encoding="utf-8") as dict_meta:
 for raw in dict_meta:
 if not raw:
 continue
 v = raw.split("|")[0].replace("audio","emo")
 emo_fpath = root.joinpath("emo").joinpath(v)
 if emo_fpath.exists():
 dict_info.append(raw)
 # else:
 # print(emo_fpath)
# Iterate over each wav
meta2 = Path('../audiodata/SV2TTS/synthesizer/train2.txt')
metadata_file = meta2.open("w", encoding="utf-8")
for new_info in dict_info:
 metadata_file.write(new_info)
metadata_file.close()