diff --git a/README-CN.md b/README-CN.md index 096ee5a..7871fd1 100644 --- a/README-CN.md +++ b/README-CN.md @@ -98,33 +98,33 @@ d48ea37adf3660e657cfb047c10edbc -## 文件结构(目标读者:开发者) -``` -├─archived_untest_files 废弃文件 -├─encoder encoder模型 -│ ├─data_objects -│ └─saved_models 预训练好的模型 -├─samples 样例语音 -├─synthesizer synthesizer模型 -│ ├─models -│ ├─saved_models 预训练好的模型 -│ └─utils 工具类库 -├─toolbox 图形化工具箱 -├─utils 工具类库 -├─vocoder vocoder模型(目前包含hifi-gan、wavrnn) -│ ├─hifigan -│ ├─saved_models 预训练好的模型 -│ └─wavernn -└─web - ├─api - │ └─Web端接口 - ├─config - │ └─ Web端配置文件 - ├─static 前端静态脚本 - │ └─js - ├─templates 前端模板 - └─__init__.py Web端入口文件 -``` +### 4. 番外:语音转换Voice Conversion(PPG based) +想像柯南拿着变声器然后发出毛利小五郎的声音吗?本项目现基于PPG-VC,引入额外两个模块(PPG extractor + PPG2Mel), 可以实现变声功能。(文档不全,尤其是训练部分,正在努力补充中) +#### 4.0 准备环境 +* 确保项目以上环境已经安装ok,运行`pip install -r requirements.txt` 来安装剩余的必要包。 +* 下载以下模型 + * 24K采样率专用的vocoder(hifigan)到 *vocoder\saved_mode\xxx* + * 预训练的ppg特征encoder(ppg_extractor)到 *ppg_extractor\saved_mode\xxx* + * 预训练的PPG2Mel到 *ppg2mel\saved_mode\xxx* + +#### 4.1 使用数据集自己训练PPG2Mel模型 (可选) + +* 下载aidatatang_200zh数据集并解压:确保您可以访问 *train* 文件夹中的所有音频文件(如.wav) +* 进行音频和梅尔频谱图预处理: +`python pre4ppg.py -d {dataset} -n {number}` +可传入参数: +* `-d {dataset}` 指定数据集,支持 aidatatang_200zh, 不传默认为aidatatang_200zh +* `-n {number}` 指定并行数,CPU 11770k在8的情况下,需要运行12到18小时!待优化 +> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\` + +* 训练合成器, 注意在上一步先下载好`ppg2mel.yaml`, 修改里面的地址指向预训练好的文件夹: +`python ppg2mel_train.py --config .\ppg2mel\saved_models\ppg2mel.yaml --oneshotvc ` +* 如果想要继续上一次的训练,可以通过`--load .\ppg2mel\saved_models\` 参数指定一个预训练模型文件。 + +#### 4.2 启动工具箱VC模式 +您可以尝试使用以下命令: +`python demo_toolbox.py vc -d ` +> 请指定一个可用的数据集文件路径,如果有支持的数据集则会自动加载供调试,也同时会作为手动录制音频的存储目录。 ## 引用及论文 > 该库一开始从仅支持英语的[Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) 分叉出来的,鸣谢作者。 diff --git a/train.py b/ppg2mel_train.py similarity index 100% rename from train.py rename to ppg2mel_train.py diff --git a/run.py b/run.py deleted file mode 100644 index 170f9db..0000000 --- a/run.py +++ /dev/null @@ -1,142 +0,0 @@ -import time -import os -import argparse -import torch -import numpy as np -import glob -from pathlib import Path -from tqdm import tqdm -from ppg_extractor import load_model -import librosa -import soundfile as sf -from utils.load_yaml import HpsYaml - -from encoder.audio import preprocess_wav -from encoder import inference as speacker_encoder -from vocoder.hifigan import inference as vocoder -from ppg2mel import MelDecoderMOLv2 -from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv - - -def _build_ppg2mel_model(model_config, model_file, device): - ppg2mel_model = MelDecoderMOLv2( - **model_config["model"] - ).to(device) - ckpt = torch.load(model_file, map_location=device) - ppg2mel_model.load_state_dict(ckpt["model"]) - ppg2mel_model.eval() - return ppg2mel_model - - -@torch.no_grad() -def convert(args): - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - output_dir = args.output_dir - os.makedirs(output_dir, exist_ok=True) - - step = os.path.basename(args.ppg2mel_model_file)[:-4].split("_")[-1] - - # Build models - print("Load PPG-model, PPG2Mel-model, Vocoder-model...") - ppg_model = load_model( - Path('./ppg_extractor/saved_models/24epoch.pt'), - device, - ) - ppg2mel_model = _build_ppg2mel_model(HpsYaml(args.ppg2mel_model_train_config), args.ppg2mel_model_file, device) - # vocoder.load_model('./vocoder/saved_models/pretrained/g_hifigan.pt', "./vocoder/hifigan/config_16k_.json") - vocoder.load_model('./vocoder/saved_models/24k/g_02830000.pt') - # Data related - ref_wav_path = args.ref_wav_path - ref_wav = preprocess_wav(ref_wav_path) - ref_fid = os.path.basename(ref_wav_path)[:-4] - - # TODO: specify encoder - speacker_encoder.load_model(Path("encoder/saved_models/pretrained_bak_5805000.pt")) - ref_spk_dvec = speacker_encoder.embed_utterance(ref_wav) - ref_spk_dvec = torch.from_numpy(ref_spk_dvec).unsqueeze(0).to(device) - ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav))) - - source_file_list = sorted(glob.glob(f"{args.wav_dir}/*.wav")) - print(f"Number of source utterances: {len(source_file_list)}.") - - total_rtf = 0.0 - cnt = 0 - for src_wav_path in tqdm(source_file_list): - # Load the audio to a numpy array: - src_wav, _ = librosa.load(src_wav_path, sr=16000) - src_wav_tensor = torch.from_numpy(src_wav).unsqueeze(0).float().to(device) - src_wav_lengths = torch.LongTensor([len(src_wav)]).to(device) - ppg = ppg_model(src_wav_tensor, src_wav_lengths) - - lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True) - min_len = min(ppg.shape[1], len(lf0_uv)) - - ppg = ppg[:, :min_len] - lf0_uv = lf0_uv[:min_len] - - start = time.time() - _, mel_pred, att_ws = ppg2mel_model.inference( - ppg, - logf0_uv=torch.from_numpy(lf0_uv).unsqueeze(0).float().to(device), - spembs=ref_spk_dvec, - ) - src_fid = os.path.basename(src_wav_path)[:-4] - wav_fname = f"{output_dir}/vc_{src_fid}_ref_{ref_fid}_step{step}.wav" - mel_len = mel_pred.shape[0] - rtf = (time.time() - start) / (0.01 * mel_len) - total_rtf += rtf - cnt += 1 - # continue - mel_pred= mel_pred.transpose(0, 1) - y, output_sample_rate = vocoder.infer_waveform(mel_pred.cpu()) - sf.write(wav_fname, y.squeeze(), output_sample_rate, "PCM_16") - - print("RTF:") - print(total_rtf / cnt) - - -def get_parser(): - parser = argparse.ArgumentParser(description="Conversion from wave input") - parser.add_argument( - "--wav_dir", - type=str, - default=None, - required=True, - help="Source wave directory.", - ) - parser.add_argument( - "--ref_wav_path", - type=str, - required=True, - help="Reference wave file path.", - ) - parser.add_argument( - "--ppg2mel_model_train_config", "-c", - type=str, - default=None, - required=True, - help="Training config file (yaml file)", - ) - parser.add_argument( - "--ppg2mel_model_file", "-m", - type=str, - default=None, - required=True, - help="ppg2mel model checkpoint file path" - ) - parser.add_argument( - "--output_dir", "-o", - type=str, - default="vc_gens_vctk_oneshot", - help="Output folder to save the converted wave." - ) - - return parser - -def main(): - parser = get_parser() - args = parser.parse_args() - convert(args) - -if __name__ == "__main__": - main()