{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'log_interval': 2000, 'eval_interval': 4000, 'seed': 1234, 'epochs': 10000, 'learning_rate': 0.0001, 'betas': [0.8, 0.99], 'eps': 1e-09, 'batch_size': 16, 'fp16_run': True, 'lr_decay': 0.5, 'segment_size': 8192, 'init_lr_ratio': 1, 'warmup_epochs': 0, 'c_mel': 45, 'c_kl': 1.0}\n", "Trainable Parameters: 0.000M\n" ] } ], "source": [ "from utils.hparams import load_hparams_json\n", "from utils.util import intersperse\n", "import json\n", "from models.synthesizer.models.vits import Vits\n", "import torch\n", "import numpy as np\n", "import IPython.display as ipd\n", "\n", "# chinese_cleaners\n", "_pad = '_'\n", "_punctuation = ',。!?—…'\n", "_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '\n", "# Export all symbols:\n", "symbols = [_pad] + list(_punctuation) + list(_letters)\n", "\n", "hps = load_hparams_json(\"data/ckpt/synthesizer/vits/config.json\")\n", "print(hps.train)\n", "model = Vits(\n", " len(symbols),\n", " hps[\"data\"][\"filter_length\"] // 2 + 1,\n", " hps[\"train\"][\"segment_size\"] // hps[\"data\"][\"hop_length\"],\n", " n_speakers=hps[\"data\"][\"n_speakers\"],\n", " stop_threshold=0.5,\n", " **hps[\"model\"])\n", "_ = model.eval()\n", "device = torch.device(\"cpu\")\n", "model.load(\"data/ckpt/synthesizer/vits/G_208000.pth\", device)\n", "\n", "# 随机抽取情感参考音频的根目录\n", "random_emotion_root = \"D:\\\\audiodata\\\\aidatatang_200zh\\\\corpus\\\\train\\\\G0017\"\n", "import random, re\n", "# import cn2an # remove dependency before production\n", "from pypinyin import lazy_pinyin, BOPOMOFO\n", "\n", "_symbol_to_id = {s: i for i, s in enumerate(symbols)}\n", "\n", "# def number_to_chinese(text):\n", "# numbers = re.findall(r'\\d+(?:\\.?\\d+)?', text)\n", "# for number in numbers:\n", "# text = text.replace(number, cn2an.an2cn(number), 1)\n", "# return text\n", "\n", "def chinese_to_bopomofo(text, taiwanese=False):\n", " text = text.replace('、', ',').replace(';', ',').replace(':', ',')\n", " for word in list(text):\n", " bopomofos = lazy_pinyin(word, BOPOMOFO)\n", " if not re.search('[\\u4e00-\\u9fff]', word):\n", " text += word\n", " continue\n", " for i in range(len(bopomofos)):\n", " bopomofos[i] = re.sub(r'([\\u3105-\\u3129])$', r'\\1ˉ', bopomofos[i])\n", " if text != '':\n", " text += ' '\n", " if taiwanese:\n", " text += '#'+'#'.join(bopomofos)\n", " else:\n", " text += ''.join(bopomofos)\n", " return text\n", "\n", "_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [\n", " ('a', 'ㄟˉ'),\n", " ('b', 'ㄅㄧˋ'),\n", " ('c', 'ㄙㄧˉ'),\n", " ('d', 'ㄉㄧˋ'),\n", " ('e', 'ㄧˋ'),\n", " ('f', 'ㄝˊㄈㄨˋ'),\n", " ('g', 'ㄐㄧˋ'),\n", " ('h', 'ㄝˇㄑㄩˋ'),\n", " ('i', 'ㄞˋ'),\n", " ('j', 'ㄐㄟˋ'),\n", " ('k', 'ㄎㄟˋ'),\n", " ('l', 'ㄝˊㄛˋ'),\n", " ('m', 'ㄝˊㄇㄨˋ'),\n", " ('n', 'ㄣˉ'),\n", " ('o', 'ㄡˉ'),\n", " ('p', 'ㄆㄧˉ'),\n", " ('q', 'ㄎㄧㄡˉ'),\n", " ('r', 'ㄚˋ'),\n", " ('s', 'ㄝˊㄙˋ'),\n", " ('t', 'ㄊㄧˋ'),\n", " ('u', 'ㄧㄡˉ'),\n", " ('v', 'ㄨㄧˉ'),\n", " ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),\n", " ('x', 'ㄝˉㄎㄨˋㄙˋ'),\n", " ('y', 'ㄨㄞˋ'),\n", " ('z', 'ㄗㄟˋ')\n", "]]\n", "\n", "def latin_to_bopomofo(text):\n", " for regex, replacement in _latin_to_bopomofo:\n", " text = re.sub(regex, replacement, text)\n", " return text\n", "\n", "#TODO: add cleaner to support multilang\n", "def chinese_cleaners(text, cleaner_names):\n", " '''Pipeline for Chinese text'''\n", " # text = number_to_chinese(text)\n", " text = chinese_to_bopomofo(text)\n", " text = latin_to_bopomofo(text)\n", " if re.match('[ˉˊˇˋ˙]', text[-1]):\n", " text += '。'\n", " return text\n", "\n", "\n", "def text_to_sequence(text, cleaner_names):\n", " '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.\n", " Args:\n", " text: string to convert to a sequence\n", " cleaner_names: names of the cleaner functions to run the text through\n", " Returns:\n", " List of integers corresponding to the symbols in the text\n", " '''\n", " sequence = []\n", "\n", " clean_text = chinese_cleaners(text, cleaner_names)\n", " for symbol in clean_text:\n", " if symbol not in _symbol_to_id.keys():\n", " continue\n", " symbol_id = _symbol_to_id[symbol]\n", " sequence += [symbol_id]\n", " return sequence\n", "\n", "import os\n", "\n", "def tts(txt, emotion, sid=0):\n", " text_norm = text_to_sequence(txt, hps[\"data\"][\"text_cleaners\"])\n", " if hps[\"data\"][\"add_blank\"]:\n", " text_norm = intersperse(text_norm, 0)\n", " stn_tst = torch.LongTensor(text_norm)\n", "\n", " with torch.no_grad(): #inference mode\n", " x_tst = stn_tst.unsqueeze(0)\n", " x_tst_lengths = torch.LongTensor([stn_tst.size(0)])\n", " sid = torch.LongTensor([sid])\n", " if emotion.endswith(\"wav\"):\n", " from models.synthesizer.preprocess_audio import extract_emo\n", " import librosa\n", " wav, sr = librosa.load(emotion, 16000)\n", " emo = torch.FloatTensor(extract_emo(np.expand_dims(wav, 0), sr, embeddings=True))\n", " else:\n", " print(\"emotion参数不正确\")\n", "\n", " audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1, emo=emo)[0][0,0].data.float().numpy()\n", " ipd.display(ipd.Audio(audio, rate=hps[\"data\"][\"sampling_rate\"], normalize=False))\n", "\n", "\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "推理:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "txt = \"随机抽取的音频文件路径可以用于使用该情感合成其他句子\"\n", "tts(txt, emotion='C:\\\\Users\\\\babys\\\\Desktop\\\\voicecollection\\\\secondround\\\\美玉.wav', sid=0)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "预处理:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using data from:\n", " ..\\audiodata\\magicdata\\train\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "magicdata: 0%| | 0/1018 [00:00here for more info. View Jupyter log for further details." ] } ], "source": [ "from models.synthesizer.preprocess import preprocess_dataset\n", "from pathlib import Path\n", "from utils.hparams import HParams\n", "datasets_root = Path(\"../audiodata/\")\n", "hparams=HParams(\n", " sample_rate = 16000,\n", " rescale = True,\n", " max_mel_frames = 900,\n", " rescaling_max = 0.9,\n", "\n", " utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded\n", " ### Audio processing options\n", " fmax = 7600, # Should not exceed (sample_rate // 2)\n", " allow_clipping_in_normalization = True, # Used when signal_normalization = True\n", " clip_mels_length = True, # If true, discards samples exceeding max_mel_frames\n", " use_lws = False, # \"Fast spectrogram phase recovery using local weighted sums\"\n", " symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,\n", " # and [0, max_abs_value] if False\n", " trim_silence = True, # Use with sample_rate of 16000 for best results\n", "\n", ")\n", "preprocess_dataset(datasets_root=datasets_root, \n", " out_dir=datasets_root.joinpath(\"SV2TTS\", \"synthesizer\"),\n", " n_processes=8,\n", " skip_existing=True, \n", " hparams=hparams, \n", " no_alignments=False, \n", " dataset=\"magicdata\", \n", " emotion_extract=True)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "训练:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\\Loading the json with %s\n", " data\\ckpt\\synthesizer\\vits\\config.json\n" ] }, { "ename": "ProcessRaisedException", "evalue": "\n\n-- Process 0 terminated with the following error:\nTraceback (most recent call last):\n File \"d:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py\", line 59, in _wrap\n fn(i, *args)\n File \"d:\\Real-Time-Voice-Cloning-Chinese\\models\\synthesizer\\train_vits.py\", line 123, in run\n net_g = Vits(\nTypeError: __init__() missing 1 required positional argument: 'stop_threshold'\n", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mProcessRaisedException\u001b[0m Traceback (most recent call last)", "\u001b[1;32md:\\Real-Time-Voice-Cloning-Chinese\\vits.ipynb Cell 7\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 18\u001b[0m os\u001b[39m.\u001b[39menviron[\u001b[39m'\u001b[39m\u001b[39mMASTER_ADDR\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mlocalhost\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m 19\u001b[0m os\u001b[39m.\u001b[39menviron[\u001b[39m'\u001b[39m\u001b[39mMASTER_PORT\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m8899\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m---> 20\u001b[0m mp\u001b[39m.\u001b[39;49mspawn(run, nprocs\u001b[39m=\u001b[39;49mn_gpus, args\u001b[39m=\u001b[39;49m(n_gpus, hparams))\n", "File \u001b[1;32md:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py:230\u001b[0m, in \u001b[0;36mspawn\u001b[1;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[0;32m 226\u001b[0m msg \u001b[39m=\u001b[39m (\u001b[39m'\u001b[39m\u001b[39mThis method only supports start_method=spawn (got: \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m).\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m'\u001b[39m\n\u001b[0;32m 227\u001b[0m \u001b[39m'\u001b[39m\u001b[39mTo use a different start_method use:\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\t\u001b[39;00m\u001b[39m\\t\u001b[39;00m\u001b[39m'\u001b[39m\n\u001b[0;32m 228\u001b[0m \u001b[39m'\u001b[39m\u001b[39m torch.multiprocessing.start_processes(...)\u001b[39m\u001b[39m'\u001b[39m \u001b[39m%\u001b[39m start_method)\n\u001b[0;32m 229\u001b[0m warnings\u001b[39m.\u001b[39mwarn(msg)\n\u001b[1;32m--> 230\u001b[0m \u001b[39mreturn\u001b[39;00m start_processes(fn, args, nprocs, join, daemon, start_method\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mspawn\u001b[39;49m\u001b[39m'\u001b[39;49m)\n", "File \u001b[1;32md:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py:188\u001b[0m, in \u001b[0;36mstart_processes\u001b[1;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[0;32m 185\u001b[0m \u001b[39mreturn\u001b[39;00m context\n\u001b[0;32m 187\u001b[0m \u001b[39m# Loop on join until it returns True or raises an exception.\u001b[39;00m\n\u001b[1;32m--> 188\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m context\u001b[39m.\u001b[39;49mjoin():\n\u001b[0;32m 189\u001b[0m \u001b[39mpass\u001b[39;00m\n", "File \u001b[1;32md:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py:150\u001b[0m, in \u001b[0;36mProcessContext.join\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 148\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39m-- Process \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m terminated with the following error:\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m error_index\n\u001b[0;32m 149\u001b[0m msg \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m original_trace\n\u001b[1;32m--> 150\u001b[0m \u001b[39mraise\u001b[39;00m ProcessRaisedException(msg, error_index, failed_process\u001b[39m.\u001b[39mpid)\n", "\u001b[1;31mProcessRaisedException\u001b[0m: \n\n-- Process 0 terminated with the following error:\nTraceback (most recent call last):\n File \"d:\\Users\\babys\\Anaconda3\\envs\\mo\\lib\\site-packages\\torch\\multiprocessing\\spawn.py\", line 59, in _wrap\n fn(i, *args)\n File \"d:\\Real-Time-Voice-Cloning-Chinese\\models\\synthesizer\\train_vits.py\", line 123, in run\n net_g = Vits(\nTypeError: __init__() missing 1 required positional argument: 'stop_threshold'\n" ] }, { "ename": "", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." ] } ], "source": [ "from models.synthesizer.train_vits import run\n", "from pathlib import Path\n", "from utils.hparams import HParams\n", "import torch, os\n", "import torch.multiprocessing as mp\n", "\n", "datasets_root = Path(\"../audiodata/SV2TTS/synthesizer\")\n", "hparams= HParams(\n", " model_dir = \"data/ckpt/synthesizer/vits\",\n", ")\n", "hparams.loadJson(Path(hparams.model_dir).joinpath(\"config.json\"))\n", "hparams.data[\"training_files\"] = str(datasets_root.joinpath(\"train.txt\"))\n", "hparams.data[\"validation_files\"] = str(datasets_root.joinpath(\"train.txt\"))\n", "hparams.data[\"datasets_root\"] = str(datasets_root)\n", "\n", "n_gpus = torch.cuda.device_count()\n", "# for spawn\n", "os.environ['MASTER_ADDR'] = 'localhost'\n", "os.environ['MASTER_PORT'] = '8899'\n", "mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hparams))" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "挑选只有对应emo文件的meta数据" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import os\n", "root = Path('../audiodata/SV2TTS/synthesizer')\n", "dict_info = []\n", "with open(root.joinpath(\"train.txt\"), \"r\", encoding=\"utf-8\") as dict_meta:\n", " for raw in dict_meta:\n", " if not raw:\n", " continue\n", " v = raw.split(\"|\")[0].replace(\"audio\",\"emo\")\n", " emo_fpath = root.joinpath(\"emo\").joinpath(v)\n", " if emo_fpath.exists():\n", " dict_info.append(raw)\n", " # else:\n", " # print(emo_fpath)\n", "# Iterate over each wav\n", "meta2 = Path('../audiodata/SV2TTS/synthesizer/train2.txt')\n", "metadata_file = meta2.open(\"w\", encoding=\"utf-8\")\n", "for new_info in dict_info:\n", " metadata_file.write(new_info)\n", "metadata_file.close()" ] } ], "metadata": { "kernelspec": { "display_name": "mo", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "vscode": { "interpreter": { "hash": "788ab866da3baa6c99886d56abb59fe71b6a552bf52c65473ecf96c784704db8" } } }, "nbformat": 4, "nbformat_minor": 4 }