commit
9d67b757f0
|
@ -14,9 +14,9 @@
|
|||
*.bcf
|
||||
*.toc
|
||||
*.sh
|
||||
*/saved_models
|
||||
!vocoder/saved_models/pretrained/**
|
||||
!encoder/saved_models/pretrained.pt
|
||||
data/ckpt
|
||||
!data/ckpt/vocoder/pretrained/**
|
||||
!data/ckpt/encoder/pretrained.pt
|
||||
wavs
|
||||
log
|
||||
!/docker-entrypoint.sh
|
||||
|
|
|
@ -15,7 +15,8 @@
|
|||
"name": "Python: Vocoder Preprocess",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_preprocess.py",
|
||||
"program": "control\\cli\\vocoder_preprocess.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["..\\audiodata"]
|
||||
},
|
||||
|
@ -23,7 +24,8 @@
|
|||
"name": "Python: Vocoder Train",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_train.py",
|
||||
"program": "control\\cli\\vocoder_train.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["dev", "..\\audiodata"]
|
||||
},
|
||||
|
@ -32,6 +34,7 @@
|
|||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "demo_toolbox.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["-d","..\\audiodata"]
|
||||
},
|
||||
|
@ -40,6 +43,7 @@
|
|||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "demo_toolbox.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["-d","..\\audiodata","-vc"]
|
||||
},
|
||||
|
@ -47,9 +51,9 @@
|
|||
"name": "Python: Synth Train",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "synthesizer_train.py",
|
||||
"program": "train.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["my_run", "..\\"]
|
||||
"args": ["--type", "synth", "..\\audiodata\\SV2TTS\\synthesizer"]
|
||||
},
|
||||
{
|
||||
"name": "Python: PPG Convert",
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from encoder.params_model import model_embedding_size as speaker_embedding_size
|
||||
from models.encoder.params_model import model_embedding_size as speaker_embedding_size
|
||||
from utils.argutils import print_args
|
||||
from utils.modelutils import check_model_paths
|
||||
from synthesizer.inference import Synthesizer
|
||||
from encoder import inference as encoder
|
||||
from vocoder import inference as vocoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.encoder import inference as encoder
|
||||
from models.vocoder import inference as vocoder
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2, preprocess_aidatatang_200zh
|
||||
from utils.argutils import print_args
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from models.encoder.preprocess import (preprocess_aidatatang_200zh,
|
||||
preprocess_librispeech, preprocess_voxceleb1,
|
||||
preprocess_voxceleb2)
|
||||
from utils.argutils import print_args
|
||||
|
||||
if __name__ == "__main__":
|
||||
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
|
|
@ -1,5 +1,5 @@
|
|||
from utils.argutils import print_args
|
||||
from encoder.train import train
|
||||
from models.encoder.train import train
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
|
@ -2,8 +2,8 @@ import sys
|
|||
import torch
|
||||
import argparse
|
||||
import numpy as np
|
||||
from utils.load_yaml import HpsYaml
|
||||
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
from utils.hparams import HpsYaml
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
|
||||
# For reproducibility, comment these may speed up training
|
||||
torch.backends.cudnn.deterministic = True
|
|
@ -1,7 +1,7 @@
|
|||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
from ppg2mel.preprocess import preprocess_dataset
|
||||
from models.ppg2mel.preprocess import preprocess_dataset
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
|
@ -1,10 +1,9 @@
|
|||
from synthesizer.hparams import hparams
|
||||
from synthesizer.train import train
|
||||
from models.synthesizer.hparams import hparams
|
||||
from models.synthesizer.train import train
|
||||
from utils.argutils import print_args
|
||||
import argparse
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def new_train():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("run_id", type=str, help= \
|
||||
"Name for this model instance. If a model state from the same run ID was previously "
|
||||
|
@ -13,7 +12,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument("syn_dir", type=str, default=argparse.SUPPRESS, help= \
|
||||
"Path to the synthesizer directory that contains the ground truth mel spectrograms, "
|
||||
"the wavs and the embeds.")
|
||||
parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\
|
||||
parser.add_argument("-m", "--models_dir", type=str, default=f"data/ckpt/synthesizer/", help=\
|
||||
"Path to the output directory that will contain the saved model weights and the logs.")
|
||||
parser.add_argument("-s", "--save_every", type=int, default=1000, help= \
|
||||
"Number of steps between updates of the model on the disk. Set to 0 to never save the "
|
||||
|
@ -28,10 +27,14 @@ if __name__ == "__main__":
|
|||
parser.add_argument("--hparams", default="",
|
||||
help="Hyperparameter overrides as a comma-separated list of name=value "
|
||||
"pairs")
|
||||
args = parser.parse_args()
|
||||
args, _ = parser.parse_known_args()
|
||||
print_args(args, parser)
|
||||
|
||||
args.hparams = hparams.parse(args.hparams)
|
||||
|
||||
# Run the training
|
||||
train(**vars(args))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
new_train()
|
|
@ -0,0 +1,66 @@
|
|||
import sys
|
||||
import torch
|
||||
import argparse
|
||||
import numpy as np
|
||||
from utils.hparams import HpsYaml
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
|
||||
# For reproducibility, comment these may speed up training
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
||||
def main():
|
||||
# Arguments
|
||||
parser = argparse.ArgumentParser(description=
|
||||
'Training PPG2Mel VC model.')
|
||||
parser.add_argument('--config', type=str,
|
||||
help='Path to experiment config, e.g., config/vc.yaml')
|
||||
parser.add_argument('--name', default=None, type=str, help='Name for logging.')
|
||||
parser.add_argument('--logdir', default='log/', type=str,
|
||||
help='Logging path.', required=False)
|
||||
parser.add_argument('--ckpdir', default='ppg2mel/saved_models/', type=str,
|
||||
help='Checkpoint path.', required=False)
|
||||
parser.add_argument('--outdir', default='result/', type=str,
|
||||
help='Decode output path.', required=False)
|
||||
parser.add_argument('--load', default=None, type=str,
|
||||
help='Load pre-trained model (for training only)', required=False)
|
||||
parser.add_argument('--warm_start', action='store_true',
|
||||
help='Load model weights only, ignore specified layers.')
|
||||
parser.add_argument('--seed', default=0, type=int,
|
||||
help='Random seed for reproducable results.', required=False)
|
||||
parser.add_argument('--njobs', default=8, type=int,
|
||||
help='Number of threads for dataloader/decoding.', required=False)
|
||||
parser.add_argument('--cpu', action='store_true', help='Disable GPU training.')
|
||||
parser.add_argument('--no-pin', action='store_true',
|
||||
help='Disable pin-memory for dataloader')
|
||||
parser.add_argument('--test', action='store_true', help='Test the model.')
|
||||
parser.add_argument('--no-msg', action='store_true', help='Hide all messages.')
|
||||
parser.add_argument('--finetune', action='store_true', help='Finetune model')
|
||||
parser.add_argument('--oneshotvc', action='store_true', help='Oneshot VC model')
|
||||
parser.add_argument('--bilstm', action='store_true', help='BiLSTM VC model')
|
||||
parser.add_argument('--lsa', action='store_true', help='Use location-sensitive attention (LSA)')
|
||||
|
||||
###
|
||||
paras = parser.parse_args()
|
||||
setattr(paras, 'gpu', not paras.cpu)
|
||||
setattr(paras, 'pin_memory', not paras.no_pin)
|
||||
setattr(paras, 'verbose', not paras.no_msg)
|
||||
# Make the config dict dot visitable
|
||||
config = HpsYaml(paras.config)
|
||||
|
||||
np.random.seed(paras.seed)
|
||||
torch.manual_seed(paras.seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(paras.seed)
|
||||
|
||||
print(">>> OneShot VC training ...")
|
||||
mode = "train"
|
||||
solver = Solver(config, paras, mode)
|
||||
solver.load_data()
|
||||
solver.set_model()
|
||||
solver.exec()
|
||||
print(">>> Oneshot VC train finished!")
|
||||
sys.exit(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,5 +1,5 @@
|
|||
from synthesizer.synthesize import run_synthesis
|
||||
from synthesizer.hparams import hparams
|
||||
from models.synthesizer.synthesize import run_synthesis
|
||||
from models.synthesizer.hparams import hparams
|
||||
from utils.argutils import print_args
|
||||
import argparse
|
||||
import os
|
|
@ -1,7 +1,7 @@
|
|||
from utils.argutils import print_args
|
||||
from vocoder.wavernn.train import train
|
||||
from vocoder.hifigan.train import train as train_hifigan
|
||||
from vocoder.fregan.train import train as train_fregan
|
||||
from models.vocoder.wavernn.train import train
|
||||
from models.vocoder.hifigan.train import train as train_hifigan
|
||||
from models.vocoder.fregan.train import train as train_fregan
|
||||
from utils.util import AttrDict
|
||||
from pathlib import Path
|
||||
import argparse
|
|
@ -2,22 +2,22 @@ from pydantic import BaseModel, Field
|
|||
import os
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
from encoder import inference as encoder
|
||||
from models.encoder import inference as encoder
|
||||
import librosa
|
||||
from scipy.io.wavfile import write
|
||||
import re
|
||||
import numpy as np
|
||||
from mkgui.base.components.types import FileContent
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from synthesizer.inference import Synthesizer
|
||||
from control.mkgui.base.components.types import FileContent
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from typing import Any, Tuple
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Constants
|
||||
AUDIO_SAMPLES_DIR = f"samples{os.sep}"
|
||||
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models"
|
||||
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
VOC_MODELS_DIRT = f"vocoder{os.sep}saved_models"
|
||||
AUDIO_SAMPLES_DIR = f"data{os.sep}samples{os.sep}"
|
||||
SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
VOC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}vocoder"
|
||||
TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav"
|
||||
TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav"
|
||||
if not os.path.isdir("wavs"):
|
||||
|
@ -31,7 +31,7 @@ if os.path.isdir(SYN_MODELS_DIRT):
|
|||
synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded synthesizer models: " + str(len(synthesizers)))
|
||||
else:
|
||||
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.")
|
||||
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist. 请将模型文件位置移动到上述位置中进行重试!")
|
||||
|
||||
if os.path.isdir(ENC_MODELS_DIRT):
|
||||
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
|
||||
|
@ -46,15 +46,16 @@ else:
|
|||
raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
|
||||
|
||||
class Input(BaseModel):
|
||||
message: str = Field(
|
||||
..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容"
|
||||
)
|
||||
local_audio_file: audio_input_selection = Field(
|
||||
..., alias="输入语音(本地wav)",
|
||||
..., alias="选择语音(本地wav)",
|
||||
description="选择本地语音文件."
|
||||
)
|
||||
record_audio_file: FileContent = Field(default=None, alias="录制语音",
|
||||
description="录音.", is_recorder=True, mime_type="audio/wav")
|
||||
upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
|
||||
description="拖拽或点击上传.", mime_type="audio/wav")
|
||||
encoder: encoders = Field(
|
||||
|
@ -104,7 +105,12 @@ def synthesize(input: Input) -> Output:
|
|||
gan_vocoder.load_model(Path(input.vocoder.value))
|
||||
|
||||
# load file
|
||||
if input.upload_audio_file != None:
|
||||
if input.record_audio_file != None:
|
||||
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
|
||||
f.write(input.record_audio_file.as_bytes())
|
||||
f.seek(0)
|
||||
wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
|
||||
elif input.upload_audio_file != None:
|
||||
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
|
||||
f.write(input.upload_audio_file.as_bytes())
|
||||
f.seek(0)
|
|
@ -1,27 +1,26 @@
|
|||
from synthesizer.inference import Synthesizer
|
||||
from pydantic import BaseModel, Field
|
||||
from encoder import inference as speacker_encoder
|
||||
import torch
|
||||
import os
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
import ppg_extractor as Extractor
|
||||
import ppg2mel as Convertor
|
||||
import librosa
|
||||
from scipy.io.wavfile import write
|
||||
import re
|
||||
import numpy as np
|
||||
from mkgui.base.components.types import FileContent
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from pathlib import Path
|
||||
from typing import Any, Tuple
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
import librosa
|
||||
import matplotlib.pyplot as plt
|
||||
import torch
|
||||
from pydantic import BaseModel, Field
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
import models.ppg2mel as Convertor
|
||||
import models.ppg_extractor as Extractor
|
||||
from control.mkgui.base.components.types import FileContent
|
||||
from models.encoder import inference as speacker_encoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
|
||||
# Constants
|
||||
AUDIO_SAMPLES_DIR = f'samples{os.sep}'
|
||||
EXT_MODELS_DIRT = f'ppg_extractor{os.sep}saved_models'
|
||||
CONV_MODELS_DIRT = f'ppg2mel{os.sep}saved_models'
|
||||
VOC_MODELS_DIRT = f'vocoder{os.sep}saved_models'
|
||||
AUDIO_SAMPLES_DIR = f'data{os.sep}samples{os.sep}'
|
||||
EXT_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg_extractor'
|
||||
CONV_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg2mel'
|
||||
VOC_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}vocoder'
|
||||
TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav'
|
||||
TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav'
|
||||
TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav'
|
||||
|
@ -132,9 +131,10 @@ def convert(input: Input) -> Output:
|
|||
|
||||
ppg = extractor.extract_from_wav(src_wav)
|
||||
# Import necessary dependency of Voice Conversion
|
||||
from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv
|
||||
from utils.f0_utils import (compute_f0, compute_mean_std, f02lf0,
|
||||
get_converted_lf0uv)
|
||||
ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
|
||||
speacker_encoder.load_model(Path(f"encoder{os.sep}saved_models{os.sep}pretrained_bak_5805000.pt"))
|
||||
speacker_encoder.load_model(Path(f"data{os.sep}ckpt{os.sep}encoder{os.sep}pretrained_bak_5805000.pt"))
|
||||
embed = speacker_encoder.embed_utterance(ref_wav)
|
||||
lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
|
||||
min_len = min(ppg.shape[1], len(lf0_uv))
|
|
@ -37,6 +37,12 @@ def is_single_file_property(property: Dict) -> bool:
|
|||
# TODO: binary?
|
||||
return property.get("format") == "byte"
|
||||
|
||||
def is_single_autio_property(property: Dict) -> bool:
|
||||
if property.get("type") != "string":
|
||||
return False
|
||||
# TODO: binary?
|
||||
return property.get("format") == "bytes"
|
||||
|
||||
|
||||
def is_single_directory_property(property: Dict) -> bool:
|
||||
if property.get("type") != "string":
|
|
@ -14,14 +14,13 @@ from fastapi.encoders import jsonable_encoder
|
|||
from loguru import logger
|
||||
from pydantic import BaseModel, ValidationError, parse_obj_as
|
||||
|
||||
from mkgui.base import Opyrator
|
||||
from mkgui.base.core import name_to_title
|
||||
from mkgui.base.ui import schema_utils
|
||||
from mkgui.base.ui.streamlit_utils import CUSTOM_STREAMLIT_CSS
|
||||
from control.mkgui.base import Opyrator
|
||||
from control.mkgui.base.core import name_to_title
|
||||
from . import schema_utils
|
||||
from .streamlit_utils import CUSTOM_STREAMLIT_CSS
|
||||
|
||||
STREAMLIT_RUNNER_SNIPPET = """
|
||||
from mkgui.base.ui import render_streamlit_ui
|
||||
from mkgui.base import Opyrator
|
||||
from control.mkgui.base.ui import render_streamlit_ui
|
||||
|
||||
import streamlit as st
|
||||
|
||||
|
@ -243,7 +242,14 @@ class InputUI:
|
|||
file_extension = None
|
||||
if "mime_type" in property:
|
||||
file_extension = mimetypes.guess_extension(property["mime_type"])
|
||||
|
||||
|
||||
if "is_recorder" in property:
|
||||
from audio_recorder_streamlit import audio_recorder
|
||||
audio_bytes = audio_recorder()
|
||||
if audio_bytes:
|
||||
streamlit_app.audio(audio_bytes, format="audio/wav")
|
||||
return audio_bytes
|
||||
|
||||
uploaded_file = streamlit_app.file_uploader(
|
||||
**streamlit_kwargs, accept_multiple_files=False, type=file_extension
|
||||
)
|
||||
|
@ -263,6 +269,39 @@ class InputUI:
|
|||
streamlit_app.video(bytes, format=property.get("mime_type"))
|
||||
return bytes
|
||||
|
||||
def _render_single_audio_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
# streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
|
||||
from audio_recorder_streamlit import audio_recorder
|
||||
audio_bytes = audio_recorder()
|
||||
if audio_bytes:
|
||||
streamlit_app.audio(audio_bytes, format="audio/wav")
|
||||
return audio_bytes
|
||||
|
||||
# file_extension = None
|
||||
# if "mime_type" in property:
|
||||
# file_extension = mimetypes.guess_extension(property["mime_type"])
|
||||
|
||||
# uploaded_file = streamlit_app.file_uploader(
|
||||
# **streamlit_kwargs, accept_multiple_files=False, type=file_extension
|
||||
# )
|
||||
# if uploaded_file is None:
|
||||
# return None
|
||||
|
||||
# bytes = uploaded_file.getvalue()
|
||||
# if property.get("mime_type"):
|
||||
# if is_compatible_audio(property["mime_type"]):
|
||||
# # Show audio
|
||||
# streamlit_app.audio(bytes, format=property.get("mime_type"))
|
||||
# if is_compatible_image(property["mime_type"]):
|
||||
# # Show image
|
||||
# streamlit_app.image(bytes)
|
||||
# if is_compatible_video(property["mime_type"]):
|
||||
# # Show video
|
||||
# streamlit_app.video(bytes, format=property.get("mime_type"))
|
||||
# return bytes
|
||||
|
||||
def _render_single_string_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
|
@ -807,21 +846,20 @@ class OutputUI:
|
|||
|
||||
def getOpyrator(mode: str) -> Opyrator:
|
||||
if mode == None or mode.startswith('VC'):
|
||||
from mkgui.app_vc import convert
|
||||
from control.mkgui.app_vc import convert
|
||||
return Opyrator(convert)
|
||||
if mode == None or mode.startswith('预处理'):
|
||||
from mkgui.preprocess import preprocess
|
||||
from control.mkgui.preprocess import preprocess
|
||||
return Opyrator(preprocess)
|
||||
if mode == None or mode.startswith('模型训练'):
|
||||
from mkgui.train import train
|
||||
from control.mkgui.train import train
|
||||
return Opyrator(train)
|
||||
if mode == None or mode.startswith('模型训练(VC)'):
|
||||
from mkgui.train_vc import train_vc
|
||||
from control.mkgui.train_vc import train_vc
|
||||
return Opyrator(train_vc)
|
||||
from mkgui.app import synthesize
|
||||
from control.mkgui.app import synthesize
|
||||
return Opyrator(synthesize)
|
||||
|
||||
|
||||
def render_streamlit_ui() -> None:
|
||||
# init
|
||||
session_state = st.session_state
|
||||
|
@ -845,7 +883,7 @@ def render_streamlit_ui() -> None:
|
|||
col2.title(title)
|
||||
col2.markdown("欢迎使用MockingBird Web 2")
|
||||
|
||||
image = Image.open(path.join('mkgui', 'static', 'mb.png'))
|
||||
image = Image.open(path.join('control','mkgui', 'static', 'mb.png'))
|
||||
col1.image(image)
|
||||
|
||||
st.markdown("---")
|
||||
|
@ -853,6 +891,13 @@ def render_streamlit_ui() -> None:
|
|||
|
||||
with left:
|
||||
st.header("Control 控制")
|
||||
# if session_state.mode in ["AI拟音", "VC拟音"] :
|
||||
# from audiorecorder import audiorecorder
|
||||
# audio = audiorecorder("Click to record", "Recording...")
|
||||
# if len(audio) > 0:
|
||||
# # To play audio in frontend:
|
||||
# st.audio(audio.tobytes())
|
||||
|
||||
InputUI(session_state=session_state, input_class=opyrator.input_type).render_ui(st)
|
||||
execute_selected = st.button(opyrator.action)
|
||||
if execute_selected:
|
|
@ -6,8 +6,8 @@ from typing import Any, Tuple
|
|||
|
||||
|
||||
# Constants
|
||||
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
|
||||
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
|
||||
if os.path.isdir(EXT_MODELS_DIRT):
|
||||
|
@ -83,7 +83,7 @@ def preprocess(input: Input) -> Output:
|
|||
"""Preprocess(预处理)"""
|
||||
finished = 0
|
||||
if input.model == Model.VC_PPG2MEL:
|
||||
from ppg2mel.preprocess import preprocess_dataset
|
||||
from models.ppg2mel.preprocess import preprocess_dataset
|
||||
finished = preprocess_dataset(
|
||||
datasets_root=Path(input.datasets_root),
|
||||
dataset=input.dataset,
|
Before Width: | Height: | Size: 5.6 KiB After Width: | Height: | Size: 5.6 KiB |
|
@ -3,17 +3,17 @@ import os
|
|||
from pathlib import Path
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.train import train as synt_train
|
||||
from models.synthesizer.hparams import hparams
|
||||
from models.synthesizer.train import train as synt_train
|
||||
|
||||
# Constants
|
||||
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models"
|
||||
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
|
||||
# EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
|
||||
# CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
|
||||
# ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
# EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
|
||||
# CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
|
||||
# ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
# Pre-Load models
|
||||
if os.path.isdir(SYN_MODELS_DIRT):
|
||||
|
@ -96,7 +96,7 @@ def train(input: Input) -> Output:
|
|||
synt_train(
|
||||
input.run_id,
|
||||
input.input_root,
|
||||
f"synthesizer{os.sep}saved_models",
|
||||
f"data{os.sep}ckpt{os.sep}synthesizer",
|
||||
input.save_every,
|
||||
input.backup_every,
|
||||
input.log_every,
|
|
@ -4,14 +4,14 @@ from pathlib import Path
|
|||
from enum import Enum
|
||||
from typing import Any, Tuple
|
||||
import numpy as np
|
||||
from utils.load_yaml import HpsYaml
|
||||
from utils.hparams import HpsYaml
|
||||
from utils.util import AttrDict
|
||||
import torch
|
||||
|
||||
# Constants
|
||||
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
|
||||
CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
|
||||
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
|
||||
CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
|
||||
if os.path.isdir(EXT_MODELS_DIRT):
|
||||
|
@ -144,7 +144,7 @@ def train_vc(input: Input) -> Output:
|
|||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(input.seed)
|
||||
mode = "train"
|
||||
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
solver = Solver(config, params, mode)
|
||||
solver.load_data()
|
||||
solver.set_model()
|
|
@ -1,12 +1,12 @@
|
|||
from toolbox.ui import UI
|
||||
from encoder import inference as encoder
|
||||
from synthesizer.inference import Synthesizer
|
||||
from vocoder.wavernn import inference as rnn_vocoder
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from vocoder.fregan import inference as fgan_vocoder
|
||||
from control.toolbox.ui import UI
|
||||
from models.encoder import inference as encoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.vocoder.wavernn import inference as rnn_vocoder
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
from models.vocoder.fregan import inference as fgan_vocoder
|
||||
from pathlib import Path
|
||||
from time import perf_counter as timer
|
||||
from toolbox.utterance import Utterance
|
||||
from control.toolbox.utterance import Utterance
|
||||
import numpy as np
|
||||
import traceback
|
||||
import sys
|
||||
|
@ -38,7 +38,8 @@ recognized_datasets = [
|
|||
"VoxCeleb2/dev/aac",
|
||||
"VoxCeleb2/test/aac",
|
||||
"VCTK-Corpus/wav48",
|
||||
"aidatatang_200zh/corpus",
|
||||
"aidatatang_200zh/corpus/test",
|
||||
"aidatatang_200zh/corpus/train",
|
||||
"aishell3/test/wav",
|
||||
"magicdata/train",
|
||||
]
|
||||
|
@ -396,7 +397,7 @@ class Toolbox:
|
|||
self.ui.log("Loading the extractor %s... " % model_fpath)
|
||||
self.ui.set_loading(1)
|
||||
start = timer()
|
||||
import ppg_extractor as extractor
|
||||
import models.ppg_extractor as extractor
|
||||
self.extractor = extractor.load_model(model_fpath)
|
||||
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
|
||||
self.ui.set_loading(0)
|
||||
|
@ -408,7 +409,7 @@ class Toolbox:
|
|||
self.ui.log("Loading the convertor %s... " % model_fpath)
|
||||
self.ui.set_loading(1)
|
||||
start = timer()
|
||||
import ppg2mel as convertor
|
||||
import models.ppg2mel as convertor
|
||||
self.convertor = convertor.load_model( model_fpath)
|
||||
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
|
||||
self.ui.set_loading(0)
|
Before Width: | Height: | Size: 5.6 KiB After Width: | Height: | Size: 5.6 KiB |
|
@ -3,9 +3,8 @@ from PyQt5 import QtGui
|
|||
from PyQt5.QtWidgets import *
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
|
||||
from matplotlib.figure import Figure
|
||||
from encoder.inference import plot_embedding_as_heatmap
|
||||
from toolbox.utterance import Utterance
|
||||
from models.encoder.inference import plot_embedding_as_heatmap
|
||||
from control.toolbox.utterance import Utterance
|
||||
from pathlib import Path
|
||||
from typing import List, Set
|
||||
import sounddevice as sd
|
|
@ -1,5 +1,5 @@
|
|||
from pathlib import Path
|
||||
from toolbox import Toolbox
|
||||
from control.toolbox import Toolbox
|
||||
from utils.argutils import print_args
|
||||
from utils.modelutils import check_model_paths
|
||||
import argparse
|
||||
|
@ -17,15 +17,15 @@ if __name__ == '__main__':
|
|||
"supported datasets.", default=None)
|
||||
parser.add_argument("-vc", "--vc_mode", action="store_true",
|
||||
help="Voice Conversion Mode(PPG based)")
|
||||
parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models",
|
||||
parser.add_argument("-e", "--enc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}encoder",
|
||||
help="Directory containing saved encoder models")
|
||||
parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models",
|
||||
parser.add_argument("-s", "--syn_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}synthesizer",
|
||||
help="Directory containing saved synthesizer models")
|
||||
parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models",
|
||||
parser.add_argument("-v", "--voc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}vocoder",
|
||||
help="Directory containing saved vocoder models")
|
||||
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default="ppg_extractor/saved_models",
|
||||
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg_extractor",
|
||||
help="Directory containing saved extrator models")
|
||||
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default="ppg2mel/saved_models",
|
||||
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg2mel",
|
||||
help="Directory containing saved convert models")
|
||||
parser.add_argument("--cpu", action="store_true", help=\
|
||||
"If True, processing is done on CPU, even when a GPU is available.")
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
|
Binary file not shown.
14
gen_voice.py
14
gen_voice.py
|
@ -1,23 +1,15 @@
|
|||
from encoder.params_model import model_embedding_size as speaker_embedding_size
|
||||
from utils.argutils import print_args
|
||||
from utils.modelutils import check_model_paths
|
||||
from synthesizer.inference import Synthesizer
|
||||
from encoder import inference as encoder
|
||||
from vocoder.wavernn import inference as rnn_vocoder
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.encoder import inference as encoder
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
import argparse
|
||||
import torch
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import cn2an
|
||||
import glob
|
||||
|
||||
from audioread.exceptions import NoBackendError
|
||||
vocoder = gan_vocoder
|
||||
|
||||
def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from scipy.ndimage.morphology import binary_dilation
|
||||
from encoder.params_data import *
|
||||
from models.encoder.params_data import *
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
from warnings import warn
|
|
@ -0,0 +1,2 @@
|
|||
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
|
|
@ -1,5 +1,5 @@
|
|||
from encoder.data_objects.random_cycler import RandomCycler
|
||||
from encoder.data_objects.utterance import Utterance
|
||||
from models.encoder.data_objects.random_cycler import RandomCycler
|
||||
from models.encoder.data_objects.utterance import Utterance
|
||||
from pathlib import Path
|
||||
|
||||
# Contains the set of utterances of a single speaker
|
|
@ -1,6 +1,6 @@
|
|||
import numpy as np
|
||||
from typing import List
|
||||
from encoder.data_objects.speaker import Speaker
|
||||
from models.encoder.data_objects.speaker import Speaker
|
||||
|
||||
class SpeakerBatch:
|
||||
def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
|
|
@ -1,7 +1,7 @@
|
|||
from encoder.data_objects.random_cycler import RandomCycler
|
||||
from encoder.data_objects.speaker_batch import SpeakerBatch
|
||||
from encoder.data_objects.speaker import Speaker
|
||||
from encoder.params_data import partials_n_frames
|
||||
from models.encoder.data_objects.random_cycler import RandomCycler
|
||||
from models.encoder.data_objects.speaker_batch import SpeakerBatch
|
||||
from models.encoder.data_objects.speaker import Speaker
|
||||
from models.encoder.params_data import partials_n_frames
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from pathlib import Path
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
from encoder.params_data import *
|
||||
from encoder.model import SpeakerEncoder
|
||||
from encoder.audio import preprocess_wav # We want to expose this function from here
|
||||
from models.encoder.params_data import *
|
||||
from models.encoder.model import SpeakerEncoder
|
||||
from models.encoder.audio import preprocess_wav # We want to expose this function from here
|
||||
from matplotlib import cm
|
||||
from encoder import audio
|
||||
from models.encoder import audio
|
||||
from pathlib import Path
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
|
@ -1,5 +1,5 @@
|
|||
from encoder.params_model import *
|
||||
from encoder.params_data import *
|
||||
from models.encoder.params_model import *
|
||||
from models.encoder.params_data import *
|
||||
from scipy.interpolate import interp1d
|
||||
from sklearn.metrics import roc_curve
|
||||
from torch.nn.utils import clip_grad_norm_
|
|
@ -1,8 +1,8 @@
|
|||
from multiprocess.pool import ThreadPool
|
||||
from encoder.params_data import *
|
||||
from encoder.config import librispeech_datasets, anglophone_nationalites
|
||||
from models.encoder.params_data import *
|
||||
from models.encoder.config import librispeech_datasets, anglophone_nationalites
|
||||
from datetime import datetime
|
||||
from encoder import audio
|
||||
from models.encoder import audio
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
|
@ -22,7 +22,7 @@ class DatasetLog:
|
|||
self._log_params()
|
||||
|
||||
def _log_params(self):
|
||||
from encoder import params_data
|
||||
from models.encoder import params_data
|
||||
self.write_line("Parameter values:")
|
||||
for param_name in (p for p in dir(params_data) if not p.startswith("__")):
|
||||
value = getattr(params_data, param_name)
|
|
@ -1,7 +1,7 @@
|
|||
from encoder.visualizations import Visualizations
|
||||
from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
|
||||
from encoder.params_model import *
|
||||
from encoder.model import SpeakerEncoder
|
||||
from models.encoder.visualizations import Visualizations
|
||||
from models.encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
|
||||
from models.encoder.params_model import *
|
||||
from models.encoder.model import SpeakerEncoder
|
||||
from utils.profiler import Profiler
|
||||
from pathlib import Path
|
||||
import torch
|
|
@ -1,4 +1,4 @@
|
|||
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from datetime import datetime
|
||||
from time import perf_counter as timer
|
||||
import matplotlib.pyplot as plt
|
||||
|
@ -65,8 +65,8 @@ class Visualizations:
|
|||
def log_params(self):
|
||||
if self.disabled:
|
||||
return
|
||||
from encoder import params_data
|
||||
from encoder import params_model
|
||||
from models.encoder import params_data
|
||||
from models.encoder import params_model
|
||||
param_string = "<b>Model parameters</b>:<br>"
|
||||
for param_name in (p for p in dir(params_model) if not p.startswith("__")):
|
||||
value = getattr(params_model, param_name)
|
|
@ -15,7 +15,7 @@ from .rnn_decoder_mol import Decoder
|
|||
from .utils.cnn_postnet import Postnet
|
||||
from .utils.vc_utils import get_mask_from_lengths
|
||||
|
||||
from utils.load_yaml import HpsYaml
|
||||
from utils.hparams import HpsYaml
|
||||
|
||||
class MelDecoderMOLv2(AbsMelDecoder):
|
||||
"""Use an encoder to preprocess ppg."""
|
|
@ -7,10 +7,10 @@ from pathlib import Path
|
|||
import soundfile
|
||||
import resampy
|
||||
|
||||
from ppg_extractor import load_model
|
||||
from models.ppg_extractor import load_model
|
||||
import encoder.inference as Encoder
|
||||
from encoder.audio import preprocess_wav
|
||||
from encoder import audio
|
||||
from models.encoder.audio import preprocess_wav
|
||||
from models.encoder import audio
|
||||
from utils.f0_utils import compute_f0
|
||||
|
||||
from torch.multiprocessing import Pool, cpu_count
|
|
@ -2,8 +2,8 @@ import sys
|
|||
import torch
|
||||
import argparse
|
||||
import numpy as np
|
||||
from utils.load_yaml import HpsYaml
|
||||
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
from utils.hparams import HpsYaml
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
|
||||
# For reproducibility, comment these may speed up training
|
||||
torch.backends.cudnn.deterministic = True
|
|
@ -8,7 +8,6 @@ from torch.utils.tensorboard import SummaryWriter
|
|||
|
||||
from .option import default_hparas
|
||||
from utils.util import human_format, Timer
|
||||
from utils.load_yaml import HpsYaml
|
||||
|
||||
|
||||
class BaseSolver():
|
|
@ -14,7 +14,7 @@ from utils.data_load import OneshotVcDataset, MultiSpkVcCollate
|
|||
from .loss import MaskedMSELoss
|
||||
from .optim import Optimizer
|
||||
from utils.util import human_format
|
||||
from ppg2mel import MelDecoderMOLv2
|
||||
from models.ppg2mel import MelDecoderMOLv2
|
||||
|
||||
|
||||
class Solver(BaseSolver):
|
|
@ -1,36 +1,4 @@
|
|||
import ast
|
||||
import pprint
|
||||
import json
|
||||
|
||||
class HParams(object):
|
||||
def __init__(self, **kwargs): self.__dict__.update(kwargs)
|
||||
def __setitem__(self, key, value): setattr(self, key, value)
|
||||
def __getitem__(self, key): return getattr(self, key)
|
||||
def __repr__(self): return pprint.pformat(self.__dict__)
|
||||
|
||||
def parse(self, string):
|
||||
# Overrides hparams from a comma-separated string of name=value pairs
|
||||
if len(string) > 0:
|
||||
overrides = [s.split("=") for s in string.split(",")]
|
||||
keys, values = zip(*overrides)
|
||||
keys = list(map(str.strip, keys))
|
||||
values = list(map(str.strip, values))
|
||||
for k in keys:
|
||||
self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
|
||||
return self
|
||||
|
||||
def loadJson(self, dict):
|
||||
print("\Loading the json with %s\n", dict)
|
||||
for k in dict.keys():
|
||||
if k not in ["tts_schedule", "tts_finetune_layers"]:
|
||||
self.__dict__[k] = dict[k]
|
||||
return self
|
||||
|
||||
def dumpJson(self, fp):
|
||||
print("\Saving the json with %s\n", fp)
|
||||
with fp.open("w", encoding="utf-8") as f:
|
||||
json.dump(self.__dict__, f)
|
||||
return self
|
||||
from utils.hparams import HParams
|
||||
|
||||
hparams = HParams(
|
||||
### Signal Processing (used in both synthesizer and vocoder)
|
||||
|
@ -104,7 +72,7 @@ hparams = HParams(
|
|||
### SV2TTS
|
||||
speaker_embedding_size = 256, # Dimension for the speaker embedding
|
||||
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
|
||||
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
|
||||
utterance_min_duration = 0.5, # Duration in seconds below which utterances are discarded
|
||||
use_gst = True, # Whether to use global style token
|
||||
use_ser_for_gst = True, # Whether to use speaker embedding referenced for global style token
|
||||
)
|
|
@ -1,16 +1,15 @@
|
|||
import torch
|
||||
from synthesizer import audio
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.models.tacotron import Tacotron
|
||||
from synthesizer.utils.symbols import symbols
|
||||
from synthesizer.utils.text import text_to_sequence
|
||||
from vocoder.display import simple_table
|
||||
from models.synthesizer import audio
|
||||
from models.synthesizer.hparams import hparams
|
||||
from models.synthesizer.models.tacotron import Tacotron
|
||||
from models.synthesizer.utils.symbols import symbols
|
||||
from models.synthesizer.utils.text import text_to_sequence
|
||||
from models.vocoder.display import simple_table
|
||||
from pathlib import Path
|
||||
from typing import Union, List
|
||||
import numpy as np
|
||||
import librosa
|
||||
from utils import logmmse
|
||||
import json
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
|
||||
class Synthesizer:
|
||||
|
@ -48,8 +47,7 @@ class Synthesizer:
|
|||
# Try to scan config file
|
||||
model_config_fpaths = list(self.model_fpath.parent.rglob("*.json"))
|
||||
if len(model_config_fpaths)>0 and model_config_fpaths[0].exists():
|
||||
with model_config_fpaths[0].open("r", encoding="utf-8") as f:
|
||||
hparams.loadJson(json.load(f))
|
||||
hparams.loadJson(model_config_fpaths[0])
|
||||
"""
|
||||
Instantiates and loads the model given the weights file that was passed in the constructor.
|
||||
"""
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue