Merge pull request #822 from babysor/restruct-project

Restruct project
pull/892/head
Vega 2023-02-04 14:37:48 +08:00 committed by GitHub
commit 9d67b757f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
194 changed files with 3307 additions and 28389 deletions

6
.gitignore vendored
View File

@ -14,9 +14,9 @@
*.bcf
*.toc
*.sh
*/saved_models
!vocoder/saved_models/pretrained/**
!encoder/saved_models/pretrained.pt
data/ckpt
!data/ckpt/vocoder/pretrained/**
!data/ckpt/encoder/pretrained.pt
wavs
log
!/docker-entrypoint.sh

12
.vscode/launch.json vendored
View File

@ -15,7 +15,8 @@
"name": "Python: Vocoder Preprocess",
"type": "python",
"request": "launch",
"program": "vocoder_preprocess.py",
"program": "control\\cli\\vocoder_preprocess.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"args": ["..\\audiodata"]
},
@ -23,7 +24,8 @@
"name": "Python: Vocoder Train",
"type": "python",
"request": "launch",
"program": "vocoder_train.py",
"program": "control\\cli\\vocoder_train.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"args": ["dev", "..\\audiodata"]
},
@ -32,6 +34,7 @@
"type": "python",
"request": "launch",
"program": "demo_toolbox.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"args": ["-d","..\\audiodata"]
},
@ -40,6 +43,7 @@
"type": "python",
"request": "launch",
"program": "demo_toolbox.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"args": ["-d","..\\audiodata","-vc"]
},
@ -47,9 +51,9 @@
"name": "Python: Synth Train",
"type": "python",
"request": "launch",
"program": "synthesizer_train.py",
"program": "train.py",
"console": "integratedTerminal",
"args": ["my_run", "..\\"]
"args": ["--type", "synth", "..\\audiodata\\SV2TTS\\synthesizer"]
},
{
"name": "Python: PPG Convert",

View File

@ -1,9 +1,9 @@
from encoder.params_model import model_embedding_size as speaker_embedding_size
from models.encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from utils.modelutils import check_model_paths
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from models.synthesizer.inference import Synthesizer
from models.encoder import inference as encoder
from models.vocoder import inference as vocoder
from pathlib import Path
import numpy as np
import soundfile as sf

View File

@ -1,7 +1,10 @@
from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2, preprocess_aidatatang_200zh
from utils.argutils import print_args
from pathlib import Path
import argparse
from pathlib import Path
from models.encoder.preprocess import (preprocess_aidatatang_200zh,
preprocess_librispeech, preprocess_voxceleb1,
preprocess_voxceleb2)
from utils.argutils import print_args
if __name__ == "__main__":
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):

View File

@ -1,5 +1,5 @@
from utils.argutils import print_args
from encoder.train import train
from models.encoder.train import train
from pathlib import Path
import argparse

View File

@ -2,8 +2,8 @@ import sys
import torch
import argparse
import numpy as np
from utils.load_yaml import HpsYaml
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
from utils.hparams import HpsYaml
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True

View File

@ -1,7 +1,7 @@
from pathlib import Path
import argparse
from ppg2mel.preprocess import preprocess_dataset
from models.ppg2mel.preprocess import preprocess_dataset
from pathlib import Path
import argparse

View File

@ -1,10 +1,9 @@
from synthesizer.hparams import hparams
from synthesizer.train import train
from models.synthesizer.hparams import hparams
from models.synthesizer.train import train
from utils.argutils import print_args
import argparse
if __name__ == "__main__":
def new_train():
parser = argparse.ArgumentParser()
parser.add_argument("run_id", type=str, help= \
"Name for this model instance. If a model state from the same run ID was previously "
@ -13,7 +12,7 @@ if __name__ == "__main__":
parser.add_argument("syn_dir", type=str, default=argparse.SUPPRESS, help= \
"Path to the synthesizer directory that contains the ground truth mel spectrograms, "
"the wavs and the embeds.")
parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\
parser.add_argument("-m", "--models_dir", type=str, default=f"data/ckpt/synthesizer/", help=\
"Path to the output directory that will contain the saved model weights and the logs.")
parser.add_argument("-s", "--save_every", type=int, default=1000, help= \
"Number of steps between updates of the model on the disk. Set to 0 to never save the "
@ -28,10 +27,14 @@ if __name__ == "__main__":
parser.add_argument("--hparams", default="",
help="Hyperparameter overrides as a comma-separated list of name=value "
"pairs")
args = parser.parse_args()
args, _ = parser.parse_known_args()
print_args(args, parser)
args.hparams = hparams.parse(args.hparams)
# Run the training
train(**vars(args))
if __name__ == "__main__":
new_train()

View File

@ -0,0 +1,66 @@
import sys
import torch
import argparse
import numpy as np
from utils.hparams import HpsYaml
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def main():
# Arguments
parser = argparse.ArgumentParser(description=
'Training PPG2Mel VC model.')
parser.add_argument('--config', type=str,
help='Path to experiment config, e.g., config/vc.yaml')
parser.add_argument('--name', default=None, type=str, help='Name for logging.')
parser.add_argument('--logdir', default='log/', type=str,
help='Logging path.', required=False)
parser.add_argument('--ckpdir', default='ppg2mel/saved_models/', type=str,
help='Checkpoint path.', required=False)
parser.add_argument('--outdir', default='result/', type=str,
help='Decode output path.', required=False)
parser.add_argument('--load', default=None, type=str,
help='Load pre-trained model (for training only)', required=False)
parser.add_argument('--warm_start', action='store_true',
help='Load model weights only, ignore specified layers.')
parser.add_argument('--seed', default=0, type=int,
help='Random seed for reproducable results.', required=False)
parser.add_argument('--njobs', default=8, type=int,
help='Number of threads for dataloader/decoding.', required=False)
parser.add_argument('--cpu', action='store_true', help='Disable GPU training.')
parser.add_argument('--no-pin', action='store_true',
help='Disable pin-memory for dataloader')
parser.add_argument('--test', action='store_true', help='Test the model.')
parser.add_argument('--no-msg', action='store_true', help='Hide all messages.')
parser.add_argument('--finetune', action='store_true', help='Finetune model')
parser.add_argument('--oneshotvc', action='store_true', help='Oneshot VC model')
parser.add_argument('--bilstm', action='store_true', help='BiLSTM VC model')
parser.add_argument('--lsa', action='store_true', help='Use location-sensitive attention (LSA)')
###
paras = parser.parse_args()
setattr(paras, 'gpu', not paras.cpu)
setattr(paras, 'pin_memory', not paras.no_pin)
setattr(paras, 'verbose', not paras.no_msg)
# Make the config dict dot visitable
config = HpsYaml(paras.config)
np.random.seed(paras.seed)
torch.manual_seed(paras.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(paras.seed)
print(">>> OneShot VC training ...")
mode = "train"
solver = Solver(config, paras, mode)
solver.load_data()
solver.set_model()
solver.exec()
print(">>> Oneshot VC train finished!")
sys.exit(0)
if __name__ == "__main__":
main()

View File

@ -1,5 +1,5 @@
from synthesizer.synthesize import run_synthesis
from synthesizer.hparams import hparams
from models.synthesizer.synthesize import run_synthesis
from models.synthesizer.hparams import hparams
from utils.argutils import print_args
import argparse
import os

View File

@ -1,7 +1,7 @@
from utils.argutils import print_args
from vocoder.wavernn.train import train
from vocoder.hifigan.train import train as train_hifigan
from vocoder.fregan.train import train as train_fregan
from models.vocoder.wavernn.train import train
from models.vocoder.hifigan.train import train as train_hifigan
from models.vocoder.fregan.train import train as train_fregan
from utils.util import AttrDict
from pathlib import Path
import argparse

View File

@ -2,22 +2,22 @@ from pydantic import BaseModel, Field
import os
from pathlib import Path
from enum import Enum
from encoder import inference as encoder
from models.encoder import inference as encoder
import librosa
from scipy.io.wavfile import write
import re
import numpy as np
from mkgui.base.components.types import FileContent
from vocoder.hifigan import inference as gan_vocoder
from synthesizer.inference import Synthesizer
from control.mkgui.base.components.types import FileContent
from models.vocoder.hifigan import inference as gan_vocoder
from models.synthesizer.inference import Synthesizer
from typing import Any, Tuple
import matplotlib.pyplot as plt
# Constants
AUDIO_SAMPLES_DIR = f"samples{os.sep}"
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
VOC_MODELS_DIRT = f"vocoder{os.sep}saved_models"
AUDIO_SAMPLES_DIR = f"data{os.sep}samples{os.sep}"
SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
VOC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}vocoder"
TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav"
TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav"
if not os.path.isdir("wavs"):
@ -31,7 +31,7 @@ if os.path.isdir(SYN_MODELS_DIRT):
synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
print("Loaded synthesizer models: " + str(len(synthesizers)))
else:
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.")
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist. 请将模型文件位置移动到上述位置中进行重试!")
if os.path.isdir(ENC_MODELS_DIRT):
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
@ -46,15 +46,16 @@ else:
raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
class Input(BaseModel):
message: str = Field(
..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容"
)
local_audio_file: audio_input_selection = Field(
..., alias="输入语音本地wav",
..., alias="选择语音本地wav",
description="选择本地语音文件."
)
record_audio_file: FileContent = Field(default=None, alias="录制语音",
description="录音.", is_recorder=True, mime_type="audio/wav")
upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
description="拖拽或点击上传.", mime_type="audio/wav")
encoder: encoders = Field(
@ -104,7 +105,12 @@ def synthesize(input: Input) -> Output:
gan_vocoder.load_model(Path(input.vocoder.value))
# load file
if input.upload_audio_file != None:
if input.record_audio_file != None:
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
f.write(input.record_audio_file.as_bytes())
f.seek(0)
wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
elif input.upload_audio_file != None:
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
f.write(input.upload_audio_file.as_bytes())
f.seek(0)

View File

@ -1,27 +1,26 @@
from synthesizer.inference import Synthesizer
from pydantic import BaseModel, Field
from encoder import inference as speacker_encoder
import torch
import os
from pathlib import Path
from enum import Enum
import ppg_extractor as Extractor
import ppg2mel as Convertor
import librosa
from scipy.io.wavfile import write
import re
import numpy as np
from mkgui.base.components.types import FileContent
from vocoder.hifigan import inference as gan_vocoder
from pathlib import Path
from typing import Any, Tuple
import matplotlib.pyplot as plt
import librosa
import matplotlib.pyplot as plt
import torch
from pydantic import BaseModel, Field
from scipy.io.wavfile import write
import models.ppg2mel as Convertor
import models.ppg_extractor as Extractor
from control.mkgui.base.components.types import FileContent
from models.encoder import inference as speacker_encoder
from models.synthesizer.inference import Synthesizer
from models.vocoder.hifigan import inference as gan_vocoder
# Constants
AUDIO_SAMPLES_DIR = f'samples{os.sep}'
EXT_MODELS_DIRT = f'ppg_extractor{os.sep}saved_models'
CONV_MODELS_DIRT = f'ppg2mel{os.sep}saved_models'
VOC_MODELS_DIRT = f'vocoder{os.sep}saved_models'
AUDIO_SAMPLES_DIR = f'data{os.sep}samples{os.sep}'
EXT_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg_extractor'
CONV_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg2mel'
VOC_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}vocoder'
TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav'
TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav'
TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav'
@ -132,9 +131,10 @@ def convert(input: Input) -> Output:
ppg = extractor.extract_from_wav(src_wav)
# Import necessary dependency of Voice Conversion
from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv
from utils.f0_utils import (compute_f0, compute_mean_std, f02lf0,
get_converted_lf0uv)
ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
speacker_encoder.load_model(Path(f"encoder{os.sep}saved_models{os.sep}pretrained_bak_5805000.pt"))
speacker_encoder.load_model(Path(f"data{os.sep}ckpt{os.sep}encoder{os.sep}pretrained_bak_5805000.pt"))
embed = speacker_encoder.embed_utterance(ref_wav)
lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
min_len = min(ppg.shape[1], len(lf0_uv))

View File

@ -37,6 +37,12 @@ def is_single_file_property(property: Dict) -> bool:
# TODO: binary?
return property.get("format") == "byte"
def is_single_autio_property(property: Dict) -> bool:
if property.get("type") != "string":
return False
# TODO: binary?
return property.get("format") == "bytes"
def is_single_directory_property(property: Dict) -> bool:
if property.get("type") != "string":

View File

@ -14,14 +14,13 @@ from fastapi.encoders import jsonable_encoder
from loguru import logger
from pydantic import BaseModel, ValidationError, parse_obj_as
from mkgui.base import Opyrator
from mkgui.base.core import name_to_title
from mkgui.base.ui import schema_utils
from mkgui.base.ui.streamlit_utils import CUSTOM_STREAMLIT_CSS
from control.mkgui.base import Opyrator
from control.mkgui.base.core import name_to_title
from . import schema_utils
from .streamlit_utils import CUSTOM_STREAMLIT_CSS
STREAMLIT_RUNNER_SNIPPET = """
from mkgui.base.ui import render_streamlit_ui
from mkgui.base import Opyrator
from control.mkgui.base.ui import render_streamlit_ui
import streamlit as st
@ -243,7 +242,14 @@ class InputUI:
file_extension = None
if "mime_type" in property:
file_extension = mimetypes.guess_extension(property["mime_type"])
if "is_recorder" in property:
from audio_recorder_streamlit import audio_recorder
audio_bytes = audio_recorder()
if audio_bytes:
streamlit_app.audio(audio_bytes, format="audio/wav")
return audio_bytes
uploaded_file = streamlit_app.file_uploader(
**streamlit_kwargs, accept_multiple_files=False, type=file_extension
)
@ -263,6 +269,39 @@ class InputUI:
streamlit_app.video(bytes, format=property.get("mime_type"))
return bytes
def _render_single_audio_input(
self, streamlit_app: st, key: str, property: Dict
) -> Any:
# streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
from audio_recorder_streamlit import audio_recorder
audio_bytes = audio_recorder()
if audio_bytes:
streamlit_app.audio(audio_bytes, format="audio/wav")
return audio_bytes
# file_extension = None
# if "mime_type" in property:
# file_extension = mimetypes.guess_extension(property["mime_type"])
# uploaded_file = streamlit_app.file_uploader(
# **streamlit_kwargs, accept_multiple_files=False, type=file_extension
# )
# if uploaded_file is None:
# return None
# bytes = uploaded_file.getvalue()
# if property.get("mime_type"):
# if is_compatible_audio(property["mime_type"]):
# # Show audio
# streamlit_app.audio(bytes, format=property.get("mime_type"))
# if is_compatible_image(property["mime_type"]):
# # Show image
# streamlit_app.image(bytes)
# if is_compatible_video(property["mime_type"]):
# # Show video
# streamlit_app.video(bytes, format=property.get("mime_type"))
# return bytes
def _render_single_string_input(
self, streamlit_app: st, key: str, property: Dict
) -> Any:
@ -807,21 +846,20 @@ class OutputUI:
def getOpyrator(mode: str) -> Opyrator:
if mode == None or mode.startswith('VC'):
from mkgui.app_vc import convert
from control.mkgui.app_vc import convert
return Opyrator(convert)
if mode == None or mode.startswith('预处理'):
from mkgui.preprocess import preprocess
from control.mkgui.preprocess import preprocess
return Opyrator(preprocess)
if mode == None or mode.startswith('模型训练'):
from mkgui.train import train
from control.mkgui.train import train
return Opyrator(train)
if mode == None or mode.startswith('模型训练(VC)'):
from mkgui.train_vc import train_vc
from control.mkgui.train_vc import train_vc
return Opyrator(train_vc)
from mkgui.app import synthesize
from control.mkgui.app import synthesize
return Opyrator(synthesize)
def render_streamlit_ui() -> None:
# init
session_state = st.session_state
@ -845,7 +883,7 @@ def render_streamlit_ui() -> None:
col2.title(title)
col2.markdown("欢迎使用MockingBird Web 2")
image = Image.open(path.join('mkgui', 'static', 'mb.png'))
image = Image.open(path.join('control','mkgui', 'static', 'mb.png'))
col1.image(image)
st.markdown("---")
@ -853,6 +891,13 @@ def render_streamlit_ui() -> None:
with left:
st.header("Control 控制")
# if session_state.mode in ["AI拟音", "VC拟音"] :
# from audiorecorder import audiorecorder
# audio = audiorecorder("Click to record", "Recording...")
# if len(audio) > 0:
# # To play audio in frontend:
# st.audio(audio.tobytes())
InputUI(session_state=session_state, input_class=opyrator.input_type).render_ui(st)
execute_selected = st.button(opyrator.action)
if execute_selected:

View File

@ -6,8 +6,8 @@ from typing import Any, Tuple
# Constants
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
if os.path.isdir(EXT_MODELS_DIRT):
@ -83,7 +83,7 @@ def preprocess(input: Input) -> Output:
"""Preprocess(预处理)"""
finished = 0
if input.model == Model.VC_PPG2MEL:
from ppg2mel.preprocess import preprocess_dataset
from models.ppg2mel.preprocess import preprocess_dataset
finished = preprocess_dataset(
datasets_root=Path(input.datasets_root),
dataset=input.dataset,

View File

Before

Width:  |  Height:  |  Size: 5.6 KiB

After

Width:  |  Height:  |  Size: 5.6 KiB

View File

@ -3,17 +3,17 @@ import os
from pathlib import Path
from enum import Enum
from typing import Any
from synthesizer.hparams import hparams
from synthesizer.train import train as synt_train
from models.synthesizer.hparams import hparams
from models.synthesizer.train import train as synt_train
# Constants
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
# EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
# CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
# ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
# EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
# CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
# ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
# Pre-Load models
if os.path.isdir(SYN_MODELS_DIRT):
@ -96,7 +96,7 @@ def train(input: Input) -> Output:
synt_train(
input.run_id,
input.input_root,
f"synthesizer{os.sep}saved_models",
f"data{os.sep}ckpt{os.sep}synthesizer",
input.save_every,
input.backup_every,
input.log_every,

View File

@ -4,14 +4,14 @@ from pathlib import Path
from enum import Enum
from typing import Any, Tuple
import numpy as np
from utils.load_yaml import HpsYaml
from utils.hparams import HpsYaml
from utils.util import AttrDict
import torch
# Constants
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
if os.path.isdir(EXT_MODELS_DIRT):
@ -144,7 +144,7 @@ def train_vc(input: Input) -> Output:
if torch.cuda.is_available():
torch.cuda.manual_seed_all(input.seed)
mode = "train"
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
solver = Solver(config, params, mode)
solver.load_data()
solver.set_model()

View File

@ -1,12 +1,12 @@
from toolbox.ui import UI
from encoder import inference as encoder
from synthesizer.inference import Synthesizer
from vocoder.wavernn import inference as rnn_vocoder
from vocoder.hifigan import inference as gan_vocoder
from vocoder.fregan import inference as fgan_vocoder
from control.toolbox.ui import UI
from models.encoder import inference as encoder
from models.synthesizer.inference import Synthesizer
from models.vocoder.wavernn import inference as rnn_vocoder
from models.vocoder.hifigan import inference as gan_vocoder
from models.vocoder.fregan import inference as fgan_vocoder
from pathlib import Path
from time import perf_counter as timer
from toolbox.utterance import Utterance
from control.toolbox.utterance import Utterance
import numpy as np
import traceback
import sys
@ -38,7 +38,8 @@ recognized_datasets = [
"VoxCeleb2/dev/aac",
"VoxCeleb2/test/aac",
"VCTK-Corpus/wav48",
"aidatatang_200zh/corpus",
"aidatatang_200zh/corpus/test",
"aidatatang_200zh/corpus/train",
"aishell3/test/wav",
"magicdata/train",
]
@ -396,7 +397,7 @@ class Toolbox:
self.ui.log("Loading the extractor %s... " % model_fpath)
self.ui.set_loading(1)
start = timer()
import ppg_extractor as extractor
import models.ppg_extractor as extractor
self.extractor = extractor.load_model(model_fpath)
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
self.ui.set_loading(0)
@ -408,7 +409,7 @@ class Toolbox:
self.ui.log("Loading the convertor %s... " % model_fpath)
self.ui.set_loading(1)
start = timer()
import ppg2mel as convertor
import models.ppg2mel as convertor
self.convertor = convertor.load_model( model_fpath)
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
self.ui.set_loading(0)

View File

Before

Width:  |  Height:  |  Size: 5.6 KiB

After

Width:  |  Height:  |  Size: 5.6 KiB

View File

@ -3,9 +3,8 @@ from PyQt5 import QtGui
from PyQt5.QtWidgets import *
import matplotlib.pyplot as plt
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure
from encoder.inference import plot_embedding_as_heatmap
from toolbox.utterance import Utterance
from models.encoder.inference import plot_embedding_as_heatmap
from control.toolbox.utterance import Utterance
from pathlib import Path
from typing import List, Set
import sounddevice as sd

View File

@ -1,5 +1,5 @@
from pathlib import Path
from toolbox import Toolbox
from control.toolbox import Toolbox
from utils.argutils import print_args
from utils.modelutils import check_model_paths
import argparse
@ -17,15 +17,15 @@ if __name__ == '__main__':
"supported datasets.", default=None)
parser.add_argument("-vc", "--vc_mode", action="store_true",
help="Voice Conversion Mode(PPG based)")
parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models",
parser.add_argument("-e", "--enc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}encoder",
help="Directory containing saved encoder models")
parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models",
parser.add_argument("-s", "--syn_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}synthesizer",
help="Directory containing saved synthesizer models")
parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models",
parser.add_argument("-v", "--voc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}vocoder",
help="Directory containing saved vocoder models")
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default="ppg_extractor/saved_models",
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg_extractor",
help="Directory containing saved extrator models")
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default="ppg2mel/saved_models",
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg2mel",
help="Directory containing saved convert models")
parser.add_argument("--cpu", action="store_true", help=\
"If True, processing is done on CPU, even when a GPU is available.")

View File

@ -1,2 +0,0 @@
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

Binary file not shown.

View File

@ -1,23 +1,15 @@
from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from utils.modelutils import check_model_paths
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder.wavernn import inference as rnn_vocoder
from vocoder.hifigan import inference as gan_vocoder
from models.synthesizer.inference import Synthesizer
from models.encoder import inference as encoder
from models.vocoder.hifigan import inference as gan_vocoder
from pathlib import Path
import numpy as np
import soundfile as sf
import librosa
import argparse
import torch
import sys
import os
import re
import cn2an
import glob
from audioread.exceptions import NoBackendError
vocoder = gan_vocoder
def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):

View File

@ -1,5 +1,5 @@
from scipy.ndimage.morphology import binary_dilation
from encoder.params_data import *
from models.encoder.params_data import *
from pathlib import Path
from typing import Optional, Union
from warnings import warn

View File

@ -0,0 +1,2 @@
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

View File

@ -1,5 +1,5 @@
from encoder.data_objects.random_cycler import RandomCycler
from encoder.data_objects.utterance import Utterance
from models.encoder.data_objects.random_cycler import RandomCycler
from models.encoder.data_objects.utterance import Utterance
from pathlib import Path
# Contains the set of utterances of a single speaker

View File

@ -1,6 +1,6 @@
import numpy as np
from typing import List
from encoder.data_objects.speaker import Speaker
from models.encoder.data_objects.speaker import Speaker
class SpeakerBatch:
def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):

View File

@ -1,7 +1,7 @@
from encoder.data_objects.random_cycler import RandomCycler
from encoder.data_objects.speaker_batch import SpeakerBatch
from encoder.data_objects.speaker import Speaker
from encoder.params_data import partials_n_frames
from models.encoder.data_objects.random_cycler import RandomCycler
from models.encoder.data_objects.speaker_batch import SpeakerBatch
from models.encoder.data_objects.speaker import Speaker
from models.encoder.params_data import partials_n_frames
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

View File

@ -1,8 +1,8 @@
from encoder.params_data import *
from encoder.model import SpeakerEncoder
from encoder.audio import preprocess_wav # We want to expose this function from here
from models.encoder.params_data import *
from models.encoder.model import SpeakerEncoder
from models.encoder.audio import preprocess_wav # We want to expose this function from here
from matplotlib import cm
from encoder import audio
from models.encoder import audio
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

View File

@ -1,5 +1,5 @@
from encoder.params_model import *
from encoder.params_data import *
from models.encoder.params_model import *
from models.encoder.params_data import *
from scipy.interpolate import interp1d
from sklearn.metrics import roc_curve
from torch.nn.utils import clip_grad_norm_

View File

@ -1,8 +1,8 @@
from multiprocess.pool import ThreadPool
from encoder.params_data import *
from encoder.config import librispeech_datasets, anglophone_nationalites
from models.encoder.params_data import *
from models.encoder.config import librispeech_datasets, anglophone_nationalites
from datetime import datetime
from encoder import audio
from models.encoder import audio
from pathlib import Path
from tqdm import tqdm
import numpy as np
@ -22,7 +22,7 @@ class DatasetLog:
self._log_params()
def _log_params(self):
from encoder import params_data
from models.encoder import params_data
self.write_line("Parameter values:")
for param_name in (p for p in dir(params_data) if not p.startswith("__")):
value = getattr(params_data, param_name)

View File

@ -1,7 +1,7 @@
from encoder.visualizations import Visualizations
from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
from encoder.params_model import *
from encoder.model import SpeakerEncoder
from models.encoder.visualizations import Visualizations
from models.encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
from models.encoder.params_model import *
from models.encoder.model import SpeakerEncoder
from utils.profiler import Profiler
from pathlib import Path
import torch

View File

@ -1,4 +1,4 @@
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from datetime import datetime
from time import perf_counter as timer
import matplotlib.pyplot as plt
@ -65,8 +65,8 @@ class Visualizations:
def log_params(self):
if self.disabled:
return
from encoder import params_data
from encoder import params_model
from models.encoder import params_data
from models.encoder import params_model
param_string = "<b>Model parameters</b>:<br>"
for param_name in (p for p in dir(params_model) if not p.startswith("__")):
value = getattr(params_model, param_name)

View File

@ -15,7 +15,7 @@ from .rnn_decoder_mol import Decoder
from .utils.cnn_postnet import Postnet
from .utils.vc_utils import get_mask_from_lengths
from utils.load_yaml import HpsYaml
from utils.hparams import HpsYaml
class MelDecoderMOLv2(AbsMelDecoder):
"""Use an encoder to preprocess ppg."""

View File

@ -7,10 +7,10 @@ from pathlib import Path
import soundfile
import resampy
from ppg_extractor import load_model
from models.ppg_extractor import load_model
import encoder.inference as Encoder
from encoder.audio import preprocess_wav
from encoder import audio
from models.encoder.audio import preprocess_wav
from models.encoder import audio
from utils.f0_utils import compute_f0
from torch.multiprocessing import Pool, cpu_count

View File

@ -2,8 +2,8 @@ import sys
import torch
import argparse
import numpy as np
from utils.load_yaml import HpsYaml
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
from utils.hparams import HpsYaml
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True

View File

@ -8,7 +8,6 @@ from torch.utils.tensorboard import SummaryWriter
from .option import default_hparas
from utils.util import human_format, Timer
from utils.load_yaml import HpsYaml
class BaseSolver():

View File

@ -14,7 +14,7 @@ from utils.data_load import OneshotVcDataset, MultiSpkVcCollate
from .loss import MaskedMSELoss
from .optim import Optimizer
from utils.util import human_format
from ppg2mel import MelDecoderMOLv2
from models.ppg2mel import MelDecoderMOLv2
class Solver(BaseSolver):

View File

View File

View File

@ -1,36 +1,4 @@
import ast
import pprint
import json
class HParams(object):
def __init__(self, **kwargs): self.__dict__.update(kwargs)
def __setitem__(self, key, value): setattr(self, key, value)
def __getitem__(self, key): return getattr(self, key)
def __repr__(self): return pprint.pformat(self.__dict__)
def parse(self, string):
# Overrides hparams from a comma-separated string of name=value pairs
if len(string) > 0:
overrides = [s.split("=") for s in string.split(",")]
keys, values = zip(*overrides)
keys = list(map(str.strip, keys))
values = list(map(str.strip, values))
for k in keys:
self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
return self
def loadJson(self, dict):
print("\Loading the json with %s\n", dict)
for k in dict.keys():
if k not in ["tts_schedule", "tts_finetune_layers"]:
self.__dict__[k] = dict[k]
return self
def dumpJson(self, fp):
print("\Saving the json with %s\n", fp)
with fp.open("w", encoding="utf-8") as f:
json.dump(self.__dict__, f)
return self
from utils.hparams import HParams
hparams = HParams(
### Signal Processing (used in both synthesizer and vocoder)
@ -104,7 +72,7 @@ hparams = HParams(
### SV2TTS
speaker_embedding_size = 256, # Dimension for the speaker embedding
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
utterance_min_duration = 0.5, # Duration in seconds below which utterances are discarded
use_gst = True, # Whether to use global style token
use_ser_for_gst = True, # Whether to use speaker embedding referenced for global style token
)

View File

@ -1,16 +1,15 @@
import torch
from synthesizer import audio
from synthesizer.hparams import hparams
from synthesizer.models.tacotron import Tacotron
from synthesizer.utils.symbols import symbols
from synthesizer.utils.text import text_to_sequence
from vocoder.display import simple_table
from models.synthesizer import audio
from models.synthesizer.hparams import hparams
from models.synthesizer.models.tacotron import Tacotron
from models.synthesizer.utils.symbols import symbols
from models.synthesizer.utils.text import text_to_sequence
from models.vocoder.display import simple_table
from pathlib import Path
from typing import Union, List
import numpy as np
import librosa
from utils import logmmse
import json
from pypinyin import lazy_pinyin, Style
class Synthesizer:
@ -48,8 +47,7 @@ class Synthesizer:
# Try to scan config file
model_config_fpaths = list(self.model_fpath.parent.rglob("*.json"))
if len(model_config_fpaths)>0 and model_config_fpaths[0].exists():
with model_config_fpaths[0].open("r", encoding="utf-8") as f:
hparams.loadJson(json.load(f))
hparams.loadJson(model_config_fpaths[0])
"""
Instantiates and loads the model given the weights file that was passed in the constructor.
"""

Some files were not shown because too many files have changed in this diff Show More