mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
Add UI usage of PPG-vc
This commit is contained in:
parent
6befb700e9
commit
d786e78121
8
.vscode/launch.json
vendored
8
.vscode/launch.json
vendored
@ -35,6 +35,14 @@
|
||||
"console": "integratedTerminal",
|
||||
"args": ["-d","..\\audiodata"]
|
||||
},
|
||||
{
|
||||
"name": "Python: Demo Box VC",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "demo_toolbox.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["-d","..\\audiodata","-vc"]
|
||||
},
|
||||
{
|
||||
"name": "Python: Synth Train",
|
||||
"type": "python",
|
||||
|
@ -15,6 +15,8 @@ if __name__ == '__main__':
|
||||
parser.add_argument("-d", "--datasets_root", type=Path, help= \
|
||||
"Path to the directory containing your datasets. See toolbox/__init__.py for a list of "
|
||||
"supported datasets.", default=None)
|
||||
parser.add_argument("-vc", "--vc_mode", action="store_true",
|
||||
help="Voice Conversion Mode(PPG based)")
|
||||
parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models",
|
||||
help="Directory containing saved encoder models")
|
||||
parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models",
|
||||
|
@ -50,8 +50,9 @@ recognized_datasets = [
|
||||
MAX_WAVES = 15
|
||||
|
||||
class Toolbox:
|
||||
def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir, extractor_models_dir, convertor_models_dir, seed, no_mp3_support):
|
||||
def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir, extractor_models_dir, convertor_models_dir, seed, no_mp3_support, vc_mode):
|
||||
self.no_mp3_support = no_mp3_support
|
||||
self.vc_mode = vc_mode
|
||||
sys.excepthook = self.excepthook
|
||||
self.datasets_root = datasets_root
|
||||
self.utterances = set()
|
||||
@ -76,7 +77,7 @@ class Toolbox:
|
||||
self.trim_silences = False
|
||||
|
||||
# Initialize the events and the interface
|
||||
self.ui = UI()
|
||||
self.ui = UI(vc_mode)
|
||||
self.style_idx = 0
|
||||
self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir, extractor_models_dir, convertor_models_dir, seed)
|
||||
self.setup_events()
|
||||
@ -102,9 +103,12 @@ class Toolbox:
|
||||
self.ui.encoder_box.currentIndexChanged.connect(self.init_encoder)
|
||||
def func():
|
||||
self.synthesizer = None
|
||||
self.ui.synthesizer_box.currentIndexChanged.connect(func)
|
||||
if self.vc_mode:
|
||||
self.ui.extractor_box.currentIndexChanged.connect(self.init_extractor)
|
||||
else:
|
||||
self.ui.synthesizer_box.currentIndexChanged.connect(func)
|
||||
|
||||
self.ui.vocoder_box.currentIndexChanged.connect(self.init_vocoder)
|
||||
self.ui.extractor_box.currentIndexChanged.connect(self.init_extractor)
|
||||
|
||||
# Utterance selection
|
||||
func = lambda: self.load_from_browser(self.ui.browse_file())
|
||||
@ -117,8 +121,9 @@ class Toolbox:
|
||||
self.ui.record_button.clicked.connect(self.record)
|
||||
|
||||
# Source Utterance selection
|
||||
func = lambda: self.load_soruce_button(self.ui.selected_utterance)
|
||||
self.ui.load_soruce_button.clicked.connect(func)
|
||||
if self.vc_mode:
|
||||
func = lambda: self.load_soruce_button(self.ui.selected_utterance)
|
||||
self.ui.load_soruce_button.clicked.connect(func)
|
||||
|
||||
#Audio
|
||||
self.ui.setup_audio_devices(Synthesizer.sample_rate)
|
||||
@ -131,14 +136,16 @@ class Toolbox:
|
||||
self.ui.waves_cb.currentIndexChanged.connect(self.set_current_wav)
|
||||
|
||||
# Generation
|
||||
func = lambda: self.synthesize() or self.vocode()
|
||||
self.ui.generate_button.clicked.connect(func)
|
||||
self.ui.synthesize_button.clicked.connect(self.synthesize)
|
||||
self.ui.vocode_button.clicked.connect(self.vocode)
|
||||
self.ui.random_seed_checkbox.clicked.connect(self.update_seed_textbox)
|
||||
|
||||
func = lambda: self.convert() or self.vocode()
|
||||
self.ui.convert_button.clicked.connect(func)
|
||||
if self.vc_mode:
|
||||
func = lambda: self.convert() or self.vocode()
|
||||
self.ui.convert_button.clicked.connect(func)
|
||||
else:
|
||||
func = lambda: self.synthesize() or self.vocode()
|
||||
self.ui.generate_button.clicked.connect(func)
|
||||
self.ui.synthesize_button.clicked.connect(self.synthesize)
|
||||
|
||||
# UMAP legend
|
||||
self.ui.clear_button.clicked.connect(self.clear_utterances)
|
||||
@ -154,7 +161,7 @@ class Toolbox:
|
||||
|
||||
def reset_ui(self, encoder_models_dir, synthesizer_models_dir, vocoder_models_dir, extractor_models_dir, convertor_models_dir, seed):
|
||||
self.ui.populate_browser(self.datasets_root, recognized_datasets, 0, True)
|
||||
self.ui.populate_models(encoder_models_dir, synthesizer_models_dir, vocoder_models_dir, extractor_models_dir, convertor_models_dir)
|
||||
self.ui.populate_models(encoder_models_dir, synthesizer_models_dir, vocoder_models_dir, extractor_models_dir, convertor_models_dir, self.vc_mode)
|
||||
self.ui.populate_gen_options(seed, self.trim_silences)
|
||||
|
||||
def load_from_browser(self, fpath=None):
|
||||
@ -213,7 +220,7 @@ class Toolbox:
|
||||
# Add the utterance
|
||||
utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False)
|
||||
self.utterances.add(utterance)
|
||||
self.ui.register_utterance(utterance)
|
||||
self.ui.register_utterance(utterance, self.vc_mode)
|
||||
|
||||
# Plot it
|
||||
self.ui.draw_embed(embed, name, "current")
|
||||
|
107
toolbox/ui.py
107
toolbox/ui.py
@ -335,41 +335,42 @@ class UI(QDialog):
|
||||
return self.convertor_box.itemData(self.convertor_box.currentIndex())
|
||||
|
||||
def populate_models(self, encoder_models_dir: Path, synthesizer_models_dir: Path,
|
||||
vocoder_models_dir: Path, extractor_models_dir: Path, convertor_models_dir: Path):
|
||||
vocoder_models_dir: Path, extractor_models_dir: Path, convertor_models_dir: Path, vc_mode: bool):
|
||||
# Encoder
|
||||
encoder_fpaths = list(encoder_models_dir.glob("*.pt"))
|
||||
if len(encoder_fpaths) == 0:
|
||||
raise Exception("No encoder models found in %s" % encoder_models_dir)
|
||||
self.repopulate_box(self.encoder_box, [(f.stem, f) for f in encoder_fpaths])
|
||||
|
||||
# Synthesizer
|
||||
synthesizer_fpaths = list(synthesizer_models_dir.glob("**/*.pt"))
|
||||
if len(synthesizer_fpaths) == 0:
|
||||
raise Exception("No synthesizer models found in %s" % synthesizer_models_dir)
|
||||
self.repopulate_box(self.synthesizer_box, [(f.stem, f) for f in synthesizer_fpaths])
|
||||
if vc_mode:
|
||||
# Extractor
|
||||
extractor_fpaths = list(extractor_models_dir.glob("*.pt"))
|
||||
if len(extractor_fpaths) == 0:
|
||||
self.log("No extractor models found in %s" % extractor_fpaths)
|
||||
self.repopulate_box(self.extractor_box, [(f.stem, f) for f in extractor_fpaths])
|
||||
|
||||
# Convertor
|
||||
convertor_fpaths = list(convertor_models_dir.glob("*.pth"))
|
||||
if len(convertor_fpaths) == 0:
|
||||
self.log("No convertor models found in %s" % convertor_fpaths)
|
||||
self.repopulate_box(self.convertor_box, [(f.stem, f) for f in convertor_fpaths])
|
||||
else:
|
||||
# Synthesizer
|
||||
synthesizer_fpaths = list(synthesizer_models_dir.glob("**/*.pt"))
|
||||
if len(synthesizer_fpaths) == 0:
|
||||
raise Exception("No synthesizer models found in %s" % synthesizer_models_dir)
|
||||
self.repopulate_box(self.synthesizer_box, [(f.stem, f) for f in synthesizer_fpaths])
|
||||
|
||||
# Vocoder
|
||||
vocoder_fpaths = list(vocoder_models_dir.glob("**/*.pt"))
|
||||
vocoder_items = [(f.stem, f) for f in vocoder_fpaths] + [("Griffin-Lim", None)]
|
||||
self.repopulate_box(self.vocoder_box, vocoder_items)
|
||||
|
||||
# Extractor
|
||||
extractor_fpaths = list(extractor_models_dir.glob("*.pt"))
|
||||
if len(extractor_fpaths) == 0:
|
||||
self.log("No extractor models found in %s" % extractor_fpaths)
|
||||
self.repopulate_box(self.extractor_box, [(f.stem, f) for f in extractor_fpaths])
|
||||
|
||||
# Convertor
|
||||
convertor_fpaths = list(convertor_models_dir.glob("*.pth"))
|
||||
if len(convertor_fpaths) == 0:
|
||||
self.log("No convertor models found in %s" % convertor_fpaths)
|
||||
self.repopulate_box(self.convertor_box, [(f.stem, f) for f in convertor_fpaths])
|
||||
|
||||
@property
|
||||
def selected_utterance(self):
|
||||
return self.utterance_history.itemData(self.utterance_history.currentIndex())
|
||||
|
||||
def register_utterance(self, utterance: Utterance):
|
||||
def register_utterance(self, utterance: Utterance, vc_mode):
|
||||
self.utterance_history.blockSignals(True)
|
||||
self.utterance_history.insertItem(0, utterance.name, utterance)
|
||||
self.utterance_history.setCurrentIndex(0)
|
||||
@ -379,8 +380,11 @@ class UI(QDialog):
|
||||
self.utterance_history.removeItem(self.max_saved_utterances)
|
||||
|
||||
self.play_button.setDisabled(False)
|
||||
self.generate_button.setDisabled(False)
|
||||
self.synthesize_button.setDisabled(False)
|
||||
if vc_mode:
|
||||
self.convert_button.setDisabled(False)
|
||||
else:
|
||||
self.generate_button.setDisabled(False)
|
||||
self.synthesize_button.setDisabled(False)
|
||||
|
||||
def log(self, line, mode="newline"):
|
||||
if mode == "newline":
|
||||
@ -422,7 +426,7 @@ class UI(QDialog):
|
||||
else:
|
||||
self.seed_textbox.setEnabled(False)
|
||||
|
||||
def reset_interface(self):
|
||||
def reset_interface(self, vc_mode):
|
||||
self.draw_embed(None, None, "current")
|
||||
self.draw_embed(None, None, "generated")
|
||||
self.draw_spec(None, "current")
|
||||
@ -430,14 +434,17 @@ class UI(QDialog):
|
||||
self.draw_umap_projections(set())
|
||||
self.set_loading(0)
|
||||
self.play_button.setDisabled(True)
|
||||
self.generate_button.setDisabled(True)
|
||||
self.synthesize_button.setDisabled(True)
|
||||
if vc_mode:
|
||||
self.convert_button.setDisabled(True)
|
||||
else:
|
||||
self.generate_button.setDisabled(True)
|
||||
self.synthesize_button.setDisabled(True)
|
||||
self.vocode_button.setDisabled(True)
|
||||
self.replay_wav_button.setDisabled(True)
|
||||
self.export_wav_button.setDisabled(True)
|
||||
[self.log("") for _ in range(self.max_log_lines)]
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, vc_mode):
|
||||
## Initialize the application
|
||||
self.app = QApplication(sys.argv)
|
||||
super().__init__(None)
|
||||
@ -530,8 +537,9 @@ class UI(QDialog):
|
||||
browser_layout.addWidget(self.play_button, i, 2)
|
||||
self.stop_button = QPushButton("Stop(暂停)")
|
||||
browser_layout.addWidget(self.stop_button, i, 3)
|
||||
self.load_soruce_button = QPushButton("Select(选择为被转换的语音输入)")
|
||||
browser_layout.addWidget(self.load_soruce_button, i, 4)
|
||||
if vc_mode:
|
||||
self.load_soruce_button = QPushButton("Select(选择为被转换的语音输入)")
|
||||
browser_layout.addWidget(self.load_soruce_button, i, 4)
|
||||
|
||||
i += 1
|
||||
model_groupbox = QGroupBox('Models(模型选择)')
|
||||
@ -544,19 +552,20 @@ class UI(QDialog):
|
||||
model_layout.addWidget(QLabel("Encoder:"))
|
||||
model_layout.addWidget(self.encoder_box)
|
||||
self.synthesizer_box = QComboBox()
|
||||
model_layout.addWidget(QLabel("Synthesizer:"))
|
||||
model_layout.addWidget(self.synthesizer_box)
|
||||
if vc_mode:
|
||||
self.extractor_box = QComboBox()
|
||||
model_layout.addWidget(QLabel("Extractor:"))
|
||||
model_layout.addWidget(self.extractor_box)
|
||||
self.convertor_box = QComboBox()
|
||||
model_layout.addWidget(QLabel("Convertor:"))
|
||||
model_layout.addWidget(self.convertor_box)
|
||||
else:
|
||||
model_layout.addWidget(QLabel("Synthesizer:"))
|
||||
model_layout.addWidget(self.synthesizer_box)
|
||||
self.vocoder_box = QComboBox()
|
||||
model_layout.addWidget(QLabel("Vocoder:"))
|
||||
model_layout.addWidget(self.vocoder_box)
|
||||
|
||||
self.extractor_box = QComboBox()
|
||||
model_layout.addWidget(QLabel("Extractor:"))
|
||||
model_layout.addWidget(self.extractor_box)
|
||||
self.convertor_box = QComboBox()
|
||||
model_layout.addWidget(QLabel("Convertor:"))
|
||||
model_layout.addWidget(self.convertor_box)
|
||||
|
||||
#Replay & Save Audio
|
||||
i = 0
|
||||
output_layout.addWidget(QLabel("<b>Toolbox Output:</b>"), i, 0)
|
||||
@ -578,7 +587,7 @@ class UI(QDialog):
|
||||
|
||||
## Embed & spectrograms
|
||||
vis_layout.addStretch()
|
||||
|
||||
# TODO: add spectrograms for source
|
||||
gridspec_kw = {"width_ratios": [1, 4]}
|
||||
fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
|
||||
gridspec_kw=gridspec_kw)
|
||||
@ -599,20 +608,22 @@ class UI(QDialog):
|
||||
self.text_prompt = QPlainTextEdit(default_text)
|
||||
gen_layout.addWidget(self.text_prompt, stretch=1)
|
||||
|
||||
self.generate_button = QPushButton("Synthesize and vocode")
|
||||
gen_layout.addWidget(self.generate_button)
|
||||
|
||||
layout = QHBoxLayout()
|
||||
self.synthesize_button = QPushButton("Synthesize only")
|
||||
layout.addWidget(self.synthesize_button)
|
||||
if vc_mode:
|
||||
layout = QHBoxLayout()
|
||||
self.convert_button = QPushButton("Extract and Convert")
|
||||
layout.addWidget(self.convert_button)
|
||||
gen_layout.addLayout(layout)
|
||||
else:
|
||||
self.generate_button = QPushButton("Synthesize and vocode")
|
||||
gen_layout.addWidget(self.generate_button)
|
||||
layout = QHBoxLayout()
|
||||
self.synthesize_button = QPushButton("Synthesize only")
|
||||
layout.addWidget(self.synthesize_button)
|
||||
|
||||
self.vocode_button = QPushButton("Vocode only")
|
||||
layout.addWidget(self.vocode_button)
|
||||
gen_layout.addLayout(layout)
|
||||
|
||||
layout = QHBoxLayout()
|
||||
self.convert_button = QPushButton("Extract and Convert")
|
||||
layout.addWidget(self.convert_button)
|
||||
gen_layout.addLayout(layout)
|
||||
|
||||
layout_seed = QGridLayout()
|
||||
self.random_seed_checkbox = QCheckBox("Random seed:")
|
||||
@ -681,7 +692,7 @@ class UI(QDialog):
|
||||
self.resize(max_size)
|
||||
|
||||
## Finalize the display
|
||||
self.reset_interface()
|
||||
self.reset_interface(vc_mode)
|
||||
self.show()
|
||||
|
||||
def start(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user