2022-04-02 18:11:49 +08:00
|
|
|
|
from asyncio.windows_events import NULL
|
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
import os
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from enum import Enum
|
|
|
|
|
from encoder import inference as encoder
|
|
|
|
|
import librosa
|
|
|
|
|
from scipy.io.wavfile import write
|
|
|
|
|
import re
|
|
|
|
|
import numpy as np
|
2022-05-09 18:44:02 +08:00
|
|
|
|
from mkgui.base.components.types import FileContent
|
2022-04-02 18:11:49 +08:00
|
|
|
|
from vocoder.hifigan import inference as gan_vocoder
|
|
|
|
|
from synthesizer.inference import Synthesizer
|
2022-05-09 18:44:02 +08:00
|
|
|
|
from typing import Any
|
|
|
|
|
import matplotlib.pyplot as plt
|
2022-04-02 18:11:49 +08:00
|
|
|
|
|
|
|
|
|
# Constants
|
|
|
|
|
AUDIO_SAMPLES_DIR = 'samples\\'
|
|
|
|
|
SYN_MODELS_DIRT = "synthesizer\\saved_models"
|
|
|
|
|
ENC_MODELS_DIRT = "encoder\\saved_models"
|
|
|
|
|
VOC_MODELS_DIRT = "vocoder\\saved_models"
|
2022-05-15 16:08:58 +08:00
|
|
|
|
TEMP_SOURCE_AUDIO = "wavs\\temp_source.wav"
|
|
|
|
|
TEMP_RESULT_AUDIO = "wavs\\temp_result.wav"
|
2022-04-02 18:11:49 +08:00
|
|
|
|
|
|
|
|
|
# Load local sample audio as options TODO: load dataset
|
|
|
|
|
if os.path.isdir(AUDIO_SAMPLES_DIR):
|
|
|
|
|
audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav")))
|
|
|
|
|
# Pre-Load models
|
|
|
|
|
if os.path.isdir(SYN_MODELS_DIRT):
|
|
|
|
|
synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
|
|
|
|
|
print("Loaded synthesizer models: " + str(len(synthesizers)))
|
2022-05-09 18:44:02 +08:00
|
|
|
|
else:
|
|
|
|
|
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.")
|
|
|
|
|
|
2022-04-02 18:11:49 +08:00
|
|
|
|
if os.path.isdir(ENC_MODELS_DIRT):
|
|
|
|
|
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
|
|
|
|
|
print("Loaded encoders models: " + str(len(encoders)))
|
2022-05-09 18:44:02 +08:00
|
|
|
|
else:
|
|
|
|
|
raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
|
|
|
|
|
|
2022-04-02 18:11:49 +08:00
|
|
|
|
if os.path.isdir(VOC_MODELS_DIRT):
|
|
|
|
|
vocoders = Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt")))
|
|
|
|
|
print("Loaded vocoders models: " + str(len(synthesizers)))
|
2022-05-09 18:44:02 +08:00
|
|
|
|
else:
|
|
|
|
|
raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
|
2022-04-02 18:11:49 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Input(BaseModel):
|
2022-05-09 18:44:02 +08:00
|
|
|
|
message: str = Field(
|
|
|
|
|
..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容"
|
|
|
|
|
)
|
2022-04-02 18:11:49 +08:00
|
|
|
|
local_audio_file: audio_input_selection = Field(
|
|
|
|
|
..., alias="输入语音(本地wav)",
|
|
|
|
|
description="选择本地语音文件."
|
|
|
|
|
)
|
2022-05-09 18:44:02 +08:00
|
|
|
|
upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
|
2022-04-02 18:11:49 +08:00
|
|
|
|
description="拖拽或点击上传.", mime_type="audio/wav")
|
|
|
|
|
encoder: encoders = Field(
|
|
|
|
|
..., alias="编码模型",
|
|
|
|
|
description="选择语音编码模型文件."
|
|
|
|
|
)
|
|
|
|
|
synthesizer: synthesizers = Field(
|
|
|
|
|
..., alias="合成模型",
|
2022-05-09 18:44:02 +08:00
|
|
|
|
description="选择语音合成模型文件."
|
2022-04-02 18:11:49 +08:00
|
|
|
|
)
|
|
|
|
|
vocoder: vocoders = Field(
|
2022-05-09 18:44:02 +08:00
|
|
|
|
..., alias="语音解码模型",
|
|
|
|
|
description="选择语音解码模型文件(目前只支持HifiGan类型)."
|
2022-04-02 18:11:49 +08:00
|
|
|
|
)
|
|
|
|
|
|
2022-05-09 18:44:02 +08:00
|
|
|
|
class AudioEntity(BaseModel):
|
|
|
|
|
content: bytes
|
|
|
|
|
mel: Any
|
|
|
|
|
|
2022-04-02 18:11:49 +08:00
|
|
|
|
class Output(BaseModel):
|
2022-05-09 18:44:02 +08:00
|
|
|
|
__root__: tuple[AudioEntity, AudioEntity]
|
|
|
|
|
|
|
|
|
|
def render_output_ui(self, streamlit_app, input) -> None: # type: ignore
|
|
|
|
|
"""Custom output UI.
|
|
|
|
|
If this method is implmeneted, it will be used instead of the default Output UI renderer.
|
|
|
|
|
"""
|
|
|
|
|
src, result = self.__root__
|
|
|
|
|
|
|
|
|
|
streamlit_app.subheader("Synthesized Audio")
|
|
|
|
|
streamlit_app.audio(result.content, format="audio/wav")
|
2022-04-02 18:11:49 +08:00
|
|
|
|
|
2022-05-09 18:44:02 +08:00
|
|
|
|
fig, ax = plt.subplots()
|
|
|
|
|
ax.imshow(src.mel, aspect="equal", interpolation="none")
|
|
|
|
|
ax.set_title("mel spectrogram(Source Audio)")
|
|
|
|
|
streamlit_app.pyplot(fig)
|
|
|
|
|
fig, ax = plt.subplots()
|
|
|
|
|
ax.imshow(result.mel, aspect="equal", interpolation="none")
|
|
|
|
|
ax.set_title("mel spectrogram(Result Audio)")
|
|
|
|
|
streamlit_app.pyplot(fig)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def synthesize(input: Input) -> Output:
|
|
|
|
|
"""synthesize(合成)"""
|
2022-04-02 18:11:49 +08:00
|
|
|
|
# load models
|
|
|
|
|
encoder.load_model(Path(input.encoder.value))
|
|
|
|
|
current_synt = Synthesizer(Path(input.synthesizer.value))
|
|
|
|
|
gan_vocoder.load_model(Path(input.vocoder.value))
|
|
|
|
|
|
|
|
|
|
# load file
|
2022-05-09 18:44:02 +08:00
|
|
|
|
if input.upload_audio_file != None:
|
2022-04-02 18:11:49 +08:00
|
|
|
|
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
|
|
|
|
|
f.write(input.upload_audio_file.as_bytes())
|
|
|
|
|
f.seek(0)
|
|
|
|
|
wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
|
|
|
|
|
else:
|
|
|
|
|
wav, sample_rate = librosa.load(input.local_audio_file.value)
|
|
|
|
|
write(TEMP_SOURCE_AUDIO, sample_rate, wav) #Make sure we get the correct wav
|
|
|
|
|
|
2022-05-09 18:44:02 +08:00
|
|
|
|
source_spec = Synthesizer.make_spectrogram(wav)
|
|
|
|
|
|
2022-04-02 18:11:49 +08:00
|
|
|
|
# preprocess
|
|
|
|
|
encoder_wav = encoder.preprocess_wav(wav, sample_rate)
|
|
|
|
|
embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
|
|
|
|
|
|
|
|
|
|
# Load input text
|
|
|
|
|
texts = filter(None, input.message.split("\n"))
|
|
|
|
|
punctuation = '!,。、,' # punctuate and split/clean text
|
|
|
|
|
processed_texts = []
|
|
|
|
|
for text in texts:
|
|
|
|
|
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
|
|
|
|
|
if processed_text:
|
|
|
|
|
processed_texts.append(processed_text.strip())
|
|
|
|
|
texts = processed_texts
|
|
|
|
|
|
|
|
|
|
# synthesize and vocode
|
|
|
|
|
embeds = [embed] * len(texts)
|
|
|
|
|
specs = current_synt.synthesize_spectrograms(texts, embeds)
|
|
|
|
|
spec = np.concatenate(specs, axis=1)
|
|
|
|
|
sample_rate = Synthesizer.sample_rate
|
|
|
|
|
wav, sample_rate = gan_vocoder.infer_waveform(spec)
|
|
|
|
|
|
|
|
|
|
# write and output
|
|
|
|
|
write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav
|
|
|
|
|
with open(TEMP_SOURCE_AUDIO, "rb") as f:
|
|
|
|
|
source_file = f.read()
|
|
|
|
|
with open(TEMP_RESULT_AUDIO, "rb") as f:
|
|
|
|
|
result_file = f.read()
|
2022-05-09 18:44:02 +08:00
|
|
|
|
return Output(__root__=(AudioEntity(content=source_file, mel=source_spec), AudioEntity(content=result_file, mel=spec)))
|