2022-04-02 18:11:49 +08:00
from pydantic import BaseModel, Field
import os
from pathlib import Path
from enum import Enum
2022-12-03 16:54:06 +08:00
from models.encoder import inference as encoder
2022-04-02 18:11:49 +08:00
import librosa
from scipy.io.wavfile import write
import re
import numpy as np
2022-12-03 16:54:06 +08:00
from control.mkgui.base.components.types import FileContent
from models.vocoder.hifigan import inference as gan_vocoder
from models.synthesizer.inference import Synthesizer
2022-06-18 23:46:44 +08:00
from typing import Any, Tuple
2022-05-09 18:44:02 +08:00
import matplotlib.pyplot as plt
2022-04-02 18:11:49 +08:00
# Constants
2022-12-03 16:54:06 +08:00
AUDIO_SAMPLES_DIR = f"data{os.sep}samples{os.sep}"
SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
VOC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}vocoder"
2022-06-25 20:17:06 +08:00
TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav"
TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav"
2022-07-19 23:43:51 +08:00
if not os.path.isdir("wavs"):
2022-04-02 18:11:49 +08:00
# Load local sample audio as options TODO: load dataset
if os.path.isdir(AUDIO_SAMPLES_DIR):
audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav")))
# Pre-Load models
if os.path.isdir(SYN_MODELS_DIRT):
synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
print("Loaded synthesizer models: " + str(len(synthesizers)))
2022-05-09 18:44:02 +08:00
2022-12-03 16:54:06 +08:00
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist. 请将模型文件位置移动到上述位置中进行重试!")
2022-05-09 18:44:02 +08:00
2022-04-02 18:11:49 +08:00
if os.path.isdir(ENC_MODELS_DIRT):
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
print("Loaded encoders models: " + str(len(encoders)))
2022-05-09 18:44:02 +08:00
raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
2022-04-02 18:11:49 +08:00
if os.path.isdir(VOC_MODELS_DIRT):
vocoders = Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt")))
print("Loaded vocoders models: " + str(len(synthesizers)))
2022-05-09 18:44:02 +08:00
raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
2022-04-02 18:11:49 +08:00
class Input(BaseModel):
2022-05-09 18:44:02 +08:00
message: str = Field(
..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容"
2022-04-02 18:11:49 +08:00
local_audio_file: audio_input_selection = Field(
2023-02-01 19:59:15 +08:00
..., alias="选择语音(本地wav)",
2022-04-02 18:11:49 +08:00
2023-02-01 19:59:15 +08:00
record_audio_file: FileContent = Field(default=None, alias="录制语音",
description="录音.", is_recorder=True, mime_type="audio/wav")
2022-05-09 18:44:02 +08:00
upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
2022-04-02 18:11:49 +08:00
description="拖拽或点击上传.", mime_type="audio/wav")
encoder: encoders = Field(
..., alias="编码模型",
synthesizer: synthesizers = Field(
..., alias="合成模型",
2022-05-09 18:44:02 +08:00
2022-04-02 18:11:49 +08:00
vocoder: vocoders = Field(
2022-05-09 18:44:02 +08:00
..., alias="语音解码模型",
2022-04-02 18:11:49 +08:00
2022-05-09 18:44:02 +08:00
class AudioEntity(BaseModel):
content: bytes
mel: Any
2022-04-02 18:11:49 +08:00
class Output(BaseModel):
2022-06-18 23:46:44 +08:00
__root__: Tuple[AudioEntity, AudioEntity]
2022-05-09 18:44:02 +08:00
def render_output_ui(self, streamlit_app, input) -> None: # type: ignore
"""Custom output UI.
If this method is implmeneted, it will be used instead of the default Output UI renderer.
src, result = self.__root__
streamlit_app.subheader("Synthesized Audio")
streamlit_app.audio(result.content, format="audio/wav")
2022-04-02 18:11:49 +08:00
2022-05-09 18:44:02 +08:00
fig, ax = plt.subplots()
ax.imshow(src.mel, aspect="equal", interpolation="none")
ax.set_title("mel spectrogram(Source Audio)")
fig, ax = plt.subplots()
ax.imshow(result.mel, aspect="equal", interpolation="none")
ax.set_title("mel spectrogram(Result Audio)")
def synthesize(input: Input) -> Output:
2022-04-02 18:11:49 +08:00
# load models
current_synt = Synthesizer(Path(input.synthesizer.value))
# load file
2023-02-01 19:59:15 +08:00
if input.record_audio_file != None:
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
elif input.upload_audio_file != None:
2022-04-02 18:11:49 +08:00
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
wav, sample_rate = librosa.load(input.local_audio_file.value)
write(TEMP_SOURCE_AUDIO, sample_rate, wav) #Make sure we get the correct wav
2022-05-09 18:44:02 +08:00
source_spec = Synthesizer.make_spectrogram(wav)
2022-04-02 18:11:49 +08:00
# preprocess
encoder_wav = encoder.preprocess_wav(wav, sample_rate)
embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
# Load input text
texts = filter(None, input.message.split("\n"))
punctuation = '!,。、,' # punctuate and split/clean text
processed_texts = []
for text in texts:
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
if processed_text:
texts = processed_texts
# synthesize and vocode
embeds = [embed] * len(texts)
specs = current_synt.synthesize_spectrograms(texts, embeds)
spec = np.concatenate(specs, axis=1)
sample_rate = Synthesizer.sample_rate
wav, sample_rate = gan_vocoder.infer_waveform(spec)
# write and output
write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav
with open(TEMP_SOURCE_AUDIO, "rb") as f:
source_file = f.read()
with open(TEMP_RESULT_AUDIO, "rb") as f:
result_file = f.read()
2022-05-09 18:44:02 +08:00
return Output(__root__=(AudioEntity(content=source_file, mel=source_spec), AudioEntity(content=result_file, mel=spec)))