MockingBird/control/mkgui/app_vc.py

import os
from enum import Enum
from pathlib import Path
from typing import Any, Tuple

import librosa
import matplotlib.pyplot as plt
import torch
from pydantic import BaseModel, Field
from scipy.io.wavfile import write

import models.ppg2mel as Convertor
import models.ppg_extractor as Extractor
from control.mkgui.base.components.types import FileContent
from models.encoder import inference as speacker_encoder
from models.synthesizer.inference import Synthesizer
from models.vocoder.hifigan import inference as gan_vocoder

# Constants
AUDIO_SAMPLES_DIR = f'data{os.sep}samples{os.sep}'
EXT_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg_extractor'
CONV_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg2mel'
VOC_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}vocoder'
TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav'
TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav'
TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav'

# Load local sample audio as options TODO: load dataset 
if os.path.isdir(AUDIO_SAMPLES_DIR):
    audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav")))
# Pre-Load models
if os.path.isdir(EXT_MODELS_DIRT):    
    extractors =  Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
    print("Loaded extractor models: " + str(len(extractors)))
else:
    raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.")

if os.path.isdir(CONV_MODELS_DIRT):    
    convertors =  Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth")))
    print("Loaded convertor models: " + str(len(convertors)))
else:
    raise Exception(f"Model folder {CONV_MODELS_DIRT} doesn't exist.")

if os.path.isdir(VOC_MODELS_DIRT):    
    vocoders =  Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt")))
    print("Loaded vocoders models: " + str(len(vocoders)))
else:
    raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")

class Input(BaseModel):
    local_audio_file: audio_input_selection = Field(
        ..., alias="输入语音（本地wav）",
        description="选择本地语音文件."
    )
    upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
        description="拖拽或点击上传.", mime_type="audio/wav")
    local_audio_file_target: audio_input_selection = Field(
        ..., alias="目标语音（本地wav）",
        description="选择本地语音文件."
    )
    upload_audio_file_target: FileContent = Field(default=None, alias="或上传目标语音",
        description="拖拽或点击上传.", mime_type="audio/wav")
    extractor: extractors = Field(
        ..., alias="编码模型", 
        description="选择语音编码模型文件."
    )
    convertor: convertors = Field(
        ..., alias="转换模型", 
        description="选择语音转换模型文件."
    )
    vocoder: vocoders = Field(
        ..., alias="语音解码模型", 
        description="选择语音解码模型文件(目前只支持HifiGan类型)."
    )

class AudioEntity(BaseModel):
    content: bytes
    mel: Any

class Output(BaseModel):
    __root__: Tuple[AudioEntity, AudioEntity, AudioEntity]

    def render_output_ui(self, streamlit_app, input) -> None:  # type: ignore
        """Custom output UI.
        If this method is implmeneted, it will be used instead of the default Output UI renderer.
        """
        src, target, result = self.__root__
        
        streamlit_app.subheader("Synthesized Audio")
        streamlit_app.audio(result.content, format="audio/wav")

        fig, ax = plt.subplots()
        ax.imshow(src.mel, aspect="equal", interpolation="none")
        ax.set_title("mel spectrogram(Source Audio)")
        streamlit_app.pyplot(fig)
        fig, ax = plt.subplots()
        ax.imshow(target.mel, aspect="equal", interpolation="none")
        ax.set_title("mel spectrogram(Target Audio)")
        streamlit_app.pyplot(fig)
        fig, ax = plt.subplots()
        ax.imshow(result.mel, aspect="equal", interpolation="none")
        ax.set_title("mel spectrogram(Result Audio)")
        streamlit_app.pyplot(fig)

def convert(input: Input) -> Output:
    """convert(转换)"""
    # load models
    extractor = Extractor.load_model(Path(input.extractor.value))
    convertor = Convertor.load_model(Path(input.convertor.value))
    # current_synt = Synthesizer(Path(input.synthesizer.value))
    gan_vocoder.load_model(Path(input.vocoder.value))

    # load file
    if input.upload_audio_file != None:
        with open(TEMP_SOURCE_AUDIO, "w+b") as f:
            f.write(input.upload_audio_file.as_bytes())
            f.seek(0)
        src_wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
    else:
        src_wav, sample_rate  = librosa.load(input.local_audio_file.value)
        write(TEMP_SOURCE_AUDIO, sample_rate, src_wav) #Make sure we get the correct wav

    if input.upload_audio_file_target != None:
        with open(TEMP_TARGET_AUDIO, "w+b") as f:
            f.write(input.upload_audio_file_target.as_bytes())
            f.seek(0)
        ref_wav, _ = librosa.load(TEMP_TARGET_AUDIO)
    else:
        ref_wav, _  = librosa.load(input.local_audio_file_target.value)
        write(TEMP_TARGET_AUDIO, sample_rate, ref_wav) #Make sure we get the correct wav

    ppg = extractor.extract_from_wav(src_wav)
    # Import necessary dependency of Voice Conversion
    from utils.f0_utils import (compute_f0, compute_mean_std, f02lf0,
                                get_converted_lf0uv)   
    ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
    speacker_encoder.load_model(Path(f"data{os.sep}ckpt{os.sep}encoder{os.sep}pretrained_bak_5805000.pt"))
    embed = speacker_encoder.embed_utterance(ref_wav)
    lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
    min_len = min(ppg.shape[1], len(lf0_uv))
    ppg = ppg[:, :min_len]
    lf0_uv = lf0_uv[:min_len]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    _, mel_pred, att_ws = convertor.inference(
        ppg,
        logf0_uv=torch.from_numpy(lf0_uv).unsqueeze(0).float().to(device),
        spembs=torch.from_numpy(embed).unsqueeze(0).to(device),
    )
    mel_pred= mel_pred.transpose(0, 1)
    breaks = [mel_pred.shape[1]]
    mel_pred= mel_pred.detach().cpu().numpy()

    # synthesize and vocode
    wav, sample_rate = gan_vocoder.infer_waveform(mel_pred)

    # write and output 
    write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav
    with open(TEMP_SOURCE_AUDIO, "rb") as f:
        source_file = f.read()
    with open(TEMP_TARGET_AUDIO, "rb") as f:
        target_file = f.read()
    with open(TEMP_RESULT_AUDIO, "rb") as f:
        result_file = f.read()
    

    return Output(__root__=(AudioEntity(content=source_file, mel=Synthesizer.make_spectrogram(src_wav)), AudioEntity(content=target_file, mel=Synthesizer.make_spectrogram(ref_wav)), AudioEntity(content=result_file, mel=Synthesizer.make_spectrogram(wav))))
-												Upgrade to new web service (#529)

* Init new GUI

* Remove unused codes

* Reset layout

* Add samples

* Make framework to support multiple pages

* Add vc mode

* Add preprocessing mode

* Add training mode

* Remove text input in vc mode

* Add entry for GUI and revise readme

* Move requirement together

* Add error raise when no model folder found

* Add readme
											
										
										
											2022-05-09 18:44:02 +08:00
+								import os
 								from enum import Enum
-												Refactor Project to 3 parts: Models, Control, Data

Need readme

											
										
										
											2022-12-03 16:54:06 +08:00
+								from pathlib import Path
-												修复兼容性 - mac + linux

											
										
										
											2022-06-25 20:17:06 +08:00
+								from typing import Any, Tuple
-												Refactor Project to 3 parts: Models, Control, Data

Need readme

											
										
										
											2022-12-03 16:54:06 +08:00
 								import librosa
-												Upgrade to new web service (#529)

* Init new GUI

* Remove unused codes

* Reset layout

* Add samples

* Make framework to support multiple pages

* Add vc mode

* Add preprocessing mode

* Add training mode

* Remove text input in vc mode

* Add entry for GUI and revise readme

* Move requirement together

* Add error raise when no model folder found

* Add readme
											
										
										
											2022-05-09 18:44:02 +08:00
+								import matplotlib.pyplot as plt
-												Refactor Project to 3 parts: Models, Control, Data

Need readme

											
										
										
											2022-12-03 16:54:06 +08:00
+								import torch
 								from pydantic import BaseModel, Field
 								from scipy.io.wavfile import write
-												Upgrade to new web service (#529)

* Init new GUI

* Remove unused codes

* Reset layout

* Add samples

* Make framework to support multiple pages

* Add vc mode

* Add preprocessing mode

* Add training mode

* Remove text input in vc mode

* Add entry for GUI and revise readme

* Move requirement together

* Add error raise when no model folder found

* Add readme
											
										
										
											2022-05-09 18:44:02 +08:00
-												Refactor Project to 3 parts: Models, Control, Data

Need readme

											
										
										
											2022-12-03 16:54:06 +08:00
+								import models.ppg2mel as Convertor
 								import models.ppg_extractor as Extractor
 								from control.mkgui.base.components.types import FileContent
 								from models.encoder import inference as speacker_encoder
 								from models.synthesizer.inference import Synthesizer
 								from models.vocoder.hifigan import inference as gan_vocoder
-												Upgrade to new web service (#529)

* Init new GUI

* Remove unused codes

* Reset layout

* Add samples

* Make framework to support multiple pages

* Add vc mode

* Add preprocessing mode

* Add training mode

* Remove text input in vc mode

* Add entry for GUI and revise readme

* Move requirement together

* Add error raise when no model folder found

* Add readme
											
										
										
											2022-05-09 18:44:02 +08:00
 								# Constants
-												Refactor Project to 3 parts: Models, Control, Data

Need readme

											
										
										
											2022-12-03 16:54:06 +08:00
+								AUDIO_SAMPLES_DIR = f'data{os.sep}samples{os.sep}'
 								EXT_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg_extractor'
 								CONV_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg2mel'
 								VOC_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}vocoder'
-												修复兼容性 - mac + linux

											
										
										
											2022-06-25 20:17:06 +08:00
+								TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav'
 								TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav'
 								TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav'
-												Upgrade to new web service (#529)

* Init new GUI

* Remove unused codes

* Reset layout

* Add samples

* Make framework to support multiple pages

* Add vc mode

* Add preprocessing mode

* Add training mode

* Remove text input in vc mode

* Add entry for GUI and revise readme

* Move requirement together

* Add error raise when no model folder found

* Add readme
											
										
										
											2022-05-09 18:44:02 +08:00
 								# Load local sample audio as options TODO: load dataset
 								if os.path.isdir(AUDIO_SAMPLES_DIR):
 								    audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav")))
 								# Pre-Load models
 								if os.path.isdir(EXT_MODELS_DIRT):
 								    extractors =  Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
 								    print("Loaded extractor models: " + str(len(extractors)))
 								else:
 								    raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.")
 								if os.path.isdir(CONV_MODELS_DIRT):
 								    convertors =  Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth")))
 								    print("Loaded convertor models: " + str(len(convertors)))
 								else:
 								    raise Exception(f"Model folder {CONV_MODELS_DIRT} doesn't exist.")
 								if os.path.isdir(VOC_MODELS_DIRT):
 								    vocoders =  Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt")))
 								    print("Loaded vocoders models: " + str(len(vocoders)))
 								else:
 								    raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
 								class Input(BaseModel):
 								    local_audio_file: audio_input_selection = Field(
 								        ..., alias="输入语音（本地wav）",
 								        description="选择本地语音文件."
 								    )
 								    upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
 								        description="拖拽或点击上传.", mime_type="audio/wav")
 								    local_audio_file_target: audio_input_selection = Field(
 								        ..., alias="目标语音（本地wav）",
 								        description="选择本地语音文件."
 								    )
 								    upload_audio_file_target: FileContent = Field(default=None, alias="或上传目标语音",
 								        description="拖拽或点击上传.", mime_type="audio/wav")
 								    extractor: extractors = Field(
 								        ..., alias="编码模型",
 								        description="选择语音编码模型文件."
 								    )
 								    convertor: convertors = Field(
 								        ..., alias="转换模型",
 								        description="选择语音转换模型文件."
 								    )
 								    vocoder: vocoders = Field(
-												Fix compatibility issue

											
										
										
											2022-06-18 23:46:44 +08:00
+								        ..., alias="语音解码模型",
-												Upgrade to new web service (#529)

* Init new GUI

* Remove unused codes

* Reset layout

* Add samples

* Make framework to support multiple pages

* Add vc mode

* Add preprocessing mode

* Add training mode

* Remove text input in vc mode

* Add entry for GUI and revise readme

* Move requirement together

* Add error raise when no model folder found

* Add readme
											
										
										
											2022-05-09 18:44:02 +08:00
+								        description="选择语音解码模型文件(目前只支持HifiGan类型)."
 								    )
 								class AudioEntity(BaseModel):
 								    content: bytes
 								    mel: Any
 								class Output(BaseModel):
-												修复兼容性 - mac + linux

											
										
										
											2022-06-25 20:17:06 +08:00
+								    __root__: Tuple[AudioEntity, AudioEntity, AudioEntity]
-												Upgrade to new web service (#529)

* Init new GUI

* Remove unused codes

* Reset layout

* Add samples

* Make framework to support multiple pages

* Add vc mode

* Add preprocessing mode

* Add training mode

* Remove text input in vc mode

* Add entry for GUI and revise readme

* Move requirement together

* Add error raise when no model folder found

* Add readme
											
										
										
											2022-05-09 18:44:02 +08:00
 								    def render_output_ui(self, streamlit_app, input) -> None:  # type: ignore
 								        """Custom output UI.
 								        If this method is implmeneted, it will be used instead of the default Output UI renderer.
 								        """
 								        src, target, result = self.__root__
 								        streamlit_app.subheader("Synthesized Audio")
 								        streamlit_app.audio(result.content, format="audio/wav")
 								        fig, ax = plt.subplots()
 								        ax.imshow(src.mel, aspect="equal", interpolation="none")
 								        ax.set_title("mel spectrogram(Source Audio)")
 								        streamlit_app.pyplot(fig)
 								        fig, ax = plt.subplots()
 								        ax.imshow(target.mel, aspect="equal", interpolation="none")
 								        ax.set_title("mel spectrogram(Target Audio)")
 								        streamlit_app.pyplot(fig)
 								        fig, ax = plt.subplots()
 								        ax.imshow(result.mel, aspect="equal", interpolation="none")
 								        ax.set_title("mel spectrogram(Result Audio)")
 								        streamlit_app.pyplot(fig)
 								def convert(input: Input) -> Output:
 								    """convert(转换)"""
 								    # load models
 								    extractor = Extractor.load_model(Path(input.extractor.value))
 								    convertor = Convertor.load_model(Path(input.convertor.value))
 								    # current_synt = Synthesizer(Path(input.synthesizer.value))
 								    gan_vocoder.load_model(Path(input.vocoder.value))
 								    # load file
 								    if input.upload_audio_file != None:
 								        with open(TEMP_SOURCE_AUDIO, "w+b") as f:
 								            f.write(input.upload_audio_file.as_bytes())
 								            f.seek(0)
 								        src_wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
 								    else:
 								        src_wav, sample_rate  = librosa.load(input.local_audio_file.value)
 								        write(TEMP_SOURCE_AUDIO, sample_rate, src_wav) #Make sure we get the correct wav
 								    if input.upload_audio_file_target != None:
 								        with open(TEMP_TARGET_AUDIO, "w+b") as f:
 								            f.write(input.upload_audio_file_target.as_bytes())
 								            f.seek(0)
 								        ref_wav, _ = librosa.load(TEMP_TARGET_AUDIO)
 								    else:
 								        ref_wav, _  = librosa.load(input.local_audio_file_target.value)
 								        write(TEMP_TARGET_AUDIO, sample_rate, ref_wav) #Make sure we get the correct wav
 								    ppg = extractor.extract_from_wav(src_wav)
 								    # Import necessary dependency of Voice Conversion
-												Refactor Project to 3 parts: Models, Control, Data

Need readme

											
										
										
											2022-12-03 16:54:06 +08:00
+								    from utils.f0_utils import (compute_f0, compute_mean_std, f02lf0,
 								                                get_converted_lf0uv)
-												Upgrade to new web service (#529)

* Init new GUI

* Remove unused codes

* Reset layout

* Add samples

* Make framework to support multiple pages

* Add vc mode

* Add preprocessing mode

* Add training mode

* Remove text input in vc mode

* Add entry for GUI and revise readme

* Move requirement together

* Add error raise when no model folder found

* Add readme
											
										
										
											2022-05-09 18:44:02 +08:00
+								    ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
-												Refactor Project to 3 parts: Models, Control, Data

Need readme

											
										
										
											2022-12-03 16:54:06 +08:00
+								    speacker_encoder.load_model(Path(f"data{os.sep}ckpt{os.sep}encoder{os.sep}pretrained_bak_5805000.pt"))
-												Upgrade to new web service (#529)

* Init new GUI

* Remove unused codes

* Reset layout

* Add samples

* Make framework to support multiple pages

* Add vc mode

* Add preprocessing mode

* Add training mode

* Remove text input in vc mode

* Add entry for GUI and revise readme

* Move requirement together

* Add error raise when no model folder found

* Add readme
											
										
										
											2022-05-09 18:44:02 +08:00
+								    embed = speacker_encoder.embed_utterance(ref_wav)
 								    lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
 								    min_len = min(ppg.shape[1], len(lf0_uv))
 								    ppg = ppg[:, :min_len]
 								    lf0_uv = lf0_uv[:min_len]
 								    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 								    _, mel_pred, att_ws = convertor.inference(
 								        ppg,
 								        logf0_uv=torch.from_numpy(lf0_uv).unsqueeze(0).float().to(device),
 								        spembs=torch.from_numpy(embed).unsqueeze(0).to(device),
 								    )
 								    mel_pred= mel_pred.transpose(0, 1)
 								    breaks = [mel_pred.shape[1]]
 								    mel_pred= mel_pred.detach().cpu().numpy()
 								    # synthesize and vocode
 								    wav, sample_rate = gan_vocoder.infer_waveform(mel_pred)
 								    # write and output
 								    write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav
 								    with open(TEMP_SOURCE_AUDIO, "rb") as f:
 								        source_file = f.read()
 								    with open(TEMP_TARGET_AUDIO, "rb") as f:
 								        target_file = f.read()
 								    with open(TEMP_RESULT_AUDIO, "rb") as f:
 								        result_file = f.read()
 								    return Output(__root__=(AudioEntity(content=source_file, mel=Synthesizer.make_spectrogram(src_wav)), AudioEntity(content=target_file, mel=Synthesizer.make_spectrogram(ref_wav)), AudioEntity(content=result_file, mel=Synthesizer.make_spectrogram(wav))))