MockingBird/models/synthesizer/preprocess.py

from multiprocessing.pool import Pool 

from functools import partial
from itertools import chain
from pathlib import Path
from tqdm import tqdm
import numpy as np
from models.encoder import inference as encoder
from models.synthesizer.preprocess_audio import preprocess_general, extract_emo
from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata

data_info = {
    "aidatatang_200zh": {
        "subfolders": ["corpus/train"],
        "trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
        "speak_func": preprocess_general
    },
    "aidatatang_200zh_s": {
        "subfolders": ["corpus/train"],
        "trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
        "speak_func": preprocess_general
    },
    "magicdata": {
        "subfolders": ["train"],
        "trans_filepath": "train/TRANS.txt",
        "speak_func": preprocess_general,
        "transcript_func": preprocess_transcript_magicdata,
    },
    "aishell3":{
        "subfolders": ["train/wav"],
        "trans_filepath": "train/content.txt",
        "speak_func": preprocess_general,
        "transcript_func": preprocess_transcript_aishell3,
    },
    "data_aishell":{
        "subfolders": ["wav/train"],
        "trans_filepath": "transcript/aishell_transcript_v0.8.txt",
        "speak_func": preprocess_general
    }
}

def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
                           skip_existing: bool, hparams, no_alignments: bool, 
                           dataset: str, emotion_extract = False, encoder_model_fpath=None):
    dataset_info = data_info[dataset]
    # Gather the input directories
    dataset_root = datasets_root.joinpath(dataset)
    input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]
    print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
    assert all(input_dir.exists() for input_dir in input_dirs)
    
    # Create the output directories for each output file type
    out_dir.joinpath("mels").mkdir(exist_ok=True)
    out_dir.joinpath("audio").mkdir(exist_ok=True)
    if emotion_extract:
        out_dir.joinpath("emo").mkdir(exist_ok=True)
    
    # Create a metadata file
    metadata_fpath = out_dir.joinpath("train.txt")
    metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")

    # Preprocess the dataset
    dict_info = {}
    transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
    assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
    with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
        # process with specific function for your dataset 
        if "transcript_func" in dataset_info:
            dataset_info["transcript_func"](dict_info, dict_transcript)
        else:
            for v in dict_transcript:
                if not v:
                    continue
                v = v.strip().replace("\n","").replace("\t"," ").split(" ")
                dict_info[v[0]] = " ".join(v[1:])

    speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
    
    func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, 
                   hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
    job = Pool(n_processes).imap_unordered(func, speaker_dirs)
    
    for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
        if speaker_metadata is not None:
            for metadatum in speaker_metadata:
                metadata_file.write("|".join(map(str,metadatum)) + "\n")
    metadata_file.close()

    # Verify the contents of the metadata file
    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
        metadata = [line.split("|") for line in metadata_file]
    mel_frames = sum([int(m[4]) for m in metadata])
    timesteps = sum([int(m[3]) for m in metadata])
    sample_rate = hparams.sample_rate
    hours = (timesteps / sample_rate) / 3600
    print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
          (len(metadata), mel_frames, timesteps, hours))
    print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
    print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))

def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    wav = np.load(wav_fpath)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
    
def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False):
    if skip_existing and fpaths.exists():
        return
    wav_fpath, emo_fpath = fpaths
    wav = np.load(wav_fpath)
    emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
    np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
 
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
    wav_dir = synthesizer_root.joinpath("audio")
    metadata_fpath = synthesizer_root.joinpath("train.txt")
    assert wav_dir.exists() and metadata_fpath.exists()
    embed_dir = synthesizer_root.joinpath("embeds")
    embed_dir.mkdir(exist_ok=True)
    
    # Gather the input wave filepath and the target output embed filepath
    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
        metadata = [line.split("|") for line in metadata_file]
        fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
        
    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
    # Embed the utterances in separate threads
    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
    job = Pool(n_processes).imap(func, fpaths)
    tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
    wav_dir = synthesizer_root.joinpath("audio")
    metadata_fpath = synthesizer_root.joinpath("train.txt")
    assert wav_dir.exists() and metadata_fpath.exists()
    emo_dir = synthesizer_root.joinpath("emo")
    emo_dir.mkdir(exist_ok=True)
    
    # Gather the input wave filepath and the target output embed filepath
    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
        metadata = [line.split("|") for line in metadata_file]
        fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata]
        
    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
    # Embed the utterances in separate threads
    func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
    job = Pool(n_processes).imap(func, fpaths)
    tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`from multiprocessing.pool import Pool`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`from functools import partial`
			`from itertools import chain`
			`from pathlib import Path`
			`from tqdm import tqdm`
			`import numpy as np`
Refactor Project to 3 parts: Models, Control, Data Need readme 2022-12-03 16:54:06 +08:00			`from models.encoder import inference as encoder`
add pretrained 2023-02-18 09:31:05 +08:00			`from models.synthesizer.preprocess_audio import preprocess_general, extract_emo`
Refactor Project to 3 parts: Models, Control, Data Need readme 2022-12-03 16:54:06 +08:00			`from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00			`data_info = {`
			`"aidatatang_200zh": {`
			`"subfolders": ["corpus/train"],`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",`
Add vits 2023-02-04 14:13:38 +08:00			`"speak_func": preprocess_general`
			`},`
			`"aidatatang_200zh_s": {`
			`"subfolders": ["corpus/train"],`
			`"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",`
			`"speak_func": preprocess_general`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`},`
rename slr68 to magicdata to keep consistent naming convention (cherry picked from commit bbdad858ebc4d0ee3b720ba22ae3e0ce9732a734) 2021-08-17 20:55:28 +08:00			`"magicdata": {`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`"subfolders": ["train"],`
			`"trans_filepath": "train/TRANS.txt",`
Add vits 2023-02-04 14:13:38 +08:00			`"speak_func": preprocess_general,`
Fix bug pre-processing magicdata 2021-10-12 20:01:37 +08:00			`"transcript_func": preprocess_transcript_magicdata,`
[dataset]support aishell3(tested) 2021-08-25 23:11:29 +08:00			`},`
			`"aishell3":{`
			`"subfolders": ["train/wav"],`
			`"trans_filepath": "train/content.txt",`
Add vits 2023-02-04 14:13:38 +08:00			`"speak_func": preprocess_general,`
[FIX] Fix preprocessing bug for aishell3 2021-09-19 00:09:16 +08:00			`"transcript_func": preprocess_transcript_aishell3,`
支持data_aishell（SLR33）数据集 (#141) * 支持data_aishell（SLR33）数据集 * 更新readme 2021-10-12 23:40:27 +08:00			`},`
			`"data_aishell":{`
			`"subfolders": ["wav/train"],`
			`"trans_filepath": "transcript/aishell_transcript_v0.8.txt",`
Add vits 2023-02-04 14:13:38 +08:00			`"speak_func": preprocess_general`
[FIX] Fix preprocessing bug for aishell3 2021-09-19 00:09:16 +08:00			`}`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00			`}`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00
			`def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,`
Add vits 2023-02-04 14:13:38 +08:00			`skip_existing: bool, hparams, no_alignments: bool,`
add pretrained 2023-02-18 09:31:05 +08:00			`dataset: str, emotion_extract = False, encoder_model_fpath=None):`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`dataset_info = data_info[dataset]`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`# Gather the input directories`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00			`dataset_root = datasets_root.joinpath(dataset)`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`print("\n ".join(map(str, ["Using data from:"] + input_dirs)))`
			`assert all(input_dir.exists() for input_dir in input_dirs)`

			`# Create the output directories for each output file type`
			`out_dir.joinpath("mels").mkdir(exist_ok=True)`
			`out_dir.joinpath("audio").mkdir(exist_ok=True)`
Add vits 2023-02-04 14:13:38 +08:00			`if emotion_extract:`
			`out_dir.joinpath("emo").mkdir(exist_ok=True)`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00
			`# Create a metadata file`
			`metadata_fpath = out_dir.joinpath("train.txt")`
			`metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")`

			`# Preprocess the dataset`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`dict_info = {}`
			`transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])`
			`assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."`
			`with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:`
[FIX] Fix preprocessing bug for aishell3 2021-09-19 00:09:16 +08:00			`# process with specific function for your dataset`
			`if "transcript_func" in dataset_info:`
			`dataset_info["transcript_func"](dict_info, dict_transcript)`
			`else:`
			`for v in dict_transcript:`
			`if not v:`
			`continue`
			`v = v.strip().replace("\n","").replace("\t"," ").split(" ")`
			`dict_info[v[0]] = " ".join(v[1:])`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))`
Add vits 2023-02-04 14:13:38 +08:00
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,`
add pretrained 2023-02-18 09:31:05 +08:00			`hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)`
Some changes to make it easier to install the dependencies 2023-06-02 17:22:38 +08:00			`job = Pool(n_processes).imap_unordered(func, speaker_dirs)`
Add vits 2023-02-04 14:13:38 +08:00
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00			`for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):`
Add vits 2023-02-04 14:13:38 +08:00			`if speaker_metadata is not None:`
			`for metadatum in speaker_metadata:`
Some changes to make it easier to install the dependencies 2023-06-02 17:22:38 +08:00			`metadata_file.write("\|".join(map(str,metadatum)) + "\n")`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`metadata_file.close()`

			`# Verify the contents of the metadata file`
			`with metadata_fpath.open("r", encoding="utf-8") as metadata_file:`
			`metadata = [line.split("\|") for line in metadata_file]`
			`mel_frames = sum([int(m[4]) for m in metadata])`
			`timesteps = sum([int(m[3]) for m in metadata])`
			`sample_rate = hparams.sample_rate`
			`hours = (timesteps / sample_rate) / 3600`
			`print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %`
			`(len(metadata), mel_frames, timesteps, hours))`
			`print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))`
			`print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))`
			`print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))`

			`def embed_utterance(fpaths, encoder_model_fpath):`
			`if not encoder.is_loaded():`
			`encoder.load_model(encoder_model_fpath)`

			`# Compute the speaker embedding of the utterance`
			`wav_fpath, embed_fpath = fpaths`
			`wav = np.load(wav_fpath)`
			`wav = encoder.preprocess_wav(wav)`
			`embed = encoder.embed_utterance(wav)`
			`np.save(embed_fpath, embed, allow_pickle=False)`

add pretrained 2023-02-18 09:31:05 +08:00			`def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False):`
			`if skip_existing and fpaths.exists():`
			`return`
			`wav_fpath, emo_fpath = fpaths`
			`wav = np.load(wav_fpath)`
			`emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)`
			`np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00
			`def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):`
			`wav_dir = synthesizer_root.joinpath("audio")`
			`metadata_fpath = synthesizer_root.joinpath("train.txt")`
			`assert wav_dir.exists() and metadata_fpath.exists()`
			`embed_dir = synthesizer_root.joinpath("embeds")`
			`embed_dir.mkdir(exist_ok=True)`

			`# Gather the input wave filepath and the target output embed filepath`
			`with metadata_fpath.open("r", encoding="utf-8") as metadata_file:`
			`metadata = [line.split("\|") for line in metadata_file]`
			`fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]`

			`# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.`
			`# Embed the utterances in separate threads`
			`func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)`
			`job = Pool(n_processes).imap(func, fpaths)`
Some changes to make it easier to install the dependencies 2023-06-02 17:22:38 +08:00			`tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))`
add pretrained 2023-02-18 09:31:05 +08:00
			`def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):`
			`wav_dir = synthesizer_root.joinpath("audio")`
			`metadata_fpath = synthesizer_root.joinpath("train.txt")`
			`assert wav_dir.exists() and metadata_fpath.exists()`
			`emo_dir = synthesizer_root.joinpath("emo")`
			`emo_dir.mkdir(exist_ok=True)`

			`# Gather the input wave filepath and the target output embed filepath`
			`with metadata_fpath.open("r", encoding="utf-8") as metadata_file:`
			`metadata = [line.split("\|") for line in metadata_file]`
			`fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata]`

			`# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.`
			`# Embed the utterances in separate threads`
			`func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)`
			`job = Pool(n_processes).imap(func, fpaths)`
Some changes to make it easier to install the dependencies 2023-06-02 17:22:38 +08:00			`tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))`