MockingBird/synthesizer/preprocess.py

from multiprocessing.pool import Pool 

from functools import partial
from itertools import chain
from pathlib import Path
from tqdm import tqdm
import numpy as np
from encoder import inference as encoder
from synthesizer.preprocess_speaker import preprocess_speaker_general
from synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata

data_info = {
    "aidatatang_200zh": {
        "subfolders": ["corpus/train"],
        "trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
        "speak_func": preprocess_speaker_general
    },
    "magicdata": {
        "subfolders": ["train"],
        "trans_filepath": "train/TRANS.txt",
        "speak_func": preprocess_speaker_general,
        "transcript_func": preprocess_transcript_magicdata,
    },
    "aishell3":{
        "subfolders": ["train/wav"],
        "trans_filepath": "train/content.txt",
        "speak_func": preprocess_speaker_general,
        "transcript_func": preprocess_transcript_aishell3,
    },
    "data_aishell":{
        "subfolders": ["wav/train"],
        "trans_filepath": "transcript/aishell_transcript_v0.8.txt",
        "speak_func": preprocess_speaker_general
    }
}

def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
                           skip_existing: bool, hparams, no_alignments: bool,
                           dataset: str):
    dataset_info = data_info[dataset]
    # Gather the input directories
    dataset_root = datasets_root.joinpath(dataset)
    input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]
    print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
    assert all(input_dir.exists() for input_dir in input_dirs)
    
    # Create the output directories for each output file type
    out_dir.joinpath("mels").mkdir(exist_ok=True)
    out_dir.joinpath("audio").mkdir(exist_ok=True)
    
    # Create a metadata file
    metadata_fpath = out_dir.joinpath("train.txt")
    metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")

    # Preprocess the dataset
    dict_info = {}
    transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
    assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
    with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
        # process with specific function for your dataset 
        if "transcript_func" in dataset_info:
            dataset_info["transcript_func"](dict_info, dict_transcript)
        else:
            for v in dict_transcript:
                if not v:
                    continue
                v = v.strip().replace("\n","").replace("\t"," ").split(" ")
                dict_info[v[0]] = " ".join(v[1:])

    speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
    func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, 
                   hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)
    job = Pool(n_processes).imap(func, speaker_dirs)
    for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
        for metadatum in speaker_metadata:
            metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
    metadata_file.close()

    # Verify the contents of the metadata file
    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
        metadata = [line.split("|") for line in metadata_file]
    mel_frames = sum([int(m[4]) for m in metadata])
    timesteps = sum([int(m[3]) for m in metadata])
    sample_rate = hparams.sample_rate
    hours = (timesteps / sample_rate) / 3600
    print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
          (len(metadata), mel_frames, timesteps, hours))
    print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
    print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))

def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    wav = np.load(wav_fpath)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
    
 
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
    wav_dir = synthesizer_root.joinpath("audio")
    metadata_fpath = synthesizer_root.joinpath("train.txt")
    assert wav_dir.exists() and metadata_fpath.exists()
    embed_dir = synthesizer_root.joinpath("embeds")
    embed_dir.mkdir(exist_ok=True)
    
    # Gather the input wave filepath and the target output embed filepath
    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
        metadata = [line.split("|") for line in metadata_file]
        fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
        
    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
    # Embed the utterances in separate threads
    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
    job = Pool(n_processes).imap(func, fpaths)
    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`from multiprocessing.pool import Pool`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`from functools import partial`
			`from itertools import chain`
			`from pathlib import Path`
			`from tqdm import tqdm`
			`import numpy as np`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00			`from encoder import inference as encoder`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`from synthesizer.preprocess_speaker import preprocess_speaker_general`
Fix bug pre-processing magicdata 2021-10-12 20:01:37 +08:00			`from synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00			`data_info = {`
			`"aidatatang_200zh": {`
			`"subfolders": ["corpus/train"],`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",`
			`"speak_func": preprocess_speaker_general`
			`},`
rename slr68 to magicdata to keep consistent naming convention (cherry picked from commit bbdad858ebc4d0ee3b720ba22ae3e0ce9732a734) 2021-08-17 20:55:28 +08:00			`"magicdata": {`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`"subfolders": ["train"],`
			`"trans_filepath": "train/TRANS.txt",`
Fix bug pre-processing magicdata 2021-10-12 20:01:37 +08:00			`"speak_func": preprocess_speaker_general,`
			`"transcript_func": preprocess_transcript_magicdata,`
[dataset]support aishell3(tested) 2021-08-25 23:11:29 +08:00			`},`
			`"aishell3":{`
			`"subfolders": ["train/wav"],`
			`"trans_filepath": "train/content.txt",`
[FIX] Fix preprocessing bug for aishell3 2021-09-19 00:09:16 +08:00			`"speak_func": preprocess_speaker_general,`
			`"transcript_func": preprocess_transcript_aishell3,`
支持data_aishell（SLR33）数据集 (#141) * 支持data_aishell（SLR33）数据集 * 更新readme 2021-10-12 23:40:27 +08:00			`},`
			`"data_aishell":{`
			`"subfolders": ["wav/train"],`
			`"trans_filepath": "transcript/aishell_transcript_v0.8.txt",`
			`"speak_func": preprocess_speaker_general`
[FIX] Fix preprocessing bug for aishell3 2021-09-19 00:09:16 +08:00			`}`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00			`}`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00
			`def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,`
			`skip_existing: bool, hparams, no_alignments: bool,`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00			`dataset: str):`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`dataset_info = data_info[dataset]`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`# Gather the input directories`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00			`dataset_root = datasets_root.joinpath(dataset)`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`print("\n ".join(map(str, ["Using data from:"] + input_dirs)))`
			`assert all(input_dir.exists() for input_dir in input_dirs)`

			`# Create the output directories for each output file type`
			`out_dir.joinpath("mels").mkdir(exist_ok=True)`
			`out_dir.joinpath("audio").mkdir(exist_ok=True)`

			`# Create a metadata file`
			`metadata_fpath = out_dir.joinpath("train.txt")`
			`metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")`

			`# Preprocess the dataset`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`dict_info = {}`
			`transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])`
			`assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."`
			`with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:`
[FIX] Fix preprocessing bug for aishell3 2021-09-19 00:09:16 +08:00			`# process with specific function for your dataset`
			`if "transcript_func" in dataset_info:`
			`dataset_info["transcript_func"](dict_info, dict_transcript)`
			`else:`
			`for v in dict_transcript:`
			`if not v:`
			`continue`
			`v = v.strip().replace("\n","").replace("\t"," ").split(" ")`
			`dict_info[v[0]] = " ".join(v[1:])`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 2021-08-13 00:41:58 +08:00			`func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,`
			`hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`job = Pool(n_processes).imap(func, speaker_dirs)`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 2021-08-11 23:33:43 +08:00			`for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):`
Init to support Chinese Dataset. 2021-08-07 11:56:00 +08:00			`for metadatum in speaker_metadata:`
			`metadata_file.write("\|".join(str(x) for x in metadatum) + "\n")`
			`metadata_file.close()`

			`# Verify the contents of the metadata file`
			`with metadata_fpath.open("r", encoding="utf-8") as metadata_file:`
			`metadata = [line.split("\|") for line in metadata_file]`
			`mel_frames = sum([int(m[4]) for m in metadata])`
			`timesteps = sum([int(m[3]) for m in metadata])`
			`sample_rate = hparams.sample_rate`
			`hours = (timesteps / sample_rate) / 3600`
			`print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %`
			`(len(metadata), mel_frames, timesteps, hours))`
			`print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))`
			`print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))`
			`print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))`

			`def embed_utterance(fpaths, encoder_model_fpath):`
			`if not encoder.is_loaded():`
			`encoder.load_model(encoder_model_fpath)`

			`# Compute the speaker embedding of the utterance`
			`wav_fpath, embed_fpath = fpaths`
			`wav = np.load(wav_fpath)`
			`wav = encoder.preprocess_wav(wav)`
			`embed = encoder.embed_utterance(wav)`
			`np.save(embed_fpath, embed, allow_pickle=False)`


			`def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):`
			`wav_dir = synthesizer_root.joinpath("audio")`
			`metadata_fpath = synthesizer_root.joinpath("train.txt")`
			`assert wav_dir.exists() and metadata_fpath.exists()`
			`embed_dir = synthesizer_root.joinpath("embeds")`
			`embed_dir.mkdir(exist_ok=True)`

			`# Gather the input wave filepath and the target output embed filepath`
			`with metadata_fpath.open("r", encoding="utf-8") as metadata_file:`
			`metadata = [line.split("\|") for line in metadata_file]`
			`fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]`

			`# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.`
			`# Embed the utterances in separate threads`
			`func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)`
			`job = Pool(n_processes).imap(func, fpaths)`
			`list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))`