MockingBird/synthesizer_tacotron2/synthesize.py

import torch
from torch.utils.data import DataLoader
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
from synthesizer.models.tacotron import Tacotron
from synthesizer.hparams import hparams_debug_string
from synthesizer.utils.text import text_to_sequence
from synthesizer.utils.symbols import symbols
import numpy as np
from pathlib import Path
from tqdm import tqdm
import sys
from synthesizer.infolog import log
import os
from synthesizer.tacotron2 import Tacotron2
import time
import tensorflow as tf


def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
    eval_dir = os.path.join(output_dir, "eval")
    log_dir = os.path.join(output_dir, "logs-eval")

    # Create output path if it doesn"t exist
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, "wavs"), exist_ok=True)
    os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True)

    log(hparams_debug_string())
    synth = Tacotron2(checkpoint_path, hparams)

    # Set inputs batch wise
    sentences = [sentences[i: i + hparams.tacotron_synthesis_batch_size] for i
                 in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]

    log("Starting Synthesis")
    with open(os.path.join(eval_dir, "map.txt"), "w") as file:
        for i, texts in enumerate(tqdm(sentences)):
            start = time.time()
            basenames = ["batch_{}_sentence_{}".format(i, j) for j in range(len(texts))]
            mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None)

            for elems in zip(texts, mel_filenames, speaker_ids):
                file.write("|".join([str(x) for x in elems]) + "\n")
    log("synthesized mel spectrograms at {}".format(eval_dir))
    return eval_dir


def run_synthesis(in_dir, out_dir, model_dir, hparams):
    # This generates ground truth-aligned mels for vocoder training
    synth_dir = os.path.join(out_dir, "mels_gta")
    os.makedirs(synth_dir, exist_ok=True)
    metadata_filename = os.path.join(in_dir, "train.txt")
    print(hparams_debug_string())

    # Load the model in memory
    weights_dir = os.path.join(model_dir, "taco_pretrained")
    checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path
    synth = Tacotron2(checkpoint_fpath, hparams, gta=True)

    # Load the metadata
    with open(metadata_filename, encoding="utf-8") as f:
        metadata = [line.strip().split("|") for line in f]
        frame_shift_ms = hparams.hop_size / hparams.sample_rate
        hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / 3600
        print("Loaded metadata for {} examples ({:.2f} hours)".format(len(metadata), hours))

    # Set inputs batch wise
    metadata = [metadata[i: i + hparams.tacotron_synthesis_batch_size] for i in
                range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]
    # TODO: come on big boy, fix this
    # Quick and dirty fix to make sure that all batches have the same size
    metadata = metadata[:-1]

    print("Starting Synthesis")
    mel_dir = os.path.join(in_dir, "mels")
    embed_dir = os.path.join(in_dir, "embeds")
    meta_out_fpath = os.path.join(out_dir, "synthesized.txt")
    with open(meta_out_fpath, "w") as file:
        for i, meta in enumerate(tqdm(metadata)):
            texts = [m[5] for m in meta]
            mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
            embed_filenames = [os.path.join(embed_dir, m[2]) for m in meta]
            basenames = [os.path.basename(m).replace(".npy", "").replace("mel-", "")
                         for m in mel_filenames]
            synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, embed_filenames)

            for elems in meta:
                file.write("|".join([str(x) for x in elems]) + "\n")

    print("Synthesized mel spectrograms at {}".format(synth_dir))
    return meta_out_fpath