From 856793c9bd4fa19f28e974e8a928ce1b62529bd4 Mon Sep 17 00:00:00 2001 From: babysor00 Date: Wed, 11 Aug 2021 23:33:43 +0800 Subject: [PATCH] Refactor preprocessor of synthesizer to prepare to supprot more datasets --- synthesizer/preprocess.py | 303 ++---------------------------- synthesizer/preprocess_speaker.py | 234 +++++++++++++++++++++++ synthesizer_preprocess_audio.py | 19 +- 3 files changed, 261 insertions(+), 295 deletions(-) create mode 100644 synthesizer/preprocess_speaker.py diff --git a/synthesizer/preprocess.py b/synthesizer/preprocess.py index 29ac298..b40ef7f 100644 --- a/synthesizer/preprocess.py +++ b/synthesizer/preprocess.py @@ -1,31 +1,27 @@ from multiprocessing.pool import Pool -from synthesizer import audio + from functools import partial from itertools import chain -from encoder import inference as encoder from pathlib import Path -from utils import logmmse from tqdm import tqdm import numpy as np -import librosa -import platform -from pypinyin import Style -from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin -from pypinyin.converter import DefaultConverter -from pypinyin.core import Pinyin - -class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter): - pass - -pinyin = Pinyin(PinyinConverter()).pinyin +from encoder import inference as encoder +from synthesizer.preprocess_speaker import preprocess_speaker_aidatatang_200zh +data_info = { + "aidatatang_200zh": { + "subfolders": ["corpus/train"], + "speak_func": preprocess_speaker_aidatatang_200zh + } + # TODO add more +} def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams, no_alignments: bool, - datasets_name: str, subfolders: str): + dataset: str): # Gather the input directories - dataset_root = datasets_root.joinpath(datasets_name) - input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")] + dataset_root = datasets_root.joinpath(dataset) + input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in data_info[dataset]["subfolders"]] print("\n ".join(map(str, ["Using data from:"] + input_dirs))) assert all(input_dir.exists() for input_dir in input_dirs) @@ -39,10 +35,10 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, # Preprocess the dataset speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) - func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing, - hparams=hparams, no_alignments=no_alignments) + func = partial(data_info[dataset]["speak_func"], out_dir=out_dir, skip_existing=skip_existing, + hparams=hparams, directory=dataset_root, no_alignments=no_alignments) job = Pool(n_processes).imap(func, speaker_dirs) - for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"): + for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"): for metadatum in speaker_metadata: metadata_file.write("|".join(str(x) for x in metadatum) + "\n") metadata_file.close() @@ -60,183 +56,6 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, print("Max mel frames length: %d" % max(int(m[4]) for m in metadata)) print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata)) - -def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool): - metadata = [] - for book_dir in speaker_dir.glob("*"): - if no_alignments: - # Gather the utterance audios and texts - # LibriTTS uses .wav but we will include extensions for compatibility with other datasets - extensions = ["*.wav", "*.flac", "*.mp3"] - for extension in extensions: - wav_fpaths = book_dir.glob(extension) - - for wav_fpath in wav_fpaths: - # Load the audio waveform - wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) - if hparams.rescale: - wav = wav / np.abs(wav).max() * hparams.rescaling_max - - # Get the corresponding text - # Check for .txt (for compatibility with other datasets) - text_fpath = wav_fpath.with_suffix(".txt") - if not text_fpath.exists(): - # Check for .normalized.txt (LibriTTS) - text_fpath = wav_fpath.with_suffix(".normalized.txt") - assert text_fpath.exists() - with text_fpath.open("r") as text_file: - text = "".join([line for line in text_file]) - text = text.replace("\"", "") - text = text.strip() - - # Process the utterance - metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name), - skip_existing, hparams)) - else: - # Process alignment file (LibriSpeech support) - # Gather the utterance audios and texts - try: - alignments_fpath = next(book_dir.glob("*.alignment.txt")) - with alignments_fpath.open("r") as alignments_file: - alignments = [line.rstrip().split(" ") for line in alignments_file] - except StopIteration: - # A few alignment files will be missing - continue - - # Iterate over each entry in the alignments file - for wav_fname, words, end_times in alignments: - wav_fpath = book_dir.joinpath(wav_fname + ".flac") - assert wav_fpath.exists() - words = words.replace("\"", "").split(",") - end_times = list(map(float, end_times.replace("\"", "").split(","))) - - # Process each sub-utterance - wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams) - for i, (wav, text) in enumerate(zip(wavs, texts)): - sub_basename = "%s_%02d" % (wav_fname, i) - metadata.append(process_utterance(wav, text, out_dir, sub_basename, - skip_existing, hparams)) - - return [m for m in metadata if m is not None] - - -def split_on_silences(wav_fpath, words, end_times, hparams): - # Load the audio waveform - wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) - if hparams.rescale: - wav = wav / np.abs(wav).max() * hparams.rescaling_max - - words = np.array(words) - start_times = np.array([0.0] + end_times[:-1]) - end_times = np.array(end_times) - assert len(words) == len(end_times) == len(start_times) - assert words[0] == "" and words[-1] == "" - - # Find pauses that are too long - mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split) - mask[0] = mask[-1] = True - breaks = np.where(mask)[0] - - # Profile the noise from the silences and perform noise reduction on the waveform - silence_times = [[start_times[i], end_times[i]] for i in breaks] - silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int) - noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times]) - if len(noisy_wav) > hparams.sample_rate * 0.02: - profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate) - wav = logmmse.denoise(wav, profile, eta=0) - - # Re-attach segments that are too short - segments = list(zip(breaks[:-1], breaks[1:])) - segment_durations = [start_times[end] - end_times[start] for start, end in segments] - i = 0 - while i < len(segments) and len(segments) > 1: - if segment_durations[i] < hparams.utterance_min_duration: - # See if the segment can be re-attached with the right or the left segment - left_duration = float("inf") if i == 0 else segment_durations[i - 1] - right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1] - joined_duration = segment_durations[i] + min(left_duration, right_duration) - - # Do not re-attach if it causes the joined utterance to be too long - if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate: - i += 1 - continue - - # Re-attach the segment with the neighbour of shortest duration - j = i - 1 if left_duration <= right_duration else i - segments[j] = (segments[j][0], segments[j + 1][1]) - segment_durations[j] = joined_duration - del segments[j + 1], segment_durations[j + 1] - else: - i += 1 - - # Split the utterance - segment_times = [[end_times[start], start_times[end]] for start, end in segments] - segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int) - wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times] - texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments] - - # # DEBUG: play the audio segments (run with -n=1) - # import sounddevice as sd - # if len(wavs) > 1: - # print("This sentence was split in %d segments:" % len(wavs)) - # else: - # print("There are no silences long enough for this sentence to be split:") - # for wav, text in zip(wavs, texts): - # # Pad the waveform with 1 second of silence because sounddevice tends to cut them early - # # when playing them. You shouldn't need to do that in your parsers. - # wav = np.concatenate((wav, [0] * 16000)) - # print("\t%s" % text) - # sd.play(wav, 16000, blocking=True) - # print("") - - return wavs, texts - - -def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, - skip_existing: bool, hparams): - ## FOR REFERENCE: - # For you not to lose your head if you ever wish to change things here or implement your own - # synthesizer. - # - Both the audios and the mel spectrograms are saved as numpy arrays - # - There is no processing done to the audios that will be saved to disk beyond volume - # normalization (in split_on_silences) - # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This - # is why we re-apply it on the audio on the side of the vocoder. - # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved - # without extra padding. This means that you won't have an exact relation between the length - # of the wav and of the mel spectrogram. See the vocoder data loader. - - - # Skip existing utterances if needed - mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) - wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) - if skip_existing and mel_fpath.exists() and wav_fpath.exists(): - return None - - # Trim silence - if hparams.trim_silence: - wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True) - - # Skip utterances that are too short - if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: - return None - - # Compute the mel spectrogram - mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) - mel_frames = mel_spectrogram.shape[1] - - # Skip utterances that are too long - if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: - return None - - # Write the spectrogram, embed and audio to disk - np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) - np.save(wav_fpath, wav, allow_pickle=False) - - # Return a tuple describing this training example - return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text - - def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) @@ -266,93 +85,3 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) job = Pool(n_processes).imap(func, fpaths) list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) - -# aidatatang_200zh -def preprocess_aidatatang_200zh(datasets_root: Path, out_dir: Path, n_processes: int, - skip_existing: bool, hparams, no_alignments: bool, datasets_name=None, subfolders=None): - # Gather the input directories - dataset_root = datasets_root.joinpath("aidatatang_200zh") - - dict_info = {} - transcript_dirs = dataset_root.joinpath("transcript/aidatatang_200_zh_transcript.txt") - with open(transcript_dirs,"rb") as fp: - dict_transcript = [v.decode() for v in fp] - - for v in dict_transcript: - if not v: - continue - v = v.strip().replace("\n","").split(" ") - dict_info[v[0]] = " ".join(v[1:]) - - input_dirs = [dataset_root.joinpath("corpus/train")] - print("\n ".join(map(str, ["Using data from:"] + input_dirs))) - assert all(input_dir.exists() for input_dir in input_dirs) - - # Create the output directories for each output file type - out_dir.joinpath("mels").mkdir(exist_ok=True) - out_dir.joinpath("audio").mkdir(exist_ok=True) - - # Create a metadata file - metadata_fpath = out_dir.joinpath("train.txt") - metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8") - - # Preprocess the dataset - speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) - func = partial(preprocess_speaker_aidatatang_200zh, out_dir=out_dir, skip_existing=skip_existing, - hparams=hparams, dict_info=dict_info, no_alignments=no_alignments) - job = Pool(n_processes).imap(func, speaker_dirs) - for speaker_metadata in tqdm(job, "aidatatang_200zh", len(speaker_dirs), unit="speakers"): - for metadatum in speaker_metadata: - metadata_file.write("|".join(str(x) for x in metadatum) + "\n") - metadata_file.close() - - # Verify the contents of the metadata file - with metadata_fpath.open("r", encoding="utf-8") as metadata_file: - metadata = [line.split("|") for line in metadata_file] - mel_frames = sum([int(m[4]) for m in metadata]) - timesteps = sum([int(m[3]) for m in metadata]) - sample_rate = hparams.sample_rate - hours = (timesteps / sample_rate) / 3600 - print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." % - (len(metadata), mel_frames, timesteps, hours)) - print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata)) - print("Max mel frames length: %d" % max(int(m[4]) for m in metadata)) - print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata)) - -def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool): - metadata = [] - if platform.system() == "Windows": - split = "\\" - else: - split = "/" - # for book_dir in speaker_dir.glob("*"): - # Gather the utterance audios and texts - - for wav_fpath in speaker_dir.glob("*.wav"): - # D:\dataset\data_aishell\wav\train\S0002\BAC009S0002W0122.wav - - # Process each sub-utterance - - name = str(wav_fpath).split(split)[-1] - key = name.split(".")[0] - words = dict_info.get(key) - if not words: - continue - sub_basename = "%s_%02d" % (name, 0) - wav, text = split_on_silences_aidatatang_200zh(wav_fpath, words, hparams) - metadata.append(process_utterance(wav, text, out_dir, sub_basename, - skip_existing, hparams)) - - return [m for m in metadata if m is not None] - -def split_on_silences_aidatatang_200zh(wav_fpath, words, hparams): - # Load the audio waveform - wav, _ = librosa.load(wav_fpath, hparams.sample_rate) - wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0] - if hparams.rescale: - wav = wav / np.abs(wav).max() * hparams.rescaling_max - resp = pinyin(words, style=Style.TONE3) - res = [v[0] for v in resp if v[0].strip()] - res = " ".join(res) - - return wav, res \ No newline at end of file diff --git a/synthesizer/preprocess_speaker.py b/synthesizer/preprocess_speaker.py new file mode 100644 index 0000000..291b554 --- /dev/null +++ b/synthesizer/preprocess_speaker.py @@ -0,0 +1,234 @@ +import platform +import librosa +import numpy as np + +from encoder import inference as encoder +from utils import logmmse +from synthesizer import audio +from pathlib import Path +from pypinyin import Style +from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin +from pypinyin.converter import DefaultConverter +from pypinyin.core import Pinyin + +class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter): + pass + +pinyin = Pinyin(PinyinConverter()).pinyin + + +def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, + skip_existing: bool, hparams): + ## FOR REFERENCE: + # For you not to lose your head if you ever wish to change things here or implement your own + # synthesizer. + # - Both the audios and the mel spectrograms are saved as numpy arrays + # - There is no processing done to the audios that will be saved to disk beyond volume + # normalization (in split_on_silences) + # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This + # is why we re-apply it on the audio on the side of the vocoder. + # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved + # without extra padding. This means that you won't have an exact relation between the length + # of the wav and of the mel spectrogram. See the vocoder data loader. + + + # Skip existing utterances if needed + mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) + wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) + if skip_existing and mel_fpath.exists() and wav_fpath.exists(): + return None + + # Trim silence + if hparams.trim_silence: + wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True) + + # Skip utterances that are too short + if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: + return None + + # Compute the mel spectrogram + mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) + mel_frames = mel_spectrogram.shape[1] + + # Skip utterances that are too long + if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: + return None + + # Write the spectrogram, embed and audio to disk + np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) + np.save(wav_fpath, wav, allow_pickle=False) + + # Return a tuple describing this training example + return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text + + +def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams): + # Load the audio waveform + wav, _ = librosa.load(wav_fpath, hparams.sample_rate) + wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0] + if hparams.rescale: + wav = wav / np.abs(wav).max() * hparams.rescaling_max + resp = pinyin(words, style=Style.TONE3) + res = [v[0] for v in resp if v[0].strip()] + res = " ".join(res) + + return wav, res + + +def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, directory, no_alignments: bool): + dict_info = {} + transcript_dirs = directory.joinpath("transcript/aidatatang_200_zh_transcript.txt") + with open(transcript_dirs,"rb") as fp: + dict_transcript = [v.decode() for v in fp] + for v in dict_transcript: + if not v: + continue + v = v.strip().replace("\n","").split(" ") + dict_info[v[0]] = " ".join(v[1:]) + + metadata = [] + if platform.system() == "Windows": + split = "\\" + else: + split = "/" + for wav_fpath in speaker_dir.glob("*.wav"): + name = str(wav_fpath).split(split)[-1] + key = name.split(".")[0] + words = dict_info.get(key) + if not words: + continue + sub_basename = "%s_%02d" % (name, 0) + wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams) + metadata.append(_process_utterance(wav, text, out_dir, sub_basename, + skip_existing, hparams)) + return [m for m in metadata if m is not None] + +def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool): + metadata = [] + for book_dir in speaker_dir.glob("*"): + if no_alignments: + # Gather the utterance audios and texts + # LibriTTS uses .wav but we will include extensions for compatibility with other datasets + extensions = ["*.wav", "*.flac", "*.mp3"] + for extension in extensions: + wav_fpaths = book_dir.glob(extension) + + for wav_fpath in wav_fpaths: + # Load the audio waveform + wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) + if hparams.rescale: + wav = wav / np.abs(wav).max() * hparams.rescaling_max + + # Get the corresponding text + # Check for .txt (for compatibility with other datasets) + text_fpath = wav_fpath.with_suffix(".txt") + if not text_fpath.exists(): + # Check for .normalized.txt (LibriTTS) + text_fpath = wav_fpath.with_suffix(".normalized.txt") + assert text_fpath.exists() + with text_fpath.open("r") as text_file: + text = "".join([line for line in text_file]) + text = text.replace("\"", "") + text = text.strip() + + # Process the utterance + metadata.append(_process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name), + skip_existing, hparams)) + else: + # Process alignment file (LibriSpeech support) + # Gather the utterance audios and texts + try: + alignments_fpath = next(book_dir.glob("*.alignment.txt")) + with alignments_fpath.open("r") as alignments_file: + alignments = [line.rstrip().split(" ") for line in alignments_file] + except StopIteration: + # A few alignment files will be missing + continue + + # Iterate over each entry in the alignments file + for wav_fname, words, end_times in alignments: + wav_fpath = book_dir.joinpath(wav_fname + ".flac") + assert wav_fpath.exists() + words = words.replace("\"", "").split(",") + end_times = list(map(float, end_times.replace("\"", "").split(","))) + + # Process each sub-utterance + wavs, texts = _split_on_silences(wav_fpath, words, end_times, hparams) + for i, (wav, text) in enumerate(zip(wavs, texts)): + sub_basename = "%s_%02d" % (wav_fname, i) + metadata.append(_process_utterance(wav, text, out_dir, sub_basename, + skip_existing, hparams)) + + return [m for m in metadata if m is not None] + +# TODO: use original split func +def _split_on_silences(wav_fpath, words, end_times, hparams): + # Load the audio waveform + wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) + if hparams.rescale: + wav = wav / np.abs(wav).max() * hparams.rescaling_max + + words = np.array(words) + start_times = np.array([0.0] + end_times[:-1]) + end_times = np.array(end_times) + assert len(words) == len(end_times) == len(start_times) + assert words[0] == "" and words[-1] == "" + + # Find pauses that are too long + mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split) + mask[0] = mask[-1] = True + breaks = np.where(mask)[0] + + # Profile the noise from the silences and perform noise reduction on the waveform + silence_times = [[start_times[i], end_times[i]] for i in breaks] + silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int) + noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times]) + if len(noisy_wav) > hparams.sample_rate * 0.02: + profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate) + wav = logmmse.denoise(wav, profile, eta=0) + + # Re-attach segments that are too short + segments = list(zip(breaks[:-1], breaks[1:])) + segment_durations = [start_times[end] - end_times[start] for start, end in segments] + i = 0 + while i < len(segments) and len(segments) > 1: + if segment_durations[i] < hparams.utterance_min_duration: + # See if the segment can be re-attached with the right or the left segment + left_duration = float("inf") if i == 0 else segment_durations[i - 1] + right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1] + joined_duration = segment_durations[i] + min(left_duration, right_duration) + + # Do not re-attach if it causes the joined utterance to be too long + if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate: + i += 1 + continue + + # Re-attach the segment with the neighbour of shortest duration + j = i - 1 if left_duration <= right_duration else i + segments[j] = (segments[j][0], segments[j + 1][1]) + segment_durations[j] = joined_duration + del segments[j + 1], segment_durations[j + 1] + else: + i += 1 + + # Split the utterance + segment_times = [[end_times[start], start_times[end]] for start, end in segments] + segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int) + wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times] + texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments] + + # # DEBUG: play the audio segments (run with -n=1) + # import sounddevice as sd + # if len(wavs) > 1: + # print("This sentence was split in %d segments:" % len(wavs)) + # else: + # print("There are no silences long enough for this sentence to be split:") + # for wav, text in zip(wavs, texts): + # # Pad the waveform with 1 second of silence because sounddevice tends to cut them early + # # when playing them. You shouldn't need to do that in your parsers. + # wav = np.concatenate((wav, [0] * 16000)) + # print("\t%s" % text) + # sd.play(wav, 16000, blocking=True) + # print("") + + return wavs, texts \ No newline at end of file diff --git a/synthesizer_preprocess_audio.py b/synthesizer_preprocess_audio.py index 436609e..501c13f 100644 --- a/synthesizer_preprocess_audio.py +++ b/synthesizer_preprocess_audio.py @@ -1,10 +1,15 @@ -from synthesizer.preprocess import preprocess_aidatatang_200zh +from synthesizer.preprocess import preprocess_dataset from synthesizer.hparams import hparams from utils.argutils import print_args from pathlib import Path import argparse +recognized_datasets = [ + "aidatatang_200zh", + "SLR68", +] + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Preprocesses audio files from datasets, encodes them as mel spectrograms " @@ -29,16 +34,14 @@ if __name__ == "__main__": parser.add_argument("--no_alignments", action="store_true", help=\ "Use this option when dataset does not include alignments\ (these are used to split long audio files into sub-utterances.)") - parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\ - "Name of the dataset directory to process.") - parser.add_argument("--subfolders", type=str, default="train-clean-100, train-clean-360", help=\ - "Comma-separated list of subfolders to process inside your dataset directory") + parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\ + "Name of the dataset to process.") args = parser.parse_args() # Process the arguments if not hasattr(args, "out_dir"): args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer") - + assert args.dataset in recognized_datasets, 'not surpport such dataset' # Create directories assert args.datasets_root.exists() args.out_dir.mkdir(exist_ok=True, parents=True) @@ -56,5 +59,5 @@ if __name__ == "__main__": # Preprocess the dataset print_args(args, parser) args.hparams = hparams.parse(args.hparams) - # preprocess_dataset(**vars(args)) - preprocess_aidatatang_200zh(**vars(args)) + + preprocess_dataset(**vars(args)) \ No newline at end of file