diff --git a/models/synthesizer/preprocess.py b/models/synthesizer/preprocess.py index 93705d1..cede25c 100644 --- a/models/synthesizer/preprocess.py +++ b/models/synthesizer/preprocess.py @@ -39,6 +39,9 @@ data_info = { } } +def should_skip(fpath: Path, skip_existing: bool) -> bool: + return skip_existing and fpath.exists() + def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams, no_alignments: bool, dataset: str, emotion_extract = False, encoder_model_fpath=None): @@ -99,7 +102,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, print("Max mel frames length: %d" % max(int(m[4]) for m in metadata)) print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata)) -def embed_utterance(fpaths, encoder_model_fpath): +def _embed_utterance(fpaths: str, encoder_model_fpath: str): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) @@ -110,15 +113,13 @@ def embed_utterance(fpaths, encoder_model_fpath): embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False) -def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False): - if skip_existing and fpaths.exists(): - return +def _emo_extract_from_utterance(fpaths, hparams): wav_fpath, emo_fpath = fpaths wav = np.load(wav_fpath) emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True) np.save(emo_fpath, emo.squeeze(0), allow_pickle=False) -def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int): +def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int, skip_existing: bool): wav_dir = synthesizer_root.joinpath("audio") metadata_fpath = synthesizer_root.joinpath("train.txt") assert wav_dir.exists() and metadata_fpath.exists() @@ -128,11 +129,11 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce # Gather the input wave filepath and the target output embed filepath with metadata_fpath.open("r", encoding="utf-8") as metadata_file: metadata = [line.split("|") for line in metadata_file] - fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] - + fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata if not should_skip(embed_dir.joinpath(m[2]), skip_existing)] + # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here. # Embed the utterances in separate threads - func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) + func = partial(_embed_utterance, encoder_model_fpath=encoder_model_fpath) job = Pool(n_processes).imap(func, fpaths) tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances")) @@ -142,14 +143,14 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp assert wav_dir.exists() and metadata_fpath.exists() emo_dir = synthesizer_root.joinpath("emo") emo_dir.mkdir(exist_ok=True) - + # Gather the input wave filepath and the target output embed filepath with metadata_fpath.open("r", encoding="utf-8") as metadata_file: metadata = [line.split("|") for line in metadata_file] - fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata] + fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata if not should_skip(emo_dir.joinpath(m[0].replace("audio-", "emo-")), skip_existing)] # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here. # Embed the utterances in separate threads - func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing) + func = partial(_emo_extract_from_utterance, hparams=hparams) job = Pool(n_processes).imap(func, fpaths) tuple(tqdm(job, "Emo", len(fpaths), unit="utterances")) diff --git a/models/synthesizer/preprocess_audio.py b/models/synthesizer/preprocess_audio.py index ca2a880..a181380 100644 --- a/models/synthesizer/preprocess_audio.py +++ b/models/synthesizer/preprocess_audio.py @@ -45,7 +45,7 @@ def extract_emo( return y def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, - skip_existing: bool, hparams, encoder_model_fpath): + mel_fpath: str, wav_fpath: str, hparams, encoder_model_fpath): ## FOR REFERENCE: # For you not to lose your head if you ever wish to change things here or implement your own # synthesizer. @@ -58,13 +58,6 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, # without extra padding. This means that you won't have an exact relation between the length # of the wav and of the mel spectrogram. See the vocoder data loader. - # Skip existing utterances if needed - mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) - wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) - - if skip_existing and mel_fpath.exists() and wav_fpath.exists(): - return None - # Trim silence if hparams.trim_silence: if not encoder.is_loaded(): @@ -112,50 +105,28 @@ def _split_on_silences(wav_fpath, words, hparams): def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path): metadata = [] extensions = ("*.wav", "*.flac", "*.mp3") - if skip_existing: - for extension in extensions: - wav_fpath_list = speaker_dir.glob(extension) - # Iterate over each wav - for wav_fpath in wav_fpath_list: - words = dict_info.get(wav_fpath.name.split(".")[0]) + for extension in extensions: + wav_fpath_list = speaker_dir.glob(extension) + # Iterate over each wav + for wav_fpath in wav_fpath_list: + words = dict_info.get(wav_fpath.name.split(".")[0]) + if not words: + words = dict_info.get(wav_fpath.name) # try with extension if not words: - words = dict_info.get(wav_fpath.name) # try with extension - if not words: - print("no wordS") - continue - sub_basename = "%s_%02d" % (wav_fpath.name, 0) - - mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy") - wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy") - - if mel_fpath.exists() and wav_fpath_.exists(): + print(f"No word found in dict_info for {wav_fpath.name}, skip it") continue + sub_basename = "%s_%02d" % (wav_fpath.name, 0) + mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy") + wav_fpath = out_dir.joinpath("audio", f"audio-{sub_basename}.npy") + + if skip_existing and mel_fpath.exists() and wav_fpath.exists(): + continue + wav, text = _split_on_silences(wav_fpath, words, hparams) + result = _process_utterance(wav, text, out_dir, sub_basename, + False, hparams, encoder_model_fpath) # accelarate + if result is None: + continue + wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result + metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text)) - wav, text = _split_on_silences(wav_fpath, words, hparams) - result = _process_utterance(wav, text, out_dir, sub_basename, - False, hparams, encoder_model_fpath) # accelarate - if result is None: - continue - wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result - metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text)) - else: - for extension in extensions: - wav_fpath_list = speaker_dir.glob(extension) - # Iterate over each wav - for wav_fpath in wav_fpath_list: - words = dict_info.get(wav_fpath.name.split(".")[0]) - if not words: - words = dict_info.get(wav_fpath.name) # try with extension - if not words: - print("no wordS") - continue - sub_basename = "%s_%02d" % (wav_fpath.name, 0) - - wav, text = _split_on_silences(wav_fpath, words, hparams) - result = _process_utterance(wav, text, out_dir, sub_basename, - False, hparams, encoder_model_fpath) - if result is None: - continue - wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result - metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text)) return metadata diff --git a/pre.py b/pre.py index d2571c4..a8e5f80 100644 --- a/pre.py +++ b/pre.py @@ -71,7 +71,7 @@ if __name__ == "__main__": del args.n_processes_embed preprocess_dataset(**vars(args)) - create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath) + create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath, skip_existing=args.skip_existing) if args.emotion_extract: create_emo(synthesizer_root=args.out_dir, n_processes=n_processes_embed, skip_existing=args.skip_existing, hparams=args.hparams)