Skip embedding (#950)

* Skip embedding

* Skip earlier

* Remove unused paramater

* Pass param
main
Vega 2023-09-05 23:15:04 +08:00 committed by GitHub
parent 1862d2145b
commit 156723e37c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 35 additions and 63 deletions

View File

@ -39,6 +39,9 @@ data_info = {
} }
} }
def should_skip(fpath: Path, skip_existing: bool) -> bool:
return skip_existing and fpath.exists()
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
skip_existing: bool, hparams, no_alignments: bool, skip_existing: bool, hparams, no_alignments: bool,
dataset: str, emotion_extract = False, encoder_model_fpath=None): dataset: str, emotion_extract = False, encoder_model_fpath=None):
@ -99,7 +102,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata)) print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata)) print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
def embed_utterance(fpaths, encoder_model_fpath): def _embed_utterance(fpaths: str, encoder_model_fpath: str):
if not encoder.is_loaded(): if not encoder.is_loaded():
encoder.load_model(encoder_model_fpath) encoder.load_model(encoder_model_fpath)
@ -110,15 +113,13 @@ def embed_utterance(fpaths, encoder_model_fpath):
embed = encoder.embed_utterance(wav) embed = encoder.embed_utterance(wav)
np.save(embed_fpath, embed, allow_pickle=False) np.save(embed_fpath, embed, allow_pickle=False)
def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False): def _emo_extract_from_utterance(fpaths, hparams):
if skip_existing and fpaths.exists():
return
wav_fpath, emo_fpath = fpaths wav_fpath, emo_fpath = fpaths
wav = np.load(wav_fpath) wav = np.load(wav_fpath)
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True) emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False) np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int): def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int, skip_existing: bool):
wav_dir = synthesizer_root.joinpath("audio") wav_dir = synthesizer_root.joinpath("audio")
metadata_fpath = synthesizer_root.joinpath("train.txt") metadata_fpath = synthesizer_root.joinpath("train.txt")
assert wav_dir.exists() and metadata_fpath.exists() assert wav_dir.exists() and metadata_fpath.exists()
@ -128,11 +129,11 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
# Gather the input wave filepath and the target output embed filepath # Gather the input wave filepath and the target output embed filepath
with metadata_fpath.open("r", encoding="utf-8") as metadata_file: with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file] metadata = [line.split("|") for line in metadata_file]
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata if not should_skip(embed_dir.joinpath(m[2]), skip_existing)]
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here. # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
# Embed the utterances in separate threads # Embed the utterances in separate threads
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) func = partial(_embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths) job = Pool(n_processes).imap(func, fpaths)
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances")) tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
@ -142,14 +143,14 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
assert wav_dir.exists() and metadata_fpath.exists() assert wav_dir.exists() and metadata_fpath.exists()
emo_dir = synthesizer_root.joinpath("emo") emo_dir = synthesizer_root.joinpath("emo")
emo_dir.mkdir(exist_ok=True) emo_dir.mkdir(exist_ok=True)
# Gather the input wave filepath and the target output embed filepath # Gather the input wave filepath and the target output embed filepath
with metadata_fpath.open("r", encoding="utf-8") as metadata_file: with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file] metadata = [line.split("|") for line in metadata_file]
fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata] fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata if not should_skip(emo_dir.joinpath(m[0].replace("audio-", "emo-")), skip_existing)]
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here. # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
# Embed the utterances in separate threads # Embed the utterances in separate threads
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing) func = partial(_emo_extract_from_utterance, hparams=hparams)
job = Pool(n_processes).imap(func, fpaths) job = Pool(n_processes).imap(func, fpaths)
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances")) tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))

View File

@ -45,7 +45,7 @@ def extract_emo(
return y return y
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams, encoder_model_fpath): mel_fpath: str, wav_fpath: str, hparams, encoder_model_fpath):
## FOR REFERENCE: ## FOR REFERENCE:
# For you not to lose your head if you ever wish to change things here or implement your own # For you not to lose your head if you ever wish to change things here or implement your own
# synthesizer. # synthesizer.
@ -58,13 +58,6 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
# without extra padding. This means that you won't have an exact relation between the length # without extra padding. This means that you won't have an exact relation between the length
# of the wav and of the mel spectrogram. See the vocoder data loader. # of the wav and of the mel spectrogram. See the vocoder data loader.
# Skip existing utterances if needed
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
return None
# Trim silence # Trim silence
if hparams.trim_silence: if hparams.trim_silence:
if not encoder.is_loaded(): if not encoder.is_loaded():
@ -112,50 +105,28 @@ def _split_on_silences(wav_fpath, words, hparams):
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path): def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
metadata = [] metadata = []
extensions = ("*.wav", "*.flac", "*.mp3") extensions = ("*.wav", "*.flac", "*.mp3")
if skip_existing: for extension in extensions:
for extension in extensions: wav_fpath_list = speaker_dir.glob(extension)
wav_fpath_list = speaker_dir.glob(extension) # Iterate over each wav
# Iterate over each wav for wav_fpath in wav_fpath_list:
for wav_fpath in wav_fpath_list: words = dict_info.get(wav_fpath.name.split(".")[0])
words = dict_info.get(wav_fpath.name.split(".")[0]) if not words:
words = dict_info.get(wav_fpath.name) # try with extension
if not words: if not words:
words = dict_info.get(wav_fpath.name) # try with extension print(f"No word found in dict_info for {wav_fpath.name}, skip it")
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
if mel_fpath.exists() and wav_fpath_.exists():
continue continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
wav_fpath = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
continue
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath) # accelarate
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath) # accelarate
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
else:
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
if not words:
words = dict_info.get(wav_fpath.name) # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath)
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
return metadata return metadata

2
pre.py
View File

@ -71,7 +71,7 @@ if __name__ == "__main__":
del args.n_processes_embed del args.n_processes_embed
preprocess_dataset(**vars(args)) preprocess_dataset(**vars(args))
create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath) create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath, skip_existing=args.skip_existing)
if args.emotion_extract: if args.emotion_extract:
create_emo(synthesizer_root=args.out_dir, n_processes=n_processes_embed, skip_existing=args.skip_existing, hparams=args.hparams) create_emo(synthesizer_root=args.out_dir, n_processes=n_processes_embed, skip_existing=args.skip_existing, hparams=args.hparams)