Skip embedding

2024-03-22 13:11:31 +08:00 · 2023-08-29 19:26:55 +08:00 · 2023-08-29 19:26:55 +08:00 · af1426a610
commit af1426a610
parent e9ce943f6c
2 changed files with 27 additions and 54 deletions
--- a/models/synthesizer/preprocess.py
+++ b/models/synthesizer/preprocess.py
@ -99,7 +99,9 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
    print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))

-def embed_utterance(fpaths, encoder_model_fpath):
+def embed_utterance(fpaths: str, encoder_model_fpath: str, skip_existing: bool):
+    if skip_existing and fpaths.exists():
+        return
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

@ -118,7 +120,7 @@ def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False):
    emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
    np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
 
-def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
+def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int, skip_existing: bool):
    wav_dir = synthesizer_root.joinpath("audio")
    metadata_fpath = synthesizer_root.joinpath("train.txt")
    assert wav_dir.exists() and metadata_fpath.exists()
@ -132,7 +134,7 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
        
    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
    # Embed the utterances in separate threads
-    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
+    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath, skip_existing=skip_existing)
    job = Pool(n_processes).imap(func, fpaths)
    tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

--- a/models/synthesizer/preprocess_audio.py
+++ b/models/synthesizer/preprocess_audio.py
@ -45,7 +45,7 @@ def extract_emo(
    return y

 def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, 
-                      skip_existing: bool, hparams, encoder_model_fpath):
+                      mel_fpath: str, wav_fpath: str, hparams, encoder_model_fpath):
    ## FOR REFERENCE:
    # For you not to lose your head if you ever wish to change things here or implement your own
    # synthesizer.
@ -58,13 +58,6 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
    #   without extra padding. This means that you won't have an exact relation between the length
    #   of the wav and of the mel spectrogram. See the vocoder data loader.
        
-    # Skip existing utterances if needed
-    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
-    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
-    
-    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
-        return None
-
    # Trim silence
    if hparams.trim_silence:
        if not encoder.is_loaded():
@ -112,50 +105,28 @@ def _split_on_silences(wav_fpath, words, hparams):
 def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
    metadata = []
    extensions = ("*.wav", "*.flac", "*.mp3")
-    if skip_existing:
-        for extension in extensions:
-            wav_fpath_list = speaker_dir.glob(extension)
-            # Iterate over each wav
-            for wav_fpath in wav_fpath_list:
-                words = dict_info.get(wav_fpath.name.split(".")[0])
+    for extension in extensions:
+        wav_fpath_list = speaker_dir.glob(extension)
+        # Iterate over each wav
+        for wav_fpath in wav_fpath_list:
+            words = dict_info.get(wav_fpath.name.split(".")[0])
+            if not words:
+                words = dict_info.get(wav_fpath.name) # try with extension 
                if not words:
-                    words = dict_info.get(wav_fpath.name) # try with extension 
-                    if not words:
-                        print("no wordS")
-                        continue
-                sub_basename = "%s_%02d" % (wav_fpath.name, 0)
-                
-                mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
-                wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
-                
-                if mel_fpath.exists() and wav_fpath_.exists():
+                    print(f"No word found in dict_info for {wav_fpath.name}, skip it")
                    continue
+            sub_basename = "%s_%02d" % (wav_fpath.name, 0)
+            mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
+            wav_fpath = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
+            
+            if skip_existing and mel_fpath.exists() and wav_fpath.exists():
+                continue
+            wav, text = _split_on_silences(wav_fpath, words, hparams)
+            result = _process_utterance(wav, text, out_dir, sub_basename, 
+                                                False, hparams, encoder_model_fpath) # accelarate
+            if result is None:
+                continue
+            wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
+            metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))

-                wav, text = _split_on_silences(wav_fpath, words, hparams)
-                result = _process_utterance(wav, text, out_dir, sub_basename, 
-                                                    False, hparams, encoder_model_fpath) # accelarate
-                if result is None:
-                    continue
-                wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
-                metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
-    else:
-        for extension in extensions:
-            wav_fpath_list = speaker_dir.glob(extension)
-            # Iterate over each wav
-            for wav_fpath in wav_fpath_list:
-                words = dict_info.get(wav_fpath.name.split(".")[0])
-                if not words:
-                    words = dict_info.get(wav_fpath.name) # try with extension 
-                    if not words:
-                        print("no wordS")
-                        continue
-                sub_basename = "%s_%02d" % (wav_fpath.name, 0)
-
-                wav, text = _split_on_silences(wav_fpath, words, hparams)
-                result = _process_utterance(wav, text, out_dir, sub_basename, 
-                                                    False, hparams, encoder_model_fpath)
-                if result is None:
-                    continue
-                wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
-                metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
    return metadata