Refactor preprocessor of synthesizer to prepare to supprot more datasets

2024-03-22 13:11:31 +08:00 · 2021-08-11 23:33:43 +08:00 · 2021-08-11 23:33:43 +08:00 · 856793c9bd
commit 856793c9bd
parent c21d2c11dd
3 changed files with 261 additions and 295 deletions
--- a/synthesizer/preprocess.py
+++ b/synthesizer/preprocess.py
@ -1,31 +1,27 @@
 from multiprocessing.pool import Pool 
-from synthesizer import audio
+
 from functools import partial
 from itertools import chain
-from encoder import inference as encoder
 from pathlib import Path
-from utils import logmmse
 from tqdm import tqdm
 import numpy as np
-import librosa
-import platform
-from pypinyin import Style
-from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
-from pypinyin.converter import DefaultConverter
-from pypinyin.core import Pinyin
-
-class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
-    pass
-
-pinyin = Pinyin(PinyinConverter()).pinyin
+from encoder import inference as encoder
+from synthesizer.preprocess_speaker import preprocess_speaker_aidatatang_200zh

+data_info = {
+    "aidatatang_200zh": {
+        "subfolders": ["corpus/train"],
+        "speak_func": preprocess_speaker_aidatatang_200zh
+    }
+    # TODO add more
+}

 def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
                           skip_existing: bool, hparams, no_alignments: bool,
-                           datasets_name: str, subfolders: str):
+                           dataset: str):
    # Gather the input directories
-    dataset_root = datasets_root.joinpath(datasets_name)
-    input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
+    dataset_root = datasets_root.joinpath(dataset)
+    input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in data_info[dataset]["subfolders"]]
    print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
    assert all(input_dir.exists() for input_dir in input_dirs)
    
@ -39,10 +35,10 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,

    # Preprocess the dataset
    speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
-    func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing, 
-                   hparams=hparams, no_alignments=no_alignments)
+    func = partial(data_info[dataset]["speak_func"], out_dir=out_dir, skip_existing=skip_existing, 
+                   hparams=hparams, directory=dataset_root, no_alignments=no_alignments)
    job = Pool(n_processes).imap(func, speaker_dirs)
-    for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
+    for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
        for metadatum in speaker_metadata:
            metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
    metadata_file.close()
@ -60,183 +56,6 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
    print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))

-
-def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
-    metadata = []
-    for book_dir in speaker_dir.glob("*"):
-        if no_alignments:
-            # Gather the utterance audios and texts
-            # LibriTTS uses .wav but we will include extensions for compatibility with other datasets
-            extensions = ["*.wav", "*.flac", "*.mp3"]
-            for extension in extensions:
-                wav_fpaths = book_dir.glob(extension)
-
-                for wav_fpath in wav_fpaths:
-                    # Load the audio waveform
-                    wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
-                    if hparams.rescale:
-                        wav = wav / np.abs(wav).max() * hparams.rescaling_max
-
-                    # Get the corresponding text
-                    # Check for .txt (for compatibility with other datasets)
-                    text_fpath = wav_fpath.with_suffix(".txt")
-                    if not text_fpath.exists():
-                        # Check for .normalized.txt (LibriTTS)
-                        text_fpath = wav_fpath.with_suffix(".normalized.txt")
-                        assert text_fpath.exists()
-                    with text_fpath.open("r") as text_file:
-                        text = "".join([line for line in text_file])
-                        text = text.replace("\"", "")
-                        text = text.strip()
-
-                    # Process the utterance
-                    metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
-                                                      skip_existing, hparams))
-        else:
-            # Process alignment file (LibriSpeech support)
-            # Gather the utterance audios and texts
-            try:
-                alignments_fpath = next(book_dir.glob("*.alignment.txt"))
-                with alignments_fpath.open("r") as alignments_file:
-                    alignments = [line.rstrip().split(" ") for line in alignments_file]
-            except StopIteration:
-                # A few alignment files will be missing
-                continue
-
-            # Iterate over each entry in the alignments file
-            for wav_fname, words, end_times in alignments:
-                wav_fpath = book_dir.joinpath(wav_fname + ".flac")
-                assert wav_fpath.exists()
-                words = words.replace("\"", "").split(",")
-                end_times = list(map(float, end_times.replace("\"", "").split(",")))
-
-                # Process each sub-utterance
-                wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
-                for i, (wav, text) in enumerate(zip(wavs, texts)):
-                    sub_basename = "%s_%02d" % (wav_fname, i)
-                    metadata.append(process_utterance(wav, text, out_dir, sub_basename,
-                                                      skip_existing, hparams))
-
-    return [m for m in metadata if m is not None]
-
-
-def split_on_silences(wav_fpath, words, end_times, hparams):
-    # Load the audio waveform
-    wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
-    if hparams.rescale:
-        wav = wav / np.abs(wav).max() * hparams.rescaling_max
-    
-    words = np.array(words)
-    start_times = np.array([0.0] + end_times[:-1])
-    end_times = np.array(end_times)
-    assert len(words) == len(end_times) == len(start_times)
-    assert words[0] == "" and words[-1] == ""
-    
-    # Find pauses that are too long
-    mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
-    mask[0] = mask[-1] = True
-    breaks = np.where(mask)[0]
-
-    # Profile the noise from the silences and perform noise reduction on the waveform
-    silence_times = [[start_times[i], end_times[i]] for i in breaks]
-    silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
-    noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
-    if len(noisy_wav) > hparams.sample_rate * 0.02:
-        profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
-        wav = logmmse.denoise(wav, profile, eta=0)
-    
-    # Re-attach segments that are too short
-    segments = list(zip(breaks[:-1], breaks[1:]))
-    segment_durations = [start_times[end] - end_times[start] for start, end in segments]
-    i = 0
-    while i < len(segments) and len(segments) > 1:
-        if segment_durations[i] < hparams.utterance_min_duration:
-            # See if the segment can be re-attached with the right or the left segment
-            left_duration = float("inf") if i == 0 else segment_durations[i - 1]
-            right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
-            joined_duration = segment_durations[i] + min(left_duration, right_duration)
-
-            # Do not re-attach if it causes the joined utterance to be too long
-            if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
-                i += 1
-                continue
-
-            # Re-attach the segment with the neighbour of shortest duration
-            j = i - 1 if left_duration <= right_duration else i
-            segments[j] = (segments[j][0], segments[j + 1][1])
-            segment_durations[j] = joined_duration
-            del segments[j + 1], segment_durations[j + 1]
-        else:
-            i += 1
-    
-    # Split the utterance
-    segment_times = [[end_times[start], start_times[end]] for start, end in segments]
-    segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
-    wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
-    texts = [" ".join(words[start + 1:end]).replace("  ", " ") for start, end in segments]
-    
-    # # DEBUG: play the audio segments (run with -n=1)
-    # import sounddevice as sd
-    # if len(wavs) > 1:
-    #     print("This sentence was split in %d segments:" % len(wavs))
-    # else:
-    #     print("There are no silences long enough for this sentence to be split:")
-    # for wav, text in zip(wavs, texts):
-    #     # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
-    #     # when playing them. You shouldn't need to do that in your parsers.
-    #     wav = np.concatenate((wav, [0] * 16000))
-    #     print("\t%s" % text)
-    #     sd.play(wav, 16000, blocking=True)
-    # print("")
-    
-    return wavs, texts
-    
-    
-def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, 
-                      skip_existing: bool, hparams):
-    ## FOR REFERENCE:
-    # For you not to lose your head if you ever wish to change things here or implement your own
-    # synthesizer.
-    # - Both the audios and the mel spectrograms are saved as numpy arrays
-    # - There is no processing done to the audios that will be saved to disk beyond volume  
-    #   normalization (in split_on_silences)
-    # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
-    #   is why we re-apply it on the audio on the side of the vocoder.
-    # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
-    #   without extra padding. This means that you won't have an exact relation between the length
-    #   of the wav and of the mel spectrogram. See the vocoder data loader.
-    
-    
-    # Skip existing utterances if needed
-    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
-    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
-    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
-        return None
-
-    # Trim silence
-    if hparams.trim_silence:
-        wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
-    
-    # Skip utterances that are too short
-    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
-        return None
-    
-    # Compute the mel spectrogram
-    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
-    mel_frames = mel_spectrogram.shape[1]
-    
-    # Skip utterances that are too long
-    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
-        return None
-    
-    # Write the spectrogram, embed and audio to disk
-    np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
-    np.save(wav_fpath, wav, allow_pickle=False)
-    
-    # Return a tuple describing this training example
-    return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
- 
- 
 def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)
@ -266,93 +85,3 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
    job = Pool(n_processes).imap(func, fpaths)
    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
-
-# aidatatang_200zh
-def preprocess_aidatatang_200zh(datasets_root: Path, out_dir: Path, n_processes: int, 
-                           skip_existing: bool, hparams, no_alignments: bool, datasets_name=None, subfolders=None):
-    # Gather the input directories
-    dataset_root = datasets_root.joinpath("aidatatang_200zh")
-
-    dict_info = {}
-    transcript_dirs = dataset_root.joinpath("transcript/aidatatang_200_zh_transcript.txt")
-    with open(transcript_dirs,"rb") as fp:
-        dict_transcript = [v.decode() for v in fp]
-
-    for v in dict_transcript:
-        if not v:
-            continue
-        v = v.strip().replace("\n","").split(" ")
-        dict_info[v[0]] = " ".join(v[1:])
-
-    input_dirs = [dataset_root.joinpath("corpus/train")]
-    print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
-    assert all(input_dir.exists() for input_dir in input_dirs)
-
-    # Create the output directories for each output file type
-    out_dir.joinpath("mels").mkdir(exist_ok=True)
-    out_dir.joinpath("audio").mkdir(exist_ok=True)
-
-    # Create a metadata file
-    metadata_fpath = out_dir.joinpath("train.txt")
-    metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
-
-    # Preprocess the dataset
-    speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
-    func = partial(preprocess_speaker_aidatatang_200zh, out_dir=out_dir, skip_existing=skip_existing, 
-                   hparams=hparams, dict_info=dict_info, no_alignments=no_alignments)
-    job = Pool(n_processes).imap(func, speaker_dirs)
-    for speaker_metadata in tqdm(job, "aidatatang_200zh", len(speaker_dirs), unit="speakers"):
-        for metadatum in speaker_metadata:
-            metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
-    metadata_file.close()
-
-    # Verify the contents of the metadata file
-    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
-        metadata = [line.split("|") for line in metadata_file]
-    mel_frames = sum([int(m[4]) for m in metadata])
-    timesteps = sum([int(m[3]) for m in metadata])
-    sample_rate = hparams.sample_rate
-    hours = (timesteps / sample_rate) / 3600
-    print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
-          (len(metadata), mel_frames, timesteps, hours))
-    print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
-    print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
-    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
-
-def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
-    metadata = []
-    if platform.system() == "Windows":
-        split = "\\"
-    else:
-        split = "/" 
-    # for book_dir in speaker_dir.glob("*"):
-        # Gather the utterance audios and texts
-
-    for wav_fpath in speaker_dir.glob("*.wav"):
-        # D:\dataset\data_aishell\wav\train\S0002\BAC009S0002W0122.wav
-
-        # Process each sub-utterance
-
-        name = str(wav_fpath).split(split)[-1]
-        key = name.split(".")[0]
-        words = dict_info.get(key)
-        if not words:
-            continue
-        sub_basename = "%s_%02d" % (name, 0)
-        wav, text = split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
-        metadata.append(process_utterance(wav, text, out_dir, sub_basename, 
-                                              skip_existing, hparams))
-
-    return [m for m in metadata if m is not None]
-
-def split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
-    # Load the audio waveform
-    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
-    wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
-    if hparams.rescale:
-        wav = wav / np.abs(wav).max() * hparams.rescaling_max
-    resp = pinyin(words, style=Style.TONE3)
-    res = [v[0] for v in resp if v[0].strip()]
-    res = " ".join(res)
-
-    return wav, res
--- a/synthesizer/preprocess_speaker.py
+++ b/synthesizer/preprocess_speaker.py
@ -0,0 +1,234 @@
+import platform
+import librosa
+import numpy as np
+
+from encoder import inference as encoder
+from utils import logmmse
+from synthesizer import audio
+from pathlib import Path
+from pypinyin import Style
+from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
+from pypinyin.converter import DefaultConverter
+from pypinyin.core import Pinyin
+
+class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
+    pass
+
+pinyin = Pinyin(PinyinConverter()).pinyin
+
+
+def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, 
+                      skip_existing: bool, hparams):
+    ## FOR REFERENCE:
+    # For you not to lose your head if you ever wish to change things here or implement your own
+    # synthesizer.
+    # - Both the audios and the mel spectrograms are saved as numpy arrays
+    # - There is no processing done to the audios that will be saved to disk beyond volume  
+    #   normalization (in split_on_silences)
+    # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
+    #   is why we re-apply it on the audio on the side of the vocoder.
+    # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
+    #   without extra padding. This means that you won't have an exact relation between the length
+    #   of the wav and of the mel spectrogram. See the vocoder data loader.
+    
+    
+    # Skip existing utterances if needed
+    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
+    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
+    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
+        return None
+
+    # Trim silence
+    if hparams.trim_silence:
+        wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
+    
+    # Skip utterances that are too short
+    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
+        return None
+    
+    # Compute the mel spectrogram
+    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
+    mel_frames = mel_spectrogram.shape[1]
+    
+    # Skip utterances that are too long
+    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
+        return None
+    
+    # Write the spectrogram, embed and audio to disk
+    np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
+    np.save(wav_fpath, wav, allow_pickle=False)
+    
+    # Return a tuple describing this training example
+    return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
+ 
+
+def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
+    # Load the audio waveform
+    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
+    wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
+    if hparams.rescale:
+        wav = wav / np.abs(wav).max() * hparams.rescaling_max
+    resp = pinyin(words, style=Style.TONE3)
+    res = [v[0] for v in resp if v[0].strip()]
+    res = " ".join(res)
+
+    return wav, res
+
+
+def preprocess_speaker_aidatatang_200zh(speaker_dir, out_dir: Path, skip_existing: bool, hparams, directory, no_alignments: bool):
+    dict_info = {}
+    transcript_dirs = directory.joinpath("transcript/aidatatang_200_zh_transcript.txt")
+    with open(transcript_dirs,"rb") as fp:
+        dict_transcript = [v.decode() for v in fp]
+    for v in dict_transcript:
+        if not v:
+            continue
+        v = v.strip().replace("\n","").split(" ")
+        dict_info[v[0]] = " ".join(v[1:])
+
+    metadata = []
+    if platform.system() == "Windows":
+        split = "\\"
+    else:
+        split = "/" 
+    for wav_fpath in speaker_dir.glob("*.wav"):
+        name = str(wav_fpath).split(split)[-1]
+        key = name.split(".")[0]
+        words = dict_info.get(key)
+        if not words:
+            continue
+        sub_basename = "%s_%02d" % (name, 0)
+        wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
+        metadata.append(_process_utterance(wav, text, out_dir, sub_basename, 
+                                              skip_existing, hparams))
+    return [m for m in metadata if m is not None]
+
+def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
+    metadata = []
+    for book_dir in speaker_dir.glob("*"):
+        if no_alignments:
+            # Gather the utterance audios and texts
+            # LibriTTS uses .wav but we will include extensions for compatibility with other datasets
+            extensions = ["*.wav", "*.flac", "*.mp3"]
+            for extension in extensions:
+                wav_fpaths = book_dir.glob(extension)
+
+                for wav_fpath in wav_fpaths:
+                    # Load the audio waveform
+                    wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
+                    if hparams.rescale:
+                        wav = wav / np.abs(wav).max() * hparams.rescaling_max
+
+                    # Get the corresponding text
+                    # Check for .txt (for compatibility with other datasets)
+                    text_fpath = wav_fpath.with_suffix(".txt")
+                    if not text_fpath.exists():
+                        # Check for .normalized.txt (LibriTTS)
+                        text_fpath = wav_fpath.with_suffix(".normalized.txt")
+                        assert text_fpath.exists()
+                    with text_fpath.open("r") as text_file:
+                        text = "".join([line for line in text_file])
+                        text = text.replace("\"", "")
+                        text = text.strip()
+
+                    # Process the utterance
+                    metadata.append(_process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
+                                                      skip_existing, hparams))
+        else:
+            # Process alignment file (LibriSpeech support)
+            # Gather the utterance audios and texts
+            try:
+                alignments_fpath = next(book_dir.glob("*.alignment.txt"))
+                with alignments_fpath.open("r") as alignments_file:
+                    alignments = [line.rstrip().split(" ") for line in alignments_file]
+            except StopIteration:
+                # A few alignment files will be missing
+                continue
+
+            # Iterate over each entry in the alignments file
+            for wav_fname, words, end_times in alignments:
+                wav_fpath = book_dir.joinpath(wav_fname + ".flac")
+                assert wav_fpath.exists()
+                words = words.replace("\"", "").split(",")
+                end_times = list(map(float, end_times.replace("\"", "").split(",")))
+
+                # Process each sub-utterance
+                wavs, texts = _split_on_silences(wav_fpath, words, end_times, hparams)
+                for i, (wav, text) in enumerate(zip(wavs, texts)):
+                    sub_basename = "%s_%02d" % (wav_fname, i)
+                    metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
+                                                      skip_existing, hparams))
+
+    return [m for m in metadata if m is not None]
+
+# TODO: use original split func
+def _split_on_silences(wav_fpath, words, end_times, hparams):
+    # Load the audio waveform
+    wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
+    if hparams.rescale:
+        wav = wav / np.abs(wav).max() * hparams.rescaling_max
+    
+    words = np.array(words)
+    start_times = np.array([0.0] + end_times[:-1])
+    end_times = np.array(end_times)
+    assert len(words) == len(end_times) == len(start_times)
+    assert words[0] == "" and words[-1] == ""
+    
+    # Find pauses that are too long
+    mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
+    mask[0] = mask[-1] = True
+    breaks = np.where(mask)[0]
+
+    # Profile the noise from the silences and perform noise reduction on the waveform
+    silence_times = [[start_times[i], end_times[i]] for i in breaks]
+    silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
+    noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
+    if len(noisy_wav) > hparams.sample_rate * 0.02:
+        profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
+        wav = logmmse.denoise(wav, profile, eta=0)
+    
+    # Re-attach segments that are too short
+    segments = list(zip(breaks[:-1], breaks[1:]))
+    segment_durations = [start_times[end] - end_times[start] for start, end in segments]
+    i = 0
+    while i < len(segments) and len(segments) > 1:
+        if segment_durations[i] < hparams.utterance_min_duration:
+            # See if the segment can be re-attached with the right or the left segment
+            left_duration = float("inf") if i == 0 else segment_durations[i - 1]
+            right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
+            joined_duration = segment_durations[i] + min(left_duration, right_duration)
+
+            # Do not re-attach if it causes the joined utterance to be too long
+            if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
+                i += 1
+                continue
+
+            # Re-attach the segment with the neighbour of shortest duration
+            j = i - 1 if left_duration <= right_duration else i
+            segments[j] = (segments[j][0], segments[j + 1][1])
+            segment_durations[j] = joined_duration
+            del segments[j + 1], segment_durations[j + 1]
+        else:
+            i += 1
+    
+    # Split the utterance
+    segment_times = [[end_times[start], start_times[end]] for start, end in segments]
+    segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
+    wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
+    texts = [" ".join(words[start + 1:end]).replace("  ", " ") for start, end in segments]
+    
+    # # DEBUG: play the audio segments (run with -n=1)
+    # import sounddevice as sd
+    # if len(wavs) > 1:
+    #     print("This sentence was split in %d segments:" % len(wavs))
+    # else:
+    #     print("There are no silences long enough for this sentence to be split:")
+    # for wav, text in zip(wavs, texts):
+    #     # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
+    #     # when playing them. You shouldn't need to do that in your parsers.
+    #     wav = np.concatenate((wav, [0] * 16000))
+    #     print("\t%s" % text)
+    #     sd.play(wav, 16000, blocking=True)
+    # print("")
+    
+    return wavs, texts
--- a/synthesizer_preprocess_audio.py
+++ b/synthesizer_preprocess_audio.py
@ -1,10 +1,15 @@
-from synthesizer.preprocess import preprocess_aidatatang_200zh
+from synthesizer.preprocess import preprocess_dataset
 from synthesizer.hparams import hparams
 from utils.argutils import print_args
 from pathlib import Path
 import argparse


+recognized_datasets = [
+    "aidatatang_200zh",
+    "SLR68",
+]
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
@ -29,16 +34,14 @@ if __name__ == "__main__":
    parser.add_argument("--no_alignments", action="store_true", help=\
        "Use this option when dataset does not include alignments\
        (these are used to split long audio files into sub-utterances.)")
-    parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\
-        "Name of the dataset directory to process.")
-    parser.add_argument("--subfolders", type=str, default="train-clean-100, train-clean-360", help=\
-        "Comma-separated list of subfolders to process inside your dataset directory")
+    parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\
+        "Name of the dataset to process.")
    args = parser.parse_args()

    # Process the arguments
    if not hasattr(args, "out_dir"):
        args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
-
+    assert args.dataset in recognized_datasets, 'not surpport such dataset'
    # Create directories
    assert args.datasets_root.exists()
    args.out_dir.mkdir(exist_ok=True, parents=True)
@ -56,5 +59,5 @@ if __name__ == "__main__":
    # Preprocess the dataset
    print_args(args, parser)
    args.hparams = hparams.parse(args.hparams)
-    # preprocess_dataset(**vars(args))
-    preprocess_aidatatang_200zh(**vars(args)) 
+
+    preprocess_dataset(**vars(args))