Compare commits
4 Commits
46d5f655b0
...
5f85ea8010
Author | SHA1 | Date |
---|---|---|
Emma Thompson | 5f85ea8010 | |
Vega | 156723e37c | |
Limingrui0 | dd703be2cd | |
Limingrui0 | b9ce50f77f |
|
@ -29,6 +29,7 @@
|
||||||
> 如果在用 pip 方式安装的时候出现 `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2)` 这个错误可能是 python 版本过低,3.9 可以安装成功
|
> 如果在用 pip 方式安装的时候出现 `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2)` 这个错误可能是 python 版本过低,3.9 可以安装成功
|
||||||
* 安装 [ffmpeg](https://ffmpeg.org/download.html#get-packages)。
|
* 安装 [ffmpeg](https://ffmpeg.org/download.html#get-packages)。
|
||||||
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
|
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
|
||||||
|
> 这里的环境建议使用 `Repo Tag 0.0.1` `Pytorch1.9.0 with Torchvision0.10.0 and cudatoolkit10.2` `requirements.txt` `webrtcvad-wheels` 因为 `requiremants.txt` 是在几个月前导出的,所以不适配新版本
|
||||||
* 安装 webrtcvad `pip install webrtcvad-wheels`。
|
* 安装 webrtcvad `pip install webrtcvad-wheels`。
|
||||||
|
|
||||||
或者
|
或者
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
> If you get an `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2 )` This error is probably due to a low version of python, try using 3.9 and it will install successfully
|
> If you get an `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2 )` This error is probably due to a low version of python, try using 3.9 and it will install successfully
|
||||||
* Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
|
* Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
|
||||||
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
|
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
|
||||||
|
> The recommended environment here is `Repo Tag 0.0.1` `Pytorch1.9.0 with Torchvision0.10.0 and cudatoolkit10.2` `requirements.txt` `webrtcvad-wheels` because `requiremants. txt` was exported a few months ago, so it doesn't work with newer versions
|
||||||
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
|
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
|
||||||
|
|
||||||
or
|
or
|
||||||
|
|
|
@ -39,6 +39,9 @@ data_info = {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def should_skip(fpath: Path, skip_existing: bool) -> bool:
|
||||||
|
return skip_existing and fpath.exists()
|
||||||
|
|
||||||
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
skip_existing: bool, hparams, no_alignments: bool,
|
skip_existing: bool, hparams, no_alignments: bool,
|
||||||
dataset: str, emotion_extract = False, encoder_model_fpath=None):
|
dataset: str, emotion_extract = False, encoder_model_fpath=None):
|
||||||
|
@ -99,7 +102,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
||||||
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
||||||
|
|
||||||
def embed_utterance(fpaths, encoder_model_fpath):
|
def _embed_utterance(fpaths: str, encoder_model_fpath: str):
|
||||||
if not encoder.is_loaded():
|
if not encoder.is_loaded():
|
||||||
encoder.load_model(encoder_model_fpath)
|
encoder.load_model(encoder_model_fpath)
|
||||||
|
|
||||||
|
@ -110,15 +113,13 @@ def embed_utterance(fpaths, encoder_model_fpath):
|
||||||
embed = encoder.embed_utterance(wav)
|
embed = encoder.embed_utterance(wav)
|
||||||
np.save(embed_fpath, embed, allow_pickle=False)
|
np.save(embed_fpath, embed, allow_pickle=False)
|
||||||
|
|
||||||
def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False):
|
def _emo_extract_from_utterance(fpaths, hparams):
|
||||||
if skip_existing and fpaths.exists():
|
|
||||||
return
|
|
||||||
wav_fpath, emo_fpath = fpaths
|
wav_fpath, emo_fpath = fpaths
|
||||||
wav = np.load(wav_fpath)
|
wav = np.load(wav_fpath)
|
||||||
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
|
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
|
||||||
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
|
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
|
||||||
|
|
||||||
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
|
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int, skip_existing: bool):
|
||||||
wav_dir = synthesizer_root.joinpath("audio")
|
wav_dir = synthesizer_root.joinpath("audio")
|
||||||
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
||||||
assert wav_dir.exists() and metadata_fpath.exists()
|
assert wav_dir.exists() and metadata_fpath.exists()
|
||||||
|
@ -128,11 +129,11 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
|
||||||
# Gather the input wave filepath and the target output embed filepath
|
# Gather the input wave filepath and the target output embed filepath
|
||||||
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
||||||
metadata = [line.split("|") for line in metadata_file]
|
metadata = [line.split("|") for line in metadata_file]
|
||||||
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
|
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata if not should_skip(embed_dir.joinpath(m[2]), skip_existing)]
|
||||||
|
|
||||||
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
||||||
# Embed the utterances in separate threads
|
# Embed the utterances in separate threads
|
||||||
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
func = partial(_embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
||||||
job = Pool(n_processes).imap(func, fpaths)
|
job = Pool(n_processes).imap(func, fpaths)
|
||||||
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
||||||
|
|
||||||
|
@ -142,14 +143,14 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
|
||||||
assert wav_dir.exists() and metadata_fpath.exists()
|
assert wav_dir.exists() and metadata_fpath.exists()
|
||||||
emo_dir = synthesizer_root.joinpath("emo")
|
emo_dir = synthesizer_root.joinpath("emo")
|
||||||
emo_dir.mkdir(exist_ok=True)
|
emo_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
# Gather the input wave filepath and the target output embed filepath
|
# Gather the input wave filepath and the target output embed filepath
|
||||||
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
||||||
metadata = [line.split("|") for line in metadata_file]
|
metadata = [line.split("|") for line in metadata_file]
|
||||||
fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata]
|
fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata if not should_skip(emo_dir.joinpath(m[0].replace("audio-", "emo-")), skip_existing)]
|
||||||
|
|
||||||
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
||||||
# Embed the utterances in separate threads
|
# Embed the utterances in separate threads
|
||||||
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
|
func = partial(_emo_extract_from_utterance, hparams=hparams)
|
||||||
job = Pool(n_processes).imap(func, fpaths)
|
job = Pool(n_processes).imap(func, fpaths)
|
||||||
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
||||||
|
|
|
@ -45,7 +45,7 @@ def extract_emo(
|
||||||
return y
|
return y
|
||||||
|
|
||||||
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||||
skip_existing: bool, hparams, encoder_model_fpath):
|
mel_fpath: str, wav_fpath: str, hparams, encoder_model_fpath):
|
||||||
## FOR REFERENCE:
|
## FOR REFERENCE:
|
||||||
# For you not to lose your head if you ever wish to change things here or implement your own
|
# For you not to lose your head if you ever wish to change things here or implement your own
|
||||||
# synthesizer.
|
# synthesizer.
|
||||||
|
@ -58,13 +58,6 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||||
# without extra padding. This means that you won't have an exact relation between the length
|
# without extra padding. This means that you won't have an exact relation between the length
|
||||||
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
||||||
|
|
||||||
# Skip existing utterances if needed
|
|
||||||
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
|
|
||||||
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
|
|
||||||
|
|
||||||
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Trim silence
|
# Trim silence
|
||||||
if hparams.trim_silence:
|
if hparams.trim_silence:
|
||||||
if not encoder.is_loaded():
|
if not encoder.is_loaded():
|
||||||
|
@ -112,50 +105,28 @@ def _split_on_silences(wav_fpath, words, hparams):
|
||||||
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
|
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
|
||||||
metadata = []
|
metadata = []
|
||||||
extensions = ("*.wav", "*.flac", "*.mp3")
|
extensions = ("*.wav", "*.flac", "*.mp3")
|
||||||
if skip_existing:
|
for extension in extensions:
|
||||||
for extension in extensions:
|
wav_fpath_list = speaker_dir.glob(extension)
|
||||||
wav_fpath_list = speaker_dir.glob(extension)
|
# Iterate over each wav
|
||||||
# Iterate over each wav
|
for wav_fpath in wav_fpath_list:
|
||||||
for wav_fpath in wav_fpath_list:
|
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
if not words:
|
||||||
|
words = dict_info.get(wav_fpath.name) # try with extension
|
||||||
if not words:
|
if not words:
|
||||||
words = dict_info.get(wav_fpath.name) # try with extension
|
print(f"No word found in dict_info for {wav_fpath.name}, skip it")
|
||||||
if not words:
|
|
||||||
print("no wordS")
|
|
||||||
continue
|
|
||||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
|
||||||
|
|
||||||
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
|
|
||||||
wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
|
|
||||||
|
|
||||||
if mel_fpath.exists() and wav_fpath_.exists():
|
|
||||||
continue
|
continue
|
||||||
|
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||||
|
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
|
||||||
|
wav_fpath = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
|
||||||
|
|
||||||
|
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
|
||||||
|
continue
|
||||||
|
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||||
|
result = _process_utterance(wav, text, out_dir, sub_basename,
|
||||||
|
False, hparams, encoder_model_fpath) # accelarate
|
||||||
|
if result is None:
|
||||||
|
continue
|
||||||
|
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
||||||
|
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
|
||||||
|
|
||||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
|
||||||
result = _process_utterance(wav, text, out_dir, sub_basename,
|
|
||||||
False, hparams, encoder_model_fpath) # accelarate
|
|
||||||
if result is None:
|
|
||||||
continue
|
|
||||||
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
|
||||||
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
|
|
||||||
else:
|
|
||||||
for extension in extensions:
|
|
||||||
wav_fpath_list = speaker_dir.glob(extension)
|
|
||||||
# Iterate over each wav
|
|
||||||
for wav_fpath in wav_fpath_list:
|
|
||||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
|
||||||
if not words:
|
|
||||||
words = dict_info.get(wav_fpath.name) # try with extension
|
|
||||||
if not words:
|
|
||||||
print("no wordS")
|
|
||||||
continue
|
|
||||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
|
||||||
|
|
||||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
|
||||||
result = _process_utterance(wav, text, out_dir, sub_basename,
|
|
||||||
False, hparams, encoder_model_fpath)
|
|
||||||
if result is None:
|
|
||||||
continue
|
|
||||||
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
|
||||||
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
|
|
||||||
return metadata
|
return metadata
|
||||||
|
|
2
pre.py
2
pre.py
|
@ -71,7 +71,7 @@ if __name__ == "__main__":
|
||||||
del args.n_processes_embed
|
del args.n_processes_embed
|
||||||
preprocess_dataset(**vars(args))
|
preprocess_dataset(**vars(args))
|
||||||
|
|
||||||
create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath)
|
create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath, skip_existing=args.skip_existing)
|
||||||
|
|
||||||
if args.emotion_extract:
|
if args.emotion_extract:
|
||||||
create_emo(synthesizer_root=args.out_dir, n_processes=n_processes_embed, skip_existing=args.skip_existing, hparams=args.hparams)
|
create_emo(synthesizer_root=args.out_dir, n_processes=n_processes_embed, skip_existing=args.skip_existing, hparams=args.hparams)
|
||||||
|
|
Loading…
Reference in New Issue