From 9f1dbeeecc902b2719f77e93996bb74b6d654426 Mon Sep 17 00:00:00 2001 From: 0warning0error Date: Fri, 2 Jun 2023 17:22:38 +0800 Subject: [PATCH] Some changes to make it easier to install the dependencies --- README-CN.md | 10 ++++ README.md | 10 ++++ control/cli/vocoder_train.py | 2 +- control/toolbox/ui.py | 6 +-- env.yml | Bin 0 -> 1088 bytes models/encoder/audio.py | 4 +- models/encoder/visualizations.py | 2 +- models/ppg_extractor/log_mel.py | 3 +- models/synthesizer/audio.py | 2 +- models/synthesizer/preprocess.py | 8 +-- models/synthesizer/preprocess_audio.py | 69 ++++++++++++++++++------- models/vocoder/wavernn/audio.py | 2 +- requirements.txt | 21 ++++---- 13 files changed, 93 insertions(+), 46 deletions(-) create mode 100644 env.yml diff --git a/README-CN.md b/README-CN.md index 54b769f..316edec 100644 --- a/README-CN.md +++ b/README-CN.md @@ -44,6 +44,16 @@ * 运行`pip install -r requirements.txt` 来安装剩余的必要包。 * 安装 webrtcvad `pip install webrtcvad-wheels`。 +或者 +- 用`conda` 或者 `mamba` 安装依赖 + + ```conda env create -n env_name -f env.yml``` + + ```mamba env create -n env_name -f env.yml``` + + 会创建新环境安装必须的依赖. 之后用 `conda activate env_name` 切换环境就完成了. + > env.yml只包含了运行时必要的依赖,暂时不包括monotonic-align,如果想要装GPU版本的pytorch可以查看官网教程。 + #### 1.2 M1芯片Mac环境配置(Inference Time) > 以下环境按x86-64搭建,使用原生的`demo_toolbox.py`,可作为在不改代码情况下快速使用的workaround。 > diff --git a/README.md b/README.md index 29da7d3..5d4e6c6 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,16 @@ * Run `pip install -r requirements.txt` to install the remaining necessary packages. * Install webrtcvad `pip install webrtcvad-wheels`(If you need) +or +- install dependencies with `conda` or `mamba` + + ```conda env create -n env_name -f env.yml``` + + ```mamba env create -n env_name -f env.yml``` + + will create a virtual environment where necessary dependencies are installed. Switch to the new environment by `conda activate env_name` and enjoy it. + > env.yml only includes the necessary dependencies to run the project,temporarily without monotonic-align. You can check the official website to install the GPU version of pytorch. + #### 1.2 Setup with a M1 Mac > The following steps are a workaround to directly use the original `demo_toolbox.py`without the changing of codes. > diff --git a/control/cli/vocoder_train.py b/control/cli/vocoder_train.py index 07e93db..3f8b6a7 100644 --- a/control/cli/vocoder_train.py +++ b/control/cli/vocoder_train.py @@ -78,7 +78,7 @@ if __name__ == "__main__": else: train_hifigan(0, args, h) elif args.vocoder_type == "fregan": - with open('vocoder/fregan/config.json') as f: + with Path('vocoder/fregan/config.json').open() as f: json_config = json.load(f) h = AttrDict(json_config) if h.num_gpus > 1: diff --git a/control/toolbox/ui.py b/control/toolbox/ui.py index e60d514..985933e 100644 --- a/control/toolbox/ui.py +++ b/control/toolbox/ui.py @@ -33,7 +33,7 @@ colormap = np.array([ [0, 0, 0], [183, 183, 183], [76, 255, 0], -], dtype=np.float) / 255 +], dtype=float) / 255 default_text = \ "欢迎使用工具箱, 现已支持中文输入!" @@ -402,8 +402,8 @@ class UI(QDialog): self.app.processEvents() def set_loading(self, value, maximum=1): - self.loading_bar.setValue(value * 100) - self.loading_bar.setMaximum(maximum * 100) + self.loading_bar.setValue(int(value * 100)) + self.loading_bar.setMaximum(int(maximum * 100)) self.loading_bar.setTextVisible(value != 0) self.app.processEvents() diff --git a/env.yml b/env.yml new file mode 100644 index 0000000000000000000000000000000000000000..6371a3c6c124a8d75b558da83ad4f4997d4ac6f6 GIT binary patch literal 1088 zcmZ{j;Y!0m5QO));5+yNk&5_7&=(PrG)>xIdP$ROYhPad<|=eAD&calvokxpclY&K z*~Ur>e6@8p*`r`2Y#AM_0u zx?#Bs-KGO8%=)|3D&WX{w2w?VCYGi6$Hr1W%8#^nX00000 literal 0 HcmV?d00001 diff --git a/models/encoder/audio.py b/models/encoder/audio.py index 7d2fe2d..d0aaa13 100644 --- a/models/encoder/audio.py +++ b/models/encoder/audio.py @@ -39,7 +39,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], # Resample the wav if needed if source_sr is not None and source_sr != sampling_rate: - wav = librosa.resample(wav, source_sr, sampling_rate) + wav = librosa.resample(wav, orig_sr = source_sr, target_sr = sampling_rate) # Apply the preprocessing: normalize volume and shorten long silences if normalize: @@ -99,7 +99,7 @@ def trim_long_silences(wav): return ret[width - 1:] / width audio_mask = moving_average(voice_flags, vad_moving_average_width) - audio_mask = np.round(audio_mask).astype(np.bool) + audio_mask = np.round(audio_mask).astype(bool) # Dilate the voiced regions audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) diff --git a/models/encoder/visualizations.py b/models/encoder/visualizations.py index f0b0b7c..caa61df 100644 --- a/models/encoder/visualizations.py +++ b/models/encoder/visualizations.py @@ -21,7 +21,7 @@ colormap = np.array([ [33, 0, 127], [0, 0, 0], [183, 183, 183], -], dtype=np.float) / 255 +], dtype=float) / 255 class Visualizations: diff --git a/models/ppg_extractor/log_mel.py b/models/ppg_extractor/log_mel.py index 1e3b87d..c07c7a8 100644 --- a/models/ppg_extractor/log_mel.py +++ b/models/ppg_extractor/log_mel.py @@ -31,14 +31,13 @@ class LogMel(torch.nn.Module): fs: int = 16000, n_fft: int = 512, n_mels: int = 80, - fmin: float = None, + fmin: float = 0, fmax: float = None, htk: bool = False, norm=1, ): super().__init__() - fmin = 0 if fmin is None else fmin fmax = fs / 2 if fmax is None else fmax _mel_options = dict( sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm diff --git a/models/synthesizer/audio.py b/models/synthesizer/audio.py index 2e03ae5..fb9b72c 100644 --- a/models/synthesizer/audio.py +++ b/models/synthesizer/audio.py @@ -107,7 +107,7 @@ def _griffin_lim(S, hparams): Based on https://github.com/librosa/librosa/issues/434 """ angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) - S_complex = np.abs(S).astype(np.complex) + S_complex = np.abs(S).astype(complex) y = _istft(S_complex * angles, hparams) for i in range(hparams.griffin_lim_iters): angles = np.exp(1j * np.angle(_stft(y, hparams))) diff --git a/models/synthesizer/preprocess.py b/models/synthesizer/preprocess.py index e460393..93705d1 100644 --- a/models/synthesizer/preprocess.py +++ b/models/synthesizer/preprocess.py @@ -78,12 +78,12 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath) - job = Pool(n_processes).imap(func, speaker_dirs) + job = Pool(n_processes).imap_unordered(func, speaker_dirs) for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"): if speaker_metadata is not None: for metadatum in speaker_metadata: - metadata_file.write("|".join(str(x) for x in metadatum) + "\n") + metadata_file.write("|".join(map(str,metadatum)) + "\n") metadata_file.close() # Verify the contents of the metadata file @@ -134,7 +134,7 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce # Embed the utterances in separate threads func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) job = Pool(n_processes).imap(func, fpaths) - list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) + tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances")) def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams): wav_dir = synthesizer_root.joinpath("audio") @@ -152,4 +152,4 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp # Embed the utterances in separate threads func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing) job = Pool(n_processes).imap(func, fpaths) - list(tqdm(job, "Emo", len(fpaths), unit="utterances")) + tuple(tqdm(job, "Emo", len(fpaths), unit="utterances")) diff --git a/models/synthesizer/preprocess_audio.py b/models/synthesizer/preprocess_audio.py index 4f4893f..ca2a880 100644 --- a/models/synthesizer/preprocess_audio.py +++ b/models/synthesizer/preprocess_audio.py @@ -104,29 +104,58 @@ def _split_on_silences(wav_fpath, words, hparams): wav = logmmse.denoise(wav, profile, eta=0) resp = pinyin(words, style=Style.TONE3) - res = [v[0] for v in resp if v[0].strip()] + res = filter(lambda v : not v.isspace(),map(lambda v: v[0],resp)) res = " ".join(res) return wav, res def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path): metadata = [] - extensions = ["*.wav", "*.flac", "*.mp3"] - for extension in extensions: - wav_fpath_list = speaker_dir.glob(extension) - # Iterate over each wav - for wav_fpath in wav_fpath_list: - words = dict_info.get(wav_fpath.name.split(".")[0]) - words = dict_info.get(wav_fpath.name) if not words else words # try with extension - if not words: - print("no wordS") - continue - sub_basename = "%s_%02d" % (wav_fpath.name, 0) - wav, text = _split_on_silences(wav_fpath, words, hparams) - result = _process_utterance(wav, text, out_dir, sub_basename, - skip_existing, hparams, encoder_model_fpath) - if result is None: - continue - wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result - metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text]) - return [m for m in metadata if m is not None] + extensions = ("*.wav", "*.flac", "*.mp3") + if skip_existing: + for extension in extensions: + wav_fpath_list = speaker_dir.glob(extension) + # Iterate over each wav + for wav_fpath in wav_fpath_list: + words = dict_info.get(wav_fpath.name.split(".")[0]) + if not words: + words = dict_info.get(wav_fpath.name) # try with extension + if not words: + print("no wordS") + continue + sub_basename = "%s_%02d" % (wav_fpath.name, 0) + + mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy") + wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy") + + if mel_fpath.exists() and wav_fpath_.exists(): + continue + + wav, text = _split_on_silences(wav_fpath, words, hparams) + result = _process_utterance(wav, text, out_dir, sub_basename, + False, hparams, encoder_model_fpath) # accelarate + if result is None: + continue + wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result + metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text)) + else: + for extension in extensions: + wav_fpath_list = speaker_dir.glob(extension) + # Iterate over each wav + for wav_fpath in wav_fpath_list: + words = dict_info.get(wav_fpath.name.split(".")[0]) + if not words: + words = dict_info.get(wav_fpath.name) # try with extension + if not words: + print("no wordS") + continue + sub_basename = "%s_%02d" % (wav_fpath.name, 0) + + wav, text = _split_on_silences(wav_fpath, words, hparams) + result = _process_utterance(wav, text, out_dir, sub_basename, + False, hparams, encoder_model_fpath) + if result is None: + continue + wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result + metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text)) + return metadata diff --git a/models/vocoder/wavernn/audio.py b/models/vocoder/wavernn/audio.py index 738a374..95bff5e 100644 --- a/models/vocoder/wavernn/audio.py +++ b/models/vocoder/wavernn/audio.py @@ -50,7 +50,7 @@ def linear_to_mel(spectrogram): def build_mel_basis(): - return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin) + return librosa.filters.mel(sr = hp.sample_rate, n_fft = hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin) def normalize(S): diff --git a/requirements.txt b/requirements.txt index 26a102b..c692e0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,8 @@ umap-learn visdom -librosa==0.8.1 +librosa matplotlib>=3.3.0 -numpy==1.19.3; platform_system == "Windows" -numpy==1.19.4; platform_system != "Windows" +numpy scipy>=1.0.0 tqdm sounddevice @@ -13,22 +12,22 @@ inflect PyQt5 multiprocess numba -webrtcvad; platform_system != "Windows" +webrtcvad pypinyin flask flask_wtf -flask_cors==3.0.10 -gevent==21.8.0 +flask_cors +gevent flask_restx -tensorboard==1.15 -streamlit==1.8.0 -PyYAML==5.4.1 +tensorboard +streamlit +PyYAML torch_complex espnet PyWavelets monotonic-align==0.0.3 -transformers==4.26.0 +transformers fastapi loguru typer[all] -click==8.0.4 +click