Some changes to make it easier to install the dependencies

2024-03-22 13:11:31 +08:00 · 2023-06-02 17:22:38 +08:00 · 2023-06-02 17:22:38 +08:00 · 9f1dbeeecc
commit 9f1dbeeecc
parent b78d0d2a26
13 changed files with 93 additions and 46 deletions
--- a/README-CN.md
+++ b/README-CN.md
@ -44,6 +44,16 @@
 * 运行`pip install -r requirements.txt` 来安装剩余的必要包。
 * 安装 webrtcvad `pip install webrtcvad-wheels`。

+或者
+- 用`conda` 或者 `mamba` 安装依赖
+
+  ```conda env create -n env_name -f env.yml```
+
+  ```mamba env create -n env_name -f env.yml```
+
+  会创建新环境安装必须的依赖. 之后用 `conda activate env_name` 切换环境就完成了.
+  > env.yml只包含了运行时必要的依赖，暂时不包括monotonic-align，如果想要装GPU版本的pytorch可以查看官网教程。
+
 #### 1.2 M1芯片Mac环境配置（Inference Time)
 > 以下环境按x86-64搭建，使用原生的`demo_toolbox.py`，可作为在不改代码情况下快速使用的workaround。
 > 
--- a/README.md
+++ b/README.md
@ -39,6 +39,16 @@
 * Run `pip install -r requirements.txt` to install the remaining necessary packages.
 * Install webrtcvad `pip install webrtcvad-wheels`(If you need)

+or
+- install dependencies with `conda` or `mamba`
+
+  ```conda env create -n env_name -f env.yml```
+
+  ```mamba env create -n env_name -f env.yml```
+
+  will create a virtual environment where necessary dependencies are installed. Switch to the new environment by `conda activate env_name` and enjoy it.
+  > env.yml only includes the necessary dependencies to run the project，temporarily without monotonic-align. You can check the official website to install the GPU version of pytorch.
+
 #### 1.2 Setup with a M1 Mac
 > The following steps are a workaround to directly use the original `demo_toolbox.py`without the changing of codes.
 > 
--- a/control/cli/vocoder_train.py
+++ b/control/cli/vocoder_train.py
@ -78,7 +78,7 @@ if __name__ == "__main__":
        else:
            train_hifigan(0, args, h)
    elif args.vocoder_type == "fregan":
-        with open('vocoder/fregan/config.json') as f:
+        with Path('vocoder/fregan/config.json').open() as f:
            json_config = json.load(f)
        h = AttrDict(json_config)
        if h.num_gpus > 1:
--- a/control/toolbox/ui.py
+++ b/control/toolbox/ui.py
@ -33,7 +33,7 @@ colormap = np.array([
    [0, 0, 0],
    [183, 183, 183],
    [76, 255, 0],
-], dtype=np.float) / 255 
+], dtype=float) / 255 

 default_text = \
    "欢迎使用工具箱, 现已支持中文输入！"
@ -402,8 +402,8 @@ class UI(QDialog):
        self.app.processEvents()

    def set_loading(self, value, maximum=1):
-        self.loading_bar.setValue(value * 100)
-        self.loading_bar.setMaximum(maximum * 100)
+        self.loading_bar.setValue(int(value * 100))
+        self.loading_bar.setMaximum(int(maximum * 100))
        self.loading_bar.setTextVisible(value != 0)
        self.app.processEvents()

--- a/env.yml
+++ b/env.yml
--- a/models/encoder/audio.py
+++ b/models/encoder/audio.py
@ -39,7 +39,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
    
    # Resample the wav if needed
    if source_sr is not None and source_sr != sampling_rate:
-        wav = librosa.resample(wav, source_sr, sampling_rate)
+        wav = librosa.resample(wav, orig_sr = source_sr, target_sr = sampling_rate)
        
    # Apply the preprocessing: normalize volume and shorten long silences 
    if normalize:
@ -99,7 +99,7 @@ def trim_long_silences(wav):
        return ret[width - 1:] / width
    
    audio_mask = moving_average(voice_flags, vad_moving_average_width)
-    audio_mask = np.round(audio_mask).astype(np.bool)
+    audio_mask = np.round(audio_mask).astype(bool)
    
    # Dilate the voiced regions
    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
--- a/models/encoder/visualizations.py
+++ b/models/encoder/visualizations.py
@ -21,7 +21,7 @@ colormap = np.array([
    [33, 0, 127],
    [0, 0, 0],
    [183, 183, 183],
-], dtype=np.float) / 255 
+], dtype=float) / 255 


 class Visualizations:
--- a/models/ppg_extractor/log_mel.py
+++ b/models/ppg_extractor/log_mel.py
@ -31,14 +31,13 @@ class LogMel(torch.nn.Module):
        fs: int = 16000,
        n_fft: int = 512,
        n_mels: int = 80,
-        fmin: float = None,
+        fmin: float = 0,
        fmax: float = None,
        htk: bool = False,
        norm=1,
    ):
        super().__init__()

-        fmin = 0 if fmin is None else fmin
        fmax = fs / 2 if fmax is None else fmax
        _mel_options = dict(
            sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
--- a/models/synthesizer/audio.py
+++ b/models/synthesizer/audio.py
@ -107,7 +107,7 @@ def _griffin_lim(S, hparams):
    Based on https://github.com/librosa/librosa/issues/434
    """
    angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
-    S_complex = np.abs(S).astype(np.complex)
+    S_complex = np.abs(S).astype(complex)
    y = _istft(S_complex * angles, hparams)
    for i in range(hparams.griffin_lim_iters):
        angles = np.exp(1j * np.angle(_stft(y, hparams)))
--- a/models/synthesizer/preprocess.py
+++ b/models/synthesizer/preprocess.py
@ -78,12 +78,12 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
    
    func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, 
                   hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
-    job = Pool(n_processes).imap(func, speaker_dirs)
+    job = Pool(n_processes).imap_unordered(func, speaker_dirs)
    
    for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
        if speaker_metadata is not None:
            for metadatum in speaker_metadata:
-                metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
+                metadata_file.write("|".join(map(str,metadatum)) + "\n")
    metadata_file.close()

    # Verify the contents of the metadata file
@ -134,7 +134,7 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
    # Embed the utterances in separate threads
    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
    job = Pool(n_processes).imap(func, fpaths)
-    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
+    tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

 def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
    wav_dir = synthesizer_root.joinpath("audio")
@ -152,4 +152,4 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
    # Embed the utterances in separate threads
    func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
    job = Pool(n_processes).imap(func, fpaths)
-    list(tqdm(job, "Emo", len(fpaths), unit="utterances"))
+    tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
--- a/models/synthesizer/preprocess_audio.py
+++ b/models/synthesizer/preprocess_audio.py
@ -104,29 +104,58 @@ def _split_on_silences(wav_fpath, words, hparams):
        wav = logmmse.denoise(wav, profile, eta=0)

    resp = pinyin(words, style=Style.TONE3)
-    res = [v[0] for v in resp if v[0].strip()]
+    res = filter(lambda v : not v.isspace(),map(lambda v: v[0],resp)) 
    res = " ".join(res)

    return wav, res

 def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
    metadata = []
-    extensions = ["*.wav", "*.flac", "*.mp3"]
-    for extension in extensions:
-        wav_fpath_list = speaker_dir.glob(extension)
-        # Iterate over each wav
-        for wav_fpath in wav_fpath_list:
-            words = dict_info.get(wav_fpath.name.split(".")[0])
-            words = dict_info.get(wav_fpath.name) if not words else words # try with extension 
-            if not words:
-                print("no wordS")
-                continue
-            sub_basename = "%s_%02d" % (wav_fpath.name, 0)
-            wav, text = _split_on_silences(wav_fpath, words, hparams)
-            result = _process_utterance(wav, text, out_dir, sub_basename, 
-                                                skip_existing, hparams, encoder_model_fpath)
-            if result is None:
-                continue
-            wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
-            metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
-    return [m for m in metadata if m is not None]
+    extensions = ("*.wav", "*.flac", "*.mp3")
+    if skip_existing:
+        for extension in extensions:
+            wav_fpath_list = speaker_dir.glob(extension)
+            # Iterate over each wav
+            for wav_fpath in wav_fpath_list:
+                words = dict_info.get(wav_fpath.name.split(".")[0])
+                if not words:
+                    words = dict_info.get(wav_fpath.name) # try with extension 
+                    if not words:
+                        print("no wordS")
+                        continue
+                sub_basename = "%s_%02d" % (wav_fpath.name, 0)
+                
+                mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
+                wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
+                
+                if mel_fpath.exists() and wav_fpath_.exists():
+                    continue
+
+                wav, text = _split_on_silences(wav_fpath, words, hparams)
+                result = _process_utterance(wav, text, out_dir, sub_basename, 
+                                                    False, hparams, encoder_model_fpath) # accelarate
+                if result is None:
+                    continue
+                wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
+                metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
+    else:
+        for extension in extensions:
+            wav_fpath_list = speaker_dir.glob(extension)
+            # Iterate over each wav
+            for wav_fpath in wav_fpath_list:
+                words = dict_info.get(wav_fpath.name.split(".")[0])
+                if not words:
+                    words = dict_info.get(wav_fpath.name) # try with extension 
+                    if not words:
+                        print("no wordS")
+                        continue
+                sub_basename = "%s_%02d" % (wav_fpath.name, 0)
+
+                wav, text = _split_on_silences(wav_fpath, words, hparams)
+                result = _process_utterance(wav, text, out_dir, sub_basename, 
+                                                    False, hparams, encoder_model_fpath)
+                if result is None:
+                    continue
+                wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
+                metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
+    return metadata
--- a/models/vocoder/wavernn/audio.py
+++ b/models/vocoder/wavernn/audio.py
@ -50,7 +50,7 @@ def linear_to_mel(spectrogram):


 def build_mel_basis():
-    return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
+    return librosa.filters.mel(sr = hp.sample_rate, n_fft = hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)


 def normalize(S):
--- a/requirements.txt
+++ b/requirements.txt
@ -1,9 +1,8 @@
 umap-learn
 visdom
-librosa==0.8.1
+librosa
 matplotlib>=3.3.0
-numpy==1.19.3; platform_system == "Windows"
-numpy==1.19.4; platform_system != "Windows"
+numpy
 scipy>=1.0.0
 tqdm
 sounddevice
@ -13,22 +12,22 @@ inflect
 PyQt5
 multiprocess
 numba
-webrtcvad; platform_system != "Windows"
+webrtcvad
 pypinyin
 flask
 flask_wtf
-flask_cors==3.0.10
-gevent==21.8.0
+flask_cors
+gevent
 flask_restx
-tensorboard==1.15
-streamlit==1.8.0
-PyYAML==5.4.1
+tensorboard
+streamlit
+PyYAML
 torch_complex
 espnet
 PyWavelets
 monotonic-align==0.0.3
-transformers==4.26.0
+transformers
 fastapi
 loguru
 typer[all]
-click==8.0.4
+click