Some changes to make it easier to install the dependencies

This commit is contained in:
0warning0error 2023-06-02 17:22:38 +08:00
parent b78d0d2a26
commit 9f1dbeeecc
13 changed files with 93 additions and 46 deletions

View File

@ -44,6 +44,16 @@
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
* 安装 webrtcvad `pip install webrtcvad-wheels`
或者
- 用`conda` 或者 `mamba` 安装依赖
```conda env create -n env_name -f env.yml```
```mamba env create -n env_name -f env.yml```
会创建新环境安装必须的依赖. 之后用 `conda activate env_name` 切换环境就完成了.
> env.yml只包含了运行时必要的依赖暂时不包括monotonic-align如果想要装GPU版本的pytorch可以查看官网教程。
#### 1.2 M1芯片Mac环境配置Inference Time)
> 以下环境按x86-64搭建使用原生的`demo_toolbox.py`可作为在不改代码情况下快速使用的workaround。
>

View File

@ -39,6 +39,16 @@
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
or
- install dependencies with `conda` or `mamba`
```conda env create -n env_name -f env.yml```
```mamba env create -n env_name -f env.yml```
will create a virtual environment where necessary dependencies are installed. Switch to the new environment by `conda activate env_name` and enjoy it.
> env.yml only includes the necessary dependencies to run the projecttemporarily without monotonic-align. You can check the official website to install the GPU version of pytorch.
#### 1.2 Setup with a M1 Mac
> The following steps are a workaround to directly use the original `demo_toolbox.py`without the changing of codes.
>

View File

@ -78,7 +78,7 @@ if __name__ == "__main__":
else:
train_hifigan(0, args, h)
elif args.vocoder_type == "fregan":
with open('vocoder/fregan/config.json') as f:
with Path('vocoder/fregan/config.json').open() as f:
json_config = json.load(f)
h = AttrDict(json_config)
if h.num_gpus > 1:

View File

@ -33,7 +33,7 @@ colormap = np.array([
[0, 0, 0],
[183, 183, 183],
[76, 255, 0],
], dtype=np.float) / 255
], dtype=float) / 255
default_text = \
"欢迎使用工具箱, 现已支持中文输入!"
@ -402,8 +402,8 @@ class UI(QDialog):
self.app.processEvents()
def set_loading(self, value, maximum=1):
self.loading_bar.setValue(value * 100)
self.loading_bar.setMaximum(maximum * 100)
self.loading_bar.setValue(int(value * 100))
self.loading_bar.setMaximum(int(maximum * 100))
self.loading_bar.setTextVisible(value != 0)
self.app.processEvents()

BIN
env.yml Normal file

Binary file not shown.

View File

@ -39,7 +39,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
# Resample the wav if needed
if source_sr is not None and source_sr != sampling_rate:
wav = librosa.resample(wav, source_sr, sampling_rate)
wav = librosa.resample(wav, orig_sr = source_sr, target_sr = sampling_rate)
# Apply the preprocessing: normalize volume and shorten long silences
if normalize:
@ -99,7 +99,7 @@ def trim_long_silences(wav):
return ret[width - 1:] / width
audio_mask = moving_average(voice_flags, vad_moving_average_width)
audio_mask = np.round(audio_mask).astype(np.bool)
audio_mask = np.round(audio_mask).astype(bool)
# Dilate the voiced regions
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))

View File

@ -21,7 +21,7 @@ colormap = np.array([
[33, 0, 127],
[0, 0, 0],
[183, 183, 183],
], dtype=np.float) / 255
], dtype=float) / 255
class Visualizations:

View File

@ -31,14 +31,13 @@ class LogMel(torch.nn.Module):
fs: int = 16000,
n_fft: int = 512,
n_mels: int = 80,
fmin: float = None,
fmin: float = 0,
fmax: float = None,
htk: bool = False,
norm=1,
):
super().__init__()
fmin = 0 if fmin is None else fmin
fmax = fs / 2 if fmax is None else fmax
_mel_options = dict(
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm

View File

@ -107,7 +107,7 @@ def _griffin_lim(S, hparams):
Based on https://github.com/librosa/librosa/issues/434
"""
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
S_complex = np.abs(S).astype(complex)
y = _istft(S_complex * angles, hparams)
for i in range(hparams.griffin_lim_iters):
angles = np.exp(1j * np.angle(_stft(y, hparams)))

View File

@ -78,12 +78,12 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, speaker_dirs)
job = Pool(n_processes).imap_unordered(func, speaker_dirs)
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
if speaker_metadata is not None:
for metadatum in speaker_metadata:
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
metadata_file.write("|".join(map(str,metadatum)) + "\n")
metadata_file.close()
# Verify the contents of the metadata file
@ -134,7 +134,7 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
# Embed the utterances in separate threads
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
wav_dir = synthesizer_root.joinpath("audio")
@ -152,4 +152,4 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
# Embed the utterances in separate threads
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Emo", len(fpaths), unit="utterances"))
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))

View File

@ -104,29 +104,58 @@ def _split_on_silences(wav_fpath, words, hparams):
wav = logmmse.denoise(wav, profile, eta=0)
resp = pinyin(words, style=Style.TONE3)
res = [v[0] for v in resp if v[0].strip()]
res = filter(lambda v : not v.isspace(),map(lambda v: v[0],resp))
res = " ".join(res)
return wav, res
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
metadata = []
extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
words = dict_info.get(wav_fpath.name) if not words else words # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams, encoder_model_fpath)
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
return [m for m in metadata if m is not None]
extensions = ("*.wav", "*.flac", "*.mp3")
if skip_existing:
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
if not words:
words = dict_info.get(wav_fpath.name) # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
if mel_fpath.exists() and wav_fpath_.exists():
continue
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath) # accelarate
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
else:
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
if not words:
words = dict_info.get(wav_fpath.name) # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath)
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
return metadata

View File

@ -50,7 +50,7 @@ def linear_to_mel(spectrogram):
def build_mel_basis():
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
return librosa.filters.mel(sr = hp.sample_rate, n_fft = hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
def normalize(S):

View File

@ -1,9 +1,8 @@
umap-learn
visdom
librosa==0.8.1
librosa
matplotlib>=3.3.0
numpy==1.19.3; platform_system == "Windows"
numpy==1.19.4; platform_system != "Windows"
numpy
scipy>=1.0.0
tqdm
sounddevice
@ -13,22 +12,22 @@ inflect
PyQt5
multiprocess
numba
webrtcvad; platform_system != "Windows"
webrtcvad
pypinyin
flask
flask_wtf
flask_cors==3.0.10
gevent==21.8.0
flask_cors
gevent
flask_restx
tensorboard==1.15
streamlit==1.8.0
PyYAML==5.4.1
tensorboard
streamlit
PyYAML
torch_complex
espnet
PyWavelets
monotonic-align==0.0.3
transformers==4.26.0
transformers
fastapi
loguru
typer[all]
click==8.0.4
click