Some changes to make it easier to install the dependencies
parent
b78d0d2a26
commit
9f1dbeeecc
10
README-CN.md
10
README-CN.md
|
@ -44,6 +44,16 @@
|
||||||
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
|
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
|
||||||
* 安装 webrtcvad `pip install webrtcvad-wheels`。
|
* 安装 webrtcvad `pip install webrtcvad-wheels`。
|
||||||
|
|
||||||
|
或者
|
||||||
|
- 用`conda` 或者 `mamba` 安装依赖
|
||||||
|
|
||||||
|
```conda env create -n env_name -f env.yml```
|
||||||
|
|
||||||
|
```mamba env create -n env_name -f env.yml```
|
||||||
|
|
||||||
|
会创建新环境安装必须的依赖. 之后用 `conda activate env_name` 切换环境就完成了.
|
||||||
|
> env.yml只包含了运行时必要的依赖,暂时不包括monotonic-align,如果想要装GPU版本的pytorch可以查看官网教程。
|
||||||
|
|
||||||
#### 1.2 M1芯片Mac环境配置(Inference Time)
|
#### 1.2 M1芯片Mac环境配置(Inference Time)
|
||||||
> 以下环境按x86-64搭建,使用原生的`demo_toolbox.py`,可作为在不改代码情况下快速使用的workaround。
|
> 以下环境按x86-64搭建,使用原生的`demo_toolbox.py`,可作为在不改代码情况下快速使用的workaround。
|
||||||
>
|
>
|
||||||
|
|
10
README.md
10
README.md
|
@ -39,6 +39,16 @@
|
||||||
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
|
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
|
||||||
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
|
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
|
||||||
|
|
||||||
|
or
|
||||||
|
- install dependencies with `conda` or `mamba`
|
||||||
|
|
||||||
|
```conda env create -n env_name -f env.yml```
|
||||||
|
|
||||||
|
```mamba env create -n env_name -f env.yml```
|
||||||
|
|
||||||
|
will create a virtual environment where necessary dependencies are installed. Switch to the new environment by `conda activate env_name` and enjoy it.
|
||||||
|
> env.yml only includes the necessary dependencies to run the project,temporarily without monotonic-align. You can check the official website to install the GPU version of pytorch.
|
||||||
|
|
||||||
#### 1.2 Setup with a M1 Mac
|
#### 1.2 Setup with a M1 Mac
|
||||||
> The following steps are a workaround to directly use the original `demo_toolbox.py`without the changing of codes.
|
> The following steps are a workaround to directly use the original `demo_toolbox.py`without the changing of codes.
|
||||||
>
|
>
|
||||||
|
|
|
@ -78,7 +78,7 @@ if __name__ == "__main__":
|
||||||
else:
|
else:
|
||||||
train_hifigan(0, args, h)
|
train_hifigan(0, args, h)
|
||||||
elif args.vocoder_type == "fregan":
|
elif args.vocoder_type == "fregan":
|
||||||
with open('vocoder/fregan/config.json') as f:
|
with Path('vocoder/fregan/config.json').open() as f:
|
||||||
json_config = json.load(f)
|
json_config = json.load(f)
|
||||||
h = AttrDict(json_config)
|
h = AttrDict(json_config)
|
||||||
if h.num_gpus > 1:
|
if h.num_gpus > 1:
|
||||||
|
|
|
@ -33,7 +33,7 @@ colormap = np.array([
|
||||||
[0, 0, 0],
|
[0, 0, 0],
|
||||||
[183, 183, 183],
|
[183, 183, 183],
|
||||||
[76, 255, 0],
|
[76, 255, 0],
|
||||||
], dtype=np.float) / 255
|
], dtype=float) / 255
|
||||||
|
|
||||||
default_text = \
|
default_text = \
|
||||||
"欢迎使用工具箱, 现已支持中文输入!"
|
"欢迎使用工具箱, 现已支持中文输入!"
|
||||||
|
@ -402,8 +402,8 @@ class UI(QDialog):
|
||||||
self.app.processEvents()
|
self.app.processEvents()
|
||||||
|
|
||||||
def set_loading(self, value, maximum=1):
|
def set_loading(self, value, maximum=1):
|
||||||
self.loading_bar.setValue(value * 100)
|
self.loading_bar.setValue(int(value * 100))
|
||||||
self.loading_bar.setMaximum(maximum * 100)
|
self.loading_bar.setMaximum(int(maximum * 100))
|
||||||
self.loading_bar.setTextVisible(value != 0)
|
self.loading_bar.setTextVisible(value != 0)
|
||||||
self.app.processEvents()
|
self.app.processEvents()
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
|
||||||
|
|
||||||
# Resample the wav if needed
|
# Resample the wav if needed
|
||||||
if source_sr is not None and source_sr != sampling_rate:
|
if source_sr is not None and source_sr != sampling_rate:
|
||||||
wav = librosa.resample(wav, source_sr, sampling_rate)
|
wav = librosa.resample(wav, orig_sr = source_sr, target_sr = sampling_rate)
|
||||||
|
|
||||||
# Apply the preprocessing: normalize volume and shorten long silences
|
# Apply the preprocessing: normalize volume and shorten long silences
|
||||||
if normalize:
|
if normalize:
|
||||||
|
@ -99,7 +99,7 @@ def trim_long_silences(wav):
|
||||||
return ret[width - 1:] / width
|
return ret[width - 1:] / width
|
||||||
|
|
||||||
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
||||||
audio_mask = np.round(audio_mask).astype(np.bool)
|
audio_mask = np.round(audio_mask).astype(bool)
|
||||||
|
|
||||||
# Dilate the voiced regions
|
# Dilate the voiced regions
|
||||||
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
||||||
|
|
|
@ -21,7 +21,7 @@ colormap = np.array([
|
||||||
[33, 0, 127],
|
[33, 0, 127],
|
||||||
[0, 0, 0],
|
[0, 0, 0],
|
||||||
[183, 183, 183],
|
[183, 183, 183],
|
||||||
], dtype=np.float) / 255
|
], dtype=float) / 255
|
||||||
|
|
||||||
|
|
||||||
class Visualizations:
|
class Visualizations:
|
||||||
|
|
|
@ -31,14 +31,13 @@ class LogMel(torch.nn.Module):
|
||||||
fs: int = 16000,
|
fs: int = 16000,
|
||||||
n_fft: int = 512,
|
n_fft: int = 512,
|
||||||
n_mels: int = 80,
|
n_mels: int = 80,
|
||||||
fmin: float = None,
|
fmin: float = 0,
|
||||||
fmax: float = None,
|
fmax: float = None,
|
||||||
htk: bool = False,
|
htk: bool = False,
|
||||||
norm=1,
|
norm=1,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
fmin = 0 if fmin is None else fmin
|
|
||||||
fmax = fs / 2 if fmax is None else fmax
|
fmax = fs / 2 if fmax is None else fmax
|
||||||
_mel_options = dict(
|
_mel_options = dict(
|
||||||
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
|
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
|
||||||
|
|
|
@ -107,7 +107,7 @@ def _griffin_lim(S, hparams):
|
||||||
Based on https://github.com/librosa/librosa/issues/434
|
Based on https://github.com/librosa/librosa/issues/434
|
||||||
"""
|
"""
|
||||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||||
S_complex = np.abs(S).astype(np.complex)
|
S_complex = np.abs(S).astype(complex)
|
||||||
y = _istft(S_complex * angles, hparams)
|
y = _istft(S_complex * angles, hparams)
|
||||||
for i in range(hparams.griffin_lim_iters):
|
for i in range(hparams.griffin_lim_iters):
|
||||||
angles = np.exp(1j * np.angle(_stft(y, hparams)))
|
angles = np.exp(1j * np.angle(_stft(y, hparams)))
|
||||||
|
|
|
@ -78,12 +78,12 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
|
|
||||||
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||||
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
|
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
|
||||||
job = Pool(n_processes).imap(func, speaker_dirs)
|
job = Pool(n_processes).imap_unordered(func, speaker_dirs)
|
||||||
|
|
||||||
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
||||||
if speaker_metadata is not None:
|
if speaker_metadata is not None:
|
||||||
for metadatum in speaker_metadata:
|
for metadatum in speaker_metadata:
|
||||||
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
metadata_file.write("|".join(map(str,metadatum)) + "\n")
|
||||||
metadata_file.close()
|
metadata_file.close()
|
||||||
|
|
||||||
# Verify the contents of the metadata file
|
# Verify the contents of the metadata file
|
||||||
|
@ -134,7 +134,7 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
|
||||||
# Embed the utterances in separate threads
|
# Embed the utterances in separate threads
|
||||||
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
||||||
job = Pool(n_processes).imap(func, fpaths)
|
job = Pool(n_processes).imap(func, fpaths)
|
||||||
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
||||||
|
|
||||||
def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
|
def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
|
||||||
wav_dir = synthesizer_root.joinpath("audio")
|
wav_dir = synthesizer_root.joinpath("audio")
|
||||||
|
@ -152,4 +152,4 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
|
||||||
# Embed the utterances in separate threads
|
# Embed the utterances in separate threads
|
||||||
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
|
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
|
||||||
job = Pool(n_processes).imap(func, fpaths)
|
job = Pool(n_processes).imap(func, fpaths)
|
||||||
list(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
||||||
|
|
|
@ -104,29 +104,58 @@ def _split_on_silences(wav_fpath, words, hparams):
|
||||||
wav = logmmse.denoise(wav, profile, eta=0)
|
wav = logmmse.denoise(wav, profile, eta=0)
|
||||||
|
|
||||||
resp = pinyin(words, style=Style.TONE3)
|
resp = pinyin(words, style=Style.TONE3)
|
||||||
res = [v[0] for v in resp if v[0].strip()]
|
res = filter(lambda v : not v.isspace(),map(lambda v: v[0],resp))
|
||||||
res = " ".join(res)
|
res = " ".join(res)
|
||||||
|
|
||||||
return wav, res
|
return wav, res
|
||||||
|
|
||||||
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
|
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
|
||||||
metadata = []
|
metadata = []
|
||||||
extensions = ["*.wav", "*.flac", "*.mp3"]
|
extensions = ("*.wav", "*.flac", "*.mp3")
|
||||||
for extension in extensions:
|
if skip_existing:
|
||||||
wav_fpath_list = speaker_dir.glob(extension)
|
for extension in extensions:
|
||||||
# Iterate over each wav
|
wav_fpath_list = speaker_dir.glob(extension)
|
||||||
for wav_fpath in wav_fpath_list:
|
# Iterate over each wav
|
||||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
for wav_fpath in wav_fpath_list:
|
||||||
words = dict_info.get(wav_fpath.name) if not words else words # try with extension
|
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||||
if not words:
|
if not words:
|
||||||
print("no wordS")
|
words = dict_info.get(wav_fpath.name) # try with extension
|
||||||
continue
|
if not words:
|
||||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
print("no wordS")
|
||||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
continue
|
||||||
result = _process_utterance(wav, text, out_dir, sub_basename,
|
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||||
skip_existing, hparams, encoder_model_fpath)
|
|
||||||
if result is None:
|
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
|
||||||
continue
|
wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
|
||||||
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
|
||||||
metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
|
if mel_fpath.exists() and wav_fpath_.exists():
|
||||||
return [m for m in metadata if m is not None]
|
continue
|
||||||
|
|
||||||
|
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||||
|
result = _process_utterance(wav, text, out_dir, sub_basename,
|
||||||
|
False, hparams, encoder_model_fpath) # accelarate
|
||||||
|
if result is None:
|
||||||
|
continue
|
||||||
|
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
||||||
|
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
|
||||||
|
else:
|
||||||
|
for extension in extensions:
|
||||||
|
wav_fpath_list = speaker_dir.glob(extension)
|
||||||
|
# Iterate over each wav
|
||||||
|
for wav_fpath in wav_fpath_list:
|
||||||
|
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||||
|
if not words:
|
||||||
|
words = dict_info.get(wav_fpath.name) # try with extension
|
||||||
|
if not words:
|
||||||
|
print("no wordS")
|
||||||
|
continue
|
||||||
|
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||||
|
|
||||||
|
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||||
|
result = _process_utterance(wav, text, out_dir, sub_basename,
|
||||||
|
False, hparams, encoder_model_fpath)
|
||||||
|
if result is None:
|
||||||
|
continue
|
||||||
|
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
||||||
|
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
|
||||||
|
return metadata
|
||||||
|
|
|
@ -50,7 +50,7 @@ def linear_to_mel(spectrogram):
|
||||||
|
|
||||||
|
|
||||||
def build_mel_basis():
|
def build_mel_basis():
|
||||||
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
|
return librosa.filters.mel(sr = hp.sample_rate, n_fft = hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
|
||||||
|
|
||||||
|
|
||||||
def normalize(S):
|
def normalize(S):
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
umap-learn
|
umap-learn
|
||||||
visdom
|
visdom
|
||||||
librosa==0.8.1
|
librosa
|
||||||
matplotlib>=3.3.0
|
matplotlib>=3.3.0
|
||||||
numpy==1.19.3; platform_system == "Windows"
|
numpy
|
||||||
numpy==1.19.4; platform_system != "Windows"
|
|
||||||
scipy>=1.0.0
|
scipy>=1.0.0
|
||||||
tqdm
|
tqdm
|
||||||
sounddevice
|
sounddevice
|
||||||
|
@ -13,22 +12,22 @@ inflect
|
||||||
PyQt5
|
PyQt5
|
||||||
multiprocess
|
multiprocess
|
||||||
numba
|
numba
|
||||||
webrtcvad; platform_system != "Windows"
|
webrtcvad
|
||||||
pypinyin
|
pypinyin
|
||||||
flask
|
flask
|
||||||
flask_wtf
|
flask_wtf
|
||||||
flask_cors==3.0.10
|
flask_cors
|
||||||
gevent==21.8.0
|
gevent
|
||||||
flask_restx
|
flask_restx
|
||||||
tensorboard==1.15
|
tensorboard
|
||||||
streamlit==1.8.0
|
streamlit
|
||||||
PyYAML==5.4.1
|
PyYAML
|
||||||
torch_complex
|
torch_complex
|
||||||
espnet
|
espnet
|
||||||
PyWavelets
|
PyWavelets
|
||||||
monotonic-align==0.0.3
|
monotonic-align==0.0.3
|
||||||
transformers==4.26.0
|
transformers
|
||||||
fastapi
|
fastapi
|
||||||
loguru
|
loguru
|
||||||
typer[all]
|
typer[all]
|
||||||
click==8.0.4
|
click
|
||||||
|
|
Loading…
Reference in New Issue