mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
Fre-GAN (#544)
* 替换了vocoder * 修改了vocoder_train * 减谱法 * 美化UI;语音增强;MFCC特征可视化 * 修复了训练fregan模型时的报错 * 增加了可以分析音频特征的独立文件 * 现已支持Fre-GAN声码器的训练 * 修复了训练fregan时保存模型的BUG * 删除了无用的文件 * 优化了识别声码器模型的方式
This commit is contained in:
parent
875fe15069
commit
86ea11affd
@ -79,6 +79,10 @@
|
|||||||
`python vocoder_train.py <trainid> <datasets_root> hifigan`
|
`python vocoder_train.py <trainid> <datasets_root> hifigan`
|
||||||
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
||||||
|
|
||||||
|
* 训练Fre-GAN声码器:
|
||||||
|
`python vocoder_train.py <trainid> <datasets_root> --config config.json fregan`
|
||||||
|
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
||||||
|
|
||||||
### 3. 启动程序或工具箱
|
### 3. 启动程序或工具箱
|
||||||
您可以尝试使用以下命令:
|
您可以尝试使用以下命令:
|
||||||
|
|
||||||
|
43
analysis.py
Normal file
43
analysis.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
from scipy.io import wavfile # scipy library to read wav files
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
AudioName = "target.wav" # Audio File
|
||||||
|
fs, Audiodata = wavfile.read(AudioName)
|
||||||
|
|
||||||
|
# Plot the audio signal in time
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
plt.plot(Audiodata)
|
||||||
|
plt.title('Audio signal in time',size=16)
|
||||||
|
|
||||||
|
# spectrum
|
||||||
|
from scipy.fftpack import fft # fourier transform
|
||||||
|
n = len(Audiodata)
|
||||||
|
AudioFreq = fft(Audiodata)
|
||||||
|
AudioFreq = AudioFreq[0:int(np.ceil((n+1)/2.0))] #Half of the spectrum
|
||||||
|
MagFreq = np.abs(AudioFreq) # Magnitude
|
||||||
|
MagFreq = MagFreq / float(n)
|
||||||
|
# power spectrum
|
||||||
|
MagFreq = MagFreq**2
|
||||||
|
if n % 2 > 0: # ffte odd
|
||||||
|
MagFreq[1:len(MagFreq)] = MagFreq[1:len(MagFreq)] * 2
|
||||||
|
else:# fft even
|
||||||
|
MagFreq[1:len(MagFreq) -1] = MagFreq[1:len(MagFreq) - 1] * 2
|
||||||
|
|
||||||
|
plt.figure()
|
||||||
|
freqAxis = np.arange(0,int(np.ceil((n+1)/2.0)), 1.0) * (fs / n);
|
||||||
|
plt.plot(freqAxis/1000.0, 10*np.log10(MagFreq)) #Power spectrum
|
||||||
|
plt.xlabel('Frequency (kHz)'); plt.ylabel('Power spectrum (dB)');
|
||||||
|
|
||||||
|
|
||||||
|
#Spectrogram
|
||||||
|
from scipy import signal
|
||||||
|
N = 512 #Number of point in the fft
|
||||||
|
f, t, Sxx = signal.spectrogram(Audiodata, fs,window = signal.blackman(N),nfft=N)
|
||||||
|
plt.figure()
|
||||||
|
plt.pcolormesh(t, f,10*np.log10(Sxx)) # dB spectrogram
|
||||||
|
#plt.pcolormesh(t, f,Sxx) # Lineal spectrogram
|
||||||
|
plt.ylabel('Frequency [Hz]')
|
||||||
|
plt.xlabel('Time [seg]')
|
||||||
|
plt.title('Spectrogram with scipy.signal',size=16);
|
||||||
|
|
||||||
|
plt.show()
|
BIN
fmcc_result.png
Normal file
BIN
fmcc_result.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 188 KiB |
BIN
fmcc_source.png
Normal file
BIN
fmcc_source.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 195 KiB |
188
specdeno/enhance_speach.py
Normal file
188
specdeno/enhance_speach.py
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
import wave
|
||||||
|
import math
|
||||||
|
from synthesizer.hparams import hparams
|
||||||
|
import os
|
||||||
|
import ctypes as ct
|
||||||
|
from encoder import inference as encoder
|
||||||
|
from utils import logmmse
|
||||||
|
|
||||||
|
|
||||||
|
def enhance(fpath):
|
||||||
|
class FloatBits(ct.Structure):
|
||||||
|
_fields_ = [
|
||||||
|
('M', ct.c_uint, 23),
|
||||||
|
('E', ct.c_uint, 8),
|
||||||
|
('S', ct.c_uint, 1)
|
||||||
|
]
|
||||||
|
|
||||||
|
class Float(ct.Union):
|
||||||
|
_anonymous_ = ('bits',)
|
||||||
|
_fields_ = [
|
||||||
|
('value', ct.c_float),
|
||||||
|
('bits', FloatBits)
|
||||||
|
]
|
||||||
|
|
||||||
|
def nextpow2(x):
|
||||||
|
if x < 0:
|
||||||
|
x = -x
|
||||||
|
if x == 0:
|
||||||
|
return 0
|
||||||
|
d = Float()
|
||||||
|
d.value = x
|
||||||
|
if d.M == 0:
|
||||||
|
return d.E - 127
|
||||||
|
return d.E - 127 + 1
|
||||||
|
|
||||||
|
|
||||||
|
# 打开WAV文档
|
||||||
|
f = wave.open(str(fpath))
|
||||||
|
# 读取格式信息
|
||||||
|
# (nchannels, sampwidth, framerate, nframes, comptype, compname)
|
||||||
|
params = f.getparams()
|
||||||
|
nchannels, sampwidth, framerate, nframes = params[:4]
|
||||||
|
fs = framerate
|
||||||
|
# 读取波形数据
|
||||||
|
str_data = f.readframes(nframes)
|
||||||
|
f.close()
|
||||||
|
# 将波形数据转换为数组
|
||||||
|
x = np.fromstring(str_data, dtype=np.short)
|
||||||
|
# 计算参数
|
||||||
|
len_ = 20 * fs // 1000 # 样本中帧的大小
|
||||||
|
PERC = 50 # 窗口重叠占帧的百分比
|
||||||
|
len1 = len_ * PERC // 100 # 重叠窗口
|
||||||
|
len2 = len_ - len1 # 非重叠窗口
|
||||||
|
# 设置默认参数
|
||||||
|
Thres = 3
|
||||||
|
Expnt = 2.0
|
||||||
|
beta = 0.002
|
||||||
|
G = 0.9
|
||||||
|
# 初始化汉明窗
|
||||||
|
win = np.hamming(len_)
|
||||||
|
# normalization gain for overlap+add with 50% overlap
|
||||||
|
winGain = len2 / sum(win)
|
||||||
|
|
||||||
|
# Noise magnitude calculations - assuming that the first 5 frames is noise/silence
|
||||||
|
nFFT = 2 * 2 ** (nextpow2(len_))
|
||||||
|
noise_mean = np.zeros(nFFT)
|
||||||
|
|
||||||
|
j = 0
|
||||||
|
for k in range(1, 6):
|
||||||
|
noise_mean = noise_mean + abs(np.fft.fft(win * x[j:j + len_], nFFT))
|
||||||
|
j = j + len_
|
||||||
|
noise_mu = noise_mean / 5
|
||||||
|
|
||||||
|
# --- allocate memory and initialize various variables
|
||||||
|
k = 1
|
||||||
|
img = 1j
|
||||||
|
x_old = np.zeros(len1)
|
||||||
|
Nframes = len(x) // len2 - 1
|
||||||
|
xfinal = np.zeros(Nframes * len2)
|
||||||
|
|
||||||
|
# ========================= Start Processing ===============================
|
||||||
|
for n in range(0, Nframes):
|
||||||
|
# Windowing
|
||||||
|
insign = win * x[k-1:k + len_ - 1]
|
||||||
|
# compute fourier transform of a frame
|
||||||
|
spec = np.fft.fft(insign, nFFT)
|
||||||
|
# compute the magnitude
|
||||||
|
sig = abs(spec)
|
||||||
|
|
||||||
|
# save the noisy phase information
|
||||||
|
theta = np.angle(spec)
|
||||||
|
SNRseg = 10 * np.log10(np.linalg.norm(sig, 2) ** 2 / np.linalg.norm(noise_mu, 2) ** 2)
|
||||||
|
|
||||||
|
|
||||||
|
def berouti(SNR):
|
||||||
|
if -5.0 <= SNR <= 20.0:
|
||||||
|
a = 4 - SNR * 3 / 20
|
||||||
|
else:
|
||||||
|
if SNR < -5.0:
|
||||||
|
a = 5
|
||||||
|
if SNR > 20:
|
||||||
|
a = 1
|
||||||
|
return a
|
||||||
|
|
||||||
|
|
||||||
|
def berouti1(SNR):
|
||||||
|
if -5.0 <= SNR <= 20.0:
|
||||||
|
a = 3 - SNR * 2 / 20
|
||||||
|
else:
|
||||||
|
if SNR < -5.0:
|
||||||
|
a = 4
|
||||||
|
if SNR > 20:
|
||||||
|
a = 1
|
||||||
|
return a
|
||||||
|
|
||||||
|
if Expnt == 1.0: # 幅度谱
|
||||||
|
alpha = berouti1(SNRseg)
|
||||||
|
else: # 功率谱
|
||||||
|
alpha = berouti(SNRseg)
|
||||||
|
#############
|
||||||
|
sub_speech = sig ** Expnt - alpha * noise_mu ** Expnt;
|
||||||
|
# 当纯净信号小于噪声信号的功率时
|
||||||
|
diffw = sub_speech - beta * noise_mu ** Expnt
|
||||||
|
# beta negative components
|
||||||
|
|
||||||
|
def find_index(x_list):
|
||||||
|
index_list = []
|
||||||
|
for i in range(len(x_list)):
|
||||||
|
if x_list[i] < 0:
|
||||||
|
index_list.append(i)
|
||||||
|
return index_list
|
||||||
|
|
||||||
|
z = find_index(diffw)
|
||||||
|
if len(z) > 0:
|
||||||
|
# 用估计出来的噪声信号表示下限值
|
||||||
|
sub_speech[z] = beta * noise_mu[z] ** Expnt
|
||||||
|
# --- implement a simple VAD detector --------------
|
||||||
|
if SNRseg < Thres: # Update noise spectrum
|
||||||
|
noise_temp = G * noise_mu ** Expnt + (1 - G) * sig ** Expnt # 平滑处理噪声功率谱
|
||||||
|
noise_mu = noise_temp ** (1 / Expnt) # 新的噪声幅度谱
|
||||||
|
# flipud函数实现矩阵的上下翻转,是以矩阵的“水平中线”为对称轴
|
||||||
|
# 交换上下对称元素
|
||||||
|
sub_speech[nFFT // 2 + 1:nFFT] = np.flipud(sub_speech[1:nFFT // 2])
|
||||||
|
x_phase = (sub_speech ** (1 / Expnt)) * (np.array([math.cos(x) for x in theta]) + img * (np.array([math.sin(x) for x in theta])))
|
||||||
|
# take the IFFT
|
||||||
|
|
||||||
|
xi = np.fft.ifft(x_phase).real
|
||||||
|
# --- Overlap and add ---------------
|
||||||
|
xfinal[k-1:k + len2 - 1] = x_old + xi[0:len1]
|
||||||
|
x_old = xi[0 + len1:len_]
|
||||||
|
k = k + len2
|
||||||
|
# 保存文件
|
||||||
|
wf = wave.open('out.wav', 'wb')
|
||||||
|
# 设置参数
|
||||||
|
wf.setparams(params)
|
||||||
|
# 设置波形文件 .tostring()将array转换为data
|
||||||
|
wave_data = (winGain * xfinal).astype(np.short)
|
||||||
|
wf.writeframes(wave_data.tostring())
|
||||||
|
wf.close()
|
||||||
|
wav = librosa.load("./out.wav", hparams.sample_rate)[0]
|
||||||
|
|
||||||
|
#在给定噪声配置文件的情况下清除语音波形中的噪声。 波形必须有与用于创建噪声配置文件的采样率相同
|
||||||
|
if hparams.rescale:
|
||||||
|
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||||
|
# denoise
|
||||||
|
if len(wav) > hparams.sample_rate * (0.3 + 0.1):
|
||||||
|
noise_wav = np.concatenate([wav[:int(hparams.sample_rate * 0.15)],
|
||||||
|
wav[-int(hparams.sample_rate * 0.15):]])
|
||||||
|
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
|
||||||
|
wav = logmmse.denoise(wav, profile)
|
||||||
|
|
||||||
|
# Trim excessive silences
|
||||||
|
wav = encoder.preprocess_wav(wav)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#删除保存的输出文件
|
||||||
|
os.remove("./out.wav")
|
||||||
|
return wav
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -3,6 +3,7 @@ from encoder import inference as encoder
|
|||||||
from synthesizer.inference import Synthesizer
|
from synthesizer.inference import Synthesizer
|
||||||
from vocoder.wavernn import inference as rnn_vocoder
|
from vocoder.wavernn import inference as rnn_vocoder
|
||||||
from vocoder.hifigan import inference as gan_vocoder
|
from vocoder.hifigan import inference as gan_vocoder
|
||||||
|
from vocoder.fregan import inference as fgan_vocoder
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import perf_counter as timer
|
from time import perf_counter as timer
|
||||||
from toolbox.utterance import Utterance
|
from toolbox.utterance import Utterance
|
||||||
@ -13,6 +14,10 @@ import torch
|
|||||||
import librosa
|
import librosa
|
||||||
import re
|
import re
|
||||||
from audioread.exceptions import NoBackendError
|
from audioread.exceptions import NoBackendError
|
||||||
|
from specdeno.enhance_speach import enhance
|
||||||
|
import os
|
||||||
|
from synthesizer.hparams import hparams
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
# 默认使用wavernn
|
# 默认使用wavernn
|
||||||
vocoder = rnn_vocoder
|
vocoder = rnn_vocoder
|
||||||
@ -109,6 +114,13 @@ class Toolbox:
|
|||||||
self.ui.stop_button.clicked.connect(self.ui.stop)
|
self.ui.stop_button.clicked.connect(self.ui.stop)
|
||||||
self.ui.record_button.clicked.connect(self.record)
|
self.ui.record_button.clicked.connect(self.record)
|
||||||
|
|
||||||
|
#添加source_mfcc分析槽
|
||||||
|
func = lambda: self.ui.plot_mfcc(self.ui.selected_utterance.wav, Synthesizer.sample_rate)
|
||||||
|
self.ui.play_button.clicked.connect(func)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#Audio
|
#Audio
|
||||||
self.ui.setup_audio_devices(Synthesizer.sample_rate)
|
self.ui.setup_audio_devices(Synthesizer.sample_rate)
|
||||||
|
|
||||||
@ -119,6 +131,8 @@ class Toolbox:
|
|||||||
self.ui.export_wav_button.clicked.connect(func)
|
self.ui.export_wav_button.clicked.connect(func)
|
||||||
self.ui.waves_cb.currentIndexChanged.connect(self.set_current_wav)
|
self.ui.waves_cb.currentIndexChanged.connect(self.set_current_wav)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Generation
|
# Generation
|
||||||
func = lambda: self.synthesize() or self.vocode()
|
func = lambda: self.synthesize() or self.vocode()
|
||||||
self.ui.generate_button.clicked.connect(func)
|
self.ui.generate_button.clicked.connect(func)
|
||||||
@ -126,6 +140,9 @@ class Toolbox:
|
|||||||
self.ui.vocode_button.clicked.connect(self.vocode)
|
self.ui.vocode_button.clicked.connect(self.vocode)
|
||||||
self.ui.random_seed_checkbox.clicked.connect(self.update_seed_textbox)
|
self.ui.random_seed_checkbox.clicked.connect(self.update_seed_textbox)
|
||||||
|
|
||||||
|
# 添加result_mfcc分析槽,该槽要在语音合成之后
|
||||||
|
func = lambda: self.ui.plot_mfcc1(self.current_wav, Synthesizer.sample_rate)
|
||||||
|
self.ui.generate_button.clicked.connect(func)
|
||||||
# UMAP legend
|
# UMAP legend
|
||||||
self.ui.clear_button.clicked.connect(self.clear_utterances)
|
self.ui.clear_button.clicked.connect(self.clear_utterances)
|
||||||
|
|
||||||
@ -167,13 +184,18 @@ class Toolbox:
|
|||||||
|
|
||||||
# Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
|
# Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
|
||||||
# playback, so as to have a fair comparison with the generated audio
|
# playback, so as to have a fair comparison with the generated audio
|
||||||
wav = Synthesizer.load_preprocess_wav(fpath)
|
#wav = Synthesizer.load_preprocess_wav(fpath)
|
||||||
|
wav = enhance(fpath)
|
||||||
|
|
||||||
self.ui.log("Loaded %s" % name)
|
self.ui.log("Loaded %s" % name)
|
||||||
|
|
||||||
self.add_real_utterance(wav, name, speaker_name)
|
self.add_real_utterance(wav, name, speaker_name)
|
||||||
|
|
||||||
def record(self):
|
def record(self):
|
||||||
wav = self.ui.record_one(encoder.sampling_rate, 5)
|
wav = self.ui.record_one(encoder.sampling_rate, 5)
|
||||||
|
sf.write('output1.wav', wav, hparams.sample_rate) # 先将变量wav写为文件的形式
|
||||||
|
wav = enhance('output1.wav')
|
||||||
|
os.remove("./output1.wav")
|
||||||
if wav is None:
|
if wav is None:
|
||||||
return
|
return
|
||||||
self.ui.play(wav, encoder.sampling_rate)
|
self.ui.play(wav, encoder.sampling_rate)
|
||||||
@ -285,7 +307,10 @@ class Toolbox:
|
|||||||
|
|
||||||
# Trim excessive silences
|
# Trim excessive silences
|
||||||
if self.ui.trim_silences_checkbox.isChecked():
|
if self.ui.trim_silences_checkbox.isChecked():
|
||||||
wav = encoder.preprocess_wav(wav)
|
#wav = encoder.preprocess_wav(wav)
|
||||||
|
sf.write('output.wav', wav, hparams.sample_rate) #先将变量wav写为文件的形式
|
||||||
|
wav = enhance('output.wav')
|
||||||
|
os.remove("./output.wav")
|
||||||
|
|
||||||
# Play it
|
# Play it
|
||||||
wav = wav / np.abs(wav).max() * 0.97
|
wav = wav / np.abs(wav).max() * 0.97
|
||||||
@ -360,10 +385,13 @@ class Toolbox:
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# Sekect vocoder based on model name
|
# Select vocoder based on model name
|
||||||
if model_fpath.name[0] == "g":
|
if model_fpath.name is not None and model_fpath.name.find("hifigan") > -1:
|
||||||
vocoder = gan_vocoder
|
vocoder = gan_vocoder
|
||||||
self.ui.log("set hifigan as vocoder")
|
self.ui.log("set hifigan as vocoder")
|
||||||
|
elif model_fpath.name is not None and model_fpath.name.find("fregan") > -1:
|
||||||
|
vocoder = fgan_vocoder
|
||||||
|
self.ui.log("set fregan as vocoder")
|
||||||
else:
|
else:
|
||||||
vocoder = rnn_vocoder
|
vocoder = rnn_vocoder
|
||||||
self.ui.log("set wavernn as vocoder")
|
self.ui.log("set wavernn as vocoder")
|
||||||
@ -377,3 +405,5 @@ class Toolbox:
|
|||||||
|
|
||||||
def update_seed_textbox(self):
|
def update_seed_textbox(self):
|
||||||
self.ui.update_seed_textbox()
|
self.ui.update_seed_textbox()
|
||||||
|
|
||||||
|
|
||||||
|
BIN
toolbox/assets/1.png
Normal file
BIN
toolbox/assets/1.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
BIN
toolbox/assets/2.png
Normal file
BIN
toolbox/assets/2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 29 KiB |
BIN
toolbox/assets/picture1.jpg
Normal file
BIN
toolbox/assets/picture1.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 841 KiB |
BIN
toolbox/assets/按钮控件.png
Normal file
BIN
toolbox/assets/按钮控件.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 578 KiB |
313
toolbox/ui.py
313
toolbox/ui.py
@ -1,4 +1,7 @@
|
|||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy
|
||||||
|
from scipy.fftpack import dct
|
||||||
|
from PyQt5.QtGui import QPalette, QBrush, QPixmap
|
||||||
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
|
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
|
||||||
from matplotlib.figure import Figure
|
from matplotlib.figure import Figure
|
||||||
from PyQt5.QtCore import Qt, QStringListModel
|
from PyQt5.QtCore import Qt, QStringListModel
|
||||||
@ -16,9 +19,15 @@ from time import sleep
|
|||||||
import umap
|
import umap
|
||||||
import sys
|
import sys
|
||||||
from warnings import filterwarnings, warn
|
from warnings import filterwarnings, warn
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
filterwarnings("ignore")
|
filterwarnings("ignore")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
colormap = np.array([
|
colormap = np.array([
|
||||||
[0, 127, 70],
|
[0, 127, 70],
|
||||||
[255, 0, 0],
|
[255, 0, 0],
|
||||||
@ -37,7 +46,7 @@ colormap = np.array([
|
|||||||
], dtype=np.float) / 255
|
], dtype=np.float) / 255
|
||||||
|
|
||||||
default_text = \
|
default_text = \
|
||||||
"欢迎使用工具箱, 现已支持中文输入!"
|
"请输入需要克隆的语音文本!"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -50,6 +59,11 @@ class UI(QDialog):
|
|||||||
self.draw_spec(utterance.spec, which)
|
self.draw_spec(utterance.spec, which)
|
||||||
self.draw_embed(utterance.embed, utterance.name, which)
|
self.draw_embed(utterance.embed, utterance.name, which)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def draw_embed(self, embed, name, which):
|
def draw_embed(self, embed, name, which):
|
||||||
embed_ax, _ = self.current_ax if which == "current" else self.gen_ax
|
embed_ax, _ = self.current_ax if which == "current" else self.gen_ax
|
||||||
embed_ax.figure.suptitle("" if embed is None else name)
|
embed_ax.figure.suptitle("" if embed is None else name)
|
||||||
@ -96,7 +110,7 @@ class UI(QDialog):
|
|||||||
|
|
||||||
# Display a message if there aren't enough points
|
# Display a message if there aren't enough points
|
||||||
if len(utterances) < self.min_umap_points:
|
if len(utterances) < self.min_umap_points:
|
||||||
self.umap_ax.text(.5, .5, "Add %d more points to\ngenerate the projections" %
|
self.umap_ax.text(.5, .5, "umap:\nAdd %d more points to\ngenerate the projections" %
|
||||||
(self.min_umap_points - len(utterances)),
|
(self.min_umap_points - len(utterances)),
|
||||||
horizontalalignment='center', fontsize=15)
|
horizontalalignment='center', fontsize=15)
|
||||||
self.umap_ax.set_title("")
|
self.umap_ax.set_title("")
|
||||||
@ -227,6 +241,110 @@ class UI(QDialog):
|
|||||||
|
|
||||||
return wav.squeeze()
|
return wav.squeeze()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#添加source_mfcc分析函数
|
||||||
|
def plot_mfcc(self, wav, sample_rate):
|
||||||
|
|
||||||
|
signal = wav
|
||||||
|
print(sample_rate, len(signal))
|
||||||
|
# 读取前3.5s 的数据
|
||||||
|
signal = signal[0:int(3.5 * sample_rate)]
|
||||||
|
print(signal)
|
||||||
|
|
||||||
|
# 预先处理
|
||||||
|
pre_emphasis = 0.97
|
||||||
|
emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
|
||||||
|
|
||||||
|
frame_size = 0.025
|
||||||
|
frame_stride = 0.1
|
||||||
|
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate
|
||||||
|
signal_length = len(emphasized_signal)
|
||||||
|
frame_length = int(round(frame_length))
|
||||||
|
frame_step = int(round(frame_step))
|
||||||
|
num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step))
|
||||||
|
|
||||||
|
pad_signal_length = num_frames * frame_step + frame_length
|
||||||
|
z = numpy.zeros((pad_signal_length - signal_length))
|
||||||
|
pad_signal = numpy.append(emphasized_signal, z)
|
||||||
|
|
||||||
|
indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(
|
||||||
|
numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
|
||||||
|
|
||||||
|
frames = pad_signal[numpy.mat(indices).astype(numpy.int32, copy=False)]
|
||||||
|
|
||||||
|
# 加上汉明窗
|
||||||
|
frames *= numpy.hamming(frame_length)
|
||||||
|
# frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1)) # Explicit Implementation **
|
||||||
|
|
||||||
|
# 傅立叶变换和功率谱
|
||||||
|
NFFT = 512
|
||||||
|
mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT)) # Magnitude of the FFT
|
||||||
|
# print(mag_frames.shape)
|
||||||
|
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
|
||||||
|
|
||||||
|
low_freq_mel = 0
|
||||||
|
# 将频率转换为Mel
|
||||||
|
nfilt = 40
|
||||||
|
high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700))
|
||||||
|
mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
|
||||||
|
hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # Convert Mel to Hz
|
||||||
|
|
||||||
|
bin = numpy.floor((NFFT + 1) * hz_points / sample_rate)
|
||||||
|
|
||||||
|
fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
|
||||||
|
|
||||||
|
for m in range(1, nfilt + 1):
|
||||||
|
f_m_minus = int(bin[m - 1]) # left
|
||||||
|
f_m = int(bin[m]) # center
|
||||||
|
f_m_plus = int(bin[m + 1]) # right
|
||||||
|
for k in range(f_m_minus, f_m):
|
||||||
|
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
|
||||||
|
for k in range(f_m, f_m_plus):
|
||||||
|
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
|
||||||
|
filter_banks = numpy.dot(pow_frames, fbank.T)
|
||||||
|
filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks) # Numerical Stability
|
||||||
|
filter_banks = 20 * numpy.log10(filter_banks) # dB
|
||||||
|
|
||||||
|
# 所得到的倒谱系数2-13被保留,其余的被丢弃
|
||||||
|
num_ceps = 12
|
||||||
|
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1: (num_ceps + 1)]
|
||||||
|
(nframes, ncoeff) = mfcc.shape
|
||||||
|
|
||||||
|
n = numpy.arange(ncoeff)
|
||||||
|
cep_lifter = 22
|
||||||
|
lift = 1 + (cep_lifter / 2) * numpy.sin(numpy.pi * n / cep_lifter)
|
||||||
|
mfcc *= lift # *
|
||||||
|
|
||||||
|
# filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8)
|
||||||
|
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
|
||||||
|
print(mfcc.shape)
|
||||||
|
|
||||||
|
# 创建新的figure
|
||||||
|
fig10 = plt.figure(figsize=(16,8))
|
||||||
|
|
||||||
|
# 绘制1x2两行两列共四个图,编号从1开始
|
||||||
|
ax = fig10.add_subplot(121)
|
||||||
|
plt.plot(mfcc)
|
||||||
|
|
||||||
|
ax = fig10.add_subplot(122)
|
||||||
|
# 平均归一化MFCC
|
||||||
|
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
|
||||||
|
plt.imshow(numpy.flipud(mfcc.T), cmap=plt.cm.jet, aspect=0.2,
|
||||||
|
extent=[0, mfcc.shape[0], 0, mfcc.shape[1]]) # 热力图
|
||||||
|
#将figure保存为png并显示在新创建的子窗口上
|
||||||
|
plt.savefig("fmcc_source.png")
|
||||||
|
dialog_fault = QDialog()
|
||||||
|
dialog_fault.setWindowTitle("源音频MFCC特征图及MFCC平均归一化热图") # 设置窗口名
|
||||||
|
pic = QPixmap("fmcc_source.png")
|
||||||
|
label_pic = QLabel("show", dialog_fault)
|
||||||
|
label_pic.setPixmap(pic)
|
||||||
|
label_pic.setGeometry(0,0,1500,800)
|
||||||
|
dialog_fault.exec_()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def current_dataset_name(self):
|
def current_dataset_name(self):
|
||||||
return self.dataset_box.currentText()
|
return self.dataset_box.currentText()
|
||||||
@ -272,7 +390,7 @@ class UI(QDialog):
|
|||||||
datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()]
|
datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()]
|
||||||
self.browser_load_button.setDisabled(len(datasets) == 0)
|
self.browser_load_button.setDisabled(len(datasets) == 0)
|
||||||
if datasets_root is None or len(datasets) == 0:
|
if datasets_root is None or len(datasets) == 0:
|
||||||
msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \
|
msg = "Tip: Please " + (" select the voice to be cloned" \
|
||||||
if datasets_root is None else "o not have any of the recognized datasets" \
|
if datasets_root is None else "o not have any of the recognized datasets" \
|
||||||
" in %s" % datasets_root)
|
" in %s" % datasets_root)
|
||||||
self.log(msg)
|
self.log(msg)
|
||||||
@ -417,16 +535,125 @@ class UI(QDialog):
|
|||||||
self.export_wav_button.setDisabled(True)
|
self.export_wav_button.setDisabled(True)
|
||||||
[self.log("") for _ in range(self.max_log_lines)]
|
[self.log("") for _ in range(self.max_log_lines)]
|
||||||
|
|
||||||
|
|
||||||
|
#添加result_mfcc分析函数
|
||||||
|
def plot_mfcc1(self, wav, sample_rate):
|
||||||
|
|
||||||
|
signal = wav
|
||||||
|
print(sample_rate, len(signal))
|
||||||
|
# 读取前3.5s 的数据
|
||||||
|
signal = signal[0:int(3.5 * sample_rate)]
|
||||||
|
print(signal)
|
||||||
|
|
||||||
|
# 预先处理
|
||||||
|
pre_emphasis = 0.97
|
||||||
|
emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
|
||||||
|
|
||||||
|
frame_size = 0.025
|
||||||
|
frame_stride = 0.1
|
||||||
|
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate
|
||||||
|
signal_length = len(emphasized_signal)
|
||||||
|
frame_length = int(round(frame_length))
|
||||||
|
frame_step = int(round(frame_step))
|
||||||
|
num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step))
|
||||||
|
|
||||||
|
pad_signal_length = num_frames * frame_step + frame_length
|
||||||
|
z = numpy.zeros((pad_signal_length - signal_length))
|
||||||
|
pad_signal = numpy.append(emphasized_signal, z)
|
||||||
|
|
||||||
|
indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(
|
||||||
|
numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
|
||||||
|
|
||||||
|
frames = pad_signal[numpy.mat(indices).astype(numpy.int32, copy=False)]
|
||||||
|
|
||||||
|
# 加上汉明窗
|
||||||
|
frames *= numpy.hamming(frame_length)
|
||||||
|
# frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1)) # Explicit Implementation **
|
||||||
|
|
||||||
|
# 傅立叶变换和功率谱
|
||||||
|
NFFT = 512
|
||||||
|
mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT)) # Magnitude of the FFT
|
||||||
|
# print(mag_frames.shape)
|
||||||
|
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
|
||||||
|
|
||||||
|
low_freq_mel = 0
|
||||||
|
# 将频率转换为Mel
|
||||||
|
nfilt = 40
|
||||||
|
high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700))
|
||||||
|
mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
|
||||||
|
hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # Convert Mel to Hz
|
||||||
|
|
||||||
|
bin = numpy.floor((NFFT + 1) * hz_points / sample_rate)
|
||||||
|
|
||||||
|
fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
|
||||||
|
|
||||||
|
for m in range(1, nfilt + 1):
|
||||||
|
f_m_minus = int(bin[m - 1]) # left
|
||||||
|
f_m = int(bin[m]) # center
|
||||||
|
f_m_plus = int(bin[m + 1]) # right
|
||||||
|
for k in range(f_m_minus, f_m):
|
||||||
|
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
|
||||||
|
for k in range(f_m, f_m_plus):
|
||||||
|
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
|
||||||
|
filter_banks = numpy.dot(pow_frames, fbank.T)
|
||||||
|
filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks) # Numerical Stability
|
||||||
|
filter_banks = 20 * numpy.log10(filter_banks) # dB
|
||||||
|
|
||||||
|
# 所得到的倒谱系数2-13被保留,其余的被丢弃
|
||||||
|
num_ceps = 12
|
||||||
|
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1: (num_ceps + 1)]
|
||||||
|
(nframes, ncoeff) = mfcc.shape
|
||||||
|
|
||||||
|
n = numpy.arange(ncoeff)
|
||||||
|
cep_lifter = 22
|
||||||
|
lift = 1 + (cep_lifter / 2) * numpy.sin(numpy.pi * n / cep_lifter)
|
||||||
|
mfcc *= lift # *
|
||||||
|
|
||||||
|
# filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8)
|
||||||
|
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
|
||||||
|
print(mfcc.shape)
|
||||||
|
|
||||||
|
# 创建新的figure
|
||||||
|
fig11 = plt.figure(figsize=(16,8))
|
||||||
|
|
||||||
|
# 绘制1x2两行两列共四个图,编号从1开始
|
||||||
|
ax = fig11.add_subplot(121)
|
||||||
|
plt.plot(mfcc)
|
||||||
|
|
||||||
|
ax = fig11.add_subplot(122)
|
||||||
|
# 平均归一化MFCC
|
||||||
|
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
|
||||||
|
plt.imshow(numpy.flipud(mfcc.T), cmap=plt.cm.jet, aspect=0.2,
|
||||||
|
extent=[0, mfcc.shape[0], 0, mfcc.shape[1]]) # 热力图
|
||||||
|
#将figure保存为png并显示在新创建的子窗口上
|
||||||
|
plt.savefig("fmcc_result.png")
|
||||||
|
dialog_fault1 = QDialog()
|
||||||
|
dialog_fault1.setWindowTitle("合成音频MFCC特征图及MFCC平均归一化热图") # 设置窗口名
|
||||||
|
pic = QPixmap("fmcc_result.png")
|
||||||
|
label_pic = QLabel("show", dialog_fault1)
|
||||||
|
label_pic.setPixmap(pic)
|
||||||
|
label_pic.setGeometry(0,0,1500,800)
|
||||||
|
dialog_fault1.exec_()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
## Initialize the application
|
## Initialize the application
|
||||||
self.app = QApplication(sys.argv)
|
self.app = QApplication(sys.argv)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
super().__init__(None)
|
super().__init__(None)
|
||||||
self.setWindowTitle("MockingBird GUI")
|
self.setWindowTitle("中文语音克隆系统")
|
||||||
self.setWindowIcon(QtGui.QIcon('toolbox\\assets\\mb.png'))
|
self.setWindowIcon(QtGui.QIcon('toolbox\\assets\\mb.png'))
|
||||||
self.setWindowFlag(Qt.WindowMinimizeButtonHint, True)
|
self.setWindowFlag(Qt.WindowMinimizeButtonHint, True)
|
||||||
self.setWindowFlag(Qt.WindowMaximizeButtonHint, True)
|
self.setWindowFlag(Qt.WindowMaximizeButtonHint, True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Main layouts
|
## Main layouts
|
||||||
# Root
|
# Root
|
||||||
root_layout = QGridLayout()
|
root_layout = QGridLayout()
|
||||||
@ -459,6 +686,7 @@ class UI(QDialog):
|
|||||||
self.projections_layout.addWidget(FigureCanvas(fig))
|
self.projections_layout.addWidget(FigureCanvas(fig))
|
||||||
self.umap_hot = False
|
self.umap_hot = False
|
||||||
self.clear_button = QPushButton("Clear")
|
self.clear_button = QPushButton("Clear")
|
||||||
|
self.clear_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/2.png)}')
|
||||||
self.projections_layout.addWidget(self.clear_button)
|
self.projections_layout.addWidget(self.clear_button)
|
||||||
|
|
||||||
|
|
||||||
@ -472,33 +700,46 @@ class UI(QDialog):
|
|||||||
browser_layout.addWidget(source_groupbox, i, 0, 1, 4)
|
browser_layout.addWidget(source_groupbox, i, 0, 1, 4)
|
||||||
|
|
||||||
self.dataset_box = QComboBox()
|
self.dataset_box = QComboBox()
|
||||||
source_layout.addWidget(QLabel("Dataset(数据集):"), i, 0)
|
# source_layout.addWidget(QLabel("Dataset(数据集):"), i, 0) #隐藏标签文字
|
||||||
source_layout.addWidget(self.dataset_box, i, 1)
|
source_layout.addWidget(self.dataset_box, i, 1)
|
||||||
self.random_dataset_button = QPushButton("Random")
|
self.random_dataset_button = QPushButton("Random")
|
||||||
source_layout.addWidget(self.random_dataset_button, i, 2)
|
source_layout.addWidget(self.random_dataset_button, i, 2)
|
||||||
|
|
||||||
|
self.random_dataset_button.hide() #隐藏按钮
|
||||||
|
self.dataset_box.hide() #隐藏选项条
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
self.speaker_box = QComboBox()
|
self.speaker_box = QComboBox()
|
||||||
source_layout.addWidget(QLabel("Speaker(说话者)"), i, 0)
|
# source_layout.addWidget(QLabel("Speaker(说话者)"), i, 0)
|
||||||
source_layout.addWidget(self.speaker_box, i, 1)
|
source_layout.addWidget(self.speaker_box, i, 1)
|
||||||
self.random_speaker_button = QPushButton("Random")
|
self.random_speaker_button = QPushButton("Random")
|
||||||
source_layout.addWidget(self.random_speaker_button, i, 2)
|
source_layout.addWidget(self.random_speaker_button, i, 2)
|
||||||
|
|
||||||
|
self.random_speaker_button.hide()
|
||||||
|
self.speaker_box.hide()
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
self.utterance_box = QComboBox()
|
self.utterance_box = QComboBox()
|
||||||
source_layout.addWidget(QLabel("Utterance(音频):"), i, 0)
|
# source_layout.addWidget(QLabel("Utterance(音频):"), i, 0)
|
||||||
source_layout.addWidget(self.utterance_box, i, 1)
|
source_layout.addWidget(self.utterance_box, i, 1)
|
||||||
self.random_utterance_button = QPushButton("Random")
|
self.random_utterance_button = QPushButton("Random")
|
||||||
source_layout.addWidget(self.random_utterance_button, i, 2)
|
source_layout.addWidget(self.random_utterance_button, i, 2)
|
||||||
|
|
||||||
|
self.random_utterance_button.hide()
|
||||||
|
self.utterance_box.hide()
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
source_layout.addWidget(QLabel("<b>Use(使用):</b>"), i, 0)
|
source_layout.addWidget(QLabel("<b>Use(使用):</b>"), i, 0)
|
||||||
self.browser_load_button = QPushButton("Load Above(加载上面)")
|
self.browser_load_button = QPushButton("")
|
||||||
source_layout.addWidget(self.browser_load_button, i, 1, 1, 2)
|
source_layout.addWidget(self.browser_load_button, i, 1, 1, 2)
|
||||||
self.auto_next_checkbox = QCheckBox("Auto select next")
|
self.auto_next_checkbox = QCheckBox("Auto select next")
|
||||||
self.auto_next_checkbox.setChecked(True)
|
self.auto_next_checkbox.setChecked(True)
|
||||||
source_layout.addWidget(self.auto_next_checkbox, i+1, 1)
|
source_layout.addWidget(self.auto_next_checkbox, i + 1, 1)
|
||||||
self.browser_browse_button = QPushButton("Browse(打开本地)")
|
self.browser_browse_button = QPushButton("Browse(打开本地)")
|
||||||
|
self.browser_browse_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
|
||||||
source_layout.addWidget(self.browser_browse_button, i, 3)
|
source_layout.addWidget(self.browser_browse_button, i, 3)
|
||||||
self.record_button = QPushButton("Record(录音)")
|
self.record_button = QPushButton("Record(录音)")
|
||||||
|
self.record_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
|
||||||
source_layout.addWidget(self.record_button, i+1, 3)
|
source_layout.addWidget(self.record_button, i+1, 3)
|
||||||
|
|
||||||
i += 2
|
i += 2
|
||||||
@ -507,8 +748,10 @@ class UI(QDialog):
|
|||||||
self.utterance_history = QComboBox()
|
self.utterance_history = QComboBox()
|
||||||
browser_layout.addWidget(self.utterance_history, i, 1)
|
browser_layout.addWidget(self.utterance_history, i, 1)
|
||||||
self.play_button = QPushButton("Play(播放)")
|
self.play_button = QPushButton("Play(播放)")
|
||||||
|
self.play_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
|
||||||
browser_layout.addWidget(self.play_button, i, 2)
|
browser_layout.addWidget(self.play_button, i, 2)
|
||||||
self.stop_button = QPushButton("Stop(暂停)")
|
self.stop_button = QPushButton("Stop(暂停)")
|
||||||
|
self.stop_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
|
||||||
browser_layout.addWidget(self.stop_button, i, 3)
|
browser_layout.addWidget(self.stop_button, i, 3)
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
@ -537,10 +780,12 @@ class UI(QDialog):
|
|||||||
self.waves_cb.setModel(self.waves_cb_model)
|
self.waves_cb.setModel(self.waves_cb_model)
|
||||||
self.waves_cb.setToolTip("Select one of the last generated waves in this section for replaying or exporting")
|
self.waves_cb.setToolTip("Select one of the last generated waves in this section for replaying or exporting")
|
||||||
output_layout.addWidget(self.waves_cb, i, 1)
|
output_layout.addWidget(self.waves_cb, i, 1)
|
||||||
self.replay_wav_button = QPushButton("Replay")
|
self.replay_wav_button = QPushButton("Replay(重播)")
|
||||||
|
self.replay_wav_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
|
||||||
self.replay_wav_button.setToolTip("Replay last generated vocoder")
|
self.replay_wav_button.setToolTip("Replay last generated vocoder")
|
||||||
output_layout.addWidget(self.replay_wav_button, i, 2)
|
output_layout.addWidget(self.replay_wav_button, i, 2)
|
||||||
self.export_wav_button = QPushButton("Export")
|
self.export_wav_button = QPushButton("Export(导出)")
|
||||||
|
self.export_wav_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
|
||||||
self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file")
|
self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file")
|
||||||
output_layout.addWidget(self.export_wav_button, i, 3)
|
output_layout.addWidget(self.export_wav_button, i, 3)
|
||||||
self.audio_out_devices_cb=QComboBox()
|
self.audio_out_devices_cb=QComboBox()
|
||||||
@ -551,14 +796,27 @@ class UI(QDialog):
|
|||||||
## Embed & spectrograms
|
## Embed & spectrograms
|
||||||
vis_layout.addStretch()
|
vis_layout.addStretch()
|
||||||
|
|
||||||
|
#添加标签控件,设置标签文字格式并且居中
|
||||||
|
label1 = QLabel("source audio")
|
||||||
|
label1.setStyleSheet("QLabel{color:red;font-size:20px;font-weight:bold;font-family:Roman times;}")
|
||||||
|
label1.setAlignment(Qt.AlignCenter)
|
||||||
|
vis_layout.addWidget(label1) #addwidget:添加控件
|
||||||
|
|
||||||
gridspec_kw = {"width_ratios": [1, 4]}
|
gridspec_kw = {"width_ratios": [1, 4]}
|
||||||
fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
|
fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
|
||||||
gridspec_kw=gridspec_kw)
|
gridspec_kw=gridspec_kw)
|
||||||
|
#self.current_ax[1].set_title("source audio", fontsize=50, color='red', fontstyle='italic', fontweight="heavy")
|
||||||
fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
|
fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
|
||||||
vis_layout.addWidget(FigureCanvas(fig))
|
vis_layout.addWidget(FigureCanvas(fig))
|
||||||
|
|
||||||
|
label2 = QLabel("target audio")
|
||||||
|
label2.setStyleSheet("QLabel{color:red;font-size:20px;font-weight:bold;font-family:Roman times;}")
|
||||||
|
label2.setAlignment(Qt.AlignCenter)
|
||||||
|
vis_layout.addWidget(label2)
|
||||||
|
|
||||||
fig, self.gen_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
|
fig, self.gen_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
|
||||||
gridspec_kw=gridspec_kw)
|
gridspec_kw=gridspec_kw)
|
||||||
|
#self.gen_ax[1].set_title("target audio", fontsize=50, color='red', fontstyle='italic', fontweight="heavy")
|
||||||
fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
|
fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
|
||||||
vis_layout.addWidget(FigureCanvas(fig))
|
vis_layout.addWidget(FigureCanvas(fig))
|
||||||
|
|
||||||
@ -567,28 +825,36 @@ class UI(QDialog):
|
|||||||
for side in ["top", "right", "bottom", "left"]:
|
for side in ["top", "right", "bottom", "left"]:
|
||||||
ax.spines[side].set_visible(False)
|
ax.spines[side].set_visible(False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Generation
|
## Generation
|
||||||
self.text_prompt = QPlainTextEdit(default_text)
|
self.text_prompt = QPlainTextEdit(default_text)
|
||||||
gen_layout.addWidget(self.text_prompt, stretch=1)
|
gen_layout.addWidget(self.text_prompt, stretch=1)
|
||||||
|
|
||||||
self.generate_button = QPushButton("Synthesize and vocode")
|
self.generate_button = QPushButton("Synthesize and vocode(合成并播放)")
|
||||||
|
self.generate_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
|
||||||
gen_layout.addWidget(self.generate_button)
|
gen_layout.addWidget(self.generate_button)
|
||||||
|
|
||||||
layout = QHBoxLayout()
|
layout = QHBoxLayout()
|
||||||
self.synthesize_button = QPushButton("Synthesize only")
|
self.synthesize_button = QPushButton("Synthesize only(仅合成)")
|
||||||
|
self.synthesize_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
|
||||||
layout.addWidget(self.synthesize_button)
|
layout.addWidget(self.synthesize_button)
|
||||||
self.vocode_button = QPushButton("Vocode only")
|
self.vocode_button = QPushButton("Vocode only(仅播放)")
|
||||||
|
self.vocode_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
|
||||||
|
|
||||||
layout.addWidget(self.vocode_button)
|
layout.addWidget(self.vocode_button)
|
||||||
gen_layout.addLayout(layout)
|
gen_layout.addLayout(layout)
|
||||||
|
|
||||||
layout_seed = QGridLayout()
|
layout_seed = QGridLayout()
|
||||||
self.random_seed_checkbox = QCheckBox("Random seed:")
|
self.random_seed_checkbox = QCheckBox("Random seed(随机数种子):")
|
||||||
self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.")
|
self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.")
|
||||||
layout_seed.addWidget(self.random_seed_checkbox, 0, 0)
|
layout_seed.addWidget(self.random_seed_checkbox, 0, 0)
|
||||||
self.seed_textbox = QLineEdit()
|
self.seed_textbox = QLineEdit()
|
||||||
self.seed_textbox.setMaximumWidth(80)
|
self.seed_textbox.setMaximumWidth(80)
|
||||||
layout_seed.addWidget(self.seed_textbox, 0, 1)
|
layout_seed.addWidget(self.seed_textbox, 0, 1)
|
||||||
self.trim_silences_checkbox = QCheckBox("Enhance vocoder output")
|
self.trim_silences_checkbox = QCheckBox("Enhance vocoder output(语音增强)")
|
||||||
self.trim_silences_checkbox.setToolTip("When checked, trims excess silence in vocoder output."
|
self.trim_silences_checkbox.setToolTip("When checked, trims excess silence in vocoder output."
|
||||||
" This feature requires `webrtcvad` to be installed.")
|
" This feature requires `webrtcvad` to be installed.")
|
||||||
layout_seed.addWidget(self.trim_silences_checkbox, 0, 2, 1, 2)
|
layout_seed.addWidget(self.trim_silences_checkbox, 0, 2, 1, 2)
|
||||||
@ -599,7 +865,7 @@ class UI(QDialog):
|
|||||||
self.style_slider.setRange(-1, 9)
|
self.style_slider.setRange(-1, 9)
|
||||||
self.style_value_label = QLabel("-1")
|
self.style_value_label = QLabel("-1")
|
||||||
self.style_slider.setValue(-1)
|
self.style_slider.setValue(-1)
|
||||||
layout_seed.addWidget(QLabel("Style:"), 1, 0)
|
layout_seed.addWidget(QLabel("Style(风格):"), 1, 0)
|
||||||
|
|
||||||
self.style_slider.valueChanged.connect(lambda s: self.style_value_label.setNum(s))
|
self.style_slider.valueChanged.connect(lambda s: self.style_value_label.setNum(s))
|
||||||
layout_seed.addWidget(self.style_value_label, 1, 1)
|
layout_seed.addWidget(self.style_value_label, 1, 1)
|
||||||
@ -610,7 +876,7 @@ class UI(QDialog):
|
|||||||
self.token_slider.setFocusPolicy(Qt.NoFocus)
|
self.token_slider.setFocusPolicy(Qt.NoFocus)
|
||||||
self.token_slider.setSingleStep(1)
|
self.token_slider.setSingleStep(1)
|
||||||
self.token_slider.setRange(3, 9)
|
self.token_slider.setRange(3, 9)
|
||||||
self.token_value_label = QLabel("5")
|
self.token_value_label = QLabel("4")
|
||||||
self.token_slider.setValue(4)
|
self.token_slider.setValue(4)
|
||||||
layout_seed.addWidget(QLabel("Accuracy(精度):"), 2, 0)
|
layout_seed.addWidget(QLabel("Accuracy(精度):"), 2, 0)
|
||||||
|
|
||||||
@ -651,5 +917,16 @@ class UI(QDialog):
|
|||||||
self.reset_interface()
|
self.reset_interface()
|
||||||
self.show()
|
self.show()
|
||||||
|
|
||||||
|
##set the picture of background
|
||||||
|
palette1 = QPalette()
|
||||||
|
# palette1.setColor(self.backgroundRole(), QColor(192,253,123)) # 设置背景颜色
|
||||||
|
palette1.setBrush(self.backgroundRole(), QBrush(QPixmap('toolbox\\assets\\picture1.jpg'))) # 设置背景图片
|
||||||
|
self.setPalette(palette1)
|
||||||
|
self.setAutoFillBackground(True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
self.app.exec_()
|
self.app.exec_()
|
||||||
|
129
vocoder/fregan/.gitignore
vendored
Normal file
129
vocoder/fregan/.gitignore
vendored
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
pip-wheel-metadata/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
21
vocoder/fregan/LICENSE
Normal file
21
vocoder/fregan/LICENSE
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2021 Rishikesh (ऋषिकेश)
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
12950
vocoder/fregan/LJSpeech-1.1/training.txt
Normal file
12950
vocoder/fregan/LJSpeech-1.1/training.txt
Normal file
File diff suppressed because it is too large
Load Diff
150
vocoder/fregan/LJSpeech-1.1/validation.txt
Normal file
150
vocoder/fregan/LJSpeech-1.1/validation.txt
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
LJ050-0269|The essential terms of such memoranda might well be embodied in an Executive order.|The essential terms of such memoranda might well be embodied in an Executive order.
|
||||||
|
LJ050-0270|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security.|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security.
|
||||||
|
LJ050-0271|The demands on the President in the execution of His responsibilities in today's world are so varied and complex|The demands on the President in the execution of His responsibilities in today's world are so varied and complex
|
||||||
|
LJ050-0272|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security.|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security.
|
||||||
|
LJ050-0273|The Commission has, however, from its examination of the facts of President Kennedy's assassination|The Commission has, however, from its examination of the facts of President Kennedy's assassination
|
||||||
|
LJ050-0274|made certain recommendations which it believes would, if adopted,|made certain recommendations which it believes would, if adopted,
|
||||||
|
LJ050-0275|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger.|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger.
|
||||||
|
LJ050-0276|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that,|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that,
|
||||||
|
LJ050-0277|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President,|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President,
|
||||||
|
LJ050-0278|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties.|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties.
|
||||||
|
LJ001-0028|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.
|
||||||
|
LJ001-0068|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic.|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic.
|
||||||
|
LJ002-0149|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo.|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo.
|
||||||
|
LJ002-0157|and Susannah Evans, in October the same year, for 2 shillings, with costs of 6 shillings, 8 pence.|and Susannah Evans, in October the same year, for two shillings, with costs of six shillings, eight pence.
|
||||||
|
LJ002-0167|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny.|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny.
|
||||||
|
LJ003-0042|The completion of this very necessary building was, however, much delayed for want of funds,|The completion of this very necessary building was, however, much delayed for want of funds,
|
||||||
|
LJ003-0307|but as yet no suggestion was made to provide prison uniform.|but as yet no suggestion was made to provide prison uniform.
|
||||||
|
LJ004-0169|On the dirty bedstead lay a wretched being in the throes of severe illness.|On the dirty bedstead lay a wretched being in the throes of severe illness.
|
||||||
|
LJ004-0233|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating.|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating.
|
||||||
|
LJ005-0101|whence it deduced the practice and condition of every prison that replied.|whence it deduced the practice and condition of every prison that replied.
|
||||||
|
LJ005-0108|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison,|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison,
|
||||||
|
LJ005-0202|An examination of this report shows how even the most insignificant township had its jail.|An examination of this report shows how even the most insignificant township had its jail.
|
||||||
|
LJ005-0234|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer.|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer.
|
||||||
|
LJ005-0248|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners.|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners.
|
||||||
|
LJ006-0001|The Chronicles of Newgate, Volume 2. By Arthur Griffiths. Section 9: The first report of the inspector of prisons.|The Chronicles of Newgate, Volume two. By Arthur Griffiths. Section nine: The first report of the inspector of prisons.
|
||||||
|
LJ006-0018|One was Mr. William Crawford, the other the Rev. Whitworth Russell.|One was Mr. William Crawford, the other the Rev. Whitworth Russell.
|
||||||
|
LJ006-0034|They attended early and late; they mustered the prisoners, examined into their condition,|They attended early and late; they mustered the prisoners, examined into their condition,
|
||||||
|
LJ006-0078|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner.|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner.
|
||||||
|
LJ007-0217|They go on to say|They go on to say
|
||||||
|
LJ007-0243|It was not till the erection of the new prison at Holloway in 1850, and the entire internal reconstruction of Newgate according to new ideas,|It was not till the erection of the new prison at Holloway in eighteen fifty, and the entire internal reconstruction of Newgate according to new ideas,
|
||||||
|
LJ008-0087|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor.|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor.
|
||||||
|
LJ008-0131|the other he kept between his hands.|the other he kept between his hands.
|
||||||
|
LJ008-0140|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity,|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity,
|
||||||
|
LJ008-0158|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion.|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion.
|
||||||
|
LJ008-0174|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death.|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death.
|
||||||
|
LJ010-0047|while in 1850 Her Majesty was the victim of another outrage at the hands of one Pate.|while in eighteen fifty Her Majesty was the victim of another outrage at the hands of one Pate.
|
||||||
|
LJ010-0061|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt.|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt.
|
||||||
|
LJ010-0105|Thistlewood was discovered next morning in a mean house in White Street, Moorfields.|Thistlewood was discovered next morning in a mean house in White Street, Moorfields.
|
||||||
|
LJ010-0233|Here again probably it was partly the love of notoriety which was the incentive,|Here again probably it was partly the love of notoriety which was the incentive,
|
||||||
|
LJ010-0234|backed possibly with the hope that, as in a much more recent case,|backed possibly with the hope that, as in a much more recent case,
|
||||||
|
LJ010-0258|As the Queen was driving from Buckingham Palace to the Chapel Royal,|As the Queen was driving from Buckingham Palace to the Chapel Royal,
|
||||||
|
LJ010-0262|charged him with the offense.|charged him with the offense.
|
||||||
|
LJ010-0270|exactly tallied with that of the deformed person "wanted" for the assault on the Queen.|exactly tallied with that of the deformed person "wanted" for the assault on the Queen.
|
||||||
|
LJ010-0293|I have already remarked that as violence was more and more eliminated from crimes against the person,|I have already remarked that as violence was more and more eliminated from crimes against the person,
|
||||||
|
LJ011-0009|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell.|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell.
|
||||||
|
LJ011-0256|By this time the neighbors were aroused, and several people came to the scene of the affray.|By this time the neighbors were aroused, and several people came to the scene of the affray.
|
||||||
|
LJ012-0044|When his trade was busiest he set up a second establishment, at the head of which, although he was married,|When his trade was busiest he set up a second establishment, at the head of which, although he was married,
|
||||||
|
LJ012-0145|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft.|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft.
|
||||||
|
LJ013-0020|he acted in a manner which excited the suspicions of the crew.|he acted in a manner which excited the suspicions of the crew.
|
||||||
|
LJ013-0077|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done.|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done.
|
||||||
|
LJ013-0228|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for.|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for.
|
||||||
|
LJ014-0020|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;
|
||||||
|
LJ014-0054|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone.|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone.
|
||||||
|
LJ014-0101|he found that it was soft and new, while elsewhere it was set and hard.|he found that it was soft and new, while elsewhere it was set and hard.
|
||||||
|
LJ014-0103|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe.|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe.
|
||||||
|
LJ014-0263|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.
|
||||||
|
LJ014-0272|and 1850 to embezzle and apply to his own purposes some £71,000.|and eighteen fifty to embezzle and apply to his own purposes some seventy-one thousand pounds.
|
||||||
|
LJ014-0311|His extensive business had been carried on by fraud.|His extensive business had been carried on by fraud.
|
||||||
|
LJ015-0197|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force.|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force.
|
||||||
|
LJ016-0089|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery.|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery.
|
||||||
|
LJ016-0407|who generally attended the prison services.|who generally attended the prison services.
|
||||||
|
LJ016-0443|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched.|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched.
|
||||||
|
LJ017-0033|a medical practitioner, charged with doing to death persons who relied upon his professional skill.|a medical practitioner, charged with doing to death persons who relied upon his professional skill.
|
||||||
|
LJ017-0038|That the administration of justice should never be interfered with by local prejudice or local feeling|That the administration of justice should never be interfered with by local prejudice or local feeling
|
||||||
|
LJ018-0018|he wore gold-rimmed eye-glasses and a gold watch and chain.|he wore gold-rimmed eye-glasses and a gold watch and chain.
|
||||||
|
LJ018-0119|His offer was not, however, accepted.|His offer was not, however, accepted.
|
||||||
|
LJ018-0280|The commercial experience of these clever rogues was cosmopolitan.|The commercial experience of these clever rogues was cosmopolitan.
|
||||||
|
LJ019-0178|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet.|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet.
|
||||||
|
LJ019-0240|But no structural alterations were made from the date first quoted until the time of closing the prison in 1881.|But no structural alterations were made from the date first quoted until the time of closing the prison in eighteen eighty-one.
|
||||||
|
LJ021-0049|and the curtailment of rank stock speculation through the Securities Exchange Act.|and the curtailment of rank stock speculation through the Securities Exchange Act.
|
||||||
|
LJ021-0155|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works.|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works.
|
||||||
|
LJ022-0046|It is true that while business and industry are definitely better our relief rolls are still too large.|It is true that while business and industry are definitely better our relief rolls are still too large.
|
||||||
|
LJ022-0173|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport,|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport,
|
||||||
|
LJ024-0087|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution.|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution.
|
||||||
|
LJ024-0110|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay
|
||||||
|
LJ024-0119|When before have you found them really at your side in your fights for progress?|When before have you found them really at your side in your fights for progress?
|
||||||
|
LJ025-0091|as it was current among contemporary chemists.|as it was current among contemporary chemists.
|
||||||
|
LJ026-0029|so in the case under discussion.|so in the case under discussion.
|
||||||
|
LJ026-0039|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent.|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent.
|
||||||
|
LJ026-0064|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food.|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food.
|
||||||
|
LJ026-0105|This is done by diastase, an enzyme of plant cells.|This is done by diastase, an enzyme of plant cells.
|
||||||
|
LJ026-0137|and be laid down as "reserve starch" in the cells of root or stem or elsewhere.|and be laid down as "reserve starch" in the cells of root or stem or elsewhere.
|
||||||
|
LJ027-0006|In all these lines the facts are drawn together by a strong thread of unity.|In all these lines the facts are drawn together by a strong thread of unity.
|
||||||
|
LJ028-0134|He also erected what is called a pensile paradise:|He also erected what is called a pensile paradise:
|
||||||
|
LJ028-0138|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be,|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be,
|
||||||
|
LJ028-0189|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world.|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world.
|
||||||
|
LJ028-0281|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken,|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken,
|
||||||
|
LJ029-0188|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater.|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater.
|
||||||
|
LJ030-0098|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen,|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen,
|
||||||
|
LJ031-0007|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital.|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital.
|
||||||
|
LJ031-0091|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries.|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries.
|
||||||
|
LJ031-0227|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital,|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital,
|
||||||
|
LJ032-0100|Marina Oswald|Marina Oswald
|
||||||
|
LJ032-0165|to the exclusion of all others because there are not enough microscopic characteristics present in fibers.|to the exclusion of all others because there are not enough microscopic characteristics present in fibers.
|
||||||
|
LJ032-0198|During the period from March 2, 1963, to April 24, 1963,|During the period from March two, nineteen sixty-three, to April twenty-four, nineteen sixty-three,
|
||||||
|
LJ033-0046|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so.|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so.
|
||||||
|
LJ033-0072|I then stepped off of it and the officer picked it up in the middle and it bent so.|I then stepped off of it and the officer picked it up in the middle and it bent so.
|
||||||
|
LJ033-0135|Location of Bag|Location of Bag
|
||||||
|
LJ034-0083|The significance of Givens' observation that Oswald was carrying his clipboard|The significance of Givens' observation that Oswald was carrying his clipboard
|
||||||
|
LJ034-0179|and, quote, seemed to be sitting a little forward, end quote,|and, quote, seemed to be sitting a little forward, end quote,
|
||||||
|
LJ035-0125|Victoria Adams, who worked on the fourth floor of the Depository Building,|Victoria Adams, who worked on the fourth floor of the Depository Building,
|
||||||
|
LJ035-0162|approximately 30 to 45 seconds after Oswald's lunchroom encounter with Baker and Truly.|approximately thirty to forty-five seconds after Oswald's lunchroom encounter with Baker and Truly.
|
||||||
|
LJ035-0189|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade,|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade,
|
||||||
|
LJ035-0208|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor
|
||||||
|
LJ036-0216|Tippit got out and started to walk around the front of the car|Tippit got out and started to walk around the front of the car
|
||||||
|
LJ037-0093|William Arthur Smith was about a block east of 10th and Patton when he heard shots.|William Arthur Smith was about a block east of tenth and Patton when he heard shots.
|
||||||
|
LJ037-0157|taken from Oswald.|taken from Oswald.
|
||||||
|
LJ037-0178|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting,|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting,
|
||||||
|
LJ037-0219|Oswald's Jacket|Oswald's Jacket
|
||||||
|
LJ037-0222|When Oswald was arrested, he did not have a jacket.|When Oswald was arrested, he did not have a jacket.
|
||||||
|
LJ038-0017|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb.|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb.
|
||||||
|
LJ038-0052|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight.|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight.
|
||||||
|
LJ038-0077|Statements of Oswald during Detention.|Statements of Oswald during Detention.
|
||||||
|
LJ038-0161|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm.|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm.
|
||||||
|
LJ038-0212|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear.|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear.
|
||||||
|
LJ039-0103|Oswald, like all Marine recruits, received training on the rifle range at distances up to 500 yards,|Oswald, like all Marine recruits, received training on the rifle range at distances up to five hundred yards,
|
||||||
|
LJ039-0149|established that they had been previously loaded and ejected from the assassination rifle,|established that they had been previously loaded and ejected from the assassination rifle,
|
||||||
|
LJ040-0107|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of 5 and 7 years,|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of five and seven years,
|
||||||
|
LJ040-0119|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave.|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave.
|
||||||
|
LJ040-0161|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic.|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic.
|
||||||
|
LJ040-0169|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone
|
||||||
|
LJ041-0098|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote.|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote.
|
||||||
|
LJ042-0017|and see for himself how a revolutionary society operates, a Marxist society.|and see for himself how a revolutionary society operates, a Marxist society.
|
||||||
|
LJ042-0070|Oswald was discovered in time to thwart his attempt at suicide.|Oswald was discovered in time to thwart his attempt at suicide.
|
||||||
|
LJ042-0161|Immediately after serving out his 3 years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR.|Immediately after serving out his three years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR.
|
||||||
|
LJ043-0147|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle.|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle.
|
||||||
|
LJ043-0178|as, in fact, one of them did appear after the assassination.|as, in fact, one of them did appear after the assassination.
|
||||||
|
LJ043-0183|Oswald did not lack the determination and other traits required|Oswald did not lack the determination and other traits required
|
||||||
|
LJ043-0185|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker.|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker.
|
||||||
|
LJ044-0057|extensive investigation was not able to connect Oswald with that address, although it did develop the fact|extensive investigation was not able to connect Oswald with that address, although it did develop the fact
|
||||||
|
LJ044-0109|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities.|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities.
|
||||||
|
LJ045-0081|Although she denied it in some of her testimony before the Commission,|Although she denied it in some of her testimony before the Commission,
|
||||||
|
LJ045-0147|She asked Oswald, quote,|She asked Oswald, quote,
|
||||||
|
LJ045-0204|he had never found anything to which he felt he could really belong.|he had never found anything to which he felt he could really belong.
|
||||||
|
LJ046-0193|and 12 to 15 of these cases as highly dangerous risks.|and twelve to fifteen of these cases as highly dangerous risks.
|
||||||
|
LJ046-0244|PRS should have investigated and been prepared to guard against it.|PRS should have investigated and been prepared to guard against it.
|
||||||
|
LJ047-0059|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries,|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries,
|
||||||
|
LJ047-0142|The Bureau had no earlier information suggesting that Oswald had left the United States.|The Bureau had no earlier information suggesting that Oswald had left the United States.
|
||||||
|
LJ048-0035|It was against this background and consistent with the criteria followed by the FBI prior to November 22|It was against this background and consistent with the criteria followed by the FBI prior to November twenty-two
|
||||||
|
LJ048-0063|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination.|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination.
|
||||||
|
LJ048-0104|There were far safer routes via freeways directly to the Trade Mart,|There were far safer routes via freeways directly to the Trade Mart,
|
||||||
|
LJ048-0187|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators.|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators.
|
||||||
|
LJ048-0271|will be cause for removal from the Service, end quote.|will be cause for removal from the Service, end quote.
|
||||||
|
LJ049-0031|The Presidential vehicle in use in Dallas, described in chapter 2,|The Presidential vehicle in use in Dallas, described in chapter two,
|
||||||
|
LJ049-0059|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger,|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger,
|
||||||
|
LJ049-0174|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated
|
||||||
|
LJ050-0049|and from a specialist in psychiatric prognostication at Walter Reed Hospital.|and from a specialist in psychiatric prognostication at Walter Reed Hospital.
|
||||||
|
LJ050-0113|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service,|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service,
|
||||||
|
LJ050-0150|Its present manual filing system is obsolete;|Its present manual filing system is obsolete;
|
||||||
|
LJ050-0189|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described.|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described.
|
25
vocoder/fregan/README.md
Normal file
25
vocoder/fregan/README.md
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
# Fre-GAN Vocoder
|
||||||
|
[Fre-GAN: Adversarial Frequency-consistent Audio Synthesis](https://arxiv.org/abs/2106.02297)
|
||||||
|
|
||||||
|
## Training:
|
||||||
|
```
|
||||||
|
python train.py --config config.json
|
||||||
|
```
|
||||||
|
|
||||||
|
## Citation:
|
||||||
|
```
|
||||||
|
@misc{kim2021fregan,
|
||||||
|
title={Fre-GAN: Adversarial Frequency-consistent Audio Synthesis},
|
||||||
|
author={Ji-Hoon Kim and Sang-Hoon Lee and Ji-Hyun Lee and Seong-Whan Lee},
|
||||||
|
year={2021},
|
||||||
|
eprint={2106.02297},
|
||||||
|
archivePrefix={arXiv},
|
||||||
|
primaryClass={eess.AS}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
## Note
|
||||||
|
* For more complete and end to end Voice cloning or Text to Speech (TTS) toolbox please visit [Deepsync Technologies](https://deepsync.co/).
|
||||||
|
|
||||||
|
## References:
|
||||||
|
* [Hi-Fi-GAN repo](https://github.com/jik876/hifi-gan)
|
||||||
|
* [WaveSNet repo](https://github.com/LiQiufu/WaveSNet)
|
41
vocoder/fregan/config.json
Normal file
41
vocoder/fregan/config.json
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
{
|
||||||
|
"resblock": "1",
|
||||||
|
"num_gpus": 0,
|
||||||
|
"batch_size": 16,
|
||||||
|
"learning_rate": 0.0002,
|
||||||
|
"adam_b1": 0.8,
|
||||||
|
"adam_b2": 0.99,
|
||||||
|
"lr_decay": 0.999,
|
||||||
|
"seed": 1234,
|
||||||
|
|
||||||
|
|
||||||
|
"upsample_rates": [5,5,2,2,2],
|
||||||
|
"upsample_kernel_sizes": [10,10,4,4,4],
|
||||||
|
"upsample_initial_channel": 512,
|
||||||
|
"resblock_kernel_sizes": [3,7,11],
|
||||||
|
"resblock_dilation_sizes": [[1, 3, 5, 7], [1,3,5,7], [1,3,5,7]],
|
||||||
|
|
||||||
|
"segment_size": 6400,
|
||||||
|
"num_mels": 80,
|
||||||
|
"num_freq": 1025,
|
||||||
|
"n_fft": 1024,
|
||||||
|
"hop_size": 200,
|
||||||
|
"win_size": 800,
|
||||||
|
|
||||||
|
"sampling_rate": 16000,
|
||||||
|
|
||||||
|
"fmin": 0,
|
||||||
|
"fmax": 7600,
|
||||||
|
"fmax_for_loss": null,
|
||||||
|
|
||||||
|
"num_workers": 4,
|
||||||
|
|
||||||
|
"dist_config": {
|
||||||
|
"dist_backend": "nccl",
|
||||||
|
"dist_url": "tcp://localhost:54321",
|
||||||
|
"world_size": 1
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
303
vocoder/fregan/discriminator.py
Normal file
303
vocoder/fregan/discriminator.py
Normal file
@ -0,0 +1,303 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.nn import Conv1d, AvgPool1d, Conv2d
|
||||||
|
from torch.nn.utils import weight_norm, spectral_norm
|
||||||
|
from vocoder.fregan.utils import get_padding
|
||||||
|
from vocoder.fregan.stft_loss import stft
|
||||||
|
from vocoder.fregan.dwt import DWT_1D
|
||||||
|
LRELU_SLOPE = 0.1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class SpecDiscriminator(nn.Module):
|
||||||
|
"""docstring for Discriminator."""
|
||||||
|
|
||||||
|
def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window", use_spectral_norm=False):
|
||||||
|
super(SpecDiscriminator, self).__init__()
|
||||||
|
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||||
|
self.fft_size = fft_size
|
||||||
|
self.shift_size = shift_size
|
||||||
|
self.win_length = win_length
|
||||||
|
self.window = getattr(torch, window)(win_length)
|
||||||
|
self.discriminators = nn.ModuleList([
|
||||||
|
norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))),
|
||||||
|
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))),
|
||||||
|
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))),
|
||||||
|
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))),
|
||||||
|
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1,1), padding=(1, 1))),
|
||||||
|
])
|
||||||
|
|
||||||
|
self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1))
|
||||||
|
|
||||||
|
def forward(self, y):
|
||||||
|
|
||||||
|
fmap = []
|
||||||
|
with torch.no_grad():
|
||||||
|
y = y.squeeze(1)
|
||||||
|
y = stft(y, self.fft_size, self.shift_size, self.win_length, self.window.to(y.get_device()))
|
||||||
|
y = y.unsqueeze(1)
|
||||||
|
for i, d in enumerate(self.discriminators):
|
||||||
|
y = d(y)
|
||||||
|
y = F.leaky_relu(y, LRELU_SLOPE)
|
||||||
|
fmap.append(y)
|
||||||
|
|
||||||
|
y = self.out(y)
|
||||||
|
fmap.append(y)
|
||||||
|
|
||||||
|
return torch.flatten(y, 1, -1), fmap
|
||||||
|
|
||||||
|
class MultiResSpecDiscriminator(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
fft_sizes=[1024, 2048, 512],
|
||||||
|
hop_sizes=[120, 240, 50],
|
||||||
|
win_lengths=[600, 1200, 240],
|
||||||
|
window="hann_window"):
|
||||||
|
|
||||||
|
super(MultiResSpecDiscriminator, self).__init__()
|
||||||
|
self.discriminators = nn.ModuleList([
|
||||||
|
SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window),
|
||||||
|
SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window),
|
||||||
|
SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window)
|
||||||
|
])
|
||||||
|
|
||||||
|
def forward(self, y, y_hat):
|
||||||
|
y_d_rs = []
|
||||||
|
y_d_gs = []
|
||||||
|
fmap_rs = []
|
||||||
|
fmap_gs = []
|
||||||
|
for i, d in enumerate(self.discriminators):
|
||||||
|
y_d_r, fmap_r = d(y)
|
||||||
|
y_d_g, fmap_g = d(y_hat)
|
||||||
|
y_d_rs.append(y_d_r)
|
||||||
|
fmap_rs.append(fmap_r)
|
||||||
|
y_d_gs.append(y_d_g)
|
||||||
|
fmap_gs.append(fmap_g)
|
||||||
|
|
||||||
|
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||||
|
|
||||||
|
|
||||||
|
class DiscriminatorP(torch.nn.Module):
|
||||||
|
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||||
|
super(DiscriminatorP, self).__init__()
|
||||||
|
self.period = period
|
||||||
|
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||||
|
self.dwt1d = DWT_1D()
|
||||||
|
self.dwt_conv1 = norm_f(Conv1d(2, 1, 1))
|
||||||
|
self.dwt_proj1 = norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0)))
|
||||||
|
self.dwt_conv2 = norm_f(Conv1d(4, 1, 1))
|
||||||
|
self.dwt_proj2 = norm_f(Conv2d(1, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0)))
|
||||||
|
self.dwt_conv3 = norm_f(Conv1d(8, 1, 1))
|
||||||
|
self.dwt_proj3 = norm_f(Conv2d(1, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0)))
|
||||||
|
self.convs = nn.ModuleList([
|
||||||
|
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||||
|
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||||
|
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||||
|
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||||
|
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
|
||||||
|
])
|
||||||
|
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
fmap = []
|
||||||
|
|
||||||
|
# DWT 1
|
||||||
|
x_d1_high1, x_d1_low1 = self.dwt1d(x)
|
||||||
|
x_d1 = self.dwt_conv1(torch.cat([x_d1_high1, x_d1_low1], dim=1))
|
||||||
|
# 1d to 2d
|
||||||
|
b, c, t = x_d1.shape
|
||||||
|
if t % self.period != 0: # pad first
|
||||||
|
n_pad = self.period - (t % self.period)
|
||||||
|
x_d1 = F.pad(x_d1, (0, n_pad), "reflect")
|
||||||
|
t = t + n_pad
|
||||||
|
x_d1 = x_d1.view(b, c, t // self.period, self.period)
|
||||||
|
|
||||||
|
x_d1 = self.dwt_proj1(x_d1)
|
||||||
|
|
||||||
|
# DWT 2
|
||||||
|
x_d2_high1, x_d2_low1 = self.dwt1d(x_d1_high1)
|
||||||
|
x_d2_high2, x_d2_low2 = self.dwt1d(x_d1_low1)
|
||||||
|
x_d2 = self.dwt_conv2(torch.cat([x_d2_high1, x_d2_low1, x_d2_high2, x_d2_low2], dim=1))
|
||||||
|
# 1d to 2d
|
||||||
|
b, c, t = x_d2.shape
|
||||||
|
if t % self.period != 0: # pad first
|
||||||
|
n_pad = self.period - (t % self.period)
|
||||||
|
x_d2 = F.pad(x_d2, (0, n_pad), "reflect")
|
||||||
|
t = t + n_pad
|
||||||
|
x_d2 = x_d2.view(b, c, t // self.period, self.period)
|
||||||
|
|
||||||
|
x_d2 = self.dwt_proj2(x_d2)
|
||||||
|
|
||||||
|
# DWT 3
|
||||||
|
|
||||||
|
x_d3_high1, x_d3_low1 = self.dwt1d(x_d2_high1)
|
||||||
|
x_d3_high2, x_d3_low2 = self.dwt1d(x_d2_low1)
|
||||||
|
x_d3_high3, x_d3_low3 = self.dwt1d(x_d2_high2)
|
||||||
|
x_d3_high4, x_d3_low4 = self.dwt1d(x_d2_low2)
|
||||||
|
x_d3 = self.dwt_conv3(
|
||||||
|
torch.cat([x_d3_high1, x_d3_low1, x_d3_high2, x_d3_low2, x_d3_high3, x_d3_low3, x_d3_high4, x_d3_low4],
|
||||||
|
dim=1))
|
||||||
|
# 1d to 2d
|
||||||
|
b, c, t = x_d3.shape
|
||||||
|
if t % self.period != 0: # pad first
|
||||||
|
n_pad = self.period - (t % self.period)
|
||||||
|
x_d3 = F.pad(x_d3, (0, n_pad), "reflect")
|
||||||
|
t = t + n_pad
|
||||||
|
x_d3 = x_d3.view(b, c, t // self.period, self.period)
|
||||||
|
|
||||||
|
x_d3 = self.dwt_proj3(x_d3)
|
||||||
|
|
||||||
|
# 1d to 2d
|
||||||
|
b, c, t = x.shape
|
||||||
|
if t % self.period != 0: # pad first
|
||||||
|
n_pad = self.period - (t % self.period)
|
||||||
|
x = F.pad(x, (0, n_pad), "reflect")
|
||||||
|
t = t + n_pad
|
||||||
|
x = x.view(b, c, t // self.period, self.period)
|
||||||
|
i = 0
|
||||||
|
for l in self.convs:
|
||||||
|
x = l(x)
|
||||||
|
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
|
||||||
|
fmap.append(x)
|
||||||
|
if i == 0:
|
||||||
|
x = torch.cat([x, x_d1], dim=2)
|
||||||
|
elif i == 1:
|
||||||
|
x = torch.cat([x, x_d2], dim=2)
|
||||||
|
elif i == 2:
|
||||||
|
x = torch.cat([x, x_d3], dim=2)
|
||||||
|
else:
|
||||||
|
x = x
|
||||||
|
i = i + 1
|
||||||
|
x = self.conv_post(x)
|
||||||
|
fmap.append(x)
|
||||||
|
x = torch.flatten(x, 1, -1)
|
||||||
|
|
||||||
|
return x, fmap
|
||||||
|
|
||||||
|
|
||||||
|
class ResWiseMultiPeriodDiscriminator(torch.nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super(ResWiseMultiPeriodDiscriminator, self).__init__()
|
||||||
|
self.discriminators = nn.ModuleList([
|
||||||
|
DiscriminatorP(2),
|
||||||
|
DiscriminatorP(3),
|
||||||
|
DiscriminatorP(5),
|
||||||
|
DiscriminatorP(7),
|
||||||
|
DiscriminatorP(11),
|
||||||
|
])
|
||||||
|
|
||||||
|
def forward(self, y, y_hat):
|
||||||
|
y_d_rs = []
|
||||||
|
y_d_gs = []
|
||||||
|
fmap_rs = []
|
||||||
|
fmap_gs = []
|
||||||
|
for i, d in enumerate(self.discriminators):
|
||||||
|
y_d_r, fmap_r = d(y)
|
||||||
|
y_d_g, fmap_g = d(y_hat)
|
||||||
|
y_d_rs.append(y_d_r)
|
||||||
|
fmap_rs.append(fmap_r)
|
||||||
|
y_d_gs.append(y_d_g)
|
||||||
|
fmap_gs.append(fmap_g)
|
||||||
|
|
||||||
|
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||||
|
|
||||||
|
|
||||||
|
class DiscriminatorS(torch.nn.Module):
|
||||||
|
def __init__(self, use_spectral_norm=False):
|
||||||
|
super(DiscriminatorS, self).__init__()
|
||||||
|
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||||
|
self.dwt1d = DWT_1D()
|
||||||
|
self.dwt_conv1 = norm_f(Conv1d(2, 128, 15, 1, padding=7))
|
||||||
|
self.dwt_conv2 = norm_f(Conv1d(4, 128, 41, 2, padding=20))
|
||||||
|
self.convs = nn.ModuleList([
|
||||||
|
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
||||||
|
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
||||||
|
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
|
||||||
|
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
|
||||||
|
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
|
||||||
|
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
|
||||||
|
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
||||||
|
])
|
||||||
|
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
fmap = []
|
||||||
|
|
||||||
|
# DWT 1
|
||||||
|
x_d1_high1, x_d1_low1 = self.dwt1d(x)
|
||||||
|
x_d1 = self.dwt_conv1(torch.cat([x_d1_high1, x_d1_low1], dim=1))
|
||||||
|
|
||||||
|
# DWT 2
|
||||||
|
x_d2_high1, x_d2_low1 = self.dwt1d(x_d1_high1)
|
||||||
|
x_d2_high2, x_d2_low2 = self.dwt1d(x_d1_low1)
|
||||||
|
x_d2 = self.dwt_conv2(torch.cat([x_d2_high1, x_d2_low1, x_d2_high2, x_d2_low2], dim=1))
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
for l in self.convs:
|
||||||
|
x = l(x)
|
||||||
|
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
fmap.append(x)
|
||||||
|
if i == 0:
|
||||||
|
x = torch.cat([x, x_d1], dim=2)
|
||||||
|
if i == 1:
|
||||||
|
x = torch.cat([x, x_d2], dim=2)
|
||||||
|
i = i + 1
|
||||||
|
x = self.conv_post(x)
|
||||||
|
fmap.append(x)
|
||||||
|
x = torch.flatten(x, 1, -1)
|
||||||
|
|
||||||
|
return x, fmap
|
||||||
|
|
||||||
|
|
||||||
|
class ResWiseMultiScaleDiscriminator(torch.nn.Module):
|
||||||
|
def __init__(self, use_spectral_norm=False):
|
||||||
|
super(ResWiseMultiScaleDiscriminator, self).__init__()
|
||||||
|
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||||
|
self.dwt1d = DWT_1D()
|
||||||
|
self.dwt_conv1 = norm_f(Conv1d(2, 1, 1))
|
||||||
|
self.dwt_conv2 = norm_f(Conv1d(4, 1, 1))
|
||||||
|
self.discriminators = nn.ModuleList([
|
||||||
|
DiscriminatorS(use_spectral_norm=True),
|
||||||
|
DiscriminatorS(),
|
||||||
|
DiscriminatorS(),
|
||||||
|
])
|
||||||
|
|
||||||
|
def forward(self, y, y_hat):
|
||||||
|
y_d_rs = []
|
||||||
|
y_d_gs = []
|
||||||
|
fmap_rs = []
|
||||||
|
fmap_gs = []
|
||||||
|
# DWT 1
|
||||||
|
y_hi, y_lo = self.dwt1d(y)
|
||||||
|
y_1 = self.dwt_conv1(torch.cat([y_hi, y_lo], dim=1))
|
||||||
|
x_d1_high1, x_d1_low1 = self.dwt1d(y_hat)
|
||||||
|
y_hat_1 = self.dwt_conv1(torch.cat([x_d1_high1, x_d1_low1], dim=1))
|
||||||
|
|
||||||
|
# DWT 2
|
||||||
|
x_d2_high1, x_d2_low1 = self.dwt1d(y_hi)
|
||||||
|
x_d2_high2, x_d2_low2 = self.dwt1d(y_lo)
|
||||||
|
y_2 = self.dwt_conv2(torch.cat([x_d2_high1, x_d2_low1, x_d2_high2, x_d2_low2], dim=1))
|
||||||
|
|
||||||
|
x_d2_high1, x_d2_low1 = self.dwt1d(x_d1_high1)
|
||||||
|
x_d2_high2, x_d2_low2 = self.dwt1d(x_d1_low1)
|
||||||
|
y_hat_2 = self.dwt_conv2(torch.cat([x_d2_high1, x_d2_low1, x_d2_high2, x_d2_low2], dim=1))
|
||||||
|
|
||||||
|
for i, d in enumerate(self.discriminators):
|
||||||
|
|
||||||
|
if i == 1:
|
||||||
|
y = y_1
|
||||||
|
y_hat = y_hat_1
|
||||||
|
if i == 2:
|
||||||
|
y = y_2
|
||||||
|
y_hat = y_hat_2
|
||||||
|
|
||||||
|
y_d_r, fmap_r = d(y)
|
||||||
|
y_d_g, fmap_g = d(y_hat)
|
||||||
|
y_d_rs.append(y_d_r)
|
||||||
|
fmap_rs.append(fmap_r)
|
||||||
|
y_d_gs.append(y_d_g)
|
||||||
|
fmap_gs.append(fmap_g)
|
||||||
|
|
||||||
|
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
76
vocoder/fregan/dwt.py
Normal file
76
vocoder/fregan/dwt.py
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
# Copyright (c) 2019, Adobe Inc. All rights reserved.
|
||||||
|
#
|
||||||
|
# This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike
|
||||||
|
# 4.0 International Public License. To view a copy of this license, visit
|
||||||
|
# https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.
|
||||||
|
|
||||||
|
# DWT code borrow from https://github.com/LiQiufu/WaveSNet/blob/12cb9d24208c3d26917bf953618c30f0c6b0f03d/DWT_IDWT/DWT_IDWT_layer.py
|
||||||
|
|
||||||
|
|
||||||
|
import pywt
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
__all__ = ['DWT_1D']
|
||||||
|
Pad_Mode = ['constant', 'reflect', 'replicate', 'circular']
|
||||||
|
|
||||||
|
|
||||||
|
class DWT_1D(nn.Module):
|
||||||
|
def __init__(self, pad_type='reflect', wavename='haar',
|
||||||
|
stride=2, in_channels=1, out_channels=None, groups=None,
|
||||||
|
kernel_size=None, trainable=False):
|
||||||
|
|
||||||
|
super(DWT_1D, self).__init__()
|
||||||
|
self.trainable = trainable
|
||||||
|
self.kernel_size = kernel_size
|
||||||
|
if not self.trainable:
|
||||||
|
assert self.kernel_size == None
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.out_channels = self.in_channels if out_channels == None else out_channels
|
||||||
|
self.groups = self.in_channels if groups == None else groups
|
||||||
|
assert isinstance(self.groups, int) and self.in_channels % self.groups == 0
|
||||||
|
self.stride = stride
|
||||||
|
assert self.stride == 2
|
||||||
|
self.wavename = wavename
|
||||||
|
self.pad_type = pad_type
|
||||||
|
assert self.pad_type in Pad_Mode
|
||||||
|
self.get_filters()
|
||||||
|
self.initialization()
|
||||||
|
|
||||||
|
def get_filters(self):
|
||||||
|
wavelet = pywt.Wavelet(self.wavename)
|
||||||
|
band_low = torch.tensor(wavelet.rec_lo)
|
||||||
|
band_high = torch.tensor(wavelet.rec_hi)
|
||||||
|
length_band = band_low.size()[0]
|
||||||
|
self.kernel_size = length_band if self.kernel_size == None else self.kernel_size
|
||||||
|
assert self.kernel_size >= length_band
|
||||||
|
a = (self.kernel_size - length_band) // 2
|
||||||
|
b = - (self.kernel_size - length_band - a)
|
||||||
|
b = None if b == 0 else b
|
||||||
|
self.filt_low = torch.zeros(self.kernel_size)
|
||||||
|
self.filt_high = torch.zeros(self.kernel_size)
|
||||||
|
self.filt_low[a:b] = band_low
|
||||||
|
self.filt_high[a:b] = band_high
|
||||||
|
|
||||||
|
def initialization(self):
|
||||||
|
self.filter_low = self.filt_low[None, None, :].repeat((self.out_channels, self.in_channels // self.groups, 1))
|
||||||
|
self.filter_high = self.filt_high[None, None, :].repeat((self.out_channels, self.in_channels // self.groups, 1))
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
self.filter_low = self.filter_low.cuda()
|
||||||
|
self.filter_high = self.filter_high.cuda()
|
||||||
|
if self.trainable:
|
||||||
|
self.filter_low = nn.Parameter(self.filter_low)
|
||||||
|
self.filter_high = nn.Parameter(self.filter_high)
|
||||||
|
if self.kernel_size % 2 == 0:
|
||||||
|
self.pad_sizes = [self.kernel_size // 2 - 1, self.kernel_size // 2 - 1]
|
||||||
|
else:
|
||||||
|
self.pad_sizes = [self.kernel_size // 2, self.kernel_size // 2]
|
||||||
|
|
||||||
|
def forward(self, input):
|
||||||
|
assert isinstance(input, torch.Tensor)
|
||||||
|
assert len(input.size()) == 3
|
||||||
|
assert input.size()[1] == self.in_channels
|
||||||
|
input = F.pad(input, pad=self.pad_sizes, mode=self.pad_type)
|
||||||
|
return F.conv1d(input, self.filter_low.to(input.device), stride=self.stride, groups=self.groups), \
|
||||||
|
F.conv1d(input, self.filter_high.to(input.device), stride=self.stride, groups=self.groups)
|
210
vocoder/fregan/generator.py
Normal file
210
vocoder/fregan/generator.py
Normal file
@ -0,0 +1,210 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||||
|
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||||
|
from vocoder.fregan.utils import init_weights, get_padding
|
||||||
|
|
||||||
|
LRELU_SLOPE = 0.1
|
||||||
|
|
||||||
|
|
||||||
|
class ResBlock1(torch.nn.Module):
|
||||||
|
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5, 7)):
|
||||||
|
super(ResBlock1, self).__init__()
|
||||||
|
self.h = h
|
||||||
|
self.convs1 = nn.ModuleList([
|
||||||
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
||||||
|
padding=get_padding(kernel_size, dilation[0]))),
|
||||||
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
||||||
|
padding=get_padding(kernel_size, dilation[1]))),
|
||||||
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
|
||||||
|
padding=get_padding(kernel_size, dilation[2]))),
|
||||||
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[3],
|
||||||
|
padding=get_padding(kernel_size, dilation[3])))
|
||||||
|
])
|
||||||
|
self.convs1.apply(init_weights)
|
||||||
|
|
||||||
|
self.convs2 = nn.ModuleList([
|
||||||
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||||
|
padding=get_padding(kernel_size, 1))),
|
||||||
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||||
|
padding=get_padding(kernel_size, 1))),
|
||||||
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||||
|
padding=get_padding(kernel_size, 1))),
|
||||||
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||||
|
padding=get_padding(kernel_size, 1)))
|
||||||
|
])
|
||||||
|
self.convs2.apply(init_weights)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
for c1, c2 in zip(self.convs1, self.convs2):
|
||||||
|
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
xt = c1(xt)
|
||||||
|
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
||||||
|
xt = c2(xt)
|
||||||
|
x = xt + x
|
||||||
|
return x
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
for l in self.convs1:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
for l in self.convs2:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
|
||||||
|
|
||||||
|
class ResBlock2(torch.nn.Module):
|
||||||
|
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
|
||||||
|
super(ResBlock2, self).__init__()
|
||||||
|
self.h = h
|
||||||
|
self.convs = nn.ModuleList([
|
||||||
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
||||||
|
padding=get_padding(kernel_size, dilation[0]))),
|
||||||
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
||||||
|
padding=get_padding(kernel_size, dilation[1])))
|
||||||
|
])
|
||||||
|
self.convs.apply(init_weights)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
for c in self.convs:
|
||||||
|
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
xt = c(xt)
|
||||||
|
x = xt + x
|
||||||
|
return x
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
for l in self.convs:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
|
||||||
|
|
||||||
|
class FreGAN(torch.nn.Module):
|
||||||
|
def __init__(self, h, top_k=4):
|
||||||
|
super(FreGAN, self).__init__()
|
||||||
|
self.h = h
|
||||||
|
|
||||||
|
self.num_kernels = len(h.resblock_kernel_sizes)
|
||||||
|
self.num_upsamples = len(h.upsample_rates)
|
||||||
|
self.upsample_rates = h.upsample_rates
|
||||||
|
self.up_kernels = h.upsample_kernel_sizes
|
||||||
|
self.cond_level = self.num_upsamples - top_k
|
||||||
|
self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
|
||||||
|
resblock = ResBlock1 if h.resblock == '1' else ResBlock2
|
||||||
|
|
||||||
|
self.ups = nn.ModuleList()
|
||||||
|
self.cond_up = nn.ModuleList()
|
||||||
|
self.res_output = nn.ModuleList()
|
||||||
|
upsample_ = 1
|
||||||
|
kr = 80
|
||||||
|
|
||||||
|
for i, (u, k) in enumerate(zip(self.upsample_rates, self.up_kernels)):
|
||||||
|
# self.ups.append(weight_norm(
|
||||||
|
# ConvTranspose1d(h.upsample_initial_channel // (2 ** i), h.upsample_initial_channel // (2 ** (i + 1)),
|
||||||
|
# k, u, padding=(k - u) // 2)))
|
||||||
|
self.ups.append(weight_norm(ConvTranspose1d(h.upsample_initial_channel//(2**i),
|
||||||
|
h.upsample_initial_channel//(2**(i+1)),
|
||||||
|
k, u, padding=(u//2 + u%2), output_padding=u%2)))
|
||||||
|
|
||||||
|
if i > (self.num_upsamples - top_k):
|
||||||
|
self.res_output.append(
|
||||||
|
nn.Sequential(
|
||||||
|
nn.Upsample(scale_factor=u, mode='nearest'),
|
||||||
|
weight_norm(nn.Conv1d(h.upsample_initial_channel // (2 ** i),
|
||||||
|
h.upsample_initial_channel // (2 ** (i + 1)), 1))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if i >= (self.num_upsamples - top_k):
|
||||||
|
self.cond_up.append(
|
||||||
|
weight_norm(
|
||||||
|
ConvTranspose1d(kr, h.upsample_initial_channel // (2 ** i),
|
||||||
|
self.up_kernels[i - 1], self.upsample_rates[i - 1],
|
||||||
|
padding=(self.upsample_rates[i-1]//2+self.upsample_rates[i-1]%2), output_padding=self.upsample_rates[i-1]%2))
|
||||||
|
)
|
||||||
|
kr = h.upsample_initial_channel // (2 ** i)
|
||||||
|
|
||||||
|
upsample_ *= u
|
||||||
|
|
||||||
|
self.resblocks = nn.ModuleList()
|
||||||
|
for i in range(len(self.ups)):
|
||||||
|
ch = h.upsample_initial_channel // (2 ** (i + 1))
|
||||||
|
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
|
||||||
|
self.resblocks.append(resblock(h, ch, k, d))
|
||||||
|
|
||||||
|
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
|
||||||
|
self.ups.apply(init_weights)
|
||||||
|
self.conv_post.apply(init_weights)
|
||||||
|
self.cond_up.apply(init_weights)
|
||||||
|
self.res_output.apply(init_weights)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
mel = x
|
||||||
|
x = self.conv_pre(x)
|
||||||
|
output = None
|
||||||
|
for i in range(self.num_upsamples):
|
||||||
|
if i >= self.cond_level:
|
||||||
|
mel = self.cond_up[i - self.cond_level](mel)
|
||||||
|
x += mel
|
||||||
|
if i > self.cond_level:
|
||||||
|
if output is None:
|
||||||
|
output = self.res_output[i - self.cond_level - 1](x)
|
||||||
|
else:
|
||||||
|
output = self.res_output[i - self.cond_level - 1](output)
|
||||||
|
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
x = self.ups[i](x)
|
||||||
|
xs = None
|
||||||
|
for j in range(self.num_kernels):
|
||||||
|
if xs is None:
|
||||||
|
xs = self.resblocks[i * self.num_kernels + j](x)
|
||||||
|
else:
|
||||||
|
xs += self.resblocks[i * self.num_kernels + j](x)
|
||||||
|
x = xs / self.num_kernels
|
||||||
|
if output is not None:
|
||||||
|
output = output + x
|
||||||
|
|
||||||
|
x = F.leaky_relu(output)
|
||||||
|
x = self.conv_post(x)
|
||||||
|
x = torch.tanh(x)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
print('Removing weight norm...')
|
||||||
|
for l in self.ups:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
for l in self.resblocks:
|
||||||
|
l.remove_weight_norm()
|
||||||
|
for l in self.cond_up:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
for l in self.res_output:
|
||||||
|
remove_weight_norm(l[1])
|
||||||
|
remove_weight_norm(self.conv_pre)
|
||||||
|
remove_weight_norm(self.conv_post)
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
to run this, fix
|
||||||
|
from . import ResStack
|
||||||
|
into
|
||||||
|
from res_stack import ResStack
|
||||||
|
'''
|
||||||
|
if __name__ == '__main__':
|
||||||
|
'''
|
||||||
|
torch.Size([3, 80, 10])
|
||||||
|
torch.Size([3, 1, 2000])
|
||||||
|
4527362
|
||||||
|
'''
|
||||||
|
with open('config.json') as f:
|
||||||
|
data = f.read()
|
||||||
|
from utils import AttrDict
|
||||||
|
import json
|
||||||
|
json_config = json.loads(data)
|
||||||
|
h = AttrDict(json_config)
|
||||||
|
model = FreGAN(h)
|
||||||
|
|
||||||
|
c = torch.randn(3, 80, 10) # (B, channels, T).
|
||||||
|
print(c.shape)
|
||||||
|
|
||||||
|
y = model(c) # (B, 1, T ** prod(upsample_scales)
|
||||||
|
print(y.shape)
|
||||||
|
assert y.shape == torch.Size([3, 1, 2560]) # For normal melgan torch.Size([3, 1, 2560])
|
||||||
|
|
||||||
|
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||||
|
print(pytorch_total_params)
|
70
vocoder/fregan/inference.py
Normal file
70
vocoder/fregan/inference.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import torch
|
||||||
|
from scipy.io.wavfile import write
|
||||||
|
from vocoder.hifigan.env import AttrDict
|
||||||
|
from vocoder.hifigan.meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
|
||||||
|
from vocoder.fregan.generator import FreGAN
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
|
||||||
|
generator = None # type: FreGAN
|
||||||
|
_device = None
|
||||||
|
|
||||||
|
|
||||||
|
def load_checkpoint(filepath, device):
|
||||||
|
assert os.path.isfile(filepath)
|
||||||
|
print("Loading '{}'".format(filepath))
|
||||||
|
checkpoint_dict = torch.load(filepath, map_location=device)
|
||||||
|
print("Complete.")
|
||||||
|
return checkpoint_dict
|
||||||
|
|
||||||
|
|
||||||
|
def load_model(weights_fpath, verbose=True):
|
||||||
|
global generator, _device
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Building fregan")
|
||||||
|
|
||||||
|
with open("./vocoder/fregan/config.json") as f:
|
||||||
|
data = f.read()
|
||||||
|
json_config = json.loads(data)
|
||||||
|
h = AttrDict(json_config)
|
||||||
|
torch.manual_seed(h.seed)
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
# _model = _model.cuda()
|
||||||
|
_device = torch.device('cuda')
|
||||||
|
else:
|
||||||
|
_device = torch.device('cpu')
|
||||||
|
|
||||||
|
generator = FreGAN(h).to(_device)
|
||||||
|
state_dict_g = load_checkpoint(
|
||||||
|
weights_fpath, _device
|
||||||
|
)
|
||||||
|
generator.load_state_dict(state_dict_g['generator'])
|
||||||
|
generator.eval()
|
||||||
|
generator.remove_weight_norm()
|
||||||
|
|
||||||
|
|
||||||
|
def is_loaded():
|
||||||
|
return generator is not None
|
||||||
|
|
||||||
|
|
||||||
|
def infer_waveform(mel, progress_callback=None):
|
||||||
|
|
||||||
|
if generator is None:
|
||||||
|
raise Exception("Please load fre-gan in memory before using it")
|
||||||
|
|
||||||
|
mel = torch.FloatTensor(mel).to(_device)
|
||||||
|
mel = mel.unsqueeze(0)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
y_g_hat = generator(mel)
|
||||||
|
audio = y_g_hat.squeeze()
|
||||||
|
audio = audio.cpu().numpy()
|
||||||
|
|
||||||
|
return audio
|
||||||
|
|
35
vocoder/fregan/loss.py
Normal file
35
vocoder/fregan/loss.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def feature_loss(fmap_r, fmap_g):
|
||||||
|
loss = 0
|
||||||
|
for dr, dg in zip(fmap_r, fmap_g):
|
||||||
|
for rl, gl in zip(dr, dg):
|
||||||
|
loss += torch.mean(torch.abs(rl - gl))
|
||||||
|
|
||||||
|
return loss*2
|
||||||
|
|
||||||
|
|
||||||
|
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
||||||
|
loss = 0
|
||||||
|
r_losses = []
|
||||||
|
g_losses = []
|
||||||
|
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
||||||
|
r_loss = torch.mean((1-dr)**2)
|
||||||
|
g_loss = torch.mean(dg**2)
|
||||||
|
loss += (r_loss + g_loss)
|
||||||
|
r_losses.append(r_loss.item())
|
||||||
|
g_losses.append(g_loss.item())
|
||||||
|
|
||||||
|
return loss, r_losses, g_losses
|
||||||
|
|
||||||
|
|
||||||
|
def generator_loss(disc_outputs):
|
||||||
|
loss = 0
|
||||||
|
gen_losses = []
|
||||||
|
for dg in disc_outputs:
|
||||||
|
l = torch.mean((1-dg)**2)
|
||||||
|
gen_losses.append(l)
|
||||||
|
loss += l
|
||||||
|
|
||||||
|
return loss, gen_losses
|
176
vocoder/fregan/meldataset.py
Normal file
176
vocoder/fregan/meldataset.py
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
import math
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import torch
|
||||||
|
import torch.utils.data
|
||||||
|
import numpy as np
|
||||||
|
from librosa.util import normalize
|
||||||
|
from scipy.io.wavfile import read
|
||||||
|
from librosa.filters import mel as librosa_mel_fn
|
||||||
|
|
||||||
|
MAX_WAV_VALUE = 32768.0
|
||||||
|
|
||||||
|
|
||||||
|
def load_wav(full_path):
|
||||||
|
sampling_rate, data = read(full_path)
|
||||||
|
return data, sampling_rate
|
||||||
|
|
||||||
|
|
||||||
|
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
||||||
|
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
||||||
|
|
||||||
|
|
||||||
|
def dynamic_range_decompression(x, C=1):
|
||||||
|
return np.exp(x) / C
|
||||||
|
|
||||||
|
|
||||||
|
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
||||||
|
return torch.log(torch.clamp(x, min=clip_val) * C)
|
||||||
|
|
||||||
|
|
||||||
|
def dynamic_range_decompression_torch(x, C=1):
|
||||||
|
return torch.exp(x) / C
|
||||||
|
|
||||||
|
|
||||||
|
def spectral_normalize_torch(magnitudes):
|
||||||
|
output = dynamic_range_compression_torch(magnitudes)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def spectral_de_normalize_torch(magnitudes):
|
||||||
|
output = dynamic_range_decompression_torch(magnitudes)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
mel_basis = {}
|
||||||
|
hann_window = {}
|
||||||
|
|
||||||
|
|
||||||
|
def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
||||||
|
if torch.min(y) < -1.:
|
||||||
|
print('min value is ', torch.min(y))
|
||||||
|
if torch.max(y) > 1.:
|
||||||
|
print('max value is ', torch.max(y))
|
||||||
|
|
||||||
|
global mel_basis, hann_window
|
||||||
|
if fmax not in mel_basis:
|
||||||
|
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
||||||
|
mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
|
||||||
|
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
|
||||||
|
|
||||||
|
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
||||||
|
y = y.squeeze(1)
|
||||||
|
|
||||||
|
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
|
||||||
|
center=center, pad_mode='reflect', normalized=False, onesided=True)
|
||||||
|
|
||||||
|
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
|
||||||
|
|
||||||
|
spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
|
||||||
|
spec = spectral_normalize_torch(spec)
|
||||||
|
|
||||||
|
return spec
|
||||||
|
|
||||||
|
|
||||||
|
def get_dataset_filelist(a):
|
||||||
|
#with open(a.input_training_file, 'r', encoding='utf-8') as fi:
|
||||||
|
# training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
|
||||||
|
# for x in fi.read().split('\n') if len(x) > 0]
|
||||||
|
|
||||||
|
#with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
|
||||||
|
# validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
|
||||||
|
# for x in fi.read().split('\n') if len(x) > 0]
|
||||||
|
files = os.listdir(a.input_wavs_dir)
|
||||||
|
random.shuffle(files)
|
||||||
|
files = [os.path.join(a.input_wavs_dir, f) for f in files]
|
||||||
|
training_files = files[: -int(len(files) * 0.05)]
|
||||||
|
validation_files = files[-int(len(files) * 0.05):]
|
||||||
|
return training_files, validation_files
|
||||||
|
|
||||||
|
|
||||||
|
class MelDataset(torch.utils.data.Dataset):
|
||||||
|
def __init__(self, training_files, segment_size, n_fft, num_mels,
|
||||||
|
hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
|
||||||
|
device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
|
||||||
|
self.audio_files = training_files
|
||||||
|
random.seed(1234)
|
||||||
|
if shuffle:
|
||||||
|
random.shuffle(self.audio_files)
|
||||||
|
self.segment_size = segment_size
|
||||||
|
self.sampling_rate = sampling_rate
|
||||||
|
self.split = split
|
||||||
|
self.n_fft = n_fft
|
||||||
|
self.num_mels = num_mels
|
||||||
|
self.hop_size = hop_size
|
||||||
|
self.win_size = win_size
|
||||||
|
self.fmin = fmin
|
||||||
|
self.fmax = fmax
|
||||||
|
self.fmax_loss = fmax_loss
|
||||||
|
self.cached_wav = None
|
||||||
|
self.n_cache_reuse = n_cache_reuse
|
||||||
|
self._cache_ref_count = 0
|
||||||
|
self.device = device
|
||||||
|
self.fine_tuning = fine_tuning
|
||||||
|
self.base_mels_path = base_mels_path
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
filename = self.audio_files[index]
|
||||||
|
if self._cache_ref_count == 0:
|
||||||
|
#audio, sampling_rate = load_wav(filename)
|
||||||
|
#audio = audio / MAX_WAV_VALUE
|
||||||
|
audio = np.load(filename)
|
||||||
|
if not self.fine_tuning:
|
||||||
|
audio = normalize(audio) * 0.95
|
||||||
|
self.cached_wav = audio
|
||||||
|
#if sampling_rate != self.sampling_rate:
|
||||||
|
# raise ValueError("{} SR doesn't match target {} SR".format(
|
||||||
|
# sampling_rate, self.sampling_rate))
|
||||||
|
self._cache_ref_count = self.n_cache_reuse
|
||||||
|
else:
|
||||||
|
audio = self.cached_wav
|
||||||
|
self._cache_ref_count -= 1
|
||||||
|
|
||||||
|
audio = torch.FloatTensor(audio)
|
||||||
|
audio = audio.unsqueeze(0)
|
||||||
|
|
||||||
|
if not self.fine_tuning:
|
||||||
|
if self.split:
|
||||||
|
if audio.size(1) >= self.segment_size:
|
||||||
|
max_audio_start = audio.size(1) - self.segment_size
|
||||||
|
audio_start = random.randint(0, max_audio_start)
|
||||||
|
audio = audio[:, audio_start:audio_start+self.segment_size]
|
||||||
|
else:
|
||||||
|
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
|
||||||
|
|
||||||
|
mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
|
||||||
|
self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
|
||||||
|
center=False)
|
||||||
|
else:
|
||||||
|
mel_path = os.path.join(self.base_mels_path, "mel" + "-" + filename.split("/")[-1].split("-")[-1])
|
||||||
|
mel = np.load(mel_path).T
|
||||||
|
#mel = np.load(
|
||||||
|
# os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
|
||||||
|
mel = torch.from_numpy(mel)
|
||||||
|
|
||||||
|
if len(mel.shape) < 3:
|
||||||
|
mel = mel.unsqueeze(0)
|
||||||
|
|
||||||
|
if self.split:
|
||||||
|
frames_per_seg = math.ceil(self.segment_size / self.hop_size)
|
||||||
|
|
||||||
|
if audio.size(1) >= self.segment_size:
|
||||||
|
mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
|
||||||
|
mel = mel[:, :, mel_start:mel_start + frames_per_seg]
|
||||||
|
audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
|
||||||
|
else:
|
||||||
|
mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
|
||||||
|
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
|
||||||
|
|
||||||
|
mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
|
||||||
|
self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
|
||||||
|
center=False)
|
||||||
|
|
||||||
|
return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.audio_files)
|
201
vocoder/fregan/modules.py
Normal file
201
vocoder/fregan/modules.py
Normal file
@ -0,0 +1,201 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
class KernelPredictor(torch.nn.Module):
|
||||||
|
''' Kernel predictor for the location-variable convolutions
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
cond_channels,
|
||||||
|
conv_in_channels,
|
||||||
|
conv_out_channels,
|
||||||
|
conv_layers,
|
||||||
|
conv_kernel_size=3,
|
||||||
|
kpnet_hidden_channels=64,
|
||||||
|
kpnet_conv_size=3,
|
||||||
|
kpnet_dropout=0.0,
|
||||||
|
kpnet_nonlinear_activation="LeakyReLU",
|
||||||
|
kpnet_nonlinear_activation_params={"negative_slope": 0.1}
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Args:
|
||||||
|
cond_channels (int): number of channel for the conditioning sequence,
|
||||||
|
conv_in_channels (int): number of channel for the input sequence,
|
||||||
|
conv_out_channels (int): number of channel for the output sequence,
|
||||||
|
conv_layers (int):
|
||||||
|
kpnet_
|
||||||
|
'''
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.conv_in_channels = conv_in_channels
|
||||||
|
self.conv_out_channels = conv_out_channels
|
||||||
|
self.conv_kernel_size = conv_kernel_size
|
||||||
|
self.conv_layers = conv_layers
|
||||||
|
|
||||||
|
l_w = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers
|
||||||
|
l_b = conv_out_channels * conv_layers
|
||||||
|
|
||||||
|
padding = (kpnet_conv_size - 1) // 2
|
||||||
|
self.input_conv = torch.nn.Sequential(
|
||||||
|
torch.nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=(5 - 1) // 2, bias=True),
|
||||||
|
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.residual_conv = torch.nn.Sequential(
|
||||||
|
torch.nn.Dropout(kpnet_dropout),
|
||||||
|
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
|
||||||
|
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
|
||||||
|
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
|
||||||
|
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
|
||||||
|
torch.nn.Dropout(kpnet_dropout),
|
||||||
|
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
|
||||||
|
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
|
||||||
|
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
|
||||||
|
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
|
||||||
|
torch.nn.Dropout(kpnet_dropout),
|
||||||
|
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
|
||||||
|
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
|
||||||
|
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
|
||||||
|
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.kernel_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_w, kpnet_conv_size,
|
||||||
|
padding=padding, bias=True)
|
||||||
|
self.bias_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_b, kpnet_conv_size, padding=padding,
|
||||||
|
bias=True)
|
||||||
|
|
||||||
|
def forward(self, c):
|
||||||
|
'''
|
||||||
|
Args:
|
||||||
|
c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
|
||||||
|
Returns:
|
||||||
|
'''
|
||||||
|
batch, cond_channels, cond_length = c.shape
|
||||||
|
|
||||||
|
c = self.input_conv(c)
|
||||||
|
c = c + self.residual_conv(c)
|
||||||
|
k = self.kernel_conv(c)
|
||||||
|
b = self.bias_conv(c)
|
||||||
|
|
||||||
|
kernels = k.contiguous().view(batch,
|
||||||
|
self.conv_layers,
|
||||||
|
self.conv_in_channels,
|
||||||
|
self.conv_out_channels,
|
||||||
|
self.conv_kernel_size,
|
||||||
|
cond_length)
|
||||||
|
bias = b.contiguous().view(batch,
|
||||||
|
self.conv_layers,
|
||||||
|
self.conv_out_channels,
|
||||||
|
cond_length)
|
||||||
|
return kernels, bias
|
||||||
|
|
||||||
|
|
||||||
|
class LVCBlock(torch.nn.Module):
|
||||||
|
''' the location-variable convolutions
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
cond_channels,
|
||||||
|
upsample_ratio,
|
||||||
|
conv_layers=4,
|
||||||
|
conv_kernel_size=3,
|
||||||
|
cond_hop_length=256,
|
||||||
|
kpnet_hidden_channels=64,
|
||||||
|
kpnet_conv_size=3,
|
||||||
|
kpnet_dropout=0.0
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.cond_hop_length = cond_hop_length
|
||||||
|
self.conv_layers = conv_layers
|
||||||
|
self.conv_kernel_size = conv_kernel_size
|
||||||
|
self.convs = torch.nn.ModuleList()
|
||||||
|
|
||||||
|
self.upsample = torch.nn.ConvTranspose1d(in_channels, in_channels,
|
||||||
|
kernel_size=upsample_ratio*2, stride=upsample_ratio,
|
||||||
|
padding=upsample_ratio // 2 + upsample_ratio % 2,
|
||||||
|
output_padding=upsample_ratio % 2)
|
||||||
|
|
||||||
|
|
||||||
|
self.kernel_predictor = KernelPredictor(
|
||||||
|
cond_channels=cond_channels,
|
||||||
|
conv_in_channels=in_channels,
|
||||||
|
conv_out_channels=2 * in_channels,
|
||||||
|
conv_layers=conv_layers,
|
||||||
|
conv_kernel_size=conv_kernel_size,
|
||||||
|
kpnet_hidden_channels=kpnet_hidden_channels,
|
||||||
|
kpnet_conv_size=kpnet_conv_size,
|
||||||
|
kpnet_dropout=kpnet_dropout
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
for i in range(conv_layers):
|
||||||
|
padding = (3 ** i) * int((conv_kernel_size - 1) / 2)
|
||||||
|
conv = torch.nn.Conv1d(in_channels, in_channels, kernel_size=conv_kernel_size, padding=padding, dilation=3 ** i)
|
||||||
|
|
||||||
|
self.convs.append(conv)
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, x, c):
|
||||||
|
''' forward propagation of the location-variable convolutions.
|
||||||
|
Args:
|
||||||
|
x (Tensor): the input sequence (batch, in_channels, in_length)
|
||||||
|
c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: the output sequence (batch, in_channels, in_length)
|
||||||
|
'''
|
||||||
|
batch, in_channels, in_length = x.shape
|
||||||
|
|
||||||
|
|
||||||
|
kernels, bias = self.kernel_predictor(c)
|
||||||
|
|
||||||
|
x = F.leaky_relu(x, 0.2)
|
||||||
|
x = self.upsample(x)
|
||||||
|
|
||||||
|
for i in range(self.conv_layers):
|
||||||
|
y = F.leaky_relu(x, 0.2)
|
||||||
|
y = self.convs[i](y)
|
||||||
|
y = F.leaky_relu(y, 0.2)
|
||||||
|
|
||||||
|
k = kernels[:, i, :, :, :, :]
|
||||||
|
b = bias[:, i, :, :]
|
||||||
|
y = self.location_variable_convolution(y, k, b, 1, self.cond_hop_length)
|
||||||
|
x = x + torch.sigmoid(y[:, :in_channels, :]) * torch.tanh(y[:, in_channels:, :])
|
||||||
|
return x
|
||||||
|
|
||||||
|
def location_variable_convolution(self, x, kernel, bias, dilation, hop_size):
|
||||||
|
''' perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
|
||||||
|
Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
|
||||||
|
Args:
|
||||||
|
x (Tensor): the input sequence (batch, in_channels, in_length).
|
||||||
|
kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
|
||||||
|
bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
|
||||||
|
dilation (int): the dilation of convolution.
|
||||||
|
hop_size (int): the hop_size of the conditioning sequence.
|
||||||
|
Returns:
|
||||||
|
(Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
|
||||||
|
'''
|
||||||
|
batch, in_channels, in_length = x.shape
|
||||||
|
batch, in_channels, out_channels, kernel_size, kernel_length = kernel.shape
|
||||||
|
|
||||||
|
|
||||||
|
assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched"
|
||||||
|
|
||||||
|
padding = dilation * int((kernel_size - 1) / 2)
|
||||||
|
x = F.pad(x, (padding, padding), 'constant', 0) # (batch, in_channels, in_length + 2*padding)
|
||||||
|
x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding)
|
||||||
|
|
||||||
|
if hop_size < dilation:
|
||||||
|
x = F.pad(x, (0, dilation), 'constant', 0)
|
||||||
|
x = x.unfold(3, dilation,
|
||||||
|
dilation) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
|
||||||
|
x = x[:, :, :, :, :hop_size]
|
||||||
|
x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
|
||||||
|
x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size)
|
||||||
|
|
||||||
|
o = torch.einsum('bildsk,biokl->bolsd', x, kernel)
|
||||||
|
o = o + bias.unsqueeze(-1).unsqueeze(-1)
|
||||||
|
o = o.contiguous().view(batch, out_channels, -1)
|
||||||
|
return o
|
1
vocoder/fregan/requirements.txt
Normal file
1
vocoder/fregan/requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
PyWavelets
|
136
vocoder/fregan/stft_loss.py
Normal file
136
vocoder/fregan/stft_loss.py
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2019 Tomoki Hayashi
|
||||||
|
# MIT License (https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
"""STFT-based Loss modules."""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
def stft(x, fft_size, hop_size, win_length, window):
|
||||||
|
"""Perform STFT and convert to magnitude spectrogram.
|
||||||
|
Args:
|
||||||
|
x (Tensor): Input signal tensor (B, T).
|
||||||
|
fft_size (int): FFT size.
|
||||||
|
hop_size (int): Hop size.
|
||||||
|
win_length (int): Window length.
|
||||||
|
window (str): Window function type.
|
||||||
|
Returns:
|
||||||
|
Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
|
||||||
|
"""
|
||||||
|
x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
|
||||||
|
real = x_stft[..., 0]
|
||||||
|
imag = x_stft[..., 1]
|
||||||
|
|
||||||
|
# NOTE(kan-bayashi): clamp is needed to avoid nan or inf
|
||||||
|
return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
|
||||||
|
|
||||||
|
|
||||||
|
class SpectralConvergengeLoss(torch.nn.Module):
|
||||||
|
"""Spectral convergence loss module."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initilize spectral convergence loss module."""
|
||||||
|
super(SpectralConvergengeLoss, self).__init__()
|
||||||
|
|
||||||
|
def forward(self, x_mag, y_mag):
|
||||||
|
"""Calculate forward propagation.
|
||||||
|
Args:
|
||||||
|
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
|
||||||
|
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
|
||||||
|
Returns:
|
||||||
|
Tensor: Spectral convergence loss value.
|
||||||
|
"""
|
||||||
|
return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
|
||||||
|
|
||||||
|
|
||||||
|
class LogSTFTMagnitudeLoss(torch.nn.Module):
|
||||||
|
"""Log STFT magnitude loss module."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initilize los STFT magnitude loss module."""
|
||||||
|
super(LogSTFTMagnitudeLoss, self).__init__()
|
||||||
|
|
||||||
|
def forward(self, x_mag, y_mag):
|
||||||
|
"""Calculate forward propagation.
|
||||||
|
Args:
|
||||||
|
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
|
||||||
|
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
|
||||||
|
Returns:
|
||||||
|
Tensor: Log STFT magnitude loss value.
|
||||||
|
"""
|
||||||
|
return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
|
||||||
|
|
||||||
|
|
||||||
|
class STFTLoss(torch.nn.Module):
|
||||||
|
"""STFT loss module."""
|
||||||
|
|
||||||
|
def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
|
||||||
|
"""Initialize STFT loss module."""
|
||||||
|
super(STFTLoss, self).__init__()
|
||||||
|
self.fft_size = fft_size
|
||||||
|
self.shift_size = shift_size
|
||||||
|
self.win_length = win_length
|
||||||
|
self.window = getattr(torch, window)(win_length)
|
||||||
|
self.spectral_convergenge_loss = SpectralConvergengeLoss()
|
||||||
|
self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
|
||||||
|
|
||||||
|
def forward(self, x, y):
|
||||||
|
"""Calculate forward propagation.
|
||||||
|
Args:
|
||||||
|
x (Tensor): Predicted signal (B, T).
|
||||||
|
y (Tensor): Groundtruth signal (B, T).
|
||||||
|
Returns:
|
||||||
|
Tensor: Spectral convergence loss value.
|
||||||
|
Tensor: Log STFT magnitude loss value.
|
||||||
|
"""
|
||||||
|
x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window.to(x.get_device()))
|
||||||
|
y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window.to(x.get_device()))
|
||||||
|
sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
|
||||||
|
mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
|
||||||
|
|
||||||
|
return sc_loss, mag_loss
|
||||||
|
|
||||||
|
|
||||||
|
class MultiResolutionSTFTLoss(torch.nn.Module):
|
||||||
|
"""Multi resolution STFT loss module."""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
fft_sizes=[1024, 2048, 512],
|
||||||
|
hop_sizes=[120, 240, 50],
|
||||||
|
win_lengths=[600, 1200, 240],
|
||||||
|
window="hann_window"):
|
||||||
|
"""Initialize Multi resolution STFT loss module.
|
||||||
|
Args:
|
||||||
|
fft_sizes (list): List of FFT sizes.
|
||||||
|
hop_sizes (list): List of hop sizes.
|
||||||
|
win_lengths (list): List of window lengths.
|
||||||
|
window (str): Window function type.
|
||||||
|
"""
|
||||||
|
super(MultiResolutionSTFTLoss, self).__init__()
|
||||||
|
assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
|
||||||
|
self.stft_losses = torch.nn.ModuleList()
|
||||||
|
for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
|
||||||
|
self.stft_losses += [STFTLoss(fs, ss, wl, window)]
|
||||||
|
|
||||||
|
def forward(self, x, y):
|
||||||
|
"""Calculate forward propagation.
|
||||||
|
Args:
|
||||||
|
x (Tensor): Predicted signal (B, T).
|
||||||
|
y (Tensor): Groundtruth signal (B, T).
|
||||||
|
Returns:
|
||||||
|
Tensor: Multi resolution spectral convergence loss value.
|
||||||
|
Tensor: Multi resolution log STFT magnitude loss value.
|
||||||
|
"""
|
||||||
|
sc_loss = 0.0
|
||||||
|
mag_loss = 0.0
|
||||||
|
for f in self.stft_losses:
|
||||||
|
sc_l, mag_l = f(x, y)
|
||||||
|
sc_loss += sc_l
|
||||||
|
mag_loss += mag_l
|
||||||
|
sc_loss /= len(self.stft_losses)
|
||||||
|
mag_loss /= len(self.stft_losses)
|
||||||
|
|
||||||
|
return sc_loss, mag_loss
|
253
vocoder/fregan/train.py
Normal file
253
vocoder/fregan/train.py
Normal file
@ -0,0 +1,253 @@
|
|||||||
|
import warnings
|
||||||
|
|
||||||
|
warnings.simplefilter(action='ignore', category=FutureWarning)
|
||||||
|
import itertools
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
from torch.utils.data import DistributedSampler, DataLoader
|
||||||
|
import torch.multiprocessing as mp
|
||||||
|
from torch.distributed import init_process_group
|
||||||
|
from torch.nn.parallel import DistributedDataParallel
|
||||||
|
from vocoder.fregan.utils import AttrDict, build_env
|
||||||
|
from vocoder.fregan.meldataset import MelDataset, mel_spectrogram, get_dataset_filelist
|
||||||
|
from vocoder.fregan.generator import FreGAN
|
||||||
|
from vocoder.fregan.discriminator import ResWiseMultiPeriodDiscriminator, ResWiseMultiScaleDiscriminator
|
||||||
|
from vocoder.fregan.loss import feature_loss, generator_loss, discriminator_loss
|
||||||
|
from vocoder.fregan.utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint
|
||||||
|
from vocoder.fregan.stft_loss import MultiResolutionSTFTLoss
|
||||||
|
|
||||||
|
torch.backends.cudnn.benchmark = True
|
||||||
|
|
||||||
|
|
||||||
|
def train(rank, a, h):
|
||||||
|
|
||||||
|
a.checkpoint_path = a.models_dir.joinpath(a.run_id+'_fregan')
|
||||||
|
a.checkpoint_path.mkdir(exist_ok=True)
|
||||||
|
a.training_epochs = 3100
|
||||||
|
a.stdout_interval = 5
|
||||||
|
a.checkpoint_interval = a.backup_every
|
||||||
|
a.summary_interval = 5000
|
||||||
|
a.validation_interval = 1000
|
||||||
|
a.fine_tuning = True
|
||||||
|
|
||||||
|
a.input_wavs_dir = a.syn_dir.joinpath("audio")
|
||||||
|
a.input_mels_dir = a.syn_dir.joinpath("mels")
|
||||||
|
|
||||||
|
if h.num_gpus > 1:
|
||||||
|
init_process_group(backend=h.dist_config['dist_backend'], init_method=h.dist_config['dist_url'],
|
||||||
|
world_size=h.dist_config['world_size'] * h.num_gpus, rank=rank)
|
||||||
|
|
||||||
|
torch.cuda.manual_seed(h.seed)
|
||||||
|
device = torch.device('cuda:{:d}'.format(rank))
|
||||||
|
|
||||||
|
generator = FreGAN(h).to(device)
|
||||||
|
mpd = ResWiseMultiPeriodDiscriminator().to(device)
|
||||||
|
msd = ResWiseMultiScaleDiscriminator().to(device)
|
||||||
|
|
||||||
|
if rank == 0:
|
||||||
|
print(generator)
|
||||||
|
os.makedirs(a.checkpoint_path, exist_ok=True)
|
||||||
|
print("checkpoints directory : ", a.checkpoint_path)
|
||||||
|
|
||||||
|
if os.path.isdir(a.checkpoint_path):
|
||||||
|
cp_g = scan_checkpoint(a.checkpoint_path, 'g_')
|
||||||
|
cp_do = scan_checkpoint(a.checkpoint_path, 'do_')
|
||||||
|
|
||||||
|
steps = 0
|
||||||
|
if cp_g is None or cp_do is None:
|
||||||
|
state_dict_do = None
|
||||||
|
last_epoch = -1
|
||||||
|
else:
|
||||||
|
state_dict_g = load_checkpoint(cp_g, device)
|
||||||
|
state_dict_do = load_checkpoint(cp_do, device)
|
||||||
|
generator.load_state_dict(state_dict_g['generator'])
|
||||||
|
mpd.load_state_dict(state_dict_do['mpd'])
|
||||||
|
msd.load_state_dict(state_dict_do['msd'])
|
||||||
|
steps = state_dict_do['steps'] + 1
|
||||||
|
last_epoch = state_dict_do['epoch']
|
||||||
|
|
||||||
|
if h.num_gpus > 1:
|
||||||
|
generator = DistributedDataParallel(generator, device_ids=[rank]).to(device)
|
||||||
|
mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device)
|
||||||
|
msd = DistributedDataParallel(msd, device_ids=[rank]).to(device)
|
||||||
|
|
||||||
|
optim_g = torch.optim.AdamW(generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2])
|
||||||
|
optim_d = torch.optim.AdamW(itertools.chain(msd.parameters(), mpd.parameters()),
|
||||||
|
h.learning_rate, betas=[h.adam_b1, h.adam_b2])
|
||||||
|
|
||||||
|
if state_dict_do is not None:
|
||||||
|
optim_g.load_state_dict(state_dict_do['optim_g'])
|
||||||
|
optim_d.load_state_dict(state_dict_do['optim_d'])
|
||||||
|
|
||||||
|
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=h.lr_decay, last_epoch=last_epoch)
|
||||||
|
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=h.lr_decay, last_epoch=last_epoch)
|
||||||
|
|
||||||
|
training_filelist, validation_filelist = get_dataset_filelist(a)
|
||||||
|
|
||||||
|
trainset = MelDataset(training_filelist, h.segment_size, h.n_fft, h.num_mels,
|
||||||
|
h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, n_cache_reuse=0,
|
||||||
|
shuffle=False if h.num_gpus > 1 else True, fmax_loss=h.fmax_for_loss, device=device,
|
||||||
|
fine_tuning=a.fine_tuning, base_mels_path=a.input_mels_dir)
|
||||||
|
|
||||||
|
train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None
|
||||||
|
|
||||||
|
train_loader = DataLoader(trainset, num_workers=h.num_workers, shuffle=False,
|
||||||
|
sampler=train_sampler,
|
||||||
|
batch_size=h.batch_size,
|
||||||
|
pin_memory=True,
|
||||||
|
drop_last=True)
|
||||||
|
|
||||||
|
if rank == 0:
|
||||||
|
validset = MelDataset(validation_filelist, h.segment_size, h.n_fft, h.num_mels,
|
||||||
|
h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, False, False, n_cache_reuse=0,
|
||||||
|
fmax_loss=h.fmax_for_loss, device=device, fine_tuning=a.fine_tuning,
|
||||||
|
base_mels_path=a.input_mels_dir)
|
||||||
|
validation_loader = DataLoader(validset, num_workers=1, shuffle=False,
|
||||||
|
sampler=None,
|
||||||
|
batch_size=1,
|
||||||
|
pin_memory=True,
|
||||||
|
drop_last=True)
|
||||||
|
|
||||||
|
sw = SummaryWriter(os.path.join(a.checkpoint_path, 'logs'))
|
||||||
|
|
||||||
|
generator.train()
|
||||||
|
mpd.train()
|
||||||
|
msd.train()
|
||||||
|
for epoch in range(max(0, last_epoch), a.training_epochs):
|
||||||
|
if rank == 0:
|
||||||
|
start = time.time()
|
||||||
|
print("Epoch: {}".format(epoch + 1))
|
||||||
|
|
||||||
|
if h.num_gpus > 1:
|
||||||
|
train_sampler.set_epoch(epoch)
|
||||||
|
|
||||||
|
for i, batch in enumerate(train_loader):
|
||||||
|
if rank == 0:
|
||||||
|
start_b = time.time()
|
||||||
|
x, y, _, y_mel = batch
|
||||||
|
x = torch.autograd.Variable(x.to(device, non_blocking=True))
|
||||||
|
y = torch.autograd.Variable(y.to(device, non_blocking=True))
|
||||||
|
y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True))
|
||||||
|
|
||||||
|
y = y.unsqueeze(1)
|
||||||
|
|
||||||
|
y_g_hat = generator(x)
|
||||||
|
|
||||||
|
y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size,
|
||||||
|
h.win_size,
|
||||||
|
h.fmin, h.fmax_for_loss)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
optim_d.zero_grad()
|
||||||
|
|
||||||
|
# MPD
|
||||||
|
y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach())
|
||||||
|
loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(y_df_hat_r, y_df_hat_g)
|
||||||
|
|
||||||
|
# MSD
|
||||||
|
y_ds_hat_r, y_ds_hat_g, _, _ = msd(y, y_g_hat.detach())
|
||||||
|
loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(y_ds_hat_r, y_ds_hat_g)
|
||||||
|
|
||||||
|
loss_disc_all = loss_disc_s + loss_disc_f
|
||||||
|
|
||||||
|
loss_disc_all.backward()
|
||||||
|
optim_d.step()
|
||||||
|
|
||||||
|
# Generator
|
||||||
|
optim_g.zero_grad()
|
||||||
|
|
||||||
|
|
||||||
|
# L1 Mel-Spectrogram Loss
|
||||||
|
loss_mel = F.l1_loss(y_mel, y_g_hat_mel) * 45
|
||||||
|
|
||||||
|
# sc_loss, mag_loss = stft_loss(y_g_hat[:, :, :y.size(2)].squeeze(1), y.squeeze(1))
|
||||||
|
# loss_mel = h.lambda_aux * (sc_loss + mag_loss) # STFT Loss
|
||||||
|
|
||||||
|
|
||||||
|
y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat)
|
||||||
|
y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = msd(y, y_g_hat)
|
||||||
|
loss_fm_f = feature_loss(fmap_f_r, fmap_f_g)
|
||||||
|
loss_fm_s = feature_loss(fmap_s_r, fmap_s_g)
|
||||||
|
loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g)
|
||||||
|
loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g)
|
||||||
|
loss_gen_all = loss_gen_s + loss_gen_f + (2 * (loss_fm_s + loss_fm_f)) + loss_mel
|
||||||
|
|
||||||
|
|
||||||
|
loss_gen_all.backward()
|
||||||
|
optim_g.step()
|
||||||
|
|
||||||
|
if rank == 0:
|
||||||
|
# STDOUT logging
|
||||||
|
if steps % a.stdout_interval == 0:
|
||||||
|
with torch.no_grad():
|
||||||
|
mel_error = F.l1_loss(y_mel, y_g_hat_mel).item()
|
||||||
|
|
||||||
|
print('Steps : {:d}, Gen Loss Total : {:4.3f}, Mel-Spec. Error : {:4.3f}, s/b : {:4.3f}'.
|
||||||
|
format(steps, loss_gen_all, mel_error, time.time() - start_b))
|
||||||
|
|
||||||
|
# checkpointing
|
||||||
|
if steps % a.checkpoint_interval == 0 and steps != 0:
|
||||||
|
checkpoint_path = "{}/m_fregan_{:08d}".format(a.checkpoint_path, steps)
|
||||||
|
save_checkpoint(checkpoint_path,
|
||||||
|
{'generator': (generator.module if h.num_gpus > 1 else generator).state_dict()})
|
||||||
|
checkpoint_path = "{}/do_fregan_{:08d}".format(a.checkpoint_path, steps)
|
||||||
|
save_checkpoint(checkpoint_path,
|
||||||
|
{'mpd': (mpd.module if h.num_gpus > 1
|
||||||
|
else mpd).state_dict(),
|
||||||
|
'msd': (msd.module if h.num_gpus > 1
|
||||||
|
else msd).state_dict(),
|
||||||
|
'optim_g': optim_g.state_dict(), 'optim_d': optim_d.state_dict(), 'steps': steps,
|
||||||
|
'epoch': epoch})
|
||||||
|
|
||||||
|
# Tensorboard summary logging
|
||||||
|
if steps % a.summary_interval == 0:
|
||||||
|
sw.add_scalar("training/gen_loss_total", loss_gen_all, steps)
|
||||||
|
sw.add_scalar("training/mel_spec_error", mel_error, steps)
|
||||||
|
|
||||||
|
# Validation
|
||||||
|
if steps % a.validation_interval == 0: # and steps != 0:
|
||||||
|
generator.eval()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
val_err_tot = 0
|
||||||
|
with torch.no_grad():
|
||||||
|
for j, batch in enumerate(validation_loader):
|
||||||
|
x, y, _, y_mel = batch
|
||||||
|
y_g_hat = generator(x.to(device))
|
||||||
|
y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True))
|
||||||
|
y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate,
|
||||||
|
h.hop_size, h.win_size,
|
||||||
|
h.fmin, h.fmax_for_loss)
|
||||||
|
#val_err_tot += F.l1_loss(y_mel, y_g_hat_mel).item()
|
||||||
|
|
||||||
|
if j <= 4:
|
||||||
|
if steps == 0:
|
||||||
|
sw.add_audio('gt/y_{}'.format(j), y[0], steps, h.sampling_rate)
|
||||||
|
sw.add_figure('gt/y_spec_{}'.format(j), plot_spectrogram(x[0]), steps)
|
||||||
|
|
||||||
|
sw.add_audio('generated/y_hat_{}'.format(j), y_g_hat[0], steps, h.sampling_rate)
|
||||||
|
y_hat_spec = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels,
|
||||||
|
h.sampling_rate, h.hop_size, h.win_size,
|
||||||
|
h.fmin, h.fmax)
|
||||||
|
sw.add_figure('generated/y_hat_spec_{}'.format(j),
|
||||||
|
plot_spectrogram(y_hat_spec.squeeze(0).cpu().numpy()), steps)
|
||||||
|
|
||||||
|
val_err = val_err_tot / (j + 1)
|
||||||
|
sw.add_scalar("validation/mel_spec_error", val_err, steps)
|
||||||
|
|
||||||
|
generator.train()
|
||||||
|
|
||||||
|
steps += 1
|
||||||
|
|
||||||
|
scheduler_g.step()
|
||||||
|
scheduler_d.step()
|
||||||
|
|
||||||
|
if rank == 0:
|
||||||
|
print('Time taken for epoch {} is {} sec\n'.format(epoch + 1, int(time.time() - start)))
|
||||||
|
|
||||||
|
|
71
vocoder/fregan/utils.py
Normal file
71
vocoder/fregan/utils.py
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import matplotlib
|
||||||
|
import torch
|
||||||
|
from torch.nn.utils import weight_norm
|
||||||
|
matplotlib.use("Agg")
|
||||||
|
import matplotlib.pylab as plt
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
class AttrDict(dict):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(AttrDict, self).__init__(*args, **kwargs)
|
||||||
|
self.__dict__ = self
|
||||||
|
|
||||||
|
|
||||||
|
def build_env(config, config_name, path):
|
||||||
|
t_path = os.path.join(path, config_name)
|
||||||
|
if config != t_path:
|
||||||
|
os.makedirs(path, exist_ok=True)
|
||||||
|
shutil.copyfile(config, os.path.join(path, config_name))
|
||||||
|
|
||||||
|
|
||||||
|
def plot_spectrogram(spectrogram):
|
||||||
|
fig, ax = plt.subplots(figsize=(10, 2))
|
||||||
|
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
|
||||||
|
interpolation='none')
|
||||||
|
plt.colorbar(im, ax=ax)
|
||||||
|
|
||||||
|
fig.canvas.draw()
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
return fig
|
||||||
|
|
||||||
|
|
||||||
|
def init_weights(m, mean=0.0, std=0.01):
|
||||||
|
classname = m.__class__.__name__
|
||||||
|
if classname.find("Conv") != -1:
|
||||||
|
m.weight.data.normal_(mean, std)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_weight_norm(m):
|
||||||
|
classname = m.__class__.__name__
|
||||||
|
if classname.find("Conv") != -1:
|
||||||
|
weight_norm(m)
|
||||||
|
|
||||||
|
|
||||||
|
def get_padding(kernel_size, dilation=1):
|
||||||
|
return int((kernel_size*dilation - dilation)/2)
|
||||||
|
|
||||||
|
|
||||||
|
def load_checkpoint(filepath, device):
|
||||||
|
assert os.path.isfile(filepath)
|
||||||
|
print("Loading '{}'".format(filepath))
|
||||||
|
checkpoint_dict = torch.load(filepath, map_location=device)
|
||||||
|
print("Complete.")
|
||||||
|
return checkpoint_dict
|
||||||
|
|
||||||
|
|
||||||
|
def save_checkpoint(filepath, obj):
|
||||||
|
print("Saving checkpoint to {}".format(filepath))
|
||||||
|
torch.save(obj, filepath)
|
||||||
|
print("Complete.")
|
||||||
|
|
||||||
|
|
||||||
|
def scan_checkpoint(cp_dir, prefix):
|
||||||
|
pattern = os.path.join(cp_dir, prefix + '????????')
|
||||||
|
cp_list = glob.glob(pattern)
|
||||||
|
if len(cp_list) == 0:
|
||||||
|
return None
|
||||||
|
return sorted(cp_list)[-1]
|
@ -27,5 +27,10 @@
|
|||||||
"fmax": 7600,
|
"fmax": 7600,
|
||||||
"fmax_for_loss": null,
|
"fmax_for_loss": null,
|
||||||
|
|
||||||
"num_workers": 4
|
"num_workers": 4,
|
||||||
|
"dist_config": {
|
||||||
|
"dist_backend": "nccl",
|
||||||
|
"dist_url": "tcp://localhost:54321",
|
||||||
|
"world_size": 1
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -52,6 +52,7 @@ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin,
|
|||||||
if torch.max(y) > 1.:
|
if torch.max(y) > 1.:
|
||||||
print('max value is ', torch.max(y))
|
print('max value is ', torch.max(y))
|
||||||
|
|
||||||
|
|
||||||
global mel_basis, hann_window
|
global mel_basis, hann_window
|
||||||
if fmax not in mel_basis:
|
if fmax not in mel_basis:
|
||||||
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
||||||
|
@ -2,6 +2,7 @@ from utils.argutils import print_args
|
|||||||
from vocoder.wavernn.train import train
|
from vocoder.wavernn.train import train
|
||||||
from vocoder.hifigan.train import train as train_hifigan
|
from vocoder.hifigan.train import train as train_hifigan
|
||||||
from vocoder.hifigan.env import AttrDict
|
from vocoder.hifigan.env import AttrDict
|
||||||
|
from vocoder.fregan.train import train as train_fregan
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
@ -61,11 +62,18 @@ if __name__ == "__main__":
|
|||||||
# Process the arguments
|
# Process the arguments
|
||||||
if args.vocoder_type == "wavernn":
|
if args.vocoder_type == "wavernn":
|
||||||
# Run the training wavernn
|
# Run the training wavernn
|
||||||
|
delattr(args,'vocoder_type')
|
||||||
|
delattr(args,'config')
|
||||||
train(**vars(args))
|
train(**vars(args))
|
||||||
elif args.vocoder_type == "hifigan":
|
elif args.vocoder_type == "hifigan":
|
||||||
with open(args.config) as f:
|
with open(args.config) as f:
|
||||||
json_config = json.load(f)
|
json_config = json.load(f)
|
||||||
h = AttrDict(json_config)
|
h = AttrDict(json_config)
|
||||||
train_hifigan(0, args, h)
|
train_hifigan(0, args, h)
|
||||||
|
elif args.vocoder_type == "fregan":
|
||||||
|
with open('vocoder/fregan/config.json') as f:
|
||||||
|
json_config = json.load(f)
|
||||||
|
h = AttrDict(json_config)
|
||||||
|
train_fregan(0, args, h)
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user