* 替换了vocoder

* 修改了vocoder_train

* 减谱法

* 美化UI;语音增强;MFCC特征可视化

* 修复了训练fregan模型时的报错

* 增加了可以分析音频特征的独立文件

* 现已支持Fre-GAN声码器的训练

* 修复了训练fregan时保存模型的BUG

* 删除了无用的文件

* 优化了识别声码器模型的方式
This commit is contained in:
flysmart 2022-05-09 13:22:55 +08:00 committed by GitHub
parent 875fe15069
commit 86ea11affd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
31 changed files with 15435 additions and 31 deletions

View File

@ -79,6 +79,10 @@
`python vocoder_train.py <trainid> <datasets_root> hifigan` `python vocoder_train.py <trainid> <datasets_root> hifigan`
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型 > `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
* 训练Fre-GAN声码器:
`python vocoder_train.py <trainid> <datasets_root> --config config.json fregan`
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
### 3. 启动程序或工具箱 ### 3. 启动程序或工具箱
您可以尝试使用以下命令: 您可以尝试使用以下命令:

43
analysis.py Normal file
View File

@ -0,0 +1,43 @@
from scipy.io import wavfile # scipy library to read wav files
import numpy as np
AudioName = "target.wav" # Audio File
fs, Audiodata = wavfile.read(AudioName)
# Plot the audio signal in time
import matplotlib.pyplot as plt
plt.plot(Audiodata)
plt.title('Audio signal in time',size=16)
# spectrum
from scipy.fftpack import fft # fourier transform
n = len(Audiodata)
AudioFreq = fft(Audiodata)
AudioFreq = AudioFreq[0:int(np.ceil((n+1)/2.0))] #Half of the spectrum
MagFreq = np.abs(AudioFreq) # Magnitude
MagFreq = MagFreq / float(n)
# power spectrum
MagFreq = MagFreq**2
if n % 2 > 0: # ffte odd
MagFreq[1:len(MagFreq)] = MagFreq[1:len(MagFreq)] * 2
else:# fft even
MagFreq[1:len(MagFreq) -1] = MagFreq[1:len(MagFreq) - 1] * 2
plt.figure()
freqAxis = np.arange(0,int(np.ceil((n+1)/2.0)), 1.0) * (fs / n);
plt.plot(freqAxis/1000.0, 10*np.log10(MagFreq)) #Power spectrum
plt.xlabel('Frequency (kHz)'); plt.ylabel('Power spectrum (dB)');
#Spectrogram
from scipy import signal
N = 512 #Number of point in the fft
f, t, Sxx = signal.spectrogram(Audiodata, fs,window = signal.blackman(N),nfft=N)
plt.figure()
plt.pcolormesh(t, f,10*np.log10(Sxx)) # dB spectrogram
#plt.pcolormesh(t, f,Sxx) # Lineal spectrogram
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [seg]')
plt.title('Spectrogram with scipy.signal',size=16);
plt.show()

BIN
fmcc_result.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 188 KiB

BIN
fmcc_source.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 195 KiB

188
specdeno/enhance_speach.py Normal file
View File

@ -0,0 +1,188 @@
#!/usr/bin/env python
import librosa
import numpy as np
import wave
import math
from synthesizer.hparams import hparams
import os
import ctypes as ct
from encoder import inference as encoder
from utils import logmmse
def enhance(fpath):
class FloatBits(ct.Structure):
_fields_ = [
('M', ct.c_uint, 23),
('E', ct.c_uint, 8),
('S', ct.c_uint, 1)
]
class Float(ct.Union):
_anonymous_ = ('bits',)
_fields_ = [
('value', ct.c_float),
('bits', FloatBits)
]
def nextpow2(x):
if x < 0:
x = -x
if x == 0:
return 0
d = Float()
d.value = x
if d.M == 0:
return d.E - 127
return d.E - 127 + 1
# 打开WAV文档
f = wave.open(str(fpath))
# 读取格式信息
# (nchannels, sampwidth, framerate, nframes, comptype, compname)
params = f.getparams()
nchannels, sampwidth, framerate, nframes = params[:4]
fs = framerate
# 读取波形数据
str_data = f.readframes(nframes)
f.close()
# 将波形数据转换为数组
x = np.fromstring(str_data, dtype=np.short)
# 计算参数
len_ = 20 * fs // 1000 # 样本中帧的大小
PERC = 50 # 窗口重叠占帧的百分比
len1 = len_ * PERC // 100 # 重叠窗口
len2 = len_ - len1 # 非重叠窗口
# 设置默认参数
Thres = 3
Expnt = 2.0
beta = 0.002
G = 0.9
# 初始化汉明窗
win = np.hamming(len_)
# normalization gain for overlap+add with 50% overlap
winGain = len2 / sum(win)
# Noise magnitude calculations - assuming that the first 5 frames is noise/silence
nFFT = 2 * 2 ** (nextpow2(len_))
noise_mean = np.zeros(nFFT)
j = 0
for k in range(1, 6):
noise_mean = noise_mean + abs(np.fft.fft(win * x[j:j + len_], nFFT))
j = j + len_
noise_mu = noise_mean / 5
# --- allocate memory and initialize various variables
k = 1
img = 1j
x_old = np.zeros(len1)
Nframes = len(x) // len2 - 1
xfinal = np.zeros(Nframes * len2)
# ========================= Start Processing ===============================
for n in range(0, Nframes):
# Windowing
insign = win * x[k-1:k + len_ - 1]
# compute fourier transform of a frame
spec = np.fft.fft(insign, nFFT)
# compute the magnitude
sig = abs(spec)
# save the noisy phase information
theta = np.angle(spec)
SNRseg = 10 * np.log10(np.linalg.norm(sig, 2) ** 2 / np.linalg.norm(noise_mu, 2) ** 2)
def berouti(SNR):
if -5.0 <= SNR <= 20.0:
a = 4 - SNR * 3 / 20
else:
if SNR < -5.0:
a = 5
if SNR > 20:
a = 1
return a
def berouti1(SNR):
if -5.0 <= SNR <= 20.0:
a = 3 - SNR * 2 / 20
else:
if SNR < -5.0:
a = 4
if SNR > 20:
a = 1
return a
if Expnt == 1.0: # 幅度谱
alpha = berouti1(SNRseg)
else: # 功率谱
alpha = berouti(SNRseg)
#############
sub_speech = sig ** Expnt - alpha * noise_mu ** Expnt;
# 当纯净信号小于噪声信号的功率时
diffw = sub_speech - beta * noise_mu ** Expnt
# beta negative components
def find_index(x_list):
index_list = []
for i in range(len(x_list)):
if x_list[i] < 0:
index_list.append(i)
return index_list
z = find_index(diffw)
if len(z) > 0:
# 用估计出来的噪声信号表示下限值
sub_speech[z] = beta * noise_mu[z] ** Expnt
# --- implement a simple VAD detector --------------
if SNRseg < Thres: # Update noise spectrum
noise_temp = G * noise_mu ** Expnt + (1 - G) * sig ** Expnt # 平滑处理噪声功率谱
noise_mu = noise_temp ** (1 / Expnt) # 新的噪声幅度谱
# flipud函数实现矩阵的上下翻转是以矩阵的“水平中线”为对称轴
# 交换上下对称元素
sub_speech[nFFT // 2 + 1:nFFT] = np.flipud(sub_speech[1:nFFT // 2])
x_phase = (sub_speech ** (1 / Expnt)) * (np.array([math.cos(x) for x in theta]) + img * (np.array([math.sin(x) for x in theta])))
# take the IFFT
xi = np.fft.ifft(x_phase).real
# --- Overlap and add ---------------
xfinal[k-1:k + len2 - 1] = x_old + xi[0:len1]
x_old = xi[0 + len1:len_]
k = k + len2
# 保存文件
wf = wave.open('out.wav', 'wb')
# 设置参数
wf.setparams(params)
# 设置波形文件 .tostring()将array转换为data
wave_data = (winGain * xfinal).astype(np.short)
wf.writeframes(wave_data.tostring())
wf.close()
wav = librosa.load("./out.wav", hparams.sample_rate)[0]
#在给定噪声配置文件的情况下清除语音波形中的噪声。 波形必须有与用于创建噪声配置文件的采样率相同
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# denoise
if len(wav) > hparams.sample_rate * (0.3 + 0.1):
noise_wav = np.concatenate([wav[:int(hparams.sample_rate * 0.15)],
wav[-int(hparams.sample_rate * 0.15):]])
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile)
# Trim excessive silences
wav = encoder.preprocess_wav(wav)
#删除保存的输出文件
os.remove("./out.wav")
return wav

View File

@ -3,6 +3,7 @@ from encoder import inference as encoder
from synthesizer.inference import Synthesizer from synthesizer.inference import Synthesizer
from vocoder.wavernn import inference as rnn_vocoder from vocoder.wavernn import inference as rnn_vocoder
from vocoder.hifigan import inference as gan_vocoder from vocoder.hifigan import inference as gan_vocoder
from vocoder.fregan import inference as fgan_vocoder
from pathlib import Path from pathlib import Path
from time import perf_counter as timer from time import perf_counter as timer
from toolbox.utterance import Utterance from toolbox.utterance import Utterance
@ -13,6 +14,10 @@ import torch
import librosa import librosa
import re import re
from audioread.exceptions import NoBackendError from audioread.exceptions import NoBackendError
from specdeno.enhance_speach import enhance
import os
from synthesizer.hparams import hparams
import soundfile as sf
# 默认使用wavernn # 默认使用wavernn
vocoder = rnn_vocoder vocoder = rnn_vocoder
@ -109,6 +114,13 @@ class Toolbox:
self.ui.stop_button.clicked.connect(self.ui.stop) self.ui.stop_button.clicked.connect(self.ui.stop)
self.ui.record_button.clicked.connect(self.record) self.ui.record_button.clicked.connect(self.record)
#添加source_mfcc分析槽
func = lambda: self.ui.plot_mfcc(self.ui.selected_utterance.wav, Synthesizer.sample_rate)
self.ui.play_button.clicked.connect(func)
#Audio #Audio
self.ui.setup_audio_devices(Synthesizer.sample_rate) self.ui.setup_audio_devices(Synthesizer.sample_rate)
@ -119,6 +131,8 @@ class Toolbox:
self.ui.export_wav_button.clicked.connect(func) self.ui.export_wav_button.clicked.connect(func)
self.ui.waves_cb.currentIndexChanged.connect(self.set_current_wav) self.ui.waves_cb.currentIndexChanged.connect(self.set_current_wav)
# Generation # Generation
func = lambda: self.synthesize() or self.vocode() func = lambda: self.synthesize() or self.vocode()
self.ui.generate_button.clicked.connect(func) self.ui.generate_button.clicked.connect(func)
@ -126,6 +140,9 @@ class Toolbox:
self.ui.vocode_button.clicked.connect(self.vocode) self.ui.vocode_button.clicked.connect(self.vocode)
self.ui.random_seed_checkbox.clicked.connect(self.update_seed_textbox) self.ui.random_seed_checkbox.clicked.connect(self.update_seed_textbox)
# 添加result_mfcc分析槽,该槽要在语音合成之后
func = lambda: self.ui.plot_mfcc1(self.current_wav, Synthesizer.sample_rate)
self.ui.generate_button.clicked.connect(func)
# UMAP legend # UMAP legend
self.ui.clear_button.clicked.connect(self.clear_utterances) self.ui.clear_button.clicked.connect(self.clear_utterances)
@ -167,13 +184,18 @@ class Toolbox:
# Get the wav from the disk. We take the wav with the vocoder/synthesizer format for # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
# playback, so as to have a fair comparison with the generated audio # playback, so as to have a fair comparison with the generated audio
wav = Synthesizer.load_preprocess_wav(fpath) #wav = Synthesizer.load_preprocess_wav(fpath)
wav = enhance(fpath)
self.ui.log("Loaded %s" % name) self.ui.log("Loaded %s" % name)
self.add_real_utterance(wav, name, speaker_name) self.add_real_utterance(wav, name, speaker_name)
def record(self): def record(self):
wav = self.ui.record_one(encoder.sampling_rate, 5) wav = self.ui.record_one(encoder.sampling_rate, 5)
sf.write('output1.wav', wav, hparams.sample_rate) # 先将变量wav写为文件的形式
wav = enhance('output1.wav')
os.remove("./output1.wav")
if wav is None: if wav is None:
return return
self.ui.play(wav, encoder.sampling_rate) self.ui.play(wav, encoder.sampling_rate)
@ -285,7 +307,10 @@ class Toolbox:
# Trim excessive silences # Trim excessive silences
if self.ui.trim_silences_checkbox.isChecked(): if self.ui.trim_silences_checkbox.isChecked():
wav = encoder.preprocess_wav(wav) #wav = encoder.preprocess_wav(wav)
sf.write('output.wav', wav, hparams.sample_rate) #先将变量wav写为文件的形式
wav = enhance('output.wav')
os.remove("./output.wav")
# Play it # Play it
wav = wav / np.abs(wav).max() * 0.97 wav = wav / np.abs(wav).max() * 0.97
@ -360,10 +385,13 @@ class Toolbox:
return return
# Sekect vocoder based on model name # Select vocoder based on model name
if model_fpath.name[0] == "g": if model_fpath.name is not None and model_fpath.name.find("hifigan") > -1:
vocoder = gan_vocoder vocoder = gan_vocoder
self.ui.log("set hifigan as vocoder") self.ui.log("set hifigan as vocoder")
elif model_fpath.name is not None and model_fpath.name.find("fregan") > -1:
vocoder = fgan_vocoder
self.ui.log("set fregan as vocoder")
else: else:
vocoder = rnn_vocoder vocoder = rnn_vocoder
self.ui.log("set wavernn as vocoder") self.ui.log("set wavernn as vocoder")
@ -376,4 +404,6 @@ class Toolbox:
self.ui.set_loading(0) self.ui.set_loading(0)
def update_seed_textbox(self): def update_seed_textbox(self):
self.ui.update_seed_textbox() self.ui.update_seed_textbox()

BIN
toolbox/assets/1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

BIN
toolbox/assets/2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

BIN
toolbox/assets/picture1.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 841 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 578 KiB

View File

@ -1,4 +1,7 @@
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy
from scipy.fftpack import dct
from PyQt5.QtGui import QPalette, QBrush, QPixmap
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure from matplotlib.figure import Figure
from PyQt5.QtCore import Qt, QStringListModel from PyQt5.QtCore import Qt, QStringListModel
@ -16,9 +19,15 @@ from time import sleep
import umap import umap
import sys import sys
from warnings import filterwarnings, warn from warnings import filterwarnings, warn
filterwarnings("ignore") filterwarnings("ignore")
colormap = np.array([ colormap = np.array([
[0, 127, 70], [0, 127, 70],
[255, 0, 0], [255, 0, 0],
@ -37,7 +46,7 @@ colormap = np.array([
], dtype=np.float) / 255 ], dtype=np.float) / 255
default_text = \ default_text = \
"欢迎使用工具箱, 现已支持中文输入" "请输入需要克隆的语音文本"
@ -49,7 +58,12 @@ class UI(QDialog):
def draw_utterance(self, utterance: Utterance, which): def draw_utterance(self, utterance: Utterance, which):
self.draw_spec(utterance.spec, which) self.draw_spec(utterance.spec, which)
self.draw_embed(utterance.embed, utterance.name, which) self.draw_embed(utterance.embed, utterance.name, which)
def draw_embed(self, embed, name, which): def draw_embed(self, embed, name, which):
embed_ax, _ = self.current_ax if which == "current" else self.gen_ax embed_ax, _ = self.current_ax if which == "current" else self.gen_ax
embed_ax.figure.suptitle("" if embed is None else name) embed_ax.figure.suptitle("" if embed is None else name)
@ -96,7 +110,7 @@ class UI(QDialog):
# Display a message if there aren't enough points # Display a message if there aren't enough points
if len(utterances) < self.min_umap_points: if len(utterances) < self.min_umap_points:
self.umap_ax.text(.5, .5, "Add %d more points to\ngenerate the projections" % self.umap_ax.text(.5, .5, "umap:\nAdd %d more points to\ngenerate the projections" %
(self.min_umap_points - len(utterances)), (self.min_umap_points - len(utterances)),
horizontalalignment='center', fontsize=15) horizontalalignment='center', fontsize=15)
self.umap_ax.set_title("") self.umap_ax.set_title("")
@ -227,6 +241,110 @@ class UI(QDialog):
return wav.squeeze() return wav.squeeze()
#添加source_mfcc分析函数
def plot_mfcc(self, wav, sample_rate):
signal = wav
print(sample_rate, len(signal))
# 读取前3.5s 的数据
signal = signal[0:int(3.5 * sample_rate)]
print(signal)
# 预先处理
pre_emphasis = 0.97
emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
frame_size = 0.025
frame_stride = 0.1
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step))
pad_signal_length = num_frames * frame_step + frame_length
z = numpy.zeros((pad_signal_length - signal_length))
pad_signal = numpy.append(emphasized_signal, z)
indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(
numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[numpy.mat(indices).astype(numpy.int32, copy=False)]
# 加上汉明窗
frames *= numpy.hamming(frame_length)
# frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1)) # Explicit Implementation **
# 傅立叶变换和功率谱
NFFT = 512
mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT)) # Magnitude of the FFT
# print(mag_frames.shape)
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
low_freq_mel = 0
# 将频率转换为Mel
nfilt = 40
high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700))
mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # Convert Mel to Hz
bin = numpy.floor((NFFT + 1) * hz_points / sample_rate)
fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = numpy.dot(pow_frames, fbank.T)
filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks) # Numerical Stability
filter_banks = 20 * numpy.log10(filter_banks) # dB
# 所得到的倒谱系数2-13被保留其余的被丢弃
num_ceps = 12
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1: (num_ceps + 1)]
(nframes, ncoeff) = mfcc.shape
n = numpy.arange(ncoeff)
cep_lifter = 22
lift = 1 + (cep_lifter / 2) * numpy.sin(numpy.pi * n / cep_lifter)
mfcc *= lift # *
# filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8)
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
print(mfcc.shape)
# 创建新的figure
fig10 = plt.figure(figsize=(16,8))
# 绘制1x2两行两列共四个图编号从1开始
ax = fig10.add_subplot(121)
plt.plot(mfcc)
ax = fig10.add_subplot(122)
# 平均归一化MFCC
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
plt.imshow(numpy.flipud(mfcc.T), cmap=plt.cm.jet, aspect=0.2,
extent=[0, mfcc.shape[0], 0, mfcc.shape[1]]) # 热力图
#将figure保存为png并显示在新创建的子窗口上
plt.savefig("fmcc_source.png")
dialog_fault = QDialog()
dialog_fault.setWindowTitle("源音频MFCC特征图及MFCC平均归一化热图") # 设置窗口名
pic = QPixmap("fmcc_source.png")
label_pic = QLabel("show", dialog_fault)
label_pic.setPixmap(pic)
label_pic.setGeometry(0,0,1500,800)
dialog_fault.exec_()
@property @property
def current_dataset_name(self): def current_dataset_name(self):
return self.dataset_box.currentText() return self.dataset_box.currentText()
@ -272,7 +390,7 @@ class UI(QDialog):
datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()] datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()]
self.browser_load_button.setDisabled(len(datasets) == 0) self.browser_load_button.setDisabled(len(datasets) == 0)
if datasets_root is None or len(datasets) == 0: if datasets_root is None or len(datasets) == 0:
msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \ msg = "Tip: Please " + (" select the voice to be cloned" \
if datasets_root is None else "o not have any of the recognized datasets" \ if datasets_root is None else "o not have any of the recognized datasets" \
" in %s" % datasets_root) " in %s" % datasets_root)
self.log(msg) self.log(msg)
@ -417,16 +535,125 @@ class UI(QDialog):
self.export_wav_button.setDisabled(True) self.export_wav_button.setDisabled(True)
[self.log("") for _ in range(self.max_log_lines)] [self.log("") for _ in range(self.max_log_lines)]
#添加result_mfcc分析函数
def plot_mfcc1(self, wav, sample_rate):
signal = wav
print(sample_rate, len(signal))
# 读取前3.5s 的数据
signal = signal[0:int(3.5 * sample_rate)]
print(signal)
# 预先处理
pre_emphasis = 0.97
emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
frame_size = 0.025
frame_stride = 0.1
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step))
pad_signal_length = num_frames * frame_step + frame_length
z = numpy.zeros((pad_signal_length - signal_length))
pad_signal = numpy.append(emphasized_signal, z)
indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(
numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[numpy.mat(indices).astype(numpy.int32, copy=False)]
# 加上汉明窗
frames *= numpy.hamming(frame_length)
# frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1)) # Explicit Implementation **
# 傅立叶变换和功率谱
NFFT = 512
mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT)) # Magnitude of the FFT
# print(mag_frames.shape)
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
low_freq_mel = 0
# 将频率转换为Mel
nfilt = 40
high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700))
mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # Convert Mel to Hz
bin = numpy.floor((NFFT + 1) * hz_points / sample_rate)
fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = numpy.dot(pow_frames, fbank.T)
filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks) # Numerical Stability
filter_banks = 20 * numpy.log10(filter_banks) # dB
# 所得到的倒谱系数2-13被保留其余的被丢弃
num_ceps = 12
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1: (num_ceps + 1)]
(nframes, ncoeff) = mfcc.shape
n = numpy.arange(ncoeff)
cep_lifter = 22
lift = 1 + (cep_lifter / 2) * numpy.sin(numpy.pi * n / cep_lifter)
mfcc *= lift # *
# filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8)
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
print(mfcc.shape)
# 创建新的figure
fig11 = plt.figure(figsize=(16,8))
# 绘制1x2两行两列共四个图编号从1开始
ax = fig11.add_subplot(121)
plt.plot(mfcc)
ax = fig11.add_subplot(122)
# 平均归一化MFCC
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
plt.imshow(numpy.flipud(mfcc.T), cmap=plt.cm.jet, aspect=0.2,
extent=[0, mfcc.shape[0], 0, mfcc.shape[1]]) # 热力图
#将figure保存为png并显示在新创建的子窗口上
plt.savefig("fmcc_result.png")
dialog_fault1 = QDialog()
dialog_fault1.setWindowTitle("合成音频MFCC特征图及MFCC平均归一化热图") # 设置窗口名
pic = QPixmap("fmcc_result.png")
label_pic = QLabel("show", dialog_fault1)
label_pic.setPixmap(pic)
label_pic.setGeometry(0,0,1500,800)
dialog_fault1.exec_()
def __init__(self): def __init__(self):
## Initialize the application ## Initialize the application
self.app = QApplication(sys.argv) self.app = QApplication(sys.argv)
super().__init__(None) super().__init__(None)
self.setWindowTitle("MockingBird GUI") self.setWindowTitle("中文语音克隆系统")
self.setWindowIcon(QtGui.QIcon('toolbox\\assets\\mb.png')) self.setWindowIcon(QtGui.QIcon('toolbox\\assets\\mb.png'))
self.setWindowFlag(Qt.WindowMinimizeButtonHint, True) self.setWindowFlag(Qt.WindowMinimizeButtonHint, True)
self.setWindowFlag(Qt.WindowMaximizeButtonHint, True) self.setWindowFlag(Qt.WindowMaximizeButtonHint, True)
## Main layouts ## Main layouts
# Root # Root
root_layout = QGridLayout() root_layout = QGridLayout()
@ -459,46 +686,60 @@ class UI(QDialog):
self.projections_layout.addWidget(FigureCanvas(fig)) self.projections_layout.addWidget(FigureCanvas(fig))
self.umap_hot = False self.umap_hot = False
self.clear_button = QPushButton("Clear") self.clear_button = QPushButton("Clear")
self.clear_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/2.png)}')
self.projections_layout.addWidget(self.clear_button) self.projections_layout.addWidget(self.clear_button)
## Browser ## Browser
# Dataset, speaker and utterance selection # Dataset, speaker and utterance selection
i = 0 i = 0
source_groupbox = QGroupBox('Source(源音频)') source_groupbox = QGroupBox('Source(源音频)')
source_layout = QGridLayout() source_layout = QGridLayout()
source_groupbox.setLayout(source_layout) source_groupbox.setLayout(source_layout)
browser_layout.addWidget(source_groupbox, i, 0, 1, 4) browser_layout.addWidget(source_groupbox, i, 0, 1, 4)
self.dataset_box = QComboBox() self.dataset_box = QComboBox()
source_layout.addWidget(QLabel("Dataset(数据集):"), i, 0) # source_layout.addWidget(QLabel("Dataset(数据集):"), i, 0) #隐藏标签文字
source_layout.addWidget(self.dataset_box, i, 1) source_layout.addWidget(self.dataset_box, i, 1)
self.random_dataset_button = QPushButton("Random") self.random_dataset_button = QPushButton("Random")
source_layout.addWidget(self.random_dataset_button, i, 2) source_layout.addWidget(self.random_dataset_button, i, 2)
self.random_dataset_button.hide() #隐藏按钮
self.dataset_box.hide() #隐藏选项条
i += 1 i += 1
self.speaker_box = QComboBox() self.speaker_box = QComboBox()
source_layout.addWidget(QLabel("Speaker(说话者)"), i, 0) # source_layout.addWidget(QLabel("Speaker(说话者)"), i, 0)
source_layout.addWidget(self.speaker_box, i, 1) source_layout.addWidget(self.speaker_box, i, 1)
self.random_speaker_button = QPushButton("Random") self.random_speaker_button = QPushButton("Random")
source_layout.addWidget(self.random_speaker_button, i, 2) source_layout.addWidget(self.random_speaker_button, i, 2)
self.random_speaker_button.hide()
self.speaker_box.hide()
i += 1 i += 1
self.utterance_box = QComboBox() self.utterance_box = QComboBox()
source_layout.addWidget(QLabel("Utterance(音频):"), i, 0) # source_layout.addWidget(QLabel("Utterance(音频):"), i, 0)
source_layout.addWidget(self.utterance_box, i, 1) source_layout.addWidget(self.utterance_box, i, 1)
self.random_utterance_button = QPushButton("Random") self.random_utterance_button = QPushButton("Random")
source_layout.addWidget(self.random_utterance_button, i, 2) source_layout.addWidget(self.random_utterance_button, i, 2)
self.random_utterance_button.hide()
self.utterance_box.hide()
i += 1 i += 1
source_layout.addWidget(QLabel("<b>Use(使用):</b>"), i, 0) source_layout.addWidget(QLabel("<b>Use(使用):</b>"), i, 0)
self.browser_load_button = QPushButton("Load Above(加载上面)") self.browser_load_button = QPushButton("")
source_layout.addWidget(self.browser_load_button, i, 1, 1, 2) source_layout.addWidget(self.browser_load_button, i, 1, 1, 2)
self.auto_next_checkbox = QCheckBox("Auto select next") self.auto_next_checkbox = QCheckBox("Auto select next")
self.auto_next_checkbox.setChecked(True) self.auto_next_checkbox.setChecked(True)
source_layout.addWidget(self.auto_next_checkbox, i+1, 1) source_layout.addWidget(self.auto_next_checkbox, i + 1, 1)
self.browser_browse_button = QPushButton("Browse(打开本地)") self.browser_browse_button = QPushButton("Browse(打开本地)")
self.browser_browse_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
source_layout.addWidget(self.browser_browse_button, i, 3) source_layout.addWidget(self.browser_browse_button, i, 3)
self.record_button = QPushButton("Record(录音)") self.record_button = QPushButton("Record(录音)")
self.record_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
source_layout.addWidget(self.record_button, i+1, 3) source_layout.addWidget(self.record_button, i+1, 3)
i += 2 i += 2
@ -507,8 +748,10 @@ class UI(QDialog):
self.utterance_history = QComboBox() self.utterance_history = QComboBox()
browser_layout.addWidget(self.utterance_history, i, 1) browser_layout.addWidget(self.utterance_history, i, 1)
self.play_button = QPushButton("Play(播放)") self.play_button = QPushButton("Play(播放)")
self.play_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
browser_layout.addWidget(self.play_button, i, 2) browser_layout.addWidget(self.play_button, i, 2)
self.stop_button = QPushButton("Stop(暂停)") self.stop_button = QPushButton("Stop(暂停)")
self.stop_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
browser_layout.addWidget(self.stop_button, i, 3) browser_layout.addWidget(self.stop_button, i, 3)
i += 1 i += 1
@ -537,10 +780,12 @@ class UI(QDialog):
self.waves_cb.setModel(self.waves_cb_model) self.waves_cb.setModel(self.waves_cb_model)
self.waves_cb.setToolTip("Select one of the last generated waves in this section for replaying or exporting") self.waves_cb.setToolTip("Select one of the last generated waves in this section for replaying or exporting")
output_layout.addWidget(self.waves_cb, i, 1) output_layout.addWidget(self.waves_cb, i, 1)
self.replay_wav_button = QPushButton("Replay") self.replay_wav_button = QPushButton("Replay(重播)")
self.replay_wav_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
self.replay_wav_button.setToolTip("Replay last generated vocoder") self.replay_wav_button.setToolTip("Replay last generated vocoder")
output_layout.addWidget(self.replay_wav_button, i, 2) output_layout.addWidget(self.replay_wav_button, i, 2)
self.export_wav_button = QPushButton("Export") self.export_wav_button = QPushButton("Export(导出)")
self.export_wav_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file") self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file")
output_layout.addWidget(self.export_wav_button, i, 3) output_layout.addWidget(self.export_wav_button, i, 3)
self.audio_out_devices_cb=QComboBox() self.audio_out_devices_cb=QComboBox()
@ -551,14 +796,27 @@ class UI(QDialog):
## Embed & spectrograms ## Embed & spectrograms
vis_layout.addStretch() vis_layout.addStretch()
#添加标签控件,设置标签文字格式并且居中
label1 = QLabel("source audio")
label1.setStyleSheet("QLabel{color:red;font-size:20px;font-weight:bold;font-family:Roman times;}")
label1.setAlignment(Qt.AlignCenter)
vis_layout.addWidget(label1) #addwidget:添加控件
gridspec_kw = {"width_ratios": [1, 4]} gridspec_kw = {"width_ratios": [1, 4]}
fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0", fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
gridspec_kw=gridspec_kw) gridspec_kw=gridspec_kw)
#self.current_ax[1].set_title("source audio", fontsize=50, color='red', fontstyle='italic', fontweight="heavy")
fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8) fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
vis_layout.addWidget(FigureCanvas(fig)) vis_layout.addWidget(FigureCanvas(fig))
fig, self.gen_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0", label2 = QLabel("target audio")
label2.setStyleSheet("QLabel{color:red;font-size:20px;font-weight:bold;font-family:Roman times;}")
label2.setAlignment(Qt.AlignCenter)
vis_layout.addWidget(label2)
fig, self.gen_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
gridspec_kw=gridspec_kw) gridspec_kw=gridspec_kw)
#self.gen_ax[1].set_title("target audio", fontsize=50, color='red', fontstyle='italic', fontweight="heavy")
fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8) fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
vis_layout.addWidget(FigureCanvas(fig)) vis_layout.addWidget(FigureCanvas(fig))
@ -566,29 +824,37 @@ class UI(QDialog):
ax.set_facecolor("#F0F0F0") ax.set_facecolor("#F0F0F0")
for side in ["top", "right", "bottom", "left"]: for side in ["top", "right", "bottom", "left"]:
ax.spines[side].set_visible(False) ax.spines[side].set_visible(False)
## Generation ## Generation
self.text_prompt = QPlainTextEdit(default_text) self.text_prompt = QPlainTextEdit(default_text)
gen_layout.addWidget(self.text_prompt, stretch=1) gen_layout.addWidget(self.text_prompt, stretch=1)
self.generate_button = QPushButton("Synthesize and vocode") self.generate_button = QPushButton("Synthesize and vocode(合成并播放)")
self.generate_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
gen_layout.addWidget(self.generate_button) gen_layout.addWidget(self.generate_button)
layout = QHBoxLayout() layout = QHBoxLayout()
self.synthesize_button = QPushButton("Synthesize only") self.synthesize_button = QPushButton("Synthesize only(仅合成)")
self.synthesize_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
layout.addWidget(self.synthesize_button) layout.addWidget(self.synthesize_button)
self.vocode_button = QPushButton("Vocode only") self.vocode_button = QPushButton("Vocode only(仅播放)")
self.vocode_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
layout.addWidget(self.vocode_button) layout.addWidget(self.vocode_button)
gen_layout.addLayout(layout) gen_layout.addLayout(layout)
layout_seed = QGridLayout() layout_seed = QGridLayout()
self.random_seed_checkbox = QCheckBox("Random seed:") self.random_seed_checkbox = QCheckBox("Random seed(随机数种子):")
self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.") self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.")
layout_seed.addWidget(self.random_seed_checkbox, 0, 0) layout_seed.addWidget(self.random_seed_checkbox, 0, 0)
self.seed_textbox = QLineEdit() self.seed_textbox = QLineEdit()
self.seed_textbox.setMaximumWidth(80) self.seed_textbox.setMaximumWidth(80)
layout_seed.addWidget(self.seed_textbox, 0, 1) layout_seed.addWidget(self.seed_textbox, 0, 1)
self.trim_silences_checkbox = QCheckBox("Enhance vocoder output") self.trim_silences_checkbox = QCheckBox("Enhance vocoder output(语音增强)")
self.trim_silences_checkbox.setToolTip("When checked, trims excess silence in vocoder output." self.trim_silences_checkbox.setToolTip("When checked, trims excess silence in vocoder output."
" This feature requires `webrtcvad` to be installed.") " This feature requires `webrtcvad` to be installed.")
layout_seed.addWidget(self.trim_silences_checkbox, 0, 2, 1, 2) layout_seed.addWidget(self.trim_silences_checkbox, 0, 2, 1, 2)
@ -599,7 +865,7 @@ class UI(QDialog):
self.style_slider.setRange(-1, 9) self.style_slider.setRange(-1, 9)
self.style_value_label = QLabel("-1") self.style_value_label = QLabel("-1")
self.style_slider.setValue(-1) self.style_slider.setValue(-1)
layout_seed.addWidget(QLabel("Style:"), 1, 0) layout_seed.addWidget(QLabel("Style(风格):"), 1, 0)
self.style_slider.valueChanged.connect(lambda s: self.style_value_label.setNum(s)) self.style_slider.valueChanged.connect(lambda s: self.style_value_label.setNum(s))
layout_seed.addWidget(self.style_value_label, 1, 1) layout_seed.addWidget(self.style_value_label, 1, 1)
@ -610,7 +876,7 @@ class UI(QDialog):
self.token_slider.setFocusPolicy(Qt.NoFocus) self.token_slider.setFocusPolicy(Qt.NoFocus)
self.token_slider.setSingleStep(1) self.token_slider.setSingleStep(1)
self.token_slider.setRange(3, 9) self.token_slider.setRange(3, 9)
self.token_value_label = QLabel("5") self.token_value_label = QLabel("4")
self.token_slider.setValue(4) self.token_slider.setValue(4)
layout_seed.addWidget(QLabel("Accuracy(精度):"), 2, 0) layout_seed.addWidget(QLabel("Accuracy(精度):"), 2, 0)
@ -651,5 +917,16 @@ class UI(QDialog):
self.reset_interface() self.reset_interface()
self.show() self.show()
##set the picture of background
palette1 = QPalette()
# palette1.setColor(self.backgroundRole(), QColor(192,253,123)) # 设置背景颜色
palette1.setBrush(self.backgroundRole(), QBrush(QPixmap('toolbox\\assets\\picture1.jpg'))) # 设置背景图片
self.setPalette(palette1)
self.setAutoFillBackground(True)
def start(self): def start(self):
self.app.exec_() self.app.exec_()

129
vocoder/fregan/.gitignore vendored Normal file
View File

@ -0,0 +1,129 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/

21
vocoder/fregan/LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 Rishikesh (ऋषिकेश)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,150 @@
LJ050-0269|The essential terms of such memoranda might well be embodied in an Executive order.|The essential terms of such memoranda might well be embodied in an Executive order.
LJ050-0270|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security.|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security.
LJ050-0271|The demands on the President in the execution of His responsibilities in today's world are so varied and complex|The demands on the President in the execution of His responsibilities in today's world are so varied and complex
LJ050-0272|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security.|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security.
LJ050-0273|The Commission has, however, from its examination of the facts of President Kennedy's assassination|The Commission has, however, from its examination of the facts of President Kennedy's assassination
LJ050-0274|made certain recommendations which it believes would, if adopted,|made certain recommendations which it believes would, if adopted,
LJ050-0275|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger.|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger.
LJ050-0276|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that,|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that,
LJ050-0277|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President,|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President,
LJ050-0278|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties.|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties.
LJ001-0028|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.
LJ001-0068|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic.|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic.
LJ002-0149|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo.|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo.
LJ002-0157|and Susannah Evans, in October the same year, for 2 shillings, with costs of 6 shillings, 8 pence.|and Susannah Evans, in October the same year, for two shillings, with costs of six shillings, eight pence.
LJ002-0167|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny.|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny.
LJ003-0042|The completion of this very necessary building was, however, much delayed for want of funds,|The completion of this very necessary building was, however, much delayed for want of funds,
LJ003-0307|but as yet no suggestion was made to provide prison uniform.|but as yet no suggestion was made to provide prison uniform.
LJ004-0169|On the dirty bedstead lay a wretched being in the throes of severe illness.|On the dirty bedstead lay a wretched being in the throes of severe illness.
LJ004-0233|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating.|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating.
LJ005-0101|whence it deduced the practice and condition of every prison that replied.|whence it deduced the practice and condition of every prison that replied.
LJ005-0108|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison,|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison,
LJ005-0202|An examination of this report shows how even the most insignificant township had its jail.|An examination of this report shows how even the most insignificant township had its jail.
LJ005-0234|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer.|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer.
LJ005-0248|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners.|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners.
LJ006-0001|The Chronicles of Newgate, Volume 2. By Arthur Griffiths. Section 9: The first report of the inspector of prisons.|The Chronicles of Newgate, Volume two. By Arthur Griffiths. Section nine: The first report of the inspector of prisons.
LJ006-0018|One was Mr. William Crawford, the other the Rev. Whitworth Russell.|One was Mr. William Crawford, the other the Rev. Whitworth Russell.
LJ006-0034|They attended early and late; they mustered the prisoners, examined into their condition,|They attended early and late; they mustered the prisoners, examined into their condition,
LJ006-0078|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner.|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner.
LJ007-0217|They go on to say|They go on to say
LJ007-0243|It was not till the erection of the new prison at Holloway in 1850, and the entire internal reconstruction of Newgate according to new ideas,|It was not till the erection of the new prison at Holloway in eighteen fifty, and the entire internal reconstruction of Newgate according to new ideas,
LJ008-0087|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor.|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor.
LJ008-0131|the other he kept between his hands.|the other he kept between his hands.
LJ008-0140|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity,|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity,
LJ008-0158|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion.|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion.
LJ008-0174|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death.|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death.
LJ010-0047|while in 1850 Her Majesty was the victim of another outrage at the hands of one Pate.|while in eighteen fifty Her Majesty was the victim of another outrage at the hands of one Pate.
LJ010-0061|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt.|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt.
LJ010-0105|Thistlewood was discovered next morning in a mean house in White Street, Moorfields.|Thistlewood was discovered next morning in a mean house in White Street, Moorfields.
LJ010-0233|Here again probably it was partly the love of notoriety which was the incentive,|Here again probably it was partly the love of notoriety which was the incentive,
LJ010-0234|backed possibly with the hope that, as in a much more recent case,|backed possibly with the hope that, as in a much more recent case,
LJ010-0258|As the Queen was driving from Buckingham Palace to the Chapel Royal,|As the Queen was driving from Buckingham Palace to the Chapel Royal,
LJ010-0262|charged him with the offense.|charged him with the offense.
LJ010-0270|exactly tallied with that of the deformed person "wanted" for the assault on the Queen.|exactly tallied with that of the deformed person "wanted" for the assault on the Queen.
LJ010-0293|I have already remarked that as violence was more and more eliminated from crimes against the person,|I have already remarked that as violence was more and more eliminated from crimes against the person,
LJ011-0009|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell.|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell.
LJ011-0256|By this time the neighbors were aroused, and several people came to the scene of the affray.|By this time the neighbors were aroused, and several people came to the scene of the affray.
LJ012-0044|When his trade was busiest he set up a second establishment, at the head of which, although he was married,|When his trade was busiest he set up a second establishment, at the head of which, although he was married,
LJ012-0145|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft.|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft.
LJ013-0020|he acted in a manner which excited the suspicions of the crew.|he acted in a manner which excited the suspicions of the crew.
LJ013-0077|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done.|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done.
LJ013-0228|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for.|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for.
LJ014-0020|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;
LJ014-0054|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone.|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone.
LJ014-0101|he found that it was soft and new, while elsewhere it was set and hard.|he found that it was soft and new, while elsewhere it was set and hard.
LJ014-0103|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe.|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe.
LJ014-0263|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.
LJ014-0272|and 1850 to embezzle and apply to his own purposes some £71,000.|and eighteen fifty to embezzle and apply to his own purposes some seventy-one thousand pounds.
LJ014-0311|His extensive business had been carried on by fraud.|His extensive business had been carried on by fraud.
LJ015-0197|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force.|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force.
LJ016-0089|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery.|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery.
LJ016-0407|who generally attended the prison services.|who generally attended the prison services.
LJ016-0443|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched.|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched.
LJ017-0033|a medical practitioner, charged with doing to death persons who relied upon his professional skill.|a medical practitioner, charged with doing to death persons who relied upon his professional skill.
LJ017-0038|That the administration of justice should never be interfered with by local prejudice or local feeling|That the administration of justice should never be interfered with by local prejudice or local feeling
LJ018-0018|he wore gold-rimmed eye-glasses and a gold watch and chain.|he wore gold-rimmed eye-glasses and a gold watch and chain.
LJ018-0119|His offer was not, however, accepted.|His offer was not, however, accepted.
LJ018-0280|The commercial experience of these clever rogues was cosmopolitan.|The commercial experience of these clever rogues was cosmopolitan.
LJ019-0178|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet.|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet.
LJ019-0240|But no structural alterations were made from the date first quoted until the time of closing the prison in 1881.|But no structural alterations were made from the date first quoted until the time of closing the prison in eighteen eighty-one.
LJ021-0049|and the curtailment of rank stock speculation through the Securities Exchange Act.|and the curtailment of rank stock speculation through the Securities Exchange Act.
LJ021-0155|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works.|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works.
LJ022-0046|It is true that while business and industry are definitely better our relief rolls are still too large.|It is true that while business and industry are definitely better our relief rolls are still too large.
LJ022-0173|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport,|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport,
LJ024-0087|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution.|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution.
LJ024-0110|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay
LJ024-0119|When before have you found them really at your side in your fights for progress?|When before have you found them really at your side in your fights for progress?
LJ025-0091|as it was current among contemporary chemists.|as it was current among contemporary chemists.
LJ026-0029|so in the case under discussion.|so in the case under discussion.
LJ026-0039|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent.|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent.
LJ026-0064|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food.|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food.
LJ026-0105|This is done by diastase, an enzyme of plant cells.|This is done by diastase, an enzyme of plant cells.
LJ026-0137|and be laid down as "reserve starch" in the cells of root or stem or elsewhere.|and be laid down as "reserve starch" in the cells of root or stem or elsewhere.
LJ027-0006|In all these lines the facts are drawn together by a strong thread of unity.|In all these lines the facts are drawn together by a strong thread of unity.
LJ028-0134|He also erected what is called a pensile paradise:|He also erected what is called a pensile paradise:
LJ028-0138|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be,|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be,
LJ028-0189|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world.|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world.
LJ028-0281|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken,|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken,
LJ029-0188|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater.|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater.
LJ030-0098|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen,|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen,
LJ031-0007|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital.|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital.
LJ031-0091|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries.|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries.
LJ031-0227|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital,|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital,
LJ032-0100|Marina Oswald|Marina Oswald
LJ032-0165|to the exclusion of all others because there are not enough microscopic characteristics present in fibers.|to the exclusion of all others because there are not enough microscopic characteristics present in fibers.
LJ032-0198|During the period from March 2, 1963, to April 24, 1963,|During the period from March two, nineteen sixty-three, to April twenty-four, nineteen sixty-three,
LJ033-0046|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so.|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so.
LJ033-0072|I then stepped off of it and the officer picked it up in the middle and it bent so.|I then stepped off of it and the officer picked it up in the middle and it bent so.
LJ033-0135|Location of Bag|Location of Bag
LJ034-0083|The significance of Givens' observation that Oswald was carrying his clipboard|The significance of Givens' observation that Oswald was carrying his clipboard
LJ034-0179|and, quote, seemed to be sitting a little forward, end quote,|and, quote, seemed to be sitting a little forward, end quote,
LJ035-0125|Victoria Adams, who worked on the fourth floor of the Depository Building,|Victoria Adams, who worked on the fourth floor of the Depository Building,
LJ035-0162|approximately 30 to 45 seconds after Oswald's lunchroom encounter with Baker and Truly.|approximately thirty to forty-five seconds after Oswald's lunchroom encounter with Baker and Truly.
LJ035-0189|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade,|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade,
LJ035-0208|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor
LJ036-0216|Tippit got out and started to walk around the front of the car|Tippit got out and started to walk around the front of the car
LJ037-0093|William Arthur Smith was about a block east of 10th and Patton when he heard shots.|William Arthur Smith was about a block east of tenth and Patton when he heard shots.
LJ037-0157|taken from Oswald.|taken from Oswald.
LJ037-0178|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting,|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting,
LJ037-0219|Oswald's Jacket|Oswald's Jacket
LJ037-0222|When Oswald was arrested, he did not have a jacket.|When Oswald was arrested, he did not have a jacket.
LJ038-0017|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb.|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb.
LJ038-0052|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight.|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight.
LJ038-0077|Statements of Oswald during Detention.|Statements of Oswald during Detention.
LJ038-0161|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm.|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm.
LJ038-0212|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear.|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear.
LJ039-0103|Oswald, like all Marine recruits, received training on the rifle range at distances up to 500 yards,|Oswald, like all Marine recruits, received training on the rifle range at distances up to five hundred yards,
LJ039-0149|established that they had been previously loaded and ejected from the assassination rifle,|established that they had been previously loaded and ejected from the assassination rifle,
LJ040-0107|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of 5 and 7 years,|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of five and seven years,
LJ040-0119|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave.|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave.
LJ040-0161|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic.|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic.
LJ040-0169|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone
LJ041-0098|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote.|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote.
LJ042-0017|and see for himself how a revolutionary society operates, a Marxist society.|and see for himself how a revolutionary society operates, a Marxist society.
LJ042-0070|Oswald was discovered in time to thwart his attempt at suicide.|Oswald was discovered in time to thwart his attempt at suicide.
LJ042-0161|Immediately after serving out his 3 years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR.|Immediately after serving out his three years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR.
LJ043-0147|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle.|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle.
LJ043-0178|as, in fact, one of them did appear after the assassination.|as, in fact, one of them did appear after the assassination.
LJ043-0183|Oswald did not lack the determination and other traits required|Oswald did not lack the determination and other traits required
LJ043-0185|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker.|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker.
LJ044-0057|extensive investigation was not able to connect Oswald with that address, although it did develop the fact|extensive investigation was not able to connect Oswald with that address, although it did develop the fact
LJ044-0109|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities.|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities.
LJ045-0081|Although she denied it in some of her testimony before the Commission,|Although she denied it in some of her testimony before the Commission,
LJ045-0147|She asked Oswald, quote,|She asked Oswald, quote,
LJ045-0204|he had never found anything to which he felt he could really belong.|he had never found anything to which he felt he could really belong.
LJ046-0193|and 12 to 15 of these cases as highly dangerous risks.|and twelve to fifteen of these cases as highly dangerous risks.
LJ046-0244|PRS should have investigated and been prepared to guard against it.|PRS should have investigated and been prepared to guard against it.
LJ047-0059|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries,|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries,
LJ047-0142|The Bureau had no earlier information suggesting that Oswald had left the United States.|The Bureau had no earlier information suggesting that Oswald had left the United States.
LJ048-0035|It was against this background and consistent with the criteria followed by the FBI prior to November 22|It was against this background and consistent with the criteria followed by the FBI prior to November twenty-two
LJ048-0063|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination.|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination.
LJ048-0104|There were far safer routes via freeways directly to the Trade Mart,|There were far safer routes via freeways directly to the Trade Mart,
LJ048-0187|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators.|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators.
LJ048-0271|will be cause for removal from the Service, end quote.|will be cause for removal from the Service, end quote.
LJ049-0031|The Presidential vehicle in use in Dallas, described in chapter 2,|The Presidential vehicle in use in Dallas, described in chapter two,
LJ049-0059|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger,|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger,
LJ049-0174|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated
LJ050-0049|and from a specialist in psychiatric prognostication at Walter Reed Hospital.|and from a specialist in psychiatric prognostication at Walter Reed Hospital.
LJ050-0113|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service,|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service,
LJ050-0150|Its present manual filing system is obsolete;|Its present manual filing system is obsolete;
LJ050-0189|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described.|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described.

25
vocoder/fregan/README.md Normal file
View File

@ -0,0 +1,25 @@
# Fre-GAN Vocoder
[Fre-GAN: Adversarial Frequency-consistent Audio Synthesis](https://arxiv.org/abs/2106.02297)
## Training:
```
python train.py --config config.json
```
## Citation:
```
@misc{kim2021fregan,
title={Fre-GAN: Adversarial Frequency-consistent Audio Synthesis},
author={Ji-Hoon Kim and Sang-Hoon Lee and Ji-Hyun Lee and Seong-Whan Lee},
year={2021},
eprint={2106.02297},
archivePrefix={arXiv},
primaryClass={eess.AS}
}
```
## Note
* For more complete and end to end Voice cloning or Text to Speech (TTS) toolbox please visit [Deepsync Technologies](https://deepsync.co/).
## References:
* [Hi-Fi-GAN repo](https://github.com/jik876/hifi-gan)
* [WaveSNet repo](https://github.com/LiQiufu/WaveSNet)

View File

@ -0,0 +1,41 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 16,
"learning_rate": 0.0002,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.999,
"seed": 1234,
"upsample_rates": [5,5,2,2,2],
"upsample_kernel_sizes": [10,10,4,4,4],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1, 3, 5, 7], [1,3,5,7], [1,3,5,7]],
"segment_size": 6400,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 200,
"win_size": 800,
"sampling_rate": 16000,
"fmin": 0,
"fmax": 7600,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
}

View File

@ -0,0 +1,303 @@
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Conv1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, spectral_norm
from vocoder.fregan.utils import get_padding
from vocoder.fregan.stft_loss import stft
from vocoder.fregan.dwt import DWT_1D
LRELU_SLOPE = 0.1
class SpecDiscriminator(nn.Module):
"""docstring for Discriminator."""
def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window", use_spectral_norm=False):
super(SpecDiscriminator, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.fft_size = fft_size
self.shift_size = shift_size
self.win_length = win_length
self.window = getattr(torch, window)(win_length)
self.discriminators = nn.ModuleList([
norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1,2), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1,1), padding=(1, 1))),
])
self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1))
def forward(self, y):
fmap = []
with torch.no_grad():
y = y.squeeze(1)
y = stft(y, self.fft_size, self.shift_size, self.win_length, self.window.to(y.get_device()))
y = y.unsqueeze(1)
for i, d in enumerate(self.discriminators):
y = d(y)
y = F.leaky_relu(y, LRELU_SLOPE)
fmap.append(y)
y = self.out(y)
fmap.append(y)
return torch.flatten(y, 1, -1), fmap
class MultiResSpecDiscriminator(torch.nn.Module):
def __init__(self,
fft_sizes=[1024, 2048, 512],
hop_sizes=[120, 240, 50],
win_lengths=[600, 1200, 240],
window="hann_window"):
super(MultiResSpecDiscriminator, self).__init__()
self.discriminators = nn.ModuleList([
SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window),
SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window),
SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window)
])
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.dwt1d = DWT_1D()
self.dwt_conv1 = norm_f(Conv1d(2, 1, 1))
self.dwt_proj1 = norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0)))
self.dwt_conv2 = norm_f(Conv1d(4, 1, 1))
self.dwt_proj2 = norm_f(Conv2d(1, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0)))
self.dwt_conv3 = norm_f(Conv1d(8, 1, 1))
self.dwt_proj3 = norm_f(Conv2d(1, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0)))
self.convs = nn.ModuleList([
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
])
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
def forward(self, x):
fmap = []
# DWT 1
x_d1_high1, x_d1_low1 = self.dwt1d(x)
x_d1 = self.dwt_conv1(torch.cat([x_d1_high1, x_d1_low1], dim=1))
# 1d to 2d
b, c, t = x_d1.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x_d1 = F.pad(x_d1, (0, n_pad), "reflect")
t = t + n_pad
x_d1 = x_d1.view(b, c, t // self.period, self.period)
x_d1 = self.dwt_proj1(x_d1)
# DWT 2
x_d2_high1, x_d2_low1 = self.dwt1d(x_d1_high1)
x_d2_high2, x_d2_low2 = self.dwt1d(x_d1_low1)
x_d2 = self.dwt_conv2(torch.cat([x_d2_high1, x_d2_low1, x_d2_high2, x_d2_low2], dim=1))
# 1d to 2d
b, c, t = x_d2.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x_d2 = F.pad(x_d2, (0, n_pad), "reflect")
t = t + n_pad
x_d2 = x_d2.view(b, c, t // self.period, self.period)
x_d2 = self.dwt_proj2(x_d2)
# DWT 3
x_d3_high1, x_d3_low1 = self.dwt1d(x_d2_high1)
x_d3_high2, x_d3_low2 = self.dwt1d(x_d2_low1)
x_d3_high3, x_d3_low3 = self.dwt1d(x_d2_high2)
x_d3_high4, x_d3_low4 = self.dwt1d(x_d2_low2)
x_d3 = self.dwt_conv3(
torch.cat([x_d3_high1, x_d3_low1, x_d3_high2, x_d3_low2, x_d3_high3, x_d3_low3, x_d3_high4, x_d3_low4],
dim=1))
# 1d to 2d
b, c, t = x_d3.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x_d3 = F.pad(x_d3, (0, n_pad), "reflect")
t = t + n_pad
x_d3 = x_d3.view(b, c, t // self.period, self.period)
x_d3 = self.dwt_proj3(x_d3)
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
i = 0
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
if i == 0:
x = torch.cat([x, x_d1], dim=2)
elif i == 1:
x = torch.cat([x, x_d2], dim=2)
elif i == 2:
x = torch.cat([x, x_d3], dim=2)
else:
x = x
i = i + 1
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class ResWiseMultiPeriodDiscriminator(torch.nn.Module):
def __init__(self):
super(ResWiseMultiPeriodDiscriminator, self).__init__()
self.discriminators = nn.ModuleList([
DiscriminatorP(2),
DiscriminatorP(3),
DiscriminatorP(5),
DiscriminatorP(7),
DiscriminatorP(11),
])
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.dwt1d = DWT_1D()
self.dwt_conv1 = norm_f(Conv1d(2, 128, 15, 1, padding=7))
self.dwt_conv2 = norm_f(Conv1d(4, 128, 41, 2, padding=20))
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
])
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
def forward(self, x):
fmap = []
# DWT 1
x_d1_high1, x_d1_low1 = self.dwt1d(x)
x_d1 = self.dwt_conv1(torch.cat([x_d1_high1, x_d1_low1], dim=1))
# DWT 2
x_d2_high1, x_d2_low1 = self.dwt1d(x_d1_high1)
x_d2_high2, x_d2_low2 = self.dwt1d(x_d1_low1)
x_d2 = self.dwt_conv2(torch.cat([x_d2_high1, x_d2_low1, x_d2_high2, x_d2_low2], dim=1))
i = 0
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
if i == 0:
x = torch.cat([x, x_d1], dim=2)
if i == 1:
x = torch.cat([x, x_d2], dim=2)
i = i + 1
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class ResWiseMultiScaleDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(ResWiseMultiScaleDiscriminator, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.dwt1d = DWT_1D()
self.dwt_conv1 = norm_f(Conv1d(2, 1, 1))
self.dwt_conv2 = norm_f(Conv1d(4, 1, 1))
self.discriminators = nn.ModuleList([
DiscriminatorS(use_spectral_norm=True),
DiscriminatorS(),
DiscriminatorS(),
])
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
# DWT 1
y_hi, y_lo = self.dwt1d(y)
y_1 = self.dwt_conv1(torch.cat([y_hi, y_lo], dim=1))
x_d1_high1, x_d1_low1 = self.dwt1d(y_hat)
y_hat_1 = self.dwt_conv1(torch.cat([x_d1_high1, x_d1_low1], dim=1))
# DWT 2
x_d2_high1, x_d2_low1 = self.dwt1d(y_hi)
x_d2_high2, x_d2_low2 = self.dwt1d(y_lo)
y_2 = self.dwt_conv2(torch.cat([x_d2_high1, x_d2_low1, x_d2_high2, x_d2_low2], dim=1))
x_d2_high1, x_d2_low1 = self.dwt1d(x_d1_high1)
x_d2_high2, x_d2_low2 = self.dwt1d(x_d1_low1)
y_hat_2 = self.dwt_conv2(torch.cat([x_d2_high1, x_d2_low1, x_d2_high2, x_d2_low2], dim=1))
for i, d in enumerate(self.discriminators):
if i == 1:
y = y_1
y_hat = y_hat_1
if i == 2:
y = y_2
y_hat = y_hat_2
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs

76
vocoder/fregan/dwt.py Normal file
View File

@ -0,0 +1,76 @@
# Copyright (c) 2019, Adobe Inc. All rights reserved.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike
# 4.0 International Public License. To view a copy of this license, visit
# https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.
# DWT code borrow from https://github.com/LiQiufu/WaveSNet/blob/12cb9d24208c3d26917bf953618c30f0c6b0f03d/DWT_IDWT/DWT_IDWT_layer.py
import pywt
import torch
import torch.nn as nn
import torch.nn.functional as F
__all__ = ['DWT_1D']
Pad_Mode = ['constant', 'reflect', 'replicate', 'circular']
class DWT_1D(nn.Module):
def __init__(self, pad_type='reflect', wavename='haar',
stride=2, in_channels=1, out_channels=None, groups=None,
kernel_size=None, trainable=False):
super(DWT_1D, self).__init__()
self.trainable = trainable
self.kernel_size = kernel_size
if not self.trainable:
assert self.kernel_size == None
self.in_channels = in_channels
self.out_channels = self.in_channels if out_channels == None else out_channels
self.groups = self.in_channels if groups == None else groups
assert isinstance(self.groups, int) and self.in_channels % self.groups == 0
self.stride = stride
assert self.stride == 2
self.wavename = wavename
self.pad_type = pad_type
assert self.pad_type in Pad_Mode
self.get_filters()
self.initialization()
def get_filters(self):
wavelet = pywt.Wavelet(self.wavename)
band_low = torch.tensor(wavelet.rec_lo)
band_high = torch.tensor(wavelet.rec_hi)
length_band = band_low.size()[0]
self.kernel_size = length_band if self.kernel_size == None else self.kernel_size
assert self.kernel_size >= length_band
a = (self.kernel_size - length_band) // 2
b = - (self.kernel_size - length_band - a)
b = None if b == 0 else b
self.filt_low = torch.zeros(self.kernel_size)
self.filt_high = torch.zeros(self.kernel_size)
self.filt_low[a:b] = band_low
self.filt_high[a:b] = band_high
def initialization(self):
self.filter_low = self.filt_low[None, None, :].repeat((self.out_channels, self.in_channels // self.groups, 1))
self.filter_high = self.filt_high[None, None, :].repeat((self.out_channels, self.in_channels // self.groups, 1))
if torch.cuda.is_available():
self.filter_low = self.filter_low.cuda()
self.filter_high = self.filter_high.cuda()
if self.trainable:
self.filter_low = nn.Parameter(self.filter_low)
self.filter_high = nn.Parameter(self.filter_high)
if self.kernel_size % 2 == 0:
self.pad_sizes = [self.kernel_size // 2 - 1, self.kernel_size // 2 - 1]
else:
self.pad_sizes = [self.kernel_size // 2, self.kernel_size // 2]
def forward(self, input):
assert isinstance(input, torch.Tensor)
assert len(input.size()) == 3
assert input.size()[1] == self.in_channels
input = F.pad(input, pad=self.pad_sizes, mode=self.pad_type)
return F.conv1d(input, self.filter_low.to(input.device), stride=self.stride, groups=self.groups), \
F.conv1d(input, self.filter_high.to(input.device), stride=self.stride, groups=self.groups)

210
vocoder/fregan/generator.py Normal file
View File

@ -0,0 +1,210 @@
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from vocoder.fregan.utils import init_weights, get_padding
LRELU_SLOPE = 0.1
class ResBlock1(torch.nn.Module):
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5, 7)):
super(ResBlock1, self).__init__()
self.h = h
self.convs1 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
padding=get_padding(kernel_size, dilation[2]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[3],
padding=get_padding(kernel_size, dilation[3])))
])
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1)))
])
self.convs2.apply(init_weights)
def forward(self, x):
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, LRELU_SLOPE)
xt = c1(xt)
xt = F.leaky_relu(xt, LRELU_SLOPE)
xt = c2(xt)
x = xt + x
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class ResBlock2(torch.nn.Module):
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
super(ResBlock2, self).__init__()
self.h = h
self.convs = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1])))
])
self.convs.apply(init_weights)
def forward(self, x):
for c in self.convs:
xt = F.leaky_relu(x, LRELU_SLOPE)
xt = c(xt)
x = xt + x
return x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
class FreGAN(torch.nn.Module):
def __init__(self, h, top_k=4):
super(FreGAN, self).__init__()
self.h = h
self.num_kernels = len(h.resblock_kernel_sizes)
self.num_upsamples = len(h.upsample_rates)
self.upsample_rates = h.upsample_rates
self.up_kernels = h.upsample_kernel_sizes
self.cond_level = self.num_upsamples - top_k
self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
resblock = ResBlock1 if h.resblock == '1' else ResBlock2
self.ups = nn.ModuleList()
self.cond_up = nn.ModuleList()
self.res_output = nn.ModuleList()
upsample_ = 1
kr = 80
for i, (u, k) in enumerate(zip(self.upsample_rates, self.up_kernels)):
# self.ups.append(weight_norm(
# ConvTranspose1d(h.upsample_initial_channel // (2 ** i), h.upsample_initial_channel // (2 ** (i + 1)),
# k, u, padding=(k - u) // 2)))
self.ups.append(weight_norm(ConvTranspose1d(h.upsample_initial_channel//(2**i),
h.upsample_initial_channel//(2**(i+1)),
k, u, padding=(u//2 + u%2), output_padding=u%2)))
if i > (self.num_upsamples - top_k):
self.res_output.append(
nn.Sequential(
nn.Upsample(scale_factor=u, mode='nearest'),
weight_norm(nn.Conv1d(h.upsample_initial_channel // (2 ** i),
h.upsample_initial_channel // (2 ** (i + 1)), 1))
)
)
if i >= (self.num_upsamples - top_k):
self.cond_up.append(
weight_norm(
ConvTranspose1d(kr, h.upsample_initial_channel // (2 ** i),
self.up_kernels[i - 1], self.upsample_rates[i - 1],
padding=(self.upsample_rates[i-1]//2+self.upsample_rates[i-1]%2), output_padding=self.upsample_rates[i-1]%2))
)
kr = h.upsample_initial_channel // (2 ** i)
upsample_ *= u
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = h.upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
self.resblocks.append(resblock(h, ch, k, d))
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
self.ups.apply(init_weights)
self.conv_post.apply(init_weights)
self.cond_up.apply(init_weights)
self.res_output.apply(init_weights)
def forward(self, x):
mel = x
x = self.conv_pre(x)
output = None
for i in range(self.num_upsamples):
if i >= self.cond_level:
mel = self.cond_up[i - self.cond_level](mel)
x += mel
if i > self.cond_level:
if output is None:
output = self.res_output[i - self.cond_level - 1](x)
else:
output = self.res_output[i - self.cond_level - 1](output)
x = F.leaky_relu(x, LRELU_SLOPE)
x = self.ups[i](x)
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
if output is not None:
output = output + x
x = F.leaky_relu(output)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
print('Removing weight norm...')
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
for l in self.cond_up:
remove_weight_norm(l)
for l in self.res_output:
remove_weight_norm(l[1])
remove_weight_norm(self.conv_pre)
remove_weight_norm(self.conv_post)
'''
to run this, fix
from . import ResStack
into
from res_stack import ResStack
'''
if __name__ == '__main__':
'''
torch.Size([3, 80, 10])
torch.Size([3, 1, 2000])
4527362
'''
with open('config.json') as f:
data = f.read()
from utils import AttrDict
import json
json_config = json.loads(data)
h = AttrDict(json_config)
model = FreGAN(h)
c = torch.randn(3, 80, 10) # (B, channels, T).
print(c.shape)
y = model(c) # (B, 1, T ** prod(upsample_scales)
print(y.shape)
assert y.shape == torch.Size([3, 1, 2560]) # For normal melgan torch.Size([3, 1, 2560])
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(pytorch_total_params)

View File

@ -0,0 +1,70 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import json
import torch
from scipy.io.wavfile import write
from vocoder.hifigan.env import AttrDict
from vocoder.hifigan.meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
from vocoder.fregan.generator import FreGAN
import soundfile as sf
generator = None # type: FreGAN
_device = None
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print("Loading '{}'".format(filepath))
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
def load_model(weights_fpath, verbose=True):
global generator, _device
if verbose:
print("Building fregan")
with open("./vocoder/fregan/config.json") as f:
data = f.read()
json_config = json.loads(data)
h = AttrDict(json_config)
torch.manual_seed(h.seed)
if torch.cuda.is_available():
# _model = _model.cuda()
_device = torch.device('cuda')
else:
_device = torch.device('cpu')
generator = FreGAN(h).to(_device)
state_dict_g = load_checkpoint(
weights_fpath, _device
)
generator.load_state_dict(state_dict_g['generator'])
generator.eval()
generator.remove_weight_norm()
def is_loaded():
return generator is not None
def infer_waveform(mel, progress_callback=None):
if generator is None:
raise Exception("Please load fre-gan in memory before using it")
mel = torch.FloatTensor(mel).to(_device)
mel = mel.unsqueeze(0)
with torch.no_grad():
y_g_hat = generator(mel)
audio = y_g_hat.squeeze()
audio = audio.cpu().numpy()
return audio

35
vocoder/fregan/loss.py Normal file
View File

@ -0,0 +1,35 @@
import torch
def feature_loss(fmap_r, fmap_g):
loss = 0
for dr, dg in zip(fmap_r, fmap_g):
for rl, gl in zip(dr, dg):
loss += torch.mean(torch.abs(rl - gl))
return loss*2
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
loss = 0
r_losses = []
g_losses = []
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
r_loss = torch.mean((1-dr)**2)
g_loss = torch.mean(dg**2)
loss += (r_loss + g_loss)
r_losses.append(r_loss.item())
g_losses.append(g_loss.item())
return loss, r_losses, g_losses
def generator_loss(disc_outputs):
loss = 0
gen_losses = []
for dg in disc_outputs:
l = torch.mean((1-dg)**2)
gen_losses.append(l)
loss += l
return loss, gen_losses

View File

@ -0,0 +1,176 @@
import math
import os
import random
import torch
import torch.utils.data
import numpy as np
from librosa.util import normalize
from scipy.io.wavfile import read
from librosa.filters import mel as librosa_mel_fn
MAX_WAV_VALUE = 32768.0
def load_wav(full_path):
sampling_rate, data = read(full_path)
return data, sampling_rate
def dynamic_range_compression(x, C=1, clip_val=1e-5):
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
def dynamic_range_decompression(x, C=1):
return np.exp(x) / C
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
return torch.log(torch.clamp(x, min=clip_val) * C)
def dynamic_range_decompression_torch(x, C=1):
return torch.exp(x) / C
def spectral_normalize_torch(magnitudes):
output = dynamic_range_compression_torch(magnitudes)
return output
def spectral_de_normalize_torch(magnitudes):
output = dynamic_range_decompression_torch(magnitudes)
return output
mel_basis = {}
hann_window = {}
def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
if torch.min(y) < -1.:
print('min value is ', torch.min(y))
if torch.max(y) > 1.:
print('max value is ', torch.max(y))
global mel_basis, hann_window
if fmax not in mel_basis:
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
y = y.squeeze(1)
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
center=center, pad_mode='reflect', normalized=False, onesided=True)
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
spec = spectral_normalize_torch(spec)
return spec
def get_dataset_filelist(a):
#with open(a.input_training_file, 'r', encoding='utf-8') as fi:
# training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
# for x in fi.read().split('\n') if len(x) > 0]
#with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
# validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
# for x in fi.read().split('\n') if len(x) > 0]
files = os.listdir(a.input_wavs_dir)
random.shuffle(files)
files = [os.path.join(a.input_wavs_dir, f) for f in files]
training_files = files[: -int(len(files) * 0.05)]
validation_files = files[-int(len(files) * 0.05):]
return training_files, validation_files
class MelDataset(torch.utils.data.Dataset):
def __init__(self, training_files, segment_size, n_fft, num_mels,
hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
self.audio_files = training_files
random.seed(1234)
if shuffle:
random.shuffle(self.audio_files)
self.segment_size = segment_size
self.sampling_rate = sampling_rate
self.split = split
self.n_fft = n_fft
self.num_mels = num_mels
self.hop_size = hop_size
self.win_size = win_size
self.fmin = fmin
self.fmax = fmax
self.fmax_loss = fmax_loss
self.cached_wav = None
self.n_cache_reuse = n_cache_reuse
self._cache_ref_count = 0
self.device = device
self.fine_tuning = fine_tuning
self.base_mels_path = base_mels_path
def __getitem__(self, index):
filename = self.audio_files[index]
if self._cache_ref_count == 0:
#audio, sampling_rate = load_wav(filename)
#audio = audio / MAX_WAV_VALUE
audio = np.load(filename)
if not self.fine_tuning:
audio = normalize(audio) * 0.95
self.cached_wav = audio
#if sampling_rate != self.sampling_rate:
# raise ValueError("{} SR doesn't match target {} SR".format(
# sampling_rate, self.sampling_rate))
self._cache_ref_count = self.n_cache_reuse
else:
audio = self.cached_wav
self._cache_ref_count -= 1
audio = torch.FloatTensor(audio)
audio = audio.unsqueeze(0)
if not self.fine_tuning:
if self.split:
if audio.size(1) >= self.segment_size:
max_audio_start = audio.size(1) - self.segment_size
audio_start = random.randint(0, max_audio_start)
audio = audio[:, audio_start:audio_start+self.segment_size]
else:
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
center=False)
else:
mel_path = os.path.join(self.base_mels_path, "mel" + "-" + filename.split("/")[-1].split("-")[-1])
mel = np.load(mel_path).T
#mel = np.load(
# os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
mel = torch.from_numpy(mel)
if len(mel.shape) < 3:
mel = mel.unsqueeze(0)
if self.split:
frames_per_seg = math.ceil(self.segment_size / self.hop_size)
if audio.size(1) >= self.segment_size:
mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
mel = mel[:, :, mel_start:mel_start + frames_per_seg]
audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
else:
mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
center=False)
return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
def __len__(self):
return len(self.audio_files)

201
vocoder/fregan/modules.py Normal file
View File

@ -0,0 +1,201 @@
import torch
import torch.nn.functional as F
class KernelPredictor(torch.nn.Module):
''' Kernel predictor for the location-variable convolutions
'''
def __init__(self,
cond_channels,
conv_in_channels,
conv_out_channels,
conv_layers,
conv_kernel_size=3,
kpnet_hidden_channels=64,
kpnet_conv_size=3,
kpnet_dropout=0.0,
kpnet_nonlinear_activation="LeakyReLU",
kpnet_nonlinear_activation_params={"negative_slope": 0.1}
):
'''
Args:
cond_channels (int): number of channel for the conditioning sequence,
conv_in_channels (int): number of channel for the input sequence,
conv_out_channels (int): number of channel for the output sequence,
conv_layers (int):
kpnet_
'''
super().__init__()
self.conv_in_channels = conv_in_channels
self.conv_out_channels = conv_out_channels
self.conv_kernel_size = conv_kernel_size
self.conv_layers = conv_layers
l_w = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers
l_b = conv_out_channels * conv_layers
padding = (kpnet_conv_size - 1) // 2
self.input_conv = torch.nn.Sequential(
torch.nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=(5 - 1) // 2, bias=True),
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
)
self.residual_conv = torch.nn.Sequential(
torch.nn.Dropout(kpnet_dropout),
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
torch.nn.Dropout(kpnet_dropout),
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
torch.nn.Dropout(kpnet_dropout),
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
)
self.kernel_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_w, kpnet_conv_size,
padding=padding, bias=True)
self.bias_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_b, kpnet_conv_size, padding=padding,
bias=True)
def forward(self, c):
'''
Args:
c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
Returns:
'''
batch, cond_channels, cond_length = c.shape
c = self.input_conv(c)
c = c + self.residual_conv(c)
k = self.kernel_conv(c)
b = self.bias_conv(c)
kernels = k.contiguous().view(batch,
self.conv_layers,
self.conv_in_channels,
self.conv_out_channels,
self.conv_kernel_size,
cond_length)
bias = b.contiguous().view(batch,
self.conv_layers,
self.conv_out_channels,
cond_length)
return kernels, bias
class LVCBlock(torch.nn.Module):
''' the location-variable convolutions
'''
def __init__(self,
in_channels,
cond_channels,
upsample_ratio,
conv_layers=4,
conv_kernel_size=3,
cond_hop_length=256,
kpnet_hidden_channels=64,
kpnet_conv_size=3,
kpnet_dropout=0.0
):
super().__init__()
self.cond_hop_length = cond_hop_length
self.conv_layers = conv_layers
self.conv_kernel_size = conv_kernel_size
self.convs = torch.nn.ModuleList()
self.upsample = torch.nn.ConvTranspose1d(in_channels, in_channels,
kernel_size=upsample_ratio*2, stride=upsample_ratio,
padding=upsample_ratio // 2 + upsample_ratio % 2,
output_padding=upsample_ratio % 2)
self.kernel_predictor = KernelPredictor(
cond_channels=cond_channels,
conv_in_channels=in_channels,
conv_out_channels=2 * in_channels,
conv_layers=conv_layers,
conv_kernel_size=conv_kernel_size,
kpnet_hidden_channels=kpnet_hidden_channels,
kpnet_conv_size=kpnet_conv_size,
kpnet_dropout=kpnet_dropout
)
for i in range(conv_layers):
padding = (3 ** i) * int((conv_kernel_size - 1) / 2)
conv = torch.nn.Conv1d(in_channels, in_channels, kernel_size=conv_kernel_size, padding=padding, dilation=3 ** i)
self.convs.append(conv)
def forward(self, x, c):
''' forward propagation of the location-variable convolutions.
Args:
x (Tensor): the input sequence (batch, in_channels, in_length)
c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
Returns:
Tensor: the output sequence (batch, in_channels, in_length)
'''
batch, in_channels, in_length = x.shape
kernels, bias = self.kernel_predictor(c)
x = F.leaky_relu(x, 0.2)
x = self.upsample(x)
for i in range(self.conv_layers):
y = F.leaky_relu(x, 0.2)
y = self.convs[i](y)
y = F.leaky_relu(y, 0.2)
k = kernels[:, i, :, :, :, :]
b = bias[:, i, :, :]
y = self.location_variable_convolution(y, k, b, 1, self.cond_hop_length)
x = x + torch.sigmoid(y[:, :in_channels, :]) * torch.tanh(y[:, in_channels:, :])
return x
def location_variable_convolution(self, x, kernel, bias, dilation, hop_size):
''' perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
Args:
x (Tensor): the input sequence (batch, in_channels, in_length).
kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
dilation (int): the dilation of convolution.
hop_size (int): the hop_size of the conditioning sequence.
Returns:
(Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
'''
batch, in_channels, in_length = x.shape
batch, in_channels, out_channels, kernel_size, kernel_length = kernel.shape
assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched"
padding = dilation * int((kernel_size - 1) / 2)
x = F.pad(x, (padding, padding), 'constant', 0) # (batch, in_channels, in_length + 2*padding)
x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding)
if hop_size < dilation:
x = F.pad(x, (0, dilation), 'constant', 0)
x = x.unfold(3, dilation,
dilation) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
x = x[:, :, :, :, :hop_size]
x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size)
o = torch.einsum('bildsk,biokl->bolsd', x, kernel)
o = o + bias.unsqueeze(-1).unsqueeze(-1)
o = o.contiguous().view(batch, out_channels, -1)
return o

View File

@ -0,0 +1 @@
PyWavelets

136
vocoder/fregan/stft_loss.py Normal file
View File

@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
# Copyright 2019 Tomoki Hayashi
# MIT License (https://opensource.org/licenses/MIT)
"""STFT-based Loss modules."""
import torch
import torch.nn.functional as F
def stft(x, fft_size, hop_size, win_length, window):
"""Perform STFT and convert to magnitude spectrogram.
Args:
x (Tensor): Input signal tensor (B, T).
fft_size (int): FFT size.
hop_size (int): Hop size.
win_length (int): Window length.
window (str): Window function type.
Returns:
Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
"""
x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
real = x_stft[..., 0]
imag = x_stft[..., 1]
# NOTE(kan-bayashi): clamp is needed to avoid nan or inf
return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
class SpectralConvergengeLoss(torch.nn.Module):
"""Spectral convergence loss module."""
def __init__(self):
"""Initilize spectral convergence loss module."""
super(SpectralConvergengeLoss, self).__init__()
def forward(self, x_mag, y_mag):
"""Calculate forward propagation.
Args:
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
Returns:
Tensor: Spectral convergence loss value.
"""
return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
class LogSTFTMagnitudeLoss(torch.nn.Module):
"""Log STFT magnitude loss module."""
def __init__(self):
"""Initilize los STFT magnitude loss module."""
super(LogSTFTMagnitudeLoss, self).__init__()
def forward(self, x_mag, y_mag):
"""Calculate forward propagation.
Args:
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
Returns:
Tensor: Log STFT magnitude loss value.
"""
return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
class STFTLoss(torch.nn.Module):
"""STFT loss module."""
def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
"""Initialize STFT loss module."""
super(STFTLoss, self).__init__()
self.fft_size = fft_size
self.shift_size = shift_size
self.win_length = win_length
self.window = getattr(torch, window)(win_length)
self.spectral_convergenge_loss = SpectralConvergengeLoss()
self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
def forward(self, x, y):
"""Calculate forward propagation.
Args:
x (Tensor): Predicted signal (B, T).
y (Tensor): Groundtruth signal (B, T).
Returns:
Tensor: Spectral convergence loss value.
Tensor: Log STFT magnitude loss value.
"""
x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window.to(x.get_device()))
y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window.to(x.get_device()))
sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
return sc_loss, mag_loss
class MultiResolutionSTFTLoss(torch.nn.Module):
"""Multi resolution STFT loss module."""
def __init__(self,
fft_sizes=[1024, 2048, 512],
hop_sizes=[120, 240, 50],
win_lengths=[600, 1200, 240],
window="hann_window"):
"""Initialize Multi resolution STFT loss module.
Args:
fft_sizes (list): List of FFT sizes.
hop_sizes (list): List of hop sizes.
win_lengths (list): List of window lengths.
window (str): Window function type.
"""
super(MultiResolutionSTFTLoss, self).__init__()
assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
self.stft_losses = torch.nn.ModuleList()
for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
self.stft_losses += [STFTLoss(fs, ss, wl, window)]
def forward(self, x, y):
"""Calculate forward propagation.
Args:
x (Tensor): Predicted signal (B, T).
y (Tensor): Groundtruth signal (B, T).
Returns:
Tensor: Multi resolution spectral convergence loss value.
Tensor: Multi resolution log STFT magnitude loss value.
"""
sc_loss = 0.0
mag_loss = 0.0
for f in self.stft_losses:
sc_l, mag_l = f(x, y)
sc_loss += sc_l
mag_loss += mag_l
sc_loss /= len(self.stft_losses)
mag_loss /= len(self.stft_losses)
return sc_loss, mag_loss

253
vocoder/fregan/train.py Normal file
View File

@ -0,0 +1,253 @@
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import itertools
import os
import time
import argparse
import json
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DistributedSampler, DataLoader
import torch.multiprocessing as mp
from torch.distributed import init_process_group
from torch.nn.parallel import DistributedDataParallel
from vocoder.fregan.utils import AttrDict, build_env
from vocoder.fregan.meldataset import MelDataset, mel_spectrogram, get_dataset_filelist
from vocoder.fregan.generator import FreGAN
from vocoder.fregan.discriminator import ResWiseMultiPeriodDiscriminator, ResWiseMultiScaleDiscriminator
from vocoder.fregan.loss import feature_loss, generator_loss, discriminator_loss
from vocoder.fregan.utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint
from vocoder.fregan.stft_loss import MultiResolutionSTFTLoss
torch.backends.cudnn.benchmark = True
def train(rank, a, h):
a.checkpoint_path = a.models_dir.joinpath(a.run_id+'_fregan')
a.checkpoint_path.mkdir(exist_ok=True)
a.training_epochs = 3100
a.stdout_interval = 5
a.checkpoint_interval = a.backup_every
a.summary_interval = 5000
a.validation_interval = 1000
a.fine_tuning = True
a.input_wavs_dir = a.syn_dir.joinpath("audio")
a.input_mels_dir = a.syn_dir.joinpath("mels")
if h.num_gpus > 1:
init_process_group(backend=h.dist_config['dist_backend'], init_method=h.dist_config['dist_url'],
world_size=h.dist_config['world_size'] * h.num_gpus, rank=rank)
torch.cuda.manual_seed(h.seed)
device = torch.device('cuda:{:d}'.format(rank))
generator = FreGAN(h).to(device)
mpd = ResWiseMultiPeriodDiscriminator().to(device)
msd = ResWiseMultiScaleDiscriminator().to(device)
if rank == 0:
print(generator)
os.makedirs(a.checkpoint_path, exist_ok=True)
print("checkpoints directory : ", a.checkpoint_path)
if os.path.isdir(a.checkpoint_path):
cp_g = scan_checkpoint(a.checkpoint_path, 'g_')
cp_do = scan_checkpoint(a.checkpoint_path, 'do_')
steps = 0
if cp_g is None or cp_do is None:
state_dict_do = None
last_epoch = -1
else:
state_dict_g = load_checkpoint(cp_g, device)
state_dict_do = load_checkpoint(cp_do, device)
generator.load_state_dict(state_dict_g['generator'])
mpd.load_state_dict(state_dict_do['mpd'])
msd.load_state_dict(state_dict_do['msd'])
steps = state_dict_do['steps'] + 1
last_epoch = state_dict_do['epoch']
if h.num_gpus > 1:
generator = DistributedDataParallel(generator, device_ids=[rank]).to(device)
mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device)
msd = DistributedDataParallel(msd, device_ids=[rank]).to(device)
optim_g = torch.optim.AdamW(generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2])
optim_d = torch.optim.AdamW(itertools.chain(msd.parameters(), mpd.parameters()),
h.learning_rate, betas=[h.adam_b1, h.adam_b2])
if state_dict_do is not None:
optim_g.load_state_dict(state_dict_do['optim_g'])
optim_d.load_state_dict(state_dict_do['optim_d'])
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=h.lr_decay, last_epoch=last_epoch)
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=h.lr_decay, last_epoch=last_epoch)
training_filelist, validation_filelist = get_dataset_filelist(a)
trainset = MelDataset(training_filelist, h.segment_size, h.n_fft, h.num_mels,
h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, n_cache_reuse=0,
shuffle=False if h.num_gpus > 1 else True, fmax_loss=h.fmax_for_loss, device=device,
fine_tuning=a.fine_tuning, base_mels_path=a.input_mels_dir)
train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None
train_loader = DataLoader(trainset, num_workers=h.num_workers, shuffle=False,
sampler=train_sampler,
batch_size=h.batch_size,
pin_memory=True,
drop_last=True)
if rank == 0:
validset = MelDataset(validation_filelist, h.segment_size, h.n_fft, h.num_mels,
h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, False, False, n_cache_reuse=0,
fmax_loss=h.fmax_for_loss, device=device, fine_tuning=a.fine_tuning,
base_mels_path=a.input_mels_dir)
validation_loader = DataLoader(validset, num_workers=1, shuffle=False,
sampler=None,
batch_size=1,
pin_memory=True,
drop_last=True)
sw = SummaryWriter(os.path.join(a.checkpoint_path, 'logs'))
generator.train()
mpd.train()
msd.train()
for epoch in range(max(0, last_epoch), a.training_epochs):
if rank == 0:
start = time.time()
print("Epoch: {}".format(epoch + 1))
if h.num_gpus > 1:
train_sampler.set_epoch(epoch)
for i, batch in enumerate(train_loader):
if rank == 0:
start_b = time.time()
x, y, _, y_mel = batch
x = torch.autograd.Variable(x.to(device, non_blocking=True))
y = torch.autograd.Variable(y.to(device, non_blocking=True))
y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True))
y = y.unsqueeze(1)
y_g_hat = generator(x)
y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size,
h.win_size,
h.fmin, h.fmax_for_loss)
optim_d.zero_grad()
# MPD
y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach())
loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(y_df_hat_r, y_df_hat_g)
# MSD
y_ds_hat_r, y_ds_hat_g, _, _ = msd(y, y_g_hat.detach())
loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(y_ds_hat_r, y_ds_hat_g)
loss_disc_all = loss_disc_s + loss_disc_f
loss_disc_all.backward()
optim_d.step()
# Generator
optim_g.zero_grad()
# L1 Mel-Spectrogram Loss
loss_mel = F.l1_loss(y_mel, y_g_hat_mel) * 45
# sc_loss, mag_loss = stft_loss(y_g_hat[:, :, :y.size(2)].squeeze(1), y.squeeze(1))
# loss_mel = h.lambda_aux * (sc_loss + mag_loss) # STFT Loss
y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat)
y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = msd(y, y_g_hat)
loss_fm_f = feature_loss(fmap_f_r, fmap_f_g)
loss_fm_s = feature_loss(fmap_s_r, fmap_s_g)
loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g)
loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g)
loss_gen_all = loss_gen_s + loss_gen_f + (2 * (loss_fm_s + loss_fm_f)) + loss_mel
loss_gen_all.backward()
optim_g.step()
if rank == 0:
# STDOUT logging
if steps % a.stdout_interval == 0:
with torch.no_grad():
mel_error = F.l1_loss(y_mel, y_g_hat_mel).item()
print('Steps : {:d}, Gen Loss Total : {:4.3f}, Mel-Spec. Error : {:4.3f}, s/b : {:4.3f}'.
format(steps, loss_gen_all, mel_error, time.time() - start_b))
# checkpointing
if steps % a.checkpoint_interval == 0 and steps != 0:
checkpoint_path = "{}/m_fregan_{:08d}".format(a.checkpoint_path, steps)
save_checkpoint(checkpoint_path,
{'generator': (generator.module if h.num_gpus > 1 else generator).state_dict()})
checkpoint_path = "{}/do_fregan_{:08d}".format(a.checkpoint_path, steps)
save_checkpoint(checkpoint_path,
{'mpd': (mpd.module if h.num_gpus > 1
else mpd).state_dict(),
'msd': (msd.module if h.num_gpus > 1
else msd).state_dict(),
'optim_g': optim_g.state_dict(), 'optim_d': optim_d.state_dict(), 'steps': steps,
'epoch': epoch})
# Tensorboard summary logging
if steps % a.summary_interval == 0:
sw.add_scalar("training/gen_loss_total", loss_gen_all, steps)
sw.add_scalar("training/mel_spec_error", mel_error, steps)
# Validation
if steps % a.validation_interval == 0: # and steps != 0:
generator.eval()
torch.cuda.empty_cache()
val_err_tot = 0
with torch.no_grad():
for j, batch in enumerate(validation_loader):
x, y, _, y_mel = batch
y_g_hat = generator(x.to(device))
y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True))
y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate,
h.hop_size, h.win_size,
h.fmin, h.fmax_for_loss)
#val_err_tot += F.l1_loss(y_mel, y_g_hat_mel).item()
if j <= 4:
if steps == 0:
sw.add_audio('gt/y_{}'.format(j), y[0], steps, h.sampling_rate)
sw.add_figure('gt/y_spec_{}'.format(j), plot_spectrogram(x[0]), steps)
sw.add_audio('generated/y_hat_{}'.format(j), y_g_hat[0], steps, h.sampling_rate)
y_hat_spec = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels,
h.sampling_rate, h.hop_size, h.win_size,
h.fmin, h.fmax)
sw.add_figure('generated/y_hat_spec_{}'.format(j),
plot_spectrogram(y_hat_spec.squeeze(0).cpu().numpy()), steps)
val_err = val_err_tot / (j + 1)
sw.add_scalar("validation/mel_spec_error", val_err, steps)
generator.train()
steps += 1
scheduler_g.step()
scheduler_d.step()
if rank == 0:
print('Time taken for epoch {} is {} sec\n'.format(epoch + 1, int(time.time() - start)))

71
vocoder/fregan/utils.py Normal file
View File

@ -0,0 +1,71 @@
import glob
import os
import matplotlib
import torch
from torch.nn.utils import weight_norm
matplotlib.use("Agg")
import matplotlib.pylab as plt
import shutil
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
def build_env(config, config_name, path):
t_path = os.path.join(path, config_name)
if config != t_path:
os.makedirs(path, exist_ok=True)
shutil.copyfile(config, os.path.join(path, config_name))
def plot_spectrogram(spectrogram):
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
interpolation='none')
plt.colorbar(im, ax=ax)
fig.canvas.draw()
plt.close()
return fig
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def apply_weight_norm(m):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
weight_norm(m)
def get_padding(kernel_size, dilation=1):
return int((kernel_size*dilation - dilation)/2)
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print("Loading '{}'".format(filepath))
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
def save_checkpoint(filepath, obj):
print("Saving checkpoint to {}".format(filepath))
torch.save(obj, filepath)
print("Complete.")
def scan_checkpoint(cp_dir, prefix):
pattern = os.path.join(cp_dir, prefix + '????????')
cp_list = glob.glob(pattern)
if len(cp_list) == 0:
return None
return sorted(cp_list)[-1]

View File

@ -27,5 +27,10 @@
"fmax": 7600, "fmax": 7600,
"fmax_for_loss": null, "fmax_for_loss": null,
"num_workers": 4 "num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321",
"world_size": 1
}
} }

View File

@ -52,6 +52,7 @@ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin,
if torch.max(y) > 1.: if torch.max(y) > 1.:
print('max value is ', torch.max(y)) print('max value is ', torch.max(y))
global mel_basis, hann_window global mel_basis, hann_window
if fmax not in mel_basis: if fmax not in mel_basis:
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)

View File

@ -2,6 +2,7 @@ from utils.argutils import print_args
from vocoder.wavernn.train import train from vocoder.wavernn.train import train
from vocoder.hifigan.train import train as train_hifigan from vocoder.hifigan.train import train as train_hifigan
from vocoder.hifigan.env import AttrDict from vocoder.hifigan.env import AttrDict
from vocoder.fregan.train import train as train_fregan
from pathlib import Path from pathlib import Path
import argparse import argparse
import json import json
@ -61,11 +62,18 @@ if __name__ == "__main__":
# Process the arguments # Process the arguments
if args.vocoder_type == "wavernn": if args.vocoder_type == "wavernn":
# Run the training wavernn # Run the training wavernn
delattr(args,'vocoder_type')
delattr(args,'config')
train(**vars(args)) train(**vars(args))
elif args.vocoder_type == "hifigan": elif args.vocoder_type == "hifigan":
with open(args.config) as f: with open(args.config) as f:
json_config = json.load(f) json_config = json.load(f)
h = AttrDict(json_config) h = AttrDict(json_config)
train_hifigan(0, args, h) train_hifigan(0, args, h)
elif args.vocoder_type == "fregan":
with open('vocoder/fregan/config.json') as f:
json_config = json.load(f)
h = AttrDict(json_config)
train_fregan(0, args, h)