mirror of
https://github.com/babysor/MockingBird.git
synced 2024-03-22 13:11:31 +08:00
129 lines
4.6 KiB
Python
129 lines
4.6 KiB
Python
from encoder.params_model import model_embedding_size as speaker_embedding_size
|
|
from utils.argutils import print_args
|
|
from utils.modelutils import check_model_paths
|
|
from synthesizer.inference import Synthesizer
|
|
from encoder import inference as encoder
|
|
from vocoder.wavernn import inference as rnn_vocoder
|
|
from vocoder.hifigan import inference as gan_vocoder
|
|
from pathlib import Path
|
|
import numpy as np
|
|
import soundfile as sf
|
|
import librosa
|
|
import argparse
|
|
import torch
|
|
import sys
|
|
import os
|
|
import re
|
|
import cn2an
|
|
import glob
|
|
|
|
from audioread.exceptions import NoBackendError
|
|
vocoder = gan_vocoder
|
|
|
|
def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):
|
|
embeds = [embed] * len(texts)
|
|
# If you know what the attention layer alignments are, you can retrieve them here by
|
|
# passing return_alignments=True
|
|
specs = synthesizer.synthesize_spectrograms(texts, embeds, style_idx=-1, min_stop_token=4, steps=400)
|
|
#spec = specs[0]
|
|
breaks = [spec.shape[1] for spec in specs]
|
|
spec = np.concatenate(specs, axis=1)
|
|
|
|
# If seed is specified, reset torch seed and reload vocoder
|
|
# Synthesizing the waveform is fairly straightforward. Remember that the longer the
|
|
# spectrogram, the more time-efficient the vocoder.
|
|
generated_wav, output_sample_rate = vocoder.infer_waveform(spec)
|
|
|
|
# Add breaks
|
|
b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size)
|
|
b_starts = np.concatenate(([0], b_ends[:-1]))
|
|
wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
|
|
breaks = [np.zeros(int(0.15 * synthesizer.sample_rate))] * len(breaks)
|
|
generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
|
|
|
|
## Post-generation
|
|
# There's a bug with sounddevice that makes the audio cut one second earlier, so we
|
|
# pad it.
|
|
|
|
# Trim excess silences to compensate for gaps in spectrograms (issue #53)
|
|
generated_wav = encoder.preprocess_wav(generated_wav)
|
|
generated_wav = generated_wav / np.abs(generated_wav).max() * 0.97
|
|
|
|
# Save it on the disk
|
|
model=os.path.basename(in_fpath)
|
|
filename = "%s_%d_%s.wav" %(file_name, seq, model)
|
|
sf.write(filename, generated_wav, synthesizer.sample_rate)
|
|
|
|
print("\nSaved output as %s\n\n" % filename)
|
|
|
|
|
|
def generate_wav(enc_model_fpath, syn_model_fpath, voc_model_fpath, in_fpath, input_txt, file_name):
|
|
if torch.cuda.is_available():
|
|
device_id = torch.cuda.current_device()
|
|
gpu_properties = torch.cuda.get_device_properties(device_id)
|
|
## Print some environment information (for debugging purposes)
|
|
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
|
|
"%.1fGb total memory.\n" %
|
|
(torch.cuda.device_count(),
|
|
device_id,
|
|
gpu_properties.name,
|
|
gpu_properties.major,
|
|
gpu_properties.minor,
|
|
gpu_properties.total_memory / 1e9))
|
|
else:
|
|
print("Using CPU for inference.\n")
|
|
|
|
print("Preparing the encoder, the synthesizer and the vocoder...")
|
|
encoder.load_model(enc_model_fpath)
|
|
synthesizer = Synthesizer(syn_model_fpath)
|
|
vocoder.load_model(voc_model_fpath)
|
|
|
|
encoder_wav = synthesizer.load_preprocess_wav(in_fpath)
|
|
embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
|
|
|
|
texts = input_txt.split("\n")
|
|
seq=0
|
|
each_num=1500
|
|
|
|
punctuation = '!,。、,' # punctuate and split/clean text
|
|
processed_texts = []
|
|
cur_num = 0
|
|
for text in texts:
|
|
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
|
|
if processed_text:
|
|
processed_texts.append(processed_text.strip())
|
|
cur_num += len(processed_text.strip())
|
|
if cur_num > each_num:
|
|
seq = seq +1
|
|
gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)
|
|
processed_texts = []
|
|
cur_num = 0
|
|
|
|
if len(processed_texts)>0:
|
|
seq = seq +1
|
|
gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)
|
|
|
|
if (len(sys.argv)>=3):
|
|
my_txt = ""
|
|
print("reading from :", sys.argv[1])
|
|
with open(sys.argv[1], "r") as f:
|
|
for line in f.readlines():
|
|
#line = line.strip('\n')
|
|
my_txt += line
|
|
txt_file_name = sys.argv[1]
|
|
wav_file_name = sys.argv[2]
|
|
|
|
output = cn2an.transform(my_txt, "an2cn")
|
|
print(output)
|
|
generate_wav(
|
|
Path("encoder/saved_models/pretrained.pt"),
|
|
Path("synthesizer/saved_models/mandarin.pt"),
|
|
Path("vocoder/saved_models/pretrained/g_hifigan.pt"), wav_file_name, output, txt_file_name
|
|
)
|
|
|
|
else:
|
|
print("please input the file name")
|
|
exit(1)
|
|
|
|
|