from models.synthesizer.inference import Synthesizer from models.encoder import inference as encoder from models.vocoder.hifigan import inference as gan_vocoder from pathlib import Path import numpy as np import soundfile as sf import torch import sys import os import re import cn2an vocoder = gan_vocoder def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq): embeds = [embed] * len(texts) # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = synthesizer.synthesize_spectrograms(texts, embeds, style_idx=-1, min_stop_token=4, steps=400) #spec = specs[0] breaks = [spec.shape[1] for spec in specs] spec = np.concatenate(specs, axis=1) # If seed is specified, reset torch seed and reload vocoder # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav, output_sample_rate = vocoder.infer_waveform(spec) # Add breaks b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * synthesizer.sample_rate))] * len(breaks) generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) ## Post-generation # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. # Trim excess silences to compensate for gaps in spectrograms (issue #53) generated_wav = encoder.preprocess_wav(generated_wav) generated_wav = generated_wav / np.abs(generated_wav).max() * 0.97 # Save it on the disk model=os.path.basename(in_fpath) filename = "%s_%d_%s.wav" %(file_name, seq, model) sf.write(filename, generated_wav, synthesizer.sample_rate) print("\nSaved output as %s\n\n" % filename) def generate_wav(enc_model_fpath, syn_model_fpath, voc_model_fpath, in_fpath, input_txt, file_name): if torch.cuda.is_available(): device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) ## Print some environment information (for debugging purposes) print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) else: print("Using CPU for inference.\n") print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(enc_model_fpath) synthesizer = Synthesizer(syn_model_fpath) vocoder.load_model(voc_model_fpath) encoder_wav = synthesizer.load_preprocess_wav(in_fpath) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) texts = input_txt.split("\n") seq=0 each_num=1500 punctuation = '!,。、,' # punctuate and split/clean text processed_texts = [] cur_num = 0 for text in texts: for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'): if processed_text: processed_texts.append(processed_text.strip()) cur_num += len(processed_text.strip()) if cur_num > each_num: seq = seq +1 gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq) processed_texts = [] cur_num = 0 if len(processed_texts)>0: seq = seq +1 gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq) if (len(sys.argv)>=3): my_txt = "" print("reading from :", sys.argv[1]) with open(sys.argv[1], "r") as f: for line in f.readlines(): #line = line.strip('\n') my_txt += line txt_file_name = sys.argv[1] wav_file_name = sys.argv[2] output = cn2an.transform(my_txt, "an2cn") print(output) generate_wav( Path("encoder/saved_models/pretrained.pt"), Path("synthesizer/saved_models/mandarin.pt"), Path("vocoder/saved_models/pretrained/g_hifigan.pt"), wav_file_name, output, txt_file_name ) else: print("please input the file name") exit(1)