From 7317ba5ffe22d488a4a96707143918ceb6ab5299 Mon Sep 17 00:00:00 2001 From: Evers Date: Sun, 22 May 2022 16:28:58 +0800 Subject: [PATCH] add gen_voice.py for handle by python command instead of demon_tool gui. (#560) --- README.md | 5 ++ gen_voice.py | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 gen_voice.py diff --git a/README.md b/README.md index 1538255..c1a85d2 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,11 @@ You can then try to run:`python web.py` and open it in browser, default as `http You can then try the toolbox: `python demo_toolbox.py -d ` +#### 3.3 Using the command line +You can then try the command: +`python gen_voice.py your_wav_file.wav` +you may need to install cn2an by "pip install cn2an" for better digital number result. + ## Reference > This repository is forked from [Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) which only support English. diff --git a/gen_voice.py b/gen_voice.py new file mode 100644 index 0000000..3be4159 --- /dev/null +++ b/gen_voice.py @@ -0,0 +1,128 @@ +from encoder.params_model import model_embedding_size as speaker_embedding_size +from utils.argutils import print_args +from utils.modelutils import check_model_paths +from synthesizer.inference import Synthesizer +from encoder import inference as encoder +from vocoder.wavernn import inference as rnn_vocoder +from vocoder.hifigan import inference as gan_vocoder +from pathlib import Path +import numpy as np +import soundfile as sf +import librosa +import argparse +import torch +import sys +import os +import re +import cn2an +import glob + +from audioread.exceptions import NoBackendError +vocoder = gan_vocoder + +def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq): + embeds = [embed] * len(texts) + # If you know what the attention layer alignments are, you can retrieve them here by + # passing return_alignments=True + specs = synthesizer.synthesize_spectrograms(texts, embeds, style_idx=-1, min_stop_token=4, steps=400) + #spec = specs[0] + breaks = [spec.shape[1] for spec in specs] + spec = np.concatenate(specs, axis=1) + + # If seed is specified, reset torch seed and reload vocoder + # Synthesizing the waveform is fairly straightforward. Remember that the longer the + # spectrogram, the more time-efficient the vocoder. + generated_wav, output_sample_rate = vocoder.infer_waveform(spec) + + # Add breaks + b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size) + b_starts = np.concatenate(([0], b_ends[:-1])) + wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)] + breaks = [np.zeros(int(0.15 * synthesizer.sample_rate))] * len(breaks) + generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) + + ## Post-generation + # There's a bug with sounddevice that makes the audio cut one second earlier, so we + # pad it. + + # Trim excess silences to compensate for gaps in spectrograms (issue #53) + generated_wav = encoder.preprocess_wav(generated_wav) + generated_wav = generated_wav / np.abs(generated_wav).max() * 0.97 + + # Save it on the disk + model=os.path.basename(in_fpath) + filename = "%s_%d_%s.wav" %(file_name, seq, model) + sf.write(filename, generated_wav, synthesizer.sample_rate) + + print("\nSaved output as %s\n\n" % filename) + + +def generate_wav(enc_model_fpath, syn_model_fpath, voc_model_fpath, in_fpath, input_txt, file_name): + if torch.cuda.is_available(): + device_id = torch.cuda.current_device() + gpu_properties = torch.cuda.get_device_properties(device_id) + ## Print some environment information (for debugging purposes) + print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " + "%.1fGb total memory.\n" % + (torch.cuda.device_count(), + device_id, + gpu_properties.name, + gpu_properties.major, + gpu_properties.minor, + gpu_properties.total_memory / 1e9)) + else: + print("Using CPU for inference.\n") + + print("Preparing the encoder, the synthesizer and the vocoder...") + encoder.load_model(enc_model_fpath) + synthesizer = Synthesizer(syn_model_fpath) + vocoder.load_model(voc_model_fpath) + + encoder_wav = synthesizer.load_preprocess_wav(in_fpath) + embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) + + texts = input_txt.split("\n") + seq=0 + each_num=1500 + + punctuation = '!,。、,' # punctuate and split/clean text + processed_texts = [] + cur_num = 0 + for text in texts: + for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'): + if processed_text: + processed_texts.append(processed_text.strip()) + cur_num += len(processed_text.strip()) + if cur_num > each_num: + seq = seq +1 + gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq) + processed_texts = [] + cur_num = 0 + + if len(processed_texts)>0: + seq = seq +1 + gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq) + +if (len(sys.argv)>=3): + my_txt = "" + print("reading from :", sys.argv[1]) + with open(sys.argv[1], "r") as f: + for line in f.readlines(): + #line = line.strip('\n') + my_txt += line + txt_file_name = sys.argv[1] + wav_file_name = sys.argv[2] + + output = cn2an.transform(my_txt, "an2cn") + print(output) + generate_wav( + Path("encoder/saved_models/pretrained.pt"), + Path("synthesizer/saved_models/mandarin.pt"), + Path("vocoder/saved_models/pretrained/g_hifigan.pt"), wav_file_name, output, txt_file_name + ) + +else: + print("please input the file name") + exit(1) + +