add gen_voice.py for handle by python command instead of demon_tool gui. (#560)

2024-03-22 13:11:31 +08:00 · 2022-05-22 16:28:58 +08:00 · 2022-05-22 16:28:58 +08:00 · 7317ba5ffe
commit 7317ba5ffe
parent 05f886162c
2 changed files with 133 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -90,6 +90,11 @@ You can then try to run:`python web.py` and open it in browser, default as `http
 You can then try the toolbox:
 `python demo_toolbox.py -d <datasets_root>`

+#### 3.3 Using the command line
+You can then try the command:
+`python gen_voice.py <text_file.txt> your_wav_file.wav`
+you may need to install cn2an by "pip install cn2an" for better digital number result.
+
 ## Reference
 > This repository is forked from [Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) which only support English.

--- a/gen_voice.py
+++ b/gen_voice.py
@ -0,0 +1,128 @@
+from encoder.params_model import model_embedding_size as speaker_embedding_size
+from utils.argutils import print_args
+from utils.modelutils import check_model_paths
+from synthesizer.inference import Synthesizer
+from encoder import inference as encoder
+from vocoder.wavernn import inference as rnn_vocoder
+from vocoder.hifigan import inference as gan_vocoder
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import librosa
+import argparse
+import torch
+import sys
+import os
+import re
+import cn2an
+import glob
+
+from audioread.exceptions import NoBackendError
+vocoder = gan_vocoder
+
+def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):
+    embeds = [embed] * len(texts)
+    # If you know what the attention layer alignments are, you can retrieve them here by
+    # passing return_alignments=True
+    specs = synthesizer.synthesize_spectrograms(texts, embeds, style_idx=-1, min_stop_token=4, steps=400)
+    #spec = specs[0]
+    breaks = [spec.shape[1] for spec in specs]
+    spec = np.concatenate(specs, axis=1)
+
+    # If seed is specified, reset torch seed and reload vocoder
+    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
+    # spectrogram, the more time-efficient the vocoder.
+    generated_wav, output_sample_rate = vocoder.infer_waveform(spec)
+    
+    # Add breaks
+    b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size)
+    b_starts = np.concatenate(([0], b_ends[:-1]))
+    wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
+    breaks = [np.zeros(int(0.15 * synthesizer.sample_rate))] * len(breaks)
+    generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
+    
+    ## Post-generation
+    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
+    # pad it.
+
+    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
+    generated_wav = encoder.preprocess_wav(generated_wav)
+    generated_wav = generated_wav / np.abs(generated_wav).max() * 0.97
+        
+    # Save it on the disk
+    model=os.path.basename(in_fpath)
+    filename = "%s_%d_%s.wav" %(file_name, seq, model)
+    sf.write(filename, generated_wav, synthesizer.sample_rate)
+
+    print("\nSaved output as %s\n\n" % filename)
+    
+    
+def generate_wav(enc_model_fpath, syn_model_fpath, voc_model_fpath, in_fpath, input_txt, file_name): 
+    if torch.cuda.is_available():
+        device_id = torch.cuda.current_device()
+        gpu_properties = torch.cuda.get_device_properties(device_id)
+        ## Print some environment information (for debugging purposes)
+        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
+            "%.1fGb total memory.\n" % 
+            (torch.cuda.device_count(),
+            device_id,
+            gpu_properties.name,
+            gpu_properties.major,
+            gpu_properties.minor,
+            gpu_properties.total_memory / 1e9))
+    else:
+        print("Using CPU for inference.\n")
+
+    print("Preparing the encoder, the synthesizer and the vocoder...")
+    encoder.load_model(enc_model_fpath)
+    synthesizer = Synthesizer(syn_model_fpath)
+    vocoder.load_model(voc_model_fpath)
+
+    encoder_wav = synthesizer.load_preprocess_wav(in_fpath)
+    embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
+
+    texts = input_txt.split("\n")
+    seq=0
+    each_num=1500
+    
+    punctuation = '！，。、,' # punctuate and split/clean text
+    processed_texts = []
+    cur_num = 0
+    for text in texts:
+      for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
+        if processed_text:
+            processed_texts.append(processed_text.strip())
+            cur_num += len(processed_text.strip())
+      if cur_num > each_num:
+        seq = seq +1
+        gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)
+        processed_texts = []
+        cur_num = 0
+    
+    if len(processed_texts)>0:
+      seq = seq +1
+      gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)
+
+if (len(sys.argv)>=3):
+    my_txt = ""
+    print("reading from :", sys.argv[1])
+    with open(sys.argv[1], "r") as f:
+        for line in f.readlines():
+            #line = line.strip('\n')
+            my_txt += line
+    txt_file_name = sys.argv[1]
+    wav_file_name = sys.argv[2]
+
+    output = cn2an.transform(my_txt, "an2cn")
+    print(output)
+    generate_wav(
+    Path("encoder/saved_models/pretrained.pt"),
+    Path("synthesizer/saved_models/mandarin.pt"),
+    Path("vocoder/saved_models/pretrained/g_hifigan.pt"), wav_file_name, output, txt_file_name
+    )
+
+else:
+    print("please input the file name")
+    exit(1)
+
+