diff --git a/README-CN.md b/README-CN.md index d679aa3..263c9f7 100644 --- a/README-CN.md +++ b/README-CN.md @@ -45,6 +45,8 @@ 然后您可以尝试使用工具箱: `python demo_toolbox.py -d ` +> Good news🤩: 可直接使用中文 + ## TODO - [X] 允许直接使用中文 - [X] 添加演示视频 diff --git a/README.md b/README.md index 3d15b54..37b22e9 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,8 @@ You can then try the toolbox: or `python demo_toolbox.py` +> Good news🤩: Chinese Characters are supported + ## TODO - [x] Add demo video - [X] Add support for more dataset diff --git a/synthesizer/inference.py b/synthesizer/inference.py index af7bf08..07cf881 100644 --- a/synthesizer/inference.py +++ b/synthesizer/inference.py @@ -9,7 +9,7 @@ from pathlib import Path from typing import Union, List import numpy as np import librosa - +from pypinyin import lazy_pinyin, Style class Synthesizer: sample_rate = hparams.sample_rate @@ -91,6 +91,10 @@ class Synthesizer: simple_table([("Tacotron", str(tts_k) + "k"), ("r", self._model.r)]) + #convert chinese char to pinyin + list_of_pinyin = lazy_pinyin(texts, style=Style.TONE3) + texts = [" ".join([v for v in list_of_pinyin if v.strip()])] + # Preprocess text inputs inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts] if not isinstance(embeddings, list): diff --git a/toolbox/ui.py b/toolbox/ui.py index d56b574..6ae6a7e 100644 --- a/toolbox/ui.py +++ b/toolbox/ui.py @@ -36,17 +36,8 @@ colormap = np.array([ ], dtype=np.float) / 255 default_text = \ - "Welcome to the toolbox! To begin, load an utterance from your datasets or record one " \ - "yourself.\nOnce its embedding has been created, you can synthesize any text written here.\n" \ - "The synthesizer expects to generate " \ - "outputs that are somewhere between 5 and 12 seconds.\nTo mark breaks, write a new line. " \ - "Each line will be treated separately.\nThen, they are joined together to make the final " \ - "spectrogram. Use the vocoder to generate audio.\nThe vocoder generates almost in constant " \ - "time, so it will be more time efficient for longer inputs like this one.\nOn the left you " \ - "have the embedding projections. Load or record more utterances to see them.\nIf you have " \ - "at least 2 or 3 utterances from a same speaker, a cluster should form.\nSynthesized " \ - "utterances are of the same color as the speaker whose voice was used, but they're " \ - "represented with a cross." + "欢迎使用工具箱, 现已支持中文输入!" + class UI(QDialog):