diff --git a/.vscode/launch.json b/.vscode/launch.json index f821057..7e69fb8 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1,38 +1,39 @@ { - // 使用 IntelliSense 了解相关属性。 - // 悬停以查看现有属性的描述。 - // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "Python: Vocoder Preprocess", - "type": "python", - "request": "launch", - "program": "vocoder_preprocess.py", - "console": "integratedTerminal", - "args": [ - "..\\..\\chs1" - ], - }, - { - "name": "Python: Vocoder Train", - "type": "python", - "request": "launch", - "program": "vocoder_train.py", - "console": "integratedTerminal", - "args": [ - "dev", "..\\..\\chs1" - ], - }, - { - "name": "Python: demo box", - "type": "python", - "request": "launch", - "program": "demo_toolbox.py", - "console": "integratedTerminal", - "args": [ - "-d", "..\\..\\chs" - ], - } - ] -} \ No newline at end of file + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Web", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/web/app.py", + "console": "integratedTerminal" + }, + { + "name": "Python: Vocoder Preprocess", + "type": "python", + "request": "launch", + "program": "vocoder_preprocess.py", + "console": "integratedTerminal", + "args": ["..\\..\\chs1"] + }, + { + "name": "Python: Vocoder Train", + "type": "python", + "request": "launch", + "program": "vocoder_train.py", + "console": "integratedTerminal", + "args": ["dev", "..\\..\\chs1"] + }, + { + "name": "Python: demo box", + "type": "python", + "request": "launch", + "program": "demo_toolbox.py", + "console": "integratedTerminal", + "args": ["-d", "..\\..\\chs"] + } + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..de288e1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.formatting.provider": "black" +} \ No newline at end of file diff --git a/samples/1320_00000.mp3 b/samples/1320_00000.mp3 deleted file mode 100644 index f0791b0..0000000 Binary files a/samples/1320_00000.mp3 and /dev/null differ diff --git a/samples/3575_00000.mp3 b/samples/3575_00000.mp3 deleted file mode 100644 index 545d784..0000000 Binary files a/samples/3575_00000.mp3 and /dev/null differ diff --git a/samples/6829_00000.mp3 b/samples/6829_00000.mp3 deleted file mode 100644 index 34f0382..0000000 Binary files a/samples/6829_00000.mp3 and /dev/null differ diff --git a/samples/8230_00000.mp3 b/samples/8230_00000.mp3 deleted file mode 100644 index b7c5620..0000000 Binary files a/samples/8230_00000.mp3 and /dev/null differ diff --git a/samples/README.md b/samples/README.md deleted file mode 100644 index 1a392d8..0000000 --- a/samples/README.md +++ /dev/null @@ -1,22 +0,0 @@ -The audio files in this folder are provided for toolbox testing and -benchmarking purposes. These are the same reference utterances -used by the SV2TTS authors to generate the audio samples located at: -https://google.github.io/tacotron/publications/speaker_adaptation/index.html - -The `p240_00000.mp3` and `p260_00000.mp3` files are compressed -versions of audios from the VCTK corpus available at: -https://datashare.is.ed.ac.uk/handle/10283/3443 -VCTK.txt contains the copyright notices and licensing information. - -The `1320_00000.mp3`, `3575_00000.mp3`, `6829_00000.mp3` -and `8230_00000.mp3` files are compressed versions of audios -from the LibriSpeech dataset available at: https://openslr.org/12 -For these files, the following notice applies: -``` -LibriSpeech (c) 2014 by Vassil Panayotov - -LibriSpeech ASR corpus is licensed under a -Creative Commons Attribution 4.0 International License. - -See . -``` diff --git a/samples/VCTK.txt b/samples/VCTK.txt deleted file mode 100644 index b51455a..0000000 --- a/samples/VCTK.txt +++ /dev/null @@ -1,94 +0,0 @@ ---------------------------------------------------------------------- - CSTR VCTK Corpus - English Multi-speaker Corpus for CSTR Voice Cloning Toolkit - - (Version 0.92) - RELEASE September 2019 - The Centre for Speech Technology Research - University of Edinburgh - Copyright (c) 2019 - - Junichi Yamagishi - jyamagis@inf.ed.ac.uk ---------------------------------------------------------------------- - -Overview - -This CSTR VCTK Corpus includes speech data uttered by 110 English -speakers with various accents. Each speaker reads out about 400 -sentences, which were selected from a newspaper, the rainbow passage -and an elicitation paragraph used for the speech accent archive. - -The newspaper texts were taken from Herald Glasgow, with permission -from Herald & Times Group. Each speaker has a different set of the -newspaper texts selected based a greedy algorithm that increases the -contextual and phonetic coverage. The details of the text selection -algorithms are described in the following paper: - -C. Veaux, J. Yamagishi and S. King, -"The voice bank corpus: Design, collection and data analysis of -a large regional accent speech database," -https://doi.org/10.1109/ICSDA.2013.6709856 - -The rainbow passage and elicitation paragraph are the same for all -speakers. The rainbow passage can be found at International Dialects -of English Archive: -(http://web.ku.edu/~idea/readings/rainbow.htm). The elicitation -paragraph is identical to the one used for the speech accent archive -(http://accent.gmu.edu). The details of the the speech accent archive -can be found at -http://www.ualberta.ca/~aacl2009/PDFs/WeinbergerKunath2009AACL.pdf - -All speech data was recorded using an identical recording setup: an -omni-directional microphone (DPA 4035) and a small diaphragm condenser -microphone with very wide bandwidth (Sennheiser MKH 800), 96kHz -sampling frequency at 24 bits and in a hemi-anechoic chamber of -the University of Edinburgh. (However, two speakers, p280 and p315 -had technical issues of the audio recordings using MKH 800). -All recordings were converted into 16 bits, were downsampled to -48 kHz, and were manually end-pointed. - -This corpus was originally aimed for HMM-based text-to-speech synthesis -systems, especially for speaker-adaptive HMM-based speech synthesis -that uses average voice models trained on multiple speakers and speaker -adaptation technologies. This corpus is also suitable for DNN-based -multi-speaker text-to-speech synthesis systems and waveform modeling. - -COPYING - -This corpus is licensed under the Creative Commons License: Attribution 4.0 International -http://creativecommons.org/licenses/by/4.0/legalcode - -VCTK VARIANTS -There are several variants of the VCTK corpus: -Speech enhancement -- Noisy speech database for training speech enhancement algorithms and TTS models where we added various types of noises to VCTK artificially: http://dx.doi.org/10.7488/ds/2117 -- Reverberant speech database for training speech dereverberation algorithms and TTS models where we added various types of reverberantion to VCTK artificially http://dx.doi.org/10.7488/ds/1425 -- Noisy reverberant speech database for training speech enhancement algorithms and TTS models http://dx.doi.org/10.7488/ds/2139 -- Device Recorded VCTK where speech signals of the VCTK corpus were played back and re-recorded in office environments using relatively inexpensive consumer devices http://dx.doi.org/10.7488/ds/2316 -- The Microsoft Scalable Noisy Speech Dataset (MS-SNSD) https://github.com/microsoft/MS-SNSD - -ASV and anti-spoofing -- Spoofing and Anti-Spoofing (SAS) corpus, which is a collection of synthetic speech signals produced by nine techniques, two of which are speech synthesis, and seven are voice conversion. All of them were built using the VCTK corpus. http://dx.doi.org/10.7488/ds/252 -- Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2015) Database. This database consists of synthetic speech signals produced by ten techniques and this has been used in the first Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2015) http://dx.doi.org/10.7488/ds/298 -- ASVspoof 2019: The 3rd Automatic Speaker Verification Spoofing and Countermeasures Challenge database. This database has been used in the 3rd Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2019) https://doi.org/10.7488/ds/2555 - - -ACKNOWLEDGEMENTS - -The CSTR VCTK Corpus was constructed by: - - Christophe Veaux (University of Edinburgh) - Junichi Yamagishi (University of Edinburgh) - Kirsten MacDonald - -The research leading to these results was partly funded from EPSRC -grants EP/I031022/1 (NST) and EP/J002526/1 (CAF), from the RSE-NSFC -grant (61111130120), and from the JST CREST (uDialogue). - -Please cite this corpus as follows: -Christophe Veaux, Junichi Yamagishi, Kirsten MacDonald, -"CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit", -The Centre for Speech Technology Research (CSTR), -University of Edinburgh - diff --git a/samples/p240_00000.mp3 b/samples/p240_00000.mp3 deleted file mode 100644 index 4787405..0000000 Binary files a/samples/p240_00000.mp3 and /dev/null differ diff --git a/samples/p260_00000.mp3 b/samples/p260_00000.mp3 deleted file mode 100644 index ff5f503..0000000 Binary files a/samples/p260_00000.mp3 and /dev/null differ diff --git a/synthesizer_preprocess_audio.py b/synthesizer_preprocess_audio.py index 7c322e7..51d92f9 100644 --- a/synthesizer_preprocess_audio.py +++ b/synthesizer_preprocess_audio.py @@ -12,6 +12,7 @@ recognized_datasets = [ ] if __name__ == "__main__": + print("This method is deprecaded and will not be longer supported, please use 'pre.py'") parser = argparse.ArgumentParser( description="Preprocesses audio files from datasets, encodes them as mel spectrograms " "and writes them to the disk. Audio files are also saved, to be used by the " diff --git a/synthesizer_preprocess_embeds.py b/synthesizer_preprocess_embeds.py index 94f864d..7276626 100644 --- a/synthesizer_preprocess_embeds.py +++ b/synthesizer_preprocess_embeds.py @@ -5,6 +5,7 @@ import argparse if __name__ == "__main__": + print("This method is deprecaded and will not be longer supported, please use 'pre.py'") parser = argparse.ArgumentParser( description="Creates embeddings for the synthesizer from the LibriSpeech utterances.", formatter_class=argparse.ArgumentDefaultsHelpFormatter