From 245099c7402ec2d37a0e49b640cdc5f26b1d0620 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AD=90?=
 <54951765+kslz@users.noreply.github.com>
Date: Tue, 12 Oct 2021 23:40:27 +0800
Subject: [PATCH] =?UTF-8?q?=E6=94=AF=E6=8C=81data=5Faishell=EF=BC=88SLR33?=
 =?UTF-8?q?=EF=BC=89=E6=95=B0=E6=8D=AE=E9=9B=86=20(#141)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 支持data_aishell（SLR33）数据集

* 更新readme
---
 README-CN.md              | 5 +++--
 README.md                 | 5 +++--
 pre.py                    | 3 ++-
 synthesizer/preprocess.py | 5 +++++
 4 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/README-CN.md b/README-CN.md
index 974d32e..448fbd6 100644
--- a/README-CN.md
+++ b/README-CN.md
@@ -8,7 +8,7 @@
 ### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/)
 
 ## 特性
-🌍 **中文** 支持普通话并使用多种中文数据集进行测试：aidatatang_200zh, magicdata, aishell3， biaobei，MozillaCommonVoice 等
+🌍 **中文** 支持普通话并使用多种中文数据集进行测试：aidatatang_200zh, magicdata, aishell3, biaobei, MozillaCommonVoice, data_aishell 等
 
 🤩 **PyTorch** 适用于 pytorch，已在 1.9.0 版本（最新于 2021 年 8 月）中测试，GPU Tesla T4 和 GTX 2060
 
@@ -36,7 +36,7 @@
 * 进行音频和梅尔频谱图预处理：
 `python pre.py <datasets_root> -d {dataset} -n {number}`
 可传入参数：
-* -d`{dataset}` 指定数据集，支持 aidatatang_200zh, magicdata, aishell3, 不传默认为aidatatang_200zh
+* -d`{dataset}` 指定数据集，支持 aidatatang_200zh, magicdata, aishell3, data_aishell, 不传默认为aidatatang_200zh
 * -n `{number}` 指定并行数，CPU 11770k + 32GB实测10没有问题
 > 假如你下载的 `aidatatang_200zh`文件放在D盘，`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
 
@@ -135,6 +135,7 @@
 | aidatatang_200zh | [OpenSLR](http://www.openslr.org/62/) | [Google Drive](https://drive.google.com/file/d/110A11KZoVe7vy6kXlLb6zVPLb_J91I_t/view?usp=sharing) |
 | magicdata | [OpenSLR](http://www.openslr.org/68/) | [Google Drive (Dev set)](https://drive.google.com/file/d/1g5bWRUSNH68ycC6eNvtwh07nX3QhOOlo/view?usp=sharing) |
 | aishell3 | [OpenSLR](https://www.openslr.org/93/) | [Google Drive](https://drive.google.com/file/d/1shYp_o4Z0X0cZSKQDtFirct2luFUwKzZ/view?usp=sharing) |
+| data_aishell | [OpenSLR](https://www.openslr.org/33/) |  |
 > 解壓 aidatatang_200zh 後，還需將 `aidatatang_200zh\corpus\train`下的檔案全選解壓縮
 
 #### 2.`<datasets_root>`是什麼意思?
diff --git a/README.md b/README.md
index 1e7c938..1a56fe6 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 > English | [中文](README-CN.md)
 
 ## Features
-🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, magicdata, aishell3, and etc.
+🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, magicdata, aishell3, data_aishell, and etc.
 
 🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060
 
@@ -36,7 +36,7 @@ You can either train your models or use existing ones:
 * Download dataset and unzip: make sure you can access all .wav in folder
 * Preprocess with the audios and the mel spectrograms:
 `python pre.py <datasets_root>`
-Allowing parameter `--dataset {dataset}` to support aidatatang_200zh, magicdata, aishell3, etc.
+Allowing parameter `--dataset {dataset}` to support aidatatang_200zh, magicdata, aishell3, data_aishell, etc.If this parameter is not passed, the default dataset will be aidatatang_200zh.
 
 * Train the synthesizer:
 `python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
@@ -91,6 +91,7 @@ You can then try the toolbox:
 | aidatatang_200zh | [OpenSLR](http://www.openslr.org/62/) | [Google Drive](https://drive.google.com/file/d/110A11KZoVe7vy6kXlLb6zVPLb_J91I_t/view?usp=sharing) |
 | magicdata | [OpenSLR](http://www.openslr.org/68/) | [Google Drive (Dev set)](https://drive.google.com/file/d/1g5bWRUSNH68ycC6eNvtwh07nX3QhOOlo/view?usp=sharing) |
 | aishell3 | [OpenSLR](https://www.openslr.org/93/) | [Google Drive](https://drive.google.com/file/d/1shYp_o4Z0X0cZSKQDtFirct2luFUwKzZ/view?usp=sharing) |
+| data_aishell | [OpenSLR](https://www.openslr.org/33/) |  |
 > After unzip aidatatang_200zh, you need to unzip all the files under `aidatatang_200zh\corpus\train`
 
 #### 2.What is`<datasets_root>`?
diff --git a/pre.py b/pre.py
index 1d78a12..2ea21ed 100644
--- a/pre.py
+++ b/pre.py
@@ -12,7 +12,8 @@ import argparse
 recognized_datasets = [
     "aidatatang_200zh",
     "magicdata",
-    "aishell3"
+    "aishell3",
+    "data_aishell"
 ]
 
 if __name__ == "__main__":
diff --git a/synthesizer/preprocess.py b/synthesizer/preprocess.py
index c4779ed..dc305e4 100644
--- a/synthesizer/preprocess.py
+++ b/synthesizer/preprocess.py
@@ -26,6 +26,11 @@ data_info = {
         "trans_filepath": "train/content.txt",
         "speak_func": preprocess_speaker_general,
         "transcript_func": preprocess_transcript_aishell3,
+    },
+    "data_aishell":{
+        "subfolders": ["wav/train"],
+        "trans_filepath": "transcript/aishell_transcript_v0.8.txt",
+        "speak_func": preprocess_speaker_general
     }
 }