1 /*
2 * Copyright (C) 2015-2020 Alibaba Group Holding Limited
3 *
4 */
5
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <stdint.h>
9 #include <string.h>
10
11 #include "uvoice_types.h"
12 #include "uvoice_player.h"
13 #include "uvoice_tts.h"
14
15 #include "../../../internal/uvoice_os.h"
16 #include "../../../internal/uvoice_common.h"
17 #include "../../../internal/uvoice_play.h"
18
19 #include "alicloudtts_intf.h"
20 #include "alicloudtts.h"
21 #include "httpclient.h"
22
23 #define ALIYUN_TTS_HTTP_URL "https://nls-gateway.cn-shanghai.aliyuncs.com/stream/v1/tts?"
24
25 /* https://help.aliyun.com/document_detail/84435.html?spm=a2c4g.11186623.6.581.16a75275MJHPrH */
26 static const voice_spec_t voice_spec[] = {
27 { "xiaoyun",
28 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
29 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, 0 } },
30 { "xiaogang",
31 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
32 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, 0 } },
33 { "ruoxi",
34 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
35 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
36 { "xiaomeng",
37 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
38 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, 0 } },
39 { "xiaowei",
40 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
41 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, 0 } },
42 { "amei",
43 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
44 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, 0 } },
45 { "xiaoxue",
46 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
47 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, 0 } },
48 { "siqi",
49 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
50 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
51 { "sijia",
52 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
53 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
54 { "sicheng",
55 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
56 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
57 { "siyue",
58 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
59 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
60 { "xiaomei",
61 { VOICE_CHINESE, VOICE_CN_EN_MIX, VOICE_NULL },
62 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
63 { "sitong",
64 { VOICE_CHINESE, VOICE_NULL, VOICE_NULL },
65 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
66 { "ninger",
67 { VOICE_CHINESE, VOICE_NULL, VOICE_NULL },
68 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
69 { "xiaobei",
70 { VOICE_CHINESE, VOICE_NULL, VOICE_NULL },
71 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
72 { "yina",
73 { VOICE_CHINESE, VOICE_NULL, VOICE_NULL },
74 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
75 { "sijing",
76 { VOICE_CHINESE, VOICE_NULL, VOICE_NULL },
77 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
78 { "wendy",
79 { VOICE_ENGLISH, VOICE_NULL, VOICE_NULL },
80 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
81 { "william",
82 { VOICE_ENGLISH, VOICE_NULL, VOICE_NULL },
83 { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, TTS_ALIYUN_SAMPLE_RATE_24K } },
84 { "halen", { VOICE_ENGLISH, VOICE_NULL, VOICE_NULL }, { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, 0 } },
85 { "harry", { VOICE_ENGLISH, VOICE_NULL, VOICE_NULL }, { TTS_ALIYUN_SAMPLE_RATE_8K, TTS_ALIYUN_SAMPLE_RATE_16K, 0 } },
86 };
87
88 static aliyun_tts_config_t g_tts_config;
89 static int tts_state = ALIYUN_TTS_STATE_NULL;
90 static char *tts_format[4] = { "null", "pcm", "wav", "mp3" };
91
uvoice_tts_aliyun_init(tts_config_t * config)92 int uvoice_tts_aliyun_init(tts_config_t *config)
93 {
94 int i = 0;
95 int j = 0;
96 int flag = 0;
97
98 if (!config) {
99 M_LOGE("config is null!");
100 return -1;
101 }
102
103 if (!config->app_key) {
104 M_LOGE("app_key is null!");
105 return -1;
106 }
107
108 if (!config->token) {
109 M_LOGE("token is null!");
110 return -1;
111 }
112
113 if ((config->format != MEDIA_FMT_PCM) && (config->format != MEDIA_FMT_WAV) && (config->format != MEDIA_FMT_MP3)) {
114 M_LOGE("format %d is not supported !", config->format);
115 return -1;
116 }
117
118 if ((config->sample_rate != TTS_ALIYUN_SAMPLE_RATE_8K) && (config->sample_rate != TTS_ALIYUN_SAMPLE_RATE_16K) &&
119 (config->sample_rate != TTS_ALIYUN_SAMPLE_RATE_24K)) {
120 M_LOGE("sample rate %d is not supported !", config->sample_rate);
121 return -1;
122 }
123
124 if ((config->speech_rate < TTS_SPEECH_RATE_MIN) || (config->speech_rate > TTS_SPEECH_RATE_MAX)) {
125 M_LOGE("speech rate %d is not supported !", config->speech_rate);
126 return -1;
127 }
128
129 if ((config->pitch_rate < TTS_PITCH_RATE_MIN) || (config->pitch_rate > TTS_PITCH_RATE_MAX)) {
130 M_LOGE("pitch rate %d is not supported !", config->pitch_rate);
131 return -1;
132 }
133
134 if ((config->volume < TTS_VOLUME_MIN) || (config->volume > TTS_VOLUME_MAX)) {
135 M_LOGE("volume %d is not supported !", config->volume);
136 return -1;
137 }
138
139 for (i = 0; i < sizeof(voice_spec) / sizeof(voice_spec_t); i++) {
140 if (!strcmp(voice_spec[i].voice_people, config->voice)) {
141 for (j = 0; j < 3; j++) {
142 if ((voice_spec[i].sample_rate[j] == config->sample_rate) && (voice_spec[i].sample_rate[j] != 0)) {
143 flag = 1;
144 break;
145 }
146 }
147 }
148
149 if (flag == 1)
150 break;
151 }
152
153 if (flag == 0) {
154 M_LOGE("sample rate %d not match %s !", config->sample_rate, config->voice);
155 return -1;
156 }
157
158 memset(&g_tts_config, 0, sizeof(aliyun_tts_config_t));
159 //strncpy(g_tts_config.app_key, config->app_key, strlen(config->app_key));
160 //strncpy(g_tts_config.token, config->token, strlen(config->token));
161 snprintf(g_tts_config.app_key, sizeof(g_tts_config.app_key), "%s", config->app_key);
162 snprintf(g_tts_config.token, sizeof(g_tts_config.token), "%s", config->token);
163 g_tts_config.format = config->format; /* tts output format, now only support wav, pcm, mp3 */
164 g_tts_config.sample_rate = config->sample_rate; /* support 8000Hz��16000Hz */
165 //strncpy(g_tts_config.voice, config->voice, strlen(config->voice)); /* voice people */
166 snprintf(g_tts_config.voice, sizeof(g_tts_config.voice), "%s", config->voice);
167 g_tts_config.volume = config->volume; /* 0 ~ 100 */
168 g_tts_config.speech_rate = config->speech_rate; /* -500 ~ 500 */
169 g_tts_config.pitch_rate = config->pitch_rate; /* -500 ~ 500 */
170 g_tts_config.text_encode_type = config->text_encode_type;
171
172 tts_state = ALIYUN_TTS_STATE_INITED;
173 return 0;
174 }
175
http_download(char * req_url,tts_recv_callback_t * recv_cb)176 static void http_download(char *req_url, tts_recv_callback_t *recv_cb)
177 {
178 char *rsp_buf = malloc(2048);
179 char *req_buf = malloc(1024 + strlen(req_url));
180
181 httpclient_t client = { 0 };
182 httpclient_data_t client_data = { 0 };
183 int num = 0;
184 int ret = 0;
185 int idx = 0;
186
187 memset(req_buf, 0, sizeof(req_buf));
188 client_data.header_buf = req_buf;
189 client_data.header_buf_len = 1024 + strlen(req_url);
190
191 memset(rsp_buf, 0, 2048);
192 client_data.response_buf = rsp_buf;
193 client_data.response_buf_len = 2048;
194
195 ret = httpclient_conn(&client, req_url);
196 if (!ret) {
197 ret = httpclient_send(&client, req_url, HTTP_GET, &client_data);
198
199 do {
200 ret = httpclient_recv(&client, &client_data);
201 M_LOGD("response_content_len=%d, retrieve_len=%d,content_block_len=%d\n", client_data.response_content_len,
202 client_data.retrieve_len, client_data.content_block_len);
203 M_LOGD("ismore = %d\n", client_data.is_more);
204
205 num = recv_cb->recv_data(client_data.response_buf, client_data.content_block_len, idx++);
206 if (num > 0) {
207 printf("aos_write num=%d\n", num);
208 }
209 } while (client_data.is_more);
210 }
211 httpclient_clse(&client);
212
213 free(rsp_buf);
214 free(req_buf);
215
216 return;
217 }
218
219 //by: alicloud_tts_download_task()
uvoice_tts_aliyun_request(char * text,tts_recv_type_e recv_type,tts_recv_callback_t * recv_cb)220 int uvoice_tts_aliyun_request(char *text, tts_recv_type_e recv_type, tts_recv_callback_t *recv_cb)
221 {
222 int i = 0;
223 int ret = 0;
224 char *tts_text = NULL;
225 char *utf8_text = NULL;
226 int tts_text_len;
227 char *tts_get_url = NULL;
228
229 if (tts_state != ALIYUN_TTS_STATE_INITED) {
230 M_LOGE("aliyun tts: not initialized\n");
231 return -1;
232 }
233
234 if (!recv_cb) {
235 M_LOGE("aliyun tts: recv callback is null\n");
236 return -1;
237 }
238
239 tts_text_len = strlen(text);
240 if (tts_text_len > UVOICE_TTS_MAX_TEXT_LEN) {
241 tts_text_len = UVOICE_TTS_MAX_TEXT_LEN;
242 if (recv_cb->event) {
243 recv_cb->event(TTS_WARNING, "text length too long, cut to " UVOICE_TTS_MAX_TEXT_LEN_STR);
244 }
245 }
246
247 tts_text = snd_zalloc(tts_text_len * 3 + 1, AFM_EXTN);
248 if (!tts_text) {
249 M_LOGE("alloc tts text buffer fail !\n");
250 return -1;
251 }
252
253 snprintf(tts_text, tts_text_len * 3 + 1, "%s", text);
254 #ifdef UVOICE_TTS_SUPPORT_GBK_ENCODE
255 if (g_tts_config.text_encode_type == TTS_ENCODE_GBK) {
256 utf8_text = snd_zalloc(tts_text_len * 2 * 3 + 1, AFM_EXTN);
257 if (!utf8_text) {
258 M_LOGE("alloc utf8 text buffer fail !\n");
259 goto exit_free;
260 }
261 uvoice_gbk2utf8(tts_text, utf8_text, tts_text_len * 2);
262 M_LOGI("text encode type GBK\n");
263 } else
264 #endif
265 {
266 utf8_text = tts_text;
267 M_LOGI("text encode type UTF-8\n");
268 }
269
270 uvoice_urlencode(utf8_text);
271
272 tts_get_url = snd_zalloc(UVOICE_TTS_HTTPGET_URL_LENGTH, AFM_MAIN);
273 if (!tts_get_url) {
274 M_LOGE("alloc url buffer fail !\n");
275 goto exit_free;
276 }
277
278 snprintf(tts_get_url, UVOICE_TTS_HTTPGET_URL_LENGTH,
279 "%sappkey=%s&token=%s&text=%s&format=%s&sample_rate=%u&voice=%s&volume=%d&speech_rate=%d&pitch_rate=%d",
280 ALIYUN_TTS_HTTP_URL, g_tts_config.app_key, g_tts_config.token, utf8_text, tts_format[g_tts_config.format],
281 g_tts_config.sample_rate, g_tts_config.voice, g_tts_config.volume, g_tts_config.speech_rate,
282 g_tts_config.pitch_rate);
283
284 if (recv_type == TTS_RECV_URL) {
285 if (recv_cb->recv_url) {
286 printf("tts url is %s\n", tts_get_url);
287 recv_cb->recv_url(tts_get_url);
288 }
289 if (recv_cb->event)
290 recv_cb->event(TTS_TRANS_COMPLETE, "OK");
291 ret = 0;
292 goto exit_free;
293 }
294
295 if (recv_type == TTS_RECV_DATA) {
296 media_loader_t *mloader;
297 uint8_t *buffer = NULL;
298 int size = 2048;
299 int ret_size = 0;
300 int index = 0;
301
302 if (recv_cb->recv_data == NULL) {
303 M_LOGE("recv_cb->recv_data is null !\n");
304 ret = -1;
305 goto exit_free;
306 }
307
308 http_download(tts_get_url, recv_cb);
309
310 goto exit_free;
311 mloader = snd_zalloc(sizeof(media_loader_t), AFM_EXTN);
312 if (!mloader) {
313 M_LOGE("alloc mloader failed !\n");
314 ret = -1;
315 goto exit_free;
316 }
317
318 M_LOGD("url %s\n", tts_get_url);
319 ret = http_loader_create(mloader, tts_get_url);
320 if (ret) {
321 M_LOGE("alloc mloader failed !\n");
322 ret = -1;
323 goto exit_free;
324 }
325
326 buffer = snd_zalloc(size, AFM_EXTN);
327 if (!buffer) {
328 M_LOGE("alloc buffer fail !\n");
329 ret = -1;
330 http_loader_release(mloader);
331 goto exit_free;
332 }
333 if (recv_cb->event)
334 recv_cb->event(TTS_RECV_START, "");
335
336 if (mloader->action(mloader, PLAYER_START, NULL)) {
337 M_LOGE("start http load failed !\n");
338 ret = -1;
339 snd_free(buffer);
340 http_loader_release(mloader);
341 snd_free(mloader);
342 goto exit_free;
343 }
344
345 while (1) {
346 ret_size = mloader->read(mloader, buffer, size);
347 if (ret_size > 0) {
348 recv_cb->recv_data(buffer, ret_size, index);
349 index++;
350 } else {
351 break;
352 }
353 }
354
355 mloader->action(mloader, PLAYER_STOP, NULL);
356 if (recv_cb->event)
357 recv_cb->event(TTS_RECV_COMPLETE, NULL);
358 snd_free(buffer);
359 http_loader_release(mloader);
360 snd_free(mloader);
361 }
362
363 exit_free:
364 if (tts_text)
365 snd_free(tts_text);
366
367 #ifdef UVOICE_TTS_SUPPORT_GBK_ENCODE
368 if (utf8_text)
369 snd_free(utf8_text);
370 #endif
371 if (tts_get_url)
372 snd_free(tts_get_url);
373 return ret;
374 }
375
uvoice_tts_aliyun_stop()376 int uvoice_tts_aliyun_stop()
377 {
378 return 0;
379 }