1 /*
2 Simple DirectMedia Layer
3 Copyright (C) 1997-2020 Sam Lantinga <slouken@libsdl.org>
4
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
12
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
20 */
21
22 #include "../SDL_internal.h"
23 #include "SDL_audio.h"
24 #include "SDL_audio_c.h"
25 #include "SDL_cpuinfo.h"
26 #include "SDL_assert.h"
27
28 #ifdef __ARM_NEON
29 #define HAVE_NEON_INTRINSICS 1
30 #endif
31
32 #ifdef __SSE2__
33 #define HAVE_SSE2_INTRINSICS 1
34 #endif
35
36 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
37 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* x86_64 guarantees SSE2. */
38 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS
39 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* Mac OS X/Intel guarantees SSE2. */
40 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
41 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* ARMv8+ promise NEON. */
42 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
43 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* All Apple ARMv7 chips promise NEON support. */
44 #endif
45
46 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */
47 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS
48 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
49 #endif
50
51 /* Function pointers set to a CPU-specific implementation. */
52 SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
53 SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
54 SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
55 SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
56 SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
57 SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
58 SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
59 SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
60 SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
61 SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
62
63
64 #define DIVBY128 0.0078125f
65 #define DIVBY32768 0.000030517578125f
66 #define DIVBY8388607 0.00000011920930376163766f
67
68
69 #if NEED_SCALAR_CONVERTER_FALLBACKS
70 static void SDLCALL
SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)71 SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
72 {
73 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
74 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
75 int i;
76
77 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
78
79 for (i = cvt->len_cvt; i; --i, --src, --dst) {
80 *dst = ((float) *src) * DIVBY128;
81 }
82
83 cvt->len_cvt *= 4;
84 if (cvt->filters[++cvt->filter_index]) {
85 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
86 }
87 }
88
89 static void SDLCALL
SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)90 SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
91 {
92 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
93 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
94 int i;
95
96 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
97
98 for (i = cvt->len_cvt; i; --i, --src, --dst) {
99 *dst = (((float) *src) * DIVBY128) - 1.0f;
100 }
101
102 cvt->len_cvt *= 4;
103 if (cvt->filters[++cvt->filter_index]) {
104 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
105 }
106 }
107
108 static void SDLCALL
SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)109 SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
110 {
111 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
112 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
113 int i;
114
115 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
116
117 for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
118 *dst = ((float) *src) * DIVBY32768;
119 }
120
121 cvt->len_cvt *= 2;
122 if (cvt->filters[++cvt->filter_index]) {
123 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
124 }
125 }
126
127 static void SDLCALL
SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)128 SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
129 {
130 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
131 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
132 int i;
133
134 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
135
136 for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
137 *dst = (((float) *src) * DIVBY32768) - 1.0f;
138 }
139
140 cvt->len_cvt *= 2;
141 if (cvt->filters[++cvt->filter_index]) {
142 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
143 }
144 }
145
146 static void SDLCALL
SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)147 SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
148 {
149 const Sint32 *src = (const Sint32 *) cvt->buf;
150 float *dst = (float *) cvt->buf;
151 int i;
152
153 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
154
155 for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
156 *dst = ((float) (*src>>8)) * DIVBY8388607;
157 }
158
159 if (cvt->filters[++cvt->filter_index]) {
160 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
161 }
162 }
163
164 static void SDLCALL
SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)165 SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
166 {
167 const float *src = (const float *) cvt->buf;
168 Sint8 *dst = (Sint8 *) cvt->buf;
169 int i;
170
171 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
172
173 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
174 const float sample = *src;
175 if (sample >= 1.0f) {
176 *dst = 127;
177 } else if (sample <= -1.0f) {
178 *dst = -128;
179 } else {
180 *dst = (Sint8)(sample * 127.0f);
181 }
182 }
183
184 cvt->len_cvt /= 4;
185 if (cvt->filters[++cvt->filter_index]) {
186 cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
187 }
188 }
189
190 static void SDLCALL
SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)191 SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
192 {
193 const float *src = (const float *) cvt->buf;
194 Uint8 *dst = (Uint8 *) cvt->buf;
195 int i;
196
197 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
198
199 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
200 const float sample = *src;
201 if (sample >= 1.0f) {
202 *dst = 255;
203 } else if (sample <= -1.0f) {
204 *dst = 0;
205 } else {
206 *dst = (Uint8)((sample + 1.0f) * 127.0f);
207 }
208 }
209
210 cvt->len_cvt /= 4;
211 if (cvt->filters[++cvt->filter_index]) {
212 cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
213 }
214 }
215
216 static void SDLCALL
SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)217 SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
218 {
219 const float *src = (const float *) cvt->buf;
220 Sint16 *dst = (Sint16 *) cvt->buf;
221 int i;
222
223 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
224
225 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
226 const float sample = *src;
227 if (sample >= 1.0f) {
228 *dst = 32767;
229 } else if (sample <= -1.0f) {
230 *dst = -32768;
231 } else {
232 *dst = (Sint16)(sample * 32767.0f);
233 }
234 }
235
236 cvt->len_cvt /= 2;
237 if (cvt->filters[++cvt->filter_index]) {
238 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
239 }
240 }
241
242 static void SDLCALL
SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)243 SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
244 {
245 const float *src = (const float *) cvt->buf;
246 Uint16 *dst = (Uint16 *) cvt->buf;
247 int i;
248
249 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
250
251 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
252 const float sample = *src;
253 if (sample >= 1.0f) {
254 *dst = 65535;
255 } else if (sample <= -1.0f) {
256 *dst = 0;
257 } else {
258 *dst = (Uint16)((sample + 1.0f) * 32767.0f);
259 }
260 }
261
262 cvt->len_cvt /= 2;
263 if (cvt->filters[++cvt->filter_index]) {
264 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
265 }
266 }
267
268 static void SDLCALL
SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)269 SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
270 {
271 const float *src = (const float *) cvt->buf;
272 Sint32 *dst = (Sint32 *) cvt->buf;
273 int i;
274
275 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
276
277 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
278 const float sample = *src;
279 if (sample >= 1.0f) {
280 *dst = 2147483647;
281 } else if (sample <= -1.0f) {
282 *dst = (Sint32) -2147483648LL;
283 } else {
284 *dst = ((Sint32)(sample * 8388607.0f)) << 8;
285 }
286 }
287
288 if (cvt->filters[++cvt->filter_index]) {
289 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
290 }
291 }
292 #endif
293
294
295 #if HAVE_SSE2_INTRINSICS
296 static void SDLCALL
SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)297 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
298 {
299 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
300 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
301 int i;
302
303 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
304
305 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
306 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
307 *dst = ((float) *src) * DIVBY128;
308 }
309
310 src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */
311 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
312
313 /* Make sure src is aligned too. */
314 if ((((size_t) src) & 15) == 0) {
315 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
316 const __m128i *mmsrc = (const __m128i *) src;
317 const __m128i zero = _mm_setzero_si128();
318 const __m128 divby128 = _mm_set1_ps(DIVBY128);
319 while (i >= 16) { /* 16 * 8-bit */
320 const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 sint8 into an XMM register. */
321 /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
322 const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
323 /* right-shift-sign-extend gets us sint16 with the other set of values. */
324 const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
325 /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
326 const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
327 const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
328 const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
329 const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
330 /* Interleave back into correct order, store. */
331 _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
332 _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
333 _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
334 _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
335 i -= 16; mmsrc--; dst -= 16;
336 }
337
338 src = (const Sint8 *) mmsrc;
339 }
340
341 src += 15; dst += 15; /* adjust for any scalar finishing. */
342
343 /* Finish off any leftovers with scalar operations. */
344 while (i) {
345 *dst = ((float) *src) * DIVBY128;
346 i--; src--; dst--;
347 }
348
349 cvt->len_cvt *= 4;
350 if (cvt->filters[++cvt->filter_index]) {
351 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
352 }
353 }
354
355 static void SDLCALL
SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)356 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
357 {
358 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
359 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
360 int i;
361
362 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
363
364 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
365 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
366 *dst = (((float) *src) * DIVBY128) - 1.0f;
367 }
368
369 src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */
370 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
371
372 /* Make sure src is aligned too. */
373 if ((((size_t) src) & 15) == 0) {
374 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
375 const __m128i *mmsrc = (const __m128i *) src;
376 const __m128i zero = _mm_setzero_si128();
377 const __m128 divby128 = _mm_set1_ps(DIVBY128);
378 const __m128 minus1 = _mm_set1_ps(-1.0f);
379 while (i >= 16) { /* 16 * 8-bit */
380 const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */
381 /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
382 const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
383 /* right-shift-zero-extend gets us uint16 with the other set of values. */
384 const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
385 /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
386 /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
387 const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
388 const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
389 const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
390 const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
391 /* Interleave back into correct order, store. */
392 _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
393 _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
394 _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
395 _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
396 i -= 16; mmsrc--; dst -= 16;
397 }
398
399 src = (const Uint8 *) mmsrc;
400 }
401
402 src += 15; dst += 15; /* adjust for any scalar finishing. */
403
404 /* Finish off any leftovers with scalar operations. */
405 while (i) {
406 *dst = (((float) *src) * DIVBY128) - 1.0f;
407 i--; src--; dst--;
408 }
409
410 cvt->len_cvt *= 4;
411 if (cvt->filters[++cvt->filter_index]) {
412 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
413 }
414 }
415
416 static void SDLCALL
SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)417 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
418 {
419 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
420 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
421 int i;
422
423 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
424
425 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
426 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
427 *dst = ((float) *src) * DIVBY32768;
428 }
429
430 src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */
431 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
432
433 /* Make sure src is aligned too. */
434 if ((((size_t) src) & 15) == 0) {
435 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
436 const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
437 while (i >= 8) { /* 8 * 16-bit */
438 const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */
439 /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
440 const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
441 /* right-shift-sign-extend gets us sint32 with the other set of values. */
442 const __m128i b = _mm_srai_epi32(ints, 16);
443 /* Interleave these back into the right order, convert to float, multiply, store. */
444 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
445 _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
446 i -= 8; src -= 8; dst -= 8;
447 }
448 }
449
450 src += 7; dst += 7; /* adjust for any scalar finishing. */
451
452 /* Finish off any leftovers with scalar operations. */
453 while (i) {
454 *dst = ((float) *src) * DIVBY32768;
455 i--; src--; dst--;
456 }
457
458 cvt->len_cvt *= 2;
459 if (cvt->filters[++cvt->filter_index]) {
460 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
461 }
462 }
463
464 static void SDLCALL
SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)465 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
466 {
467 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
468 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
469 int i;
470
471 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
472
473 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
474 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
475 *dst = (((float) *src) * DIVBY32768) - 1.0f;
476 }
477
478 src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */
479 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
480
481 /* Make sure src is aligned too. */
482 if ((((size_t) src) & 15) == 0) {
483 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
484 const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
485 const __m128 minus1 = _mm_set1_ps(-1.0f);
486 while (i >= 8) { /* 8 * 16-bit */
487 const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */
488 /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
489 const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
490 /* right-shift-sign-extend gets us sint32 with the other set of values. */
491 const __m128i b = _mm_srli_epi32(ints, 16);
492 /* Interleave these back into the right order, convert to float, multiply, store. */
493 _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
494 _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
495 i -= 8; src -= 8; dst -= 8;
496 }
497 }
498
499 src += 7; dst += 7; /* adjust for any scalar finishing. */
500
501 /* Finish off any leftovers with scalar operations. */
502 while (i) {
503 *dst = (((float) *src) * DIVBY32768) - 1.0f;
504 i--; src--; dst--;
505 }
506
507 cvt->len_cvt *= 2;
508 if (cvt->filters[++cvt->filter_index]) {
509 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
510 }
511 }
512
513 static void SDLCALL
SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)514 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
515 {
516 const Sint32 *src = (const Sint32 *) cvt->buf;
517 float *dst = (float *) cvt->buf;
518 int i;
519
520 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
521
522 /* Get dst aligned to 16 bytes */
523 for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
524 *dst = ((float) (*src>>8)) * DIVBY8388607;
525 }
526
527 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
528
529 /* Make sure src is aligned too. */
530 if ((((size_t) src) & 15) == 0) {
531 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
532 const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
533 const __m128i *mmsrc = (const __m128i *) src;
534 while (i >= 4) { /* 4 * sint32 */
535 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
536 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
537 i -= 4; mmsrc++; dst += 4;
538 }
539 src = (const Sint32 *) mmsrc;
540 }
541
542 /* Finish off any leftovers with scalar operations. */
543 while (i) {
544 *dst = ((float) (*src>>8)) * DIVBY8388607;
545 i--; src++; dst++;
546 }
547
548 if (cvt->filters[++cvt->filter_index]) {
549 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
550 }
551 }
552
553 static void SDLCALL
SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)554 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
555 {
556 const float *src = (const float *) cvt->buf;
557 Sint8 *dst = (Sint8 *) cvt->buf;
558 int i;
559
560 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
561
562 /* Get dst aligned to 16 bytes */
563 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
564 const float sample = *src;
565 if (sample >= 1.0f) {
566 *dst = 127;
567 } else if (sample <= -1.0f) {
568 *dst = -128;
569 } else {
570 *dst = (Sint8)(sample * 127.0f);
571 }
572 }
573
574 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
575
576 /* Make sure src is aligned too. */
577 if ((((size_t) src) & 15) == 0) {
578 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
579 const __m128 one = _mm_set1_ps(1.0f);
580 const __m128 negone = _mm_set1_ps(-1.0f);
581 const __m128 mulby127 = _mm_set1_ps(127.0f);
582 __m128i *mmdst = (__m128i *) dst;
583 while (i >= 16) { /* 16 * float32 */
584 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
585 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
586 const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
587 const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
588 _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */
589 i -= 16; src += 16; mmdst++;
590 }
591 dst = (Sint8 *) mmdst;
592 }
593
594 /* Finish off any leftovers with scalar operations. */
595 while (i) {
596 const float sample = *src;
597 if (sample >= 1.0f) {
598 *dst = 127;
599 } else if (sample <= -1.0f) {
600 *dst = -128;
601 } else {
602 *dst = (Sint8)(sample * 127.0f);
603 }
604 i--; src++; dst++;
605 }
606
607 cvt->len_cvt /= 4;
608 if (cvt->filters[++cvt->filter_index]) {
609 cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
610 }
611 }
612
613 static void SDLCALL
SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)614 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
615 {
616 const float *src = (const float *) cvt->buf;
617 Uint8 *dst = cvt->buf;
618 int i;
619
620 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
621
622 /* Get dst aligned to 16 bytes */
623 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
624 const float sample = *src;
625 if (sample >= 1.0f) {
626 *dst = 255;
627 } else if (sample <= -1.0f) {
628 *dst = 0;
629 } else {
630 *dst = (Uint8)((sample + 1.0f) * 127.0f);
631 }
632 }
633
634 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
635
636 /* Make sure src is aligned too. */
637 if ((((size_t) src) & 15) == 0) {
638 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
639 const __m128 one = _mm_set1_ps(1.0f);
640 const __m128 negone = _mm_set1_ps(-1.0f);
641 const __m128 mulby127 = _mm_set1_ps(127.0f);
642 __m128i *mmdst = (__m128i *) dst;
643 while (i >= 16) { /* 16 * float32 */
644 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
645 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
646 const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
647 const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
648 _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */
649 i -= 16; src += 16; mmdst++;
650 }
651 dst = (Uint8 *) mmdst;
652 }
653
654 /* Finish off any leftovers with scalar operations. */
655 while (i) {
656 const float sample = *src;
657 if (sample >= 1.0f) {
658 *dst = 255;
659 } else if (sample <= -1.0f) {
660 *dst = 0;
661 } else {
662 *dst = (Uint8)((sample + 1.0f) * 127.0f);
663 }
664 i--; src++; dst++;
665 }
666
667 cvt->len_cvt /= 4;
668 if (cvt->filters[++cvt->filter_index]) {
669 cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
670 }
671 }
672
673 static void SDLCALL
SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)674 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
675 {
676 const float *src = (const float *) cvt->buf;
677 Sint16 *dst = (Sint16 *) cvt->buf;
678 int i;
679
680 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
681
682 /* Get dst aligned to 16 bytes */
683 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
684 const float sample = *src;
685 if (sample >= 1.0f) {
686 *dst = 32767;
687 } else if (sample <= -1.0f) {
688 *dst = -32768;
689 } else {
690 *dst = (Sint16)(sample * 32767.0f);
691 }
692 }
693
694 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
695
696 /* Make sure src is aligned too. */
697 if ((((size_t) src) & 15) == 0) {
698 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
699 const __m128 one = _mm_set1_ps(1.0f);
700 const __m128 negone = _mm_set1_ps(-1.0f);
701 const __m128 mulby32767 = _mm_set1_ps(32767.0f);
702 __m128i *mmdst = (__m128i *) dst;
703 while (i >= 8) { /* 8 * float32 */
704 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
705 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
706 _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2)); /* pack to sint16, store out. */
707 i -= 8; src += 8; mmdst++;
708 }
709 dst = (Sint16 *) mmdst;
710 }
711
712 /* Finish off any leftovers with scalar operations. */
713 while (i) {
714 const float sample = *src;
715 if (sample >= 1.0f) {
716 *dst = 32767;
717 } else if (sample <= -1.0f) {
718 *dst = -32768;
719 } else {
720 *dst = (Sint16)(sample * 32767.0f);
721 }
722 i--; src++; dst++;
723 }
724
725 cvt->len_cvt /= 2;
726 if (cvt->filters[++cvt->filter_index]) {
727 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
728 }
729 }
730
731 static void SDLCALL
SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)732 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
733 {
734 const float *src = (const float *) cvt->buf;
735 Uint16 *dst = (Uint16 *) cvt->buf;
736 int i;
737
738 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
739
740 /* Get dst aligned to 16 bytes */
741 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
742 const float sample = *src;
743 if (sample >= 1.0f) {
744 *dst = 65535;
745 } else if (sample <= -1.0f) {
746 *dst = 0;
747 } else {
748 *dst = (Uint16)((sample + 1.0f) * 32767.0f);
749 }
750 }
751
752 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
753
754 /* Make sure src is aligned too. */
755 if ((((size_t) src) & 15) == 0) {
756 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
757 /* This calculates differently than the scalar path because SSE2 can't
758 pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
759 saturation, so that would corrupt our data. _mm_packus_epi32 exists,
760 but not before SSE 4.1. So we convert from float to sint16, packing
761 that down with legit signed saturation, and then xor the top bit
762 against 1. This results in the correct unsigned 16-bit value, even
763 though it looks like dark magic. */
764 const __m128 mulby32767 = _mm_set1_ps(32767.0f);
765 const __m128i topbit = _mm_set1_epi16(-32768);
766 const __m128 one = _mm_set1_ps(1.0f);
767 const __m128 negone = _mm_set1_ps(-1.0f);
768 __m128i *mmdst = (__m128i *) dst;
769 while (i >= 8) { /* 8 * float32 */
770 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
771 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
772 _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit)); /* pack to sint16, xor top bit, store out. */
773 i -= 8; src += 8; mmdst++;
774 }
775 dst = (Uint16 *) mmdst;
776 }
777
778 /* Finish off any leftovers with scalar operations. */
779 while (i) {
780 const float sample = *src;
781 if (sample >= 1.0f) {
782 *dst = 65535;
783 } else if (sample <= -1.0f) {
784 *dst = 0;
785 } else {
786 *dst = (Uint16)((sample + 1.0f) * 32767.0f);
787 }
788 i--; src++; dst++;
789 }
790
791 cvt->len_cvt /= 2;
792 if (cvt->filters[++cvt->filter_index]) {
793 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
794 }
795 }
796
797 static void SDLCALL
SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)798 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
799 {
800 const float *src = (const float *) cvt->buf;
801 Sint32 *dst = (Sint32 *) cvt->buf;
802 int i;
803
804 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
805
806 /* Get dst aligned to 16 bytes */
807 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
808 const float sample = *src;
809 if (sample >= 1.0f) {
810 *dst = 2147483647;
811 } else if (sample <= -1.0f) {
812 *dst = (Sint32) -2147483648LL;
813 } else {
814 *dst = ((Sint32)(sample * 8388607.0f)) << 8;
815 }
816 }
817
818 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
819 SDL_assert(!i || ((((size_t) src) & 15) == 0));
820
821 {
822 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
823 const __m128 one = _mm_set1_ps(1.0f);
824 const __m128 negone = _mm_set1_ps(-1.0f);
825 const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
826 __m128i *mmdst = (__m128i *) dst;
827 while (i >= 4) { /* 4 * float32 */
828 _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8)); /* load 4 floats, clamp, convert to sint32 */
829 i -= 4; src += 4; mmdst++;
830 }
831 dst = (Sint32 *) mmdst;
832 }
833
834 /* Finish off any leftovers with scalar operations. */
835 while (i) {
836 const float sample = *src;
837 if (sample >= 1.0f) {
838 *dst = 2147483647;
839 } else if (sample <= -1.0f) {
840 *dst = (Sint32) -2147483648LL;
841 } else {
842 *dst = ((Sint32)(sample * 8388607.0f)) << 8;
843 }
844 i--; src++; dst++;
845 }
846
847 if (cvt->filters[++cvt->filter_index]) {
848 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
849 }
850 }
851 #endif
852
853
854 #if HAVE_NEON_INTRINSICS
855 static void SDLCALL
SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)856 SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
857 {
858 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
859 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
860 int i;
861
862 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using NEON)");
863
864 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
865 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
866 *dst = ((float) *src) * DIVBY128;
867 }
868
869 src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */
870 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
871
872 /* Make sure src is aligned too. */
873 if ((((size_t) src) & 15) == 0) {
874 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
875 const int8_t *mmsrc = (const int8_t *) src;
876 const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
877 while (i >= 16) { /* 16 * 8-bit */
878 const int8x16_t bytes = vld1q_s8(mmsrc); /* get 16 sint8 into a NEON register. */
879 const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes)); /* convert top 8 bytes to 8 int16 */
880 const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes)); /* convert bottom 8 bytes to 8 int16 */
881 /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
882 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128));
883 vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128));
884 vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128));
885 vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128));
886 i -= 16; mmsrc -= 16; dst -= 16;
887 }
888
889 src = (const Sint8 *) mmsrc;
890 }
891
892 src += 15; dst += 15; /* adjust for any scalar finishing. */
893
894 /* Finish off any leftovers with scalar operations. */
895 while (i) {
896 *dst = ((float) *src) * DIVBY128;
897 i--; src--; dst--;
898 }
899
900 cvt->len_cvt *= 4;
901 if (cvt->filters[++cvt->filter_index]) {
902 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
903 }
904 }
905
906 static void SDLCALL
SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)907 SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
908 {
909 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
910 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
911 int i;
912
913 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using NEON)");
914
915 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
916 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
917 *dst = (((float) *src) * DIVBY128) - 1.0f;
918 }
919
920 src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */
921 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
922
923 /* Make sure src is aligned too. */
924 if ((((size_t) src) & 15) == 0) {
925 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
926 const uint8_t *mmsrc = (const uint8_t *) src;
927 const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
928 const float32x4_t negone = vdupq_n_f32(-1.0f);
929 while (i >= 16) { /* 16 * 8-bit */
930 const uint8x16_t bytes = vld1q_u8(mmsrc); /* get 16 uint8 into a NEON register. */
931 const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); /* convert top 8 bytes to 8 uint16 */
932 const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); /* convert bottom 8 bytes to 8 uint16 */
933 /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
934 vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
935 vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
936 vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
937 vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
938 i -= 16; mmsrc -= 16; dst -= 16;
939 }
940
941 src = (const Uint8 *) mmsrc;
942 }
943
944 src += 15; dst += 15; /* adjust for any scalar finishing. */
945
946 /* Finish off any leftovers with scalar operations. */
947 while (i) {
948 *dst = (((float) *src) * DIVBY128) - 1.0f;
949 i--; src--; dst--;
950 }
951
952 cvt->len_cvt *= 4;
953 if (cvt->filters[++cvt->filter_index]) {
954 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
955 }
956 }
957
958 static void SDLCALL
SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)959 SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
960 {
961 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
962 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
963 int i;
964
965 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using NEON)");
966
967 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
968 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
969 *dst = ((float) *src) * DIVBY32768;
970 }
971
972 src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */
973 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
974
975 /* Make sure src is aligned too. */
976 if ((((size_t) src) & 15) == 0) {
977 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
978 const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
979 while (i >= 8) { /* 8 * 16-bit */
980 const int16x8_t ints = vld1q_s16((int16_t const *) src); /* get 8 sint16 into a NEON register. */
981 /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
982 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
983 vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
984 i -= 8; src -= 8; dst -= 8;
985 }
986 }
987
988 src += 7; dst += 7; /* adjust for any scalar finishing. */
989
990 /* Finish off any leftovers with scalar operations. */
991 while (i) {
992 *dst = ((float) *src) * DIVBY32768;
993 i--; src--; dst--;
994 }
995
996 cvt->len_cvt *= 2;
997 if (cvt->filters[++cvt->filter_index]) {
998 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
999 }
1000 }
1001
1002 static void SDLCALL
SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1003 SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1004 {
1005 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
1006 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
1007 int i;
1008
1009 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using NEON)");
1010
1011 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
1012 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
1013 *dst = (((float) *src) * DIVBY32768) - 1.0f;
1014 }
1015
1016 src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */
1017 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1018
1019 /* Make sure src is aligned too. */
1020 if ((((size_t) src) & 15) == 0) {
1021 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1022 const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
1023 const float32x4_t negone = vdupq_n_f32(-1.0f);
1024 while (i >= 8) { /* 8 * 16-bit */
1025 const uint16x8_t uints = vld1q_u16((uint16_t const *) src); /* get 8 uint16 into a NEON register. */
1026 /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
1027 vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
1028 vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
1029 i -= 8; src -= 8; dst -= 8;
1030 }
1031 }
1032
1033 src += 7; dst += 7; /* adjust for any scalar finishing. */
1034
1035 /* Finish off any leftovers with scalar operations. */
1036 while (i) {
1037 *dst = (((float) *src) * DIVBY32768) - 1.0f;
1038 i--; src--; dst--;
1039 }
1040
1041 cvt->len_cvt *= 2;
1042 if (cvt->filters[++cvt->filter_index]) {
1043 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
1044 }
1045 }
1046
1047 static void SDLCALL
SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1048 SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1049 {
1050 const Sint32 *src = (const Sint32 *) cvt->buf;
1051 float *dst = (float *) cvt->buf;
1052 int i;
1053
1054 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using NEON)");
1055
1056 /* Get dst aligned to 16 bytes */
1057 for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1058 *dst = ((float) (*src>>8)) * DIVBY8388607;
1059 }
1060
1061 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1062
1063 /* Make sure src is aligned too. */
1064 if ((((size_t) src) & 15) == 0) {
1065 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1066 const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
1067 const int32_t *mmsrc = (const int32_t *) src;
1068 while (i >= 4) { /* 4 * sint32 */
1069 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
1070 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
1071 i -= 4; mmsrc += 4; dst += 4;
1072 }
1073 src = (const Sint32 *) mmsrc;
1074 }
1075
1076 /* Finish off any leftovers with scalar operations. */
1077 while (i) {
1078 *dst = ((float) (*src>>8)) * DIVBY8388607;
1079 i--; src++; dst++;
1080 }
1081
1082 if (cvt->filters[++cvt->filter_index]) {
1083 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
1084 }
1085 }
1086
1087 static void SDLCALL
SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1088 SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1089 {
1090 const float *src = (const float *) cvt->buf;
1091 Sint8 *dst = (Sint8 *) cvt->buf;
1092 int i;
1093
1094 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using NEON)");
1095
1096 /* Get dst aligned to 16 bytes */
1097 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1098 const float sample = *src;
1099 if (sample >= 1.0f) {
1100 *dst = 127;
1101 } else if (sample <= -1.0f) {
1102 *dst = -128;
1103 } else {
1104 *dst = (Sint8)(sample * 127.0f);
1105 }
1106 }
1107
1108 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1109
1110 /* Make sure src is aligned too. */
1111 if ((((size_t) src) & 15) == 0) {
1112 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1113 const float32x4_t one = vdupq_n_f32(1.0f);
1114 const float32x4_t negone = vdupq_n_f32(-1.0f);
1115 const float32x4_t mulby127 = vdupq_n_f32(127.0f);
1116 int8_t *mmdst = (int8_t *) dst;
1117 while (i >= 16) { /* 16 * float32 */
1118 const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
1119 const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
1120 const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
1121 const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */
1122 const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */
1123 const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */
1124 vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi)); /* combine to int8x16_t, store out */
1125 i -= 16; src += 16; mmdst += 16;
1126 }
1127 dst = (Sint8 *) mmdst;
1128 }
1129
1130 /* Finish off any leftovers with scalar operations. */
1131 while (i) {
1132 const float sample = *src;
1133 if (sample >= 1.0f) {
1134 *dst = 127;
1135 } else if (sample <= -1.0f) {
1136 *dst = -128;
1137 } else {
1138 *dst = (Sint8)(sample * 127.0f);
1139 }
1140 i--; src++; dst++;
1141 }
1142
1143 cvt->len_cvt /= 4;
1144 if (cvt->filters[++cvt->filter_index]) {
1145 cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
1146 }
1147 }
1148
1149 static void SDLCALL
SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1150 SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1151 {
1152 const float *src = (const float *) cvt->buf;
1153 Uint8 *dst = (Uint8 *) cvt->buf;
1154 int i;
1155
1156 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using NEON)");
1157
1158 /* Get dst aligned to 16 bytes */
1159 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1160 const float sample = *src;
1161 if (sample >= 1.0f) {
1162 *dst = 255;
1163 } else if (sample <= -1.0f) {
1164 *dst = 0;
1165 } else {
1166 *dst = (Uint8)((sample + 1.0f) * 127.0f);
1167 }
1168 }
1169
1170 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1171
1172 /* Make sure src is aligned too. */
1173 if ((((size_t) src) & 15) == 0) {
1174 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1175 const float32x4_t one = vdupq_n_f32(1.0f);
1176 const float32x4_t negone = vdupq_n_f32(-1.0f);
1177 const float32x4_t mulby127 = vdupq_n_f32(127.0f);
1178 uint8_t *mmdst = (uint8_t *) dst;
1179 while (i >= 16) { /* 16 * float32 */
1180 const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */
1181 const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */
1182 const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */
1183 const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */
1184 const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */
1185 const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */
1186 vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi)); /* combine to uint8x16_t, store out */
1187 i -= 16; src += 16; mmdst += 16;
1188 }
1189
1190 dst = (Uint8 *) mmdst;
1191 }
1192
1193 /* Finish off any leftovers with scalar operations. */
1194 while (i) {
1195 const float sample = *src;
1196 if (sample >= 1.0f) {
1197 *dst = 255;
1198 } else if (sample <= -1.0f) {
1199 *dst = 0;
1200 } else {
1201 *dst = (Uint8)((sample + 1.0f) * 127.0f);
1202 }
1203 i--; src++; dst++;
1204 }
1205
1206 cvt->len_cvt /= 4;
1207 if (cvt->filters[++cvt->filter_index]) {
1208 cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
1209 }
1210 }
1211
1212 static void SDLCALL
SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1213 SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1214 {
1215 const float *src = (const float *) cvt->buf;
1216 Sint16 *dst = (Sint16 *) cvt->buf;
1217 int i;
1218
1219 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using NEON)");
1220
1221 /* Get dst aligned to 16 bytes */
1222 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1223 const float sample = *src;
1224 if (sample >= 1.0f) {
1225 *dst = 32767;
1226 } else if (sample <= -1.0f) {
1227 *dst = -32768;
1228 } else {
1229 *dst = (Sint16)(sample * 32767.0f);
1230 }
1231 }
1232
1233 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1234
1235 /* Make sure src is aligned too. */
1236 if ((((size_t) src) & 15) == 0) {
1237 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1238 const float32x4_t one = vdupq_n_f32(1.0f);
1239 const float32x4_t negone = vdupq_n_f32(-1.0f);
1240 const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
1241 int16_t *mmdst = (int16_t *) dst;
1242 while (i >= 8) { /* 8 * float32 */
1243 const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
1244 const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */
1245 vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, store out. */
1246 i -= 8; src += 8; mmdst += 8;
1247 }
1248 dst = (Sint16 *) mmdst;
1249 }
1250
1251 /* Finish off any leftovers with scalar operations. */
1252 while (i) {
1253 const float sample = *src;
1254 if (sample >= 1.0f) {
1255 *dst = 32767;
1256 } else if (sample <= -1.0f) {
1257 *dst = -32768;
1258 } else {
1259 *dst = (Sint16)(sample * 32767.0f);
1260 }
1261 i--; src++; dst++;
1262 }
1263
1264 cvt->len_cvt /= 2;
1265 if (cvt->filters[++cvt->filter_index]) {
1266 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
1267 }
1268 }
1269
1270 static void SDLCALL
SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1271 SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1272 {
1273 const float *src = (const float *) cvt->buf;
1274 Uint16 *dst = (Uint16 *) cvt->buf;
1275 int i;
1276
1277 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using NEON)");
1278
1279 /* Get dst aligned to 16 bytes */
1280 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1281 const float sample = *src;
1282 if (sample >= 1.0f) {
1283 *dst = 65535;
1284 } else if (sample <= -1.0f) {
1285 *dst = 0;
1286 } else {
1287 *dst = (Uint16)((sample + 1.0f) * 32767.0f);
1288 }
1289 }
1290
1291 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1292
1293 /* Make sure src is aligned too. */
1294 if ((((size_t) src) & 15) == 0) {
1295 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1296 const float32x4_t one = vdupq_n_f32(1.0f);
1297 const float32x4_t negone = vdupq_n_f32(-1.0f);
1298 const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
1299 uint16_t *mmdst = (uint16_t *) dst;
1300 while (i >= 8) { /* 8 * float32 */
1301 const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767)); /* load 4 floats, clamp, convert to uint32 */
1302 const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767)); /* load 4 floats, clamp, convert to uint32 */
1303 vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, store out. */
1304 i -= 8; src += 8; mmdst += 8;
1305 }
1306 dst = (Uint16 *) mmdst;
1307 }
1308
1309 /* Finish off any leftovers with scalar operations. */
1310 while (i) {
1311 const float sample = *src;
1312 if (sample >= 1.0f) {
1313 *dst = 65535;
1314 } else if (sample <= -1.0f) {
1315 *dst = 0;
1316 } else {
1317 *dst = (Uint16)((sample + 1.0f) * 32767.0f);
1318 }
1319 i--; src++; dst++;
1320 }
1321
1322 cvt->len_cvt /= 2;
1323 if (cvt->filters[++cvt->filter_index]) {
1324 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
1325 }
1326 }
1327
1328 static void SDLCALL
SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1329 SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1330 {
1331 const float *src = (const float *) cvt->buf;
1332 Sint32 *dst = (Sint32 *) cvt->buf;
1333 int i;
1334
1335 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using NEON)");
1336
1337 /* Get dst aligned to 16 bytes */
1338 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1339 const float sample = *src;
1340 if (sample >= 1.0f) {
1341 *dst = 2147483647;
1342 } else if (sample <= -1.0f) {
1343 *dst = (-2147483647) - 1;
1344 } else {
1345 *dst = ((Sint32)(sample * 8388607.0f)) << 8;
1346 }
1347 }
1348
1349 SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1350 SDL_assert(!i || ((((size_t) src) & 15) == 0));
1351
1352 {
1353 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1354 const float32x4_t one = vdupq_n_f32(1.0f);
1355 const float32x4_t negone = vdupq_n_f32(-1.0f);
1356 const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f);
1357 int32_t *mmdst = (int32_t *) dst;
1358 while (i >= 4) { /* 4 * float32 */
1359 vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8));
1360 i -= 4; src += 4; mmdst += 4;
1361 }
1362 dst = (Sint32 *) mmdst;
1363 }
1364
1365 /* Finish off any leftovers with scalar operations. */
1366 while (i) {
1367 const float sample = *src;
1368 if (sample >= 1.0f) {
1369 *dst = 2147483647;
1370 } else if (sample <= -1.0f) {
1371 *dst = (-2147483647) - 1;
1372 } else {
1373 *dst = ((Sint32)(sample * 8388607.0f)) << 8;
1374 }
1375 i--; src++; dst++;
1376 }
1377
1378 if (cvt->filters[++cvt->filter_index]) {
1379 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
1380 }
1381 }
1382 #endif
1383
1384
1385
SDL_ChooseAudioConverters(void)1386 void SDL_ChooseAudioConverters(void)
1387 {
1388 static SDL_bool converters_chosen = SDL_FALSE;
1389
1390 if (converters_chosen) {
1391 return;
1392 }
1393
1394 #define SET_CONVERTER_FUNCS(fntype) \
1395 SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
1396 SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
1397 SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
1398 SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
1399 SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
1400 SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
1401 SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
1402 SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
1403 SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
1404 SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
1405 converters_chosen = SDL_TRUE
1406
1407 #if HAVE_SSE2_INTRINSICS
1408 if (SDL_HasSSE2()) {
1409 SET_CONVERTER_FUNCS(SSE2);
1410 return;
1411 }
1412 #endif
1413
1414 #if HAVE_NEON_INTRINSICS
1415 if (SDL_HasNEON()) {
1416 SET_CONVERTER_FUNCS(NEON);
1417 return;
1418 }
1419 #endif
1420
1421 #if NEED_SCALAR_CONVERTER_FALLBACKS
1422 SET_CONVERTER_FUNCS(Scalar);
1423 #endif
1424
1425 #undef SET_CONVERTER_FUNCS
1426
1427 SDL_assert(converters_chosen == SDL_TRUE);
1428 }
1429
1430 /* vi: set ts=4 sw=4 expandtab: */
1431