1 /*
2   Simple DirectMedia Layer
3   Copyright (C) 1997-2020 Sam Lantinga <slouken@libsdl.org>
4 
5   This software is provided 'as-is', without any express or implied
6   warranty.  In no event will the authors be held liable for any damages
7   arising from the use of this software.
8 
9   Permission is granted to anyone to use this software for any purpose,
10   including commercial applications, and to alter it and redistribute it
11   freely, subject to the following restrictions:
12 
13   1. The origin of this software must not be misrepresented; you must not
14      claim that you wrote the original software. If you use this software
15      in a product, an acknowledgment in the product documentation would be
16      appreciated but is not required.
17   2. Altered source versions must be plainly marked as such, and must not be
18      misrepresented as being the original software.
19   3. This notice may not be removed or altered from any source distribution.
20 */
21 
22 #include "../SDL_internal.h"
23 #include "SDL_audio.h"
24 #include "SDL_audio_c.h"
25 #include "SDL_cpuinfo.h"
26 #include "SDL_assert.h"
27 
28 #ifdef __ARM_NEON
29 #define HAVE_NEON_INTRINSICS 1
30 #endif
31 
32 #ifdef __SSE2__
33 #define HAVE_SSE2_INTRINSICS 1
34 #endif
35 
36 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
37 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
38 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS
39 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
40 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
41 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
42 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
43 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
44 #endif
45 
46 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */
47 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS
48 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
49 #endif
50 
51 /* Function pointers set to a CPU-specific implementation. */
52 SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
53 SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
54 SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
55 SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
56 SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
57 SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
58 SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
59 SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
60 SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
61 SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
62 
63 
64 #define DIVBY128 0.0078125f
65 #define DIVBY32768 0.000030517578125f
66 #define DIVBY8388607 0.00000011920930376163766f
67 
68 
69 #if NEED_SCALAR_CONVERTER_FALLBACKS
70 static void SDLCALL
SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)71 SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
72 {
73     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
74     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
75     int i;
76 
77     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
78 
79     for (i = cvt->len_cvt; i; --i, --src, --dst) {
80         *dst = ((float) *src) * DIVBY128;
81     }
82 
83     cvt->len_cvt *= 4;
84     if (cvt->filters[++cvt->filter_index]) {
85         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
86     }
87 }
88 
89 static void SDLCALL
SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)90 SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
91 {
92     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
93     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
94     int i;
95 
96     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
97 
98     for (i = cvt->len_cvt; i; --i, --src, --dst) {
99         *dst = (((float) *src) * DIVBY128) - 1.0f;
100     }
101 
102     cvt->len_cvt *= 4;
103     if (cvt->filters[++cvt->filter_index]) {
104         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
105     }
106 }
107 
108 static void SDLCALL
SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)109 SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
110 {
111     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
112     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
113     int i;
114 
115     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
116 
117     for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
118         *dst = ((float) *src) * DIVBY32768;
119     }
120 
121     cvt->len_cvt *= 2;
122     if (cvt->filters[++cvt->filter_index]) {
123         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
124     }
125 }
126 
127 static void SDLCALL
SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)128 SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
129 {
130     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
131     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
132     int i;
133 
134     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
135 
136     for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
137         *dst = (((float) *src) * DIVBY32768) - 1.0f;
138     }
139 
140     cvt->len_cvt *= 2;
141     if (cvt->filters[++cvt->filter_index]) {
142         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
143     }
144 }
145 
146 static void SDLCALL
SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)147 SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
148 {
149     const Sint32 *src = (const Sint32 *) cvt->buf;
150     float *dst = (float *) cvt->buf;
151     int i;
152 
153     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
154 
155     for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
156         *dst = ((float) (*src>>8)) * DIVBY8388607;
157     }
158 
159     if (cvt->filters[++cvt->filter_index]) {
160         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
161     }
162 }
163 
164 static void SDLCALL
SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)165 SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
166 {
167     const float *src = (const float *) cvt->buf;
168     Sint8 *dst = (Sint8 *) cvt->buf;
169     int i;
170 
171     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
172 
173     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
174         const float sample = *src;
175         if (sample >= 1.0f) {
176             *dst = 127;
177         } else if (sample <= -1.0f) {
178             *dst = -128;
179         } else {
180             *dst = (Sint8)(sample * 127.0f);
181         }
182     }
183 
184     cvt->len_cvt /= 4;
185     if (cvt->filters[++cvt->filter_index]) {
186         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
187     }
188 }
189 
190 static void SDLCALL
SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)191 SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
192 {
193     const float *src = (const float *) cvt->buf;
194     Uint8 *dst = (Uint8 *) cvt->buf;
195     int i;
196 
197     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
198 
199     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
200         const float sample = *src;
201         if (sample >= 1.0f) {
202             *dst = 255;
203         } else if (sample <= -1.0f) {
204             *dst = 0;
205         } else {
206             *dst = (Uint8)((sample + 1.0f) * 127.0f);
207         }
208     }
209 
210     cvt->len_cvt /= 4;
211     if (cvt->filters[++cvt->filter_index]) {
212         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
213     }
214 }
215 
216 static void SDLCALL
SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)217 SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
218 {
219     const float *src = (const float *) cvt->buf;
220     Sint16 *dst = (Sint16 *) cvt->buf;
221     int i;
222 
223     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
224 
225     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
226         const float sample = *src;
227         if (sample >= 1.0f) {
228             *dst = 32767;
229         } else if (sample <= -1.0f) {
230             *dst = -32768;
231         } else {
232             *dst = (Sint16)(sample * 32767.0f);
233         }
234     }
235 
236     cvt->len_cvt /= 2;
237     if (cvt->filters[++cvt->filter_index]) {
238         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
239     }
240 }
241 
242 static void SDLCALL
SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)243 SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
244 {
245     const float *src = (const float *) cvt->buf;
246     Uint16 *dst = (Uint16 *) cvt->buf;
247     int i;
248 
249     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
250 
251     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
252         const float sample = *src;
253         if (sample >= 1.0f) {
254             *dst = 65535;
255         } else if (sample <= -1.0f) {
256             *dst = 0;
257         } else {
258             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
259         }
260     }
261 
262     cvt->len_cvt /= 2;
263     if (cvt->filters[++cvt->filter_index]) {
264         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
265     }
266 }
267 
268 static void SDLCALL
SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT * cvt,SDL_AudioFormat format)269 SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
270 {
271     const float *src = (const float *) cvt->buf;
272     Sint32 *dst = (Sint32 *) cvt->buf;
273     int i;
274 
275     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
276 
277     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
278         const float sample = *src;
279         if (sample >= 1.0f) {
280             *dst = 2147483647;
281         } else if (sample <= -1.0f) {
282             *dst = (Sint32) -2147483648LL;
283         } else {
284             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
285         }
286     }
287 
288     if (cvt->filters[++cvt->filter_index]) {
289         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
290     }
291 }
292 #endif
293 
294 
295 #if HAVE_SSE2_INTRINSICS
296 static void SDLCALL
SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)297 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
298 {
299     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
300     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
301     int i;
302 
303     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
304 
305     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
306     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
307         *dst = ((float) *src) * DIVBY128;
308     }
309 
310     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
311     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
312 
313     /* Make sure src is aligned too. */
314     if ((((size_t) src) & 15) == 0) {
315         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
316         const __m128i *mmsrc = (const __m128i *) src;
317         const __m128i zero = _mm_setzero_si128();
318         const __m128 divby128 = _mm_set1_ps(DIVBY128);
319         while (i >= 16) {   /* 16 * 8-bit */
320             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
321             /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
322             const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
323             /* right-shift-sign-extend gets us sint16 with the other set of values. */
324             const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
325             /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
326             const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
327             const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
328             const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
329             const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
330             /* Interleave back into correct order, store. */
331             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
332             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
333             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
334             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
335             i -= 16; mmsrc--; dst -= 16;
336         }
337 
338         src = (const Sint8 *) mmsrc;
339     }
340 
341     src += 15; dst += 15;  /* adjust for any scalar finishing. */
342 
343     /* Finish off any leftovers with scalar operations. */
344     while (i) {
345         *dst = ((float) *src) * DIVBY128;
346         i--; src--; dst--;
347     }
348 
349     cvt->len_cvt *= 4;
350     if (cvt->filters[++cvt->filter_index]) {
351         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
352     }
353 }
354 
355 static void SDLCALL
SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)356 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
357 {
358     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
359     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
360     int i;
361 
362     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
363 
364     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
365     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
366         *dst = (((float) *src) * DIVBY128) - 1.0f;
367     }
368 
369     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
370     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
371 
372     /* Make sure src is aligned too. */
373     if ((((size_t) src) & 15) == 0) {
374         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
375         const __m128i *mmsrc = (const __m128i *) src;
376         const __m128i zero = _mm_setzero_si128();
377         const __m128 divby128 = _mm_set1_ps(DIVBY128);
378         const __m128 minus1 = _mm_set1_ps(-1.0f);
379         while (i >= 16) {   /* 16 * 8-bit */
380             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
381             /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
382             const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
383             /* right-shift-zero-extend gets us uint16 with the other set of values. */
384             const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
385             /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
386             /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
387             const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
388             const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
389             const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
390             const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
391             /* Interleave back into correct order, store. */
392             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
393             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
394             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
395             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
396             i -= 16; mmsrc--; dst -= 16;
397         }
398 
399         src = (const Uint8 *) mmsrc;
400     }
401 
402     src += 15; dst += 15;  /* adjust for any scalar finishing. */
403 
404     /* Finish off any leftovers with scalar operations. */
405     while (i) {
406         *dst = (((float) *src) * DIVBY128) - 1.0f;
407         i--; src--; dst--;
408     }
409 
410     cvt->len_cvt *= 4;
411     if (cvt->filters[++cvt->filter_index]) {
412         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
413     }
414 }
415 
416 static void SDLCALL
SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)417 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
418 {
419     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
420     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
421     int i;
422 
423     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
424 
425     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
426     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
427         *dst = ((float) *src) * DIVBY32768;
428     }
429 
430     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
431     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
432 
433     /* Make sure src is aligned too. */
434     if ((((size_t) src) & 15) == 0) {
435         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
436         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
437         while (i >= 8) {   /* 8 * 16-bit */
438             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
439             /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
440             const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
441             /* right-shift-sign-extend gets us sint32 with the other set of values. */
442             const __m128i b = _mm_srai_epi32(ints, 16);
443             /* Interleave these back into the right order, convert to float, multiply, store. */
444             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
445             _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
446             i -= 8; src -= 8; dst -= 8;
447         }
448     }
449 
450     src += 7; dst += 7;  /* adjust for any scalar finishing. */
451 
452     /* Finish off any leftovers with scalar operations. */
453     while (i) {
454         *dst = ((float) *src) * DIVBY32768;
455         i--; src--; dst--;
456     }
457 
458     cvt->len_cvt *= 2;
459     if (cvt->filters[++cvt->filter_index]) {
460         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
461     }
462 }
463 
464 static void SDLCALL
SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)465 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
466 {
467     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
468     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
469     int i;
470 
471     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
472 
473     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
474     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
475         *dst = (((float) *src) * DIVBY32768) - 1.0f;
476     }
477 
478     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
479     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
480 
481     /* Make sure src is aligned too. */
482     if ((((size_t) src) & 15) == 0) {
483         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
484         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
485         const __m128 minus1 = _mm_set1_ps(-1.0f);
486         while (i >= 8) {   /* 8 * 16-bit */
487             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
488             /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
489             const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
490             /* right-shift-sign-extend gets us sint32 with the other set of values. */
491             const __m128i b = _mm_srli_epi32(ints, 16);
492             /* Interleave these back into the right order, convert to float, multiply, store. */
493             _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
494             _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
495             i -= 8; src -= 8; dst -= 8;
496         }
497     }
498 
499     src += 7; dst += 7;  /* adjust for any scalar finishing. */
500 
501     /* Finish off any leftovers with scalar operations. */
502     while (i) {
503         *dst = (((float) *src) * DIVBY32768) - 1.0f;
504         i--; src--; dst--;
505     }
506 
507     cvt->len_cvt *= 2;
508     if (cvt->filters[++cvt->filter_index]) {
509         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
510     }
511 }
512 
513 static void SDLCALL
SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)514 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
515 {
516     const Sint32 *src = (const Sint32 *) cvt->buf;
517     float *dst = (float *) cvt->buf;
518     int i;
519 
520     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
521 
522     /* Get dst aligned to 16 bytes */
523     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
524         *dst = ((float) (*src>>8)) * DIVBY8388607;
525     }
526 
527     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
528 
529     /* Make sure src is aligned too. */
530     if ((((size_t) src) & 15) == 0) {
531         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
532         const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
533         const __m128i *mmsrc = (const __m128i *) src;
534         while (i >= 4) {   /* 4 * sint32 */
535             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
536             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
537             i -= 4; mmsrc++; dst += 4;
538         }
539         src = (const Sint32 *) mmsrc;
540     }
541 
542     /* Finish off any leftovers with scalar operations. */
543     while (i) {
544         *dst = ((float) (*src>>8)) * DIVBY8388607;
545         i--; src++; dst++;
546     }
547 
548     if (cvt->filters[++cvt->filter_index]) {
549         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
550     }
551 }
552 
553 static void SDLCALL
SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)554 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
555 {
556     const float *src = (const float *) cvt->buf;
557     Sint8 *dst = (Sint8 *) cvt->buf;
558     int i;
559 
560     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
561 
562     /* Get dst aligned to 16 bytes */
563     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
564         const float sample = *src;
565         if (sample >= 1.0f) {
566             *dst = 127;
567         } else if (sample <= -1.0f) {
568             *dst = -128;
569         } else {
570             *dst = (Sint8)(sample * 127.0f);
571         }
572     }
573 
574     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
575 
576     /* Make sure src is aligned too. */
577     if ((((size_t) src) & 15) == 0) {
578         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
579         const __m128 one = _mm_set1_ps(1.0f);
580         const __m128 negone = _mm_set1_ps(-1.0f);
581         const __m128 mulby127 = _mm_set1_ps(127.0f);
582         __m128i *mmdst = (__m128i *) dst;
583         while (i >= 16) {   /* 16 * float32 */
584             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
585             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
586             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
587             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
588             _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
589             i -= 16; src += 16; mmdst++;
590         }
591         dst = (Sint8 *) mmdst;
592     }
593 
594     /* Finish off any leftovers with scalar operations. */
595     while (i) {
596         const float sample = *src;
597         if (sample >= 1.0f) {
598             *dst = 127;
599         } else if (sample <= -1.0f) {
600             *dst = -128;
601         } else {
602             *dst = (Sint8)(sample * 127.0f);
603         }
604         i--; src++; dst++;
605     }
606 
607     cvt->len_cvt /= 4;
608     if (cvt->filters[++cvt->filter_index]) {
609         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
610     }
611 }
612 
613 static void SDLCALL
SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)614 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
615 {
616     const float *src = (const float *) cvt->buf;
617     Uint8 *dst = cvt->buf;
618     int i;
619 
620     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
621 
622     /* Get dst aligned to 16 bytes */
623     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
624         const float sample = *src;
625         if (sample >= 1.0f) {
626             *dst = 255;
627         } else if (sample <= -1.0f) {
628             *dst = 0;
629         } else {
630             *dst = (Uint8)((sample + 1.0f) * 127.0f);
631         }
632     }
633 
634     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
635 
636     /* Make sure src is aligned too. */
637     if ((((size_t) src) & 15) == 0) {
638         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
639         const __m128 one = _mm_set1_ps(1.0f);
640         const __m128 negone = _mm_set1_ps(-1.0f);
641         const __m128 mulby127 = _mm_set1_ps(127.0f);
642         __m128i *mmdst = (__m128i *) dst;
643         while (i >= 16) {   /* 16 * float32 */
644             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
645             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
646             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
647             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
648             _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
649             i -= 16; src += 16; mmdst++;
650         }
651         dst = (Uint8 *) mmdst;
652     }
653 
654     /* Finish off any leftovers with scalar operations. */
655     while (i) {
656         const float sample = *src;
657         if (sample >= 1.0f) {
658             *dst = 255;
659         } else if (sample <= -1.0f) {
660             *dst = 0;
661         } else {
662             *dst = (Uint8)((sample + 1.0f) * 127.0f);
663         }
664         i--; src++; dst++;
665     }
666 
667     cvt->len_cvt /= 4;
668     if (cvt->filters[++cvt->filter_index]) {
669         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
670     }
671 }
672 
673 static void SDLCALL
SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)674 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
675 {
676     const float *src = (const float *) cvt->buf;
677     Sint16 *dst = (Sint16 *) cvt->buf;
678     int i;
679 
680     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
681 
682     /* Get dst aligned to 16 bytes */
683     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
684         const float sample = *src;
685         if (sample >= 1.0f) {
686             *dst = 32767;
687         } else if (sample <= -1.0f) {
688             *dst = -32768;
689         } else {
690             *dst = (Sint16)(sample * 32767.0f);
691         }
692     }
693 
694     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
695 
696     /* Make sure src is aligned too. */
697     if ((((size_t) src) & 15) == 0) {
698         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
699         const __m128 one = _mm_set1_ps(1.0f);
700         const __m128 negone = _mm_set1_ps(-1.0f);
701         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
702         __m128i *mmdst = (__m128i *) dst;
703         while (i >= 8) {   /* 8 * float32 */
704             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
705             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
706             _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
707             i -= 8; src += 8; mmdst++;
708         }
709         dst = (Sint16 *) mmdst;
710     }
711 
712     /* Finish off any leftovers with scalar operations. */
713     while (i) {
714         const float sample = *src;
715         if (sample >= 1.0f) {
716             *dst = 32767;
717         } else if (sample <= -1.0f) {
718             *dst = -32768;
719         } else {
720             *dst = (Sint16)(sample * 32767.0f);
721         }
722         i--; src++; dst++;
723     }
724 
725     cvt->len_cvt /= 2;
726     if (cvt->filters[++cvt->filter_index]) {
727         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
728     }
729 }
730 
731 static void SDLCALL
SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)732 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
733 {
734     const float *src = (const float *) cvt->buf;
735     Uint16 *dst = (Uint16 *) cvt->buf;
736     int i;
737 
738     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
739 
740     /* Get dst aligned to 16 bytes */
741     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
742         const float sample = *src;
743         if (sample >= 1.0f) {
744             *dst = 65535;
745         } else if (sample <= -1.0f) {
746             *dst = 0;
747         } else {
748             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
749         }
750     }
751 
752     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
753 
754     /* Make sure src is aligned too. */
755     if ((((size_t) src) & 15) == 0) {
756         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
757         /* This calculates differently than the scalar path because SSE2 can't
758            pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
759            saturation, so that would corrupt our data. _mm_packus_epi32 exists,
760            but not before SSE 4.1. So we convert from float to sint16, packing
761            that down with legit signed saturation, and then xor the top bit
762            against 1. This results in the correct unsigned 16-bit value, even
763            though it looks like dark magic. */
764         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
765         const __m128i topbit = _mm_set1_epi16(-32768);
766         const __m128 one = _mm_set1_ps(1.0f);
767         const __m128 negone = _mm_set1_ps(-1.0f);
768         __m128i *mmdst = (__m128i *) dst;
769         while (i >= 8) {   /* 8 * float32 */
770             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
771             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
772             _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
773             i -= 8; src += 8; mmdst++;
774         }
775         dst = (Uint16 *) mmdst;
776     }
777 
778     /* Finish off any leftovers with scalar operations. */
779     while (i) {
780         const float sample = *src;
781         if (sample >= 1.0f) {
782             *dst = 65535;
783         } else if (sample <= -1.0f) {
784             *dst = 0;
785         } else {
786             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
787         }
788         i--; src++; dst++;
789     }
790 
791     cvt->len_cvt /= 2;
792     if (cvt->filters[++cvt->filter_index]) {
793         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
794     }
795 }
796 
797 static void SDLCALL
SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT * cvt,SDL_AudioFormat format)798 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
799 {
800     const float *src = (const float *) cvt->buf;
801     Sint32 *dst = (Sint32 *) cvt->buf;
802     int i;
803 
804     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
805 
806     /* Get dst aligned to 16 bytes */
807     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
808         const float sample = *src;
809         if (sample >= 1.0f) {
810             *dst = 2147483647;
811         } else if (sample <= -1.0f) {
812             *dst = (Sint32) -2147483648LL;
813         } else {
814             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
815         }
816     }
817 
818     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
819     SDL_assert(!i || ((((size_t) src) & 15) == 0));
820 
821     {
822         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
823         const __m128 one = _mm_set1_ps(1.0f);
824         const __m128 negone = _mm_set1_ps(-1.0f);
825         const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
826         __m128i *mmdst = (__m128i *) dst;
827         while (i >= 4) {   /* 4 * float32 */
828             _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8));  /* load 4 floats, clamp, convert to sint32 */
829             i -= 4; src += 4; mmdst++;
830         }
831         dst = (Sint32 *) mmdst;
832     }
833 
834     /* Finish off any leftovers with scalar operations. */
835     while (i) {
836         const float sample = *src;
837         if (sample >= 1.0f) {
838             *dst = 2147483647;
839         } else if (sample <= -1.0f) {
840             *dst = (Sint32) -2147483648LL;
841         } else {
842             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
843         }
844         i--; src++; dst++;
845     }
846 
847     if (cvt->filters[++cvt->filter_index]) {
848         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
849     }
850 }
851 #endif
852 
853 
854 #if HAVE_NEON_INTRINSICS
855 static void SDLCALL
SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)856 SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
857 {
858     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
859     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
860     int i;
861 
862     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using NEON)");
863 
864     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
865     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
866         *dst = ((float) *src) * DIVBY128;
867     }
868 
869     src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
870     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
871 
872     /* Make sure src is aligned too. */
873     if ((((size_t) src) & 15) == 0) {
874         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
875         const int8_t *mmsrc = (const int8_t *) src;
876         const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
877         while (i >= 16) {   /* 16 * 8-bit */
878             const int8x16_t bytes = vld1q_s8(mmsrc);  /* get 16 sint8 into a NEON register. */
879             const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes));  /* convert top 8 bytes to 8 int16 */
880             const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes));   /* convert bottom 8 bytes to 8 int16 */
881             /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
882             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128));
883             vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128));
884             vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128));
885             vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128));
886             i -= 16; mmsrc -= 16; dst -= 16;
887         }
888 
889         src = (const Sint8 *) mmsrc;
890     }
891 
892     src += 15; dst += 15;  /* adjust for any scalar finishing. */
893 
894     /* Finish off any leftovers with scalar operations. */
895     while (i) {
896         *dst = ((float) *src) * DIVBY128;
897         i--; src--; dst--;
898     }
899 
900     cvt->len_cvt *= 4;
901     if (cvt->filters[++cvt->filter_index]) {
902         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
903     }
904 }
905 
906 static void SDLCALL
SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)907 SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
908 {
909     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
910     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
911     int i;
912 
913     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using NEON)");
914 
915     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
916     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
917         *dst = (((float) *src) * DIVBY128) - 1.0f;
918     }
919 
920     src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
921     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
922 
923     /* Make sure src is aligned too. */
924     if ((((size_t) src) & 15) == 0) {
925         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
926         const uint8_t *mmsrc = (const uint8_t *) src;
927         const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
928         const float32x4_t negone = vdupq_n_f32(-1.0f);
929         while (i >= 16) {   /* 16 * 8-bit */
930             const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
931             const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
932             const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
933             /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
934             vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
935             vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
936             vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
937             vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
938             i -= 16; mmsrc -= 16; dst -= 16;
939         }
940 
941         src = (const Uint8 *) mmsrc;
942     }
943 
944     src += 15; dst += 15;  /* adjust for any scalar finishing. */
945 
946     /* Finish off any leftovers with scalar operations. */
947     while (i) {
948         *dst = (((float) *src) * DIVBY128) - 1.0f;
949         i--; src--; dst--;
950     }
951 
952     cvt->len_cvt *= 4;
953     if (cvt->filters[++cvt->filter_index]) {
954         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
955     }
956 }
957 
958 static void SDLCALL
SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)959 SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
960 {
961     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
962     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
963     int i;
964 
965     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using NEON)");
966 
967     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
968     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
969         *dst = ((float) *src) * DIVBY32768;
970     }
971 
972     src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
973     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
974 
975     /* Make sure src is aligned too. */
976     if ((((size_t) src) & 15) == 0) {
977         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
978         const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
979         while (i >= 8) {   /* 8 * 16-bit */
980             const int16x8_t ints = vld1q_s16((int16_t const *) src);  /* get 8 sint16 into a NEON register. */
981             /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
982             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
983             vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
984             i -= 8; src -= 8; dst -= 8;
985         }
986     }
987 
988     src += 7; dst += 7;  /* adjust for any scalar finishing. */
989 
990     /* Finish off any leftovers with scalar operations. */
991     while (i) {
992         *dst = ((float) *src) * DIVBY32768;
993         i--; src--; dst--;
994     }
995 
996     cvt->len_cvt *= 2;
997     if (cvt->filters[++cvt->filter_index]) {
998         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
999     }
1000 }
1001 
1002 static void SDLCALL
SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1003 SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1004 {
1005     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
1006     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
1007     int i;
1008 
1009     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using NEON)");
1010 
1011     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
1012     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
1013         *dst = (((float) *src) * DIVBY32768) - 1.0f;
1014     }
1015 
1016     src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
1017     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1018 
1019     /* Make sure src is aligned too. */
1020     if ((((size_t) src) & 15) == 0) {
1021         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1022         const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
1023         const float32x4_t negone = vdupq_n_f32(-1.0f);
1024         while (i >= 8) {   /* 8 * 16-bit */
1025             const uint16x8_t uints = vld1q_u16((uint16_t const *) src);  /* get 8 uint16 into a NEON register. */
1026             /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
1027             vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
1028             vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
1029             i -= 8; src -= 8; dst -= 8;
1030         }
1031     }
1032 
1033     src += 7; dst += 7;  /* adjust for any scalar finishing. */
1034 
1035     /* Finish off any leftovers with scalar operations. */
1036     while (i) {
1037         *dst = (((float) *src) * DIVBY32768) - 1.0f;
1038         i--; src--; dst--;
1039     }
1040 
1041     cvt->len_cvt *= 2;
1042     if (cvt->filters[++cvt->filter_index]) {
1043         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
1044     }
1045 }
1046 
1047 static void SDLCALL
SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1048 SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1049 {
1050     const Sint32 *src = (const Sint32 *) cvt->buf;
1051     float *dst = (float *) cvt->buf;
1052     int i;
1053 
1054     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using NEON)");
1055 
1056     /* Get dst aligned to 16 bytes */
1057     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1058         *dst = ((float) (*src>>8)) * DIVBY8388607;
1059     }
1060 
1061     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1062 
1063     /* Make sure src is aligned too. */
1064     if ((((size_t) src) & 15) == 0) {
1065         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1066         const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
1067         const int32_t *mmsrc = (const int32_t *) src;
1068         while (i >= 4) {   /* 4 * sint32 */
1069             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
1070             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
1071             i -= 4; mmsrc += 4; dst += 4;
1072         }
1073         src = (const Sint32 *) mmsrc;
1074     }
1075 
1076     /* Finish off any leftovers with scalar operations. */
1077     while (i) {
1078         *dst = ((float) (*src>>8)) * DIVBY8388607;
1079         i--; src++; dst++;
1080     }
1081 
1082     if (cvt->filters[++cvt->filter_index]) {
1083         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
1084     }
1085 }
1086 
1087 static void SDLCALL
SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1088 SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1089 {
1090     const float *src = (const float *) cvt->buf;
1091     Sint8 *dst = (Sint8 *) cvt->buf;
1092     int i;
1093 
1094     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using NEON)");
1095 
1096     /* Get dst aligned to 16 bytes */
1097     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1098         const float sample = *src;
1099         if (sample >= 1.0f) {
1100             *dst = 127;
1101         } else if (sample <= -1.0f) {
1102             *dst = -128;
1103         } else {
1104             *dst = (Sint8)(sample * 127.0f);
1105         }
1106     }
1107 
1108     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1109 
1110     /* Make sure src is aligned too. */
1111     if ((((size_t) src) & 15) == 0) {
1112         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1113         const float32x4_t one = vdupq_n_f32(1.0f);
1114         const float32x4_t negone = vdupq_n_f32(-1.0f);
1115         const float32x4_t mulby127 = vdupq_n_f32(127.0f);
1116         int8_t *mmdst = (int8_t *) dst;
1117         while (i >= 16) {   /* 16 * float32 */
1118             const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
1119             const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
1120             const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
1121             const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
1122             const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */
1123             const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */
1124             vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi));  /* combine to int8x16_t, store out */
1125             i -= 16; src += 16; mmdst += 16;
1126         }
1127         dst = (Sint8 *) mmdst;
1128     }
1129 
1130     /* Finish off any leftovers with scalar operations. */
1131     while (i) {
1132         const float sample = *src;
1133         if (sample >= 1.0f) {
1134             *dst = 127;
1135         } else if (sample <= -1.0f) {
1136             *dst = -128;
1137         } else {
1138             *dst = (Sint8)(sample * 127.0f);
1139         }
1140         i--; src++; dst++;
1141     }
1142 
1143     cvt->len_cvt /= 4;
1144     if (cvt->filters[++cvt->filter_index]) {
1145         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
1146     }
1147 }
1148 
1149 static void SDLCALL
SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1150 SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1151 {
1152     const float *src = (const float *) cvt->buf;
1153     Uint8 *dst = (Uint8 *) cvt->buf;
1154     int i;
1155 
1156     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using NEON)");
1157 
1158     /* Get dst aligned to 16 bytes */
1159     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1160         const float sample = *src;
1161         if (sample >= 1.0f) {
1162             *dst = 255;
1163         } else if (sample <= -1.0f) {
1164             *dst = 0;
1165         } else {
1166             *dst = (Uint8)((sample + 1.0f) * 127.0f);
1167         }
1168     }
1169 
1170     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1171 
1172     /* Make sure src is aligned too. */
1173     if ((((size_t) src) & 15) == 0) {
1174         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1175         const float32x4_t one = vdupq_n_f32(1.0f);
1176         const float32x4_t negone = vdupq_n_f32(-1.0f);
1177         const float32x4_t mulby127 = vdupq_n_f32(127.0f);
1178         uint8_t *mmdst = (uint8_t *) dst;
1179         while (i >= 16) {   /* 16 * float32 */
1180             const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
1181             const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
1182             const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
1183             const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
1184             const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */
1185             const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */
1186             vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi));  /* combine to uint8x16_t, store out */
1187             i -= 16; src += 16; mmdst += 16;
1188         }
1189 
1190         dst = (Uint8 *) mmdst;
1191     }
1192 
1193     /* Finish off any leftovers with scalar operations. */
1194     while (i) {
1195         const float sample = *src;
1196         if (sample >= 1.0f) {
1197             *dst = 255;
1198         } else if (sample <= -1.0f) {
1199             *dst = 0;
1200         } else {
1201             *dst = (Uint8)((sample + 1.0f) * 127.0f);
1202         }
1203         i--; src++; dst++;
1204     }
1205 
1206     cvt->len_cvt /= 4;
1207     if (cvt->filters[++cvt->filter_index]) {
1208         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
1209     }
1210 }
1211 
1212 static void SDLCALL
SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1213 SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1214 {
1215     const float *src = (const float *) cvt->buf;
1216     Sint16 *dst = (Sint16 *) cvt->buf;
1217     int i;
1218 
1219     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using NEON)");
1220 
1221     /* Get dst aligned to 16 bytes */
1222     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1223         const float sample = *src;
1224         if (sample >= 1.0f) {
1225             *dst = 32767;
1226         } else if (sample <= -1.0f) {
1227             *dst = -32768;
1228         } else {
1229             *dst = (Sint16)(sample * 32767.0f);
1230         }
1231     }
1232 
1233     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1234 
1235     /* Make sure src is aligned too. */
1236     if ((((size_t) src) & 15) == 0) {
1237         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1238         const float32x4_t one = vdupq_n_f32(1.0f);
1239         const float32x4_t negone = vdupq_n_f32(-1.0f);
1240         const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
1241         int16_t *mmdst = (int16_t *) dst;
1242         while (i >= 8) {   /* 8 * float32 */
1243             const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
1244             const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
1245             vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2)));  /* narrow to sint16, combine, store out. */
1246             i -= 8; src += 8; mmdst += 8;
1247         }
1248         dst = (Sint16 *) mmdst;
1249     }
1250 
1251     /* Finish off any leftovers with scalar operations. */
1252     while (i) {
1253         const float sample = *src;
1254         if (sample >= 1.0f) {
1255             *dst = 32767;
1256         } else if (sample <= -1.0f) {
1257             *dst = -32768;
1258         } else {
1259             *dst = (Sint16)(sample * 32767.0f);
1260         }
1261         i--; src++; dst++;
1262     }
1263 
1264     cvt->len_cvt /= 2;
1265     if (cvt->filters[++cvt->filter_index]) {
1266         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
1267     }
1268 }
1269 
1270 static void SDLCALL
SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1271 SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1272 {
1273     const float *src = (const float *) cvt->buf;
1274     Uint16 *dst = (Uint16 *) cvt->buf;
1275     int i;
1276 
1277     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using NEON)");
1278 
1279     /* Get dst aligned to 16 bytes */
1280     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1281         const float sample = *src;
1282         if (sample >= 1.0f) {
1283             *dst = 65535;
1284         } else if (sample <= -1.0f) {
1285             *dst = 0;
1286         } else {
1287             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
1288         }
1289     }
1290 
1291     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1292 
1293     /* Make sure src is aligned too. */
1294     if ((((size_t) src) & 15) == 0) {
1295         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1296         const float32x4_t one = vdupq_n_f32(1.0f);
1297         const float32x4_t negone = vdupq_n_f32(-1.0f);
1298         const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
1299         uint16_t *mmdst = (uint16_t *) dst;
1300         while (i >= 8) {   /* 8 * float32 */
1301             const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
1302             const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
1303             vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2)));  /* narrow to uint16, combine, store out. */
1304             i -= 8; src += 8; mmdst += 8;
1305         }
1306         dst = (Uint16 *) mmdst;
1307     }
1308 
1309     /* Finish off any leftovers with scalar operations. */
1310     while (i) {
1311         const float sample = *src;
1312         if (sample >= 1.0f) {
1313             *dst = 65535;
1314         } else if (sample <= -1.0f) {
1315             *dst = 0;
1316         } else {
1317             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
1318         }
1319         i--; src++; dst++;
1320     }
1321 
1322     cvt->len_cvt /= 2;
1323     if (cvt->filters[++cvt->filter_index]) {
1324         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
1325     }
1326 }
1327 
1328 static void SDLCALL
SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT * cvt,SDL_AudioFormat format)1329 SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1330 {
1331     const float *src = (const float *) cvt->buf;
1332     Sint32 *dst = (Sint32 *) cvt->buf;
1333     int i;
1334 
1335     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using NEON)");
1336 
1337     /* Get dst aligned to 16 bytes */
1338     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1339         const float sample = *src;
1340         if (sample >= 1.0f) {
1341             *dst = 2147483647;
1342         } else if (sample <= -1.0f) {
1343             *dst = (-2147483647) - 1;
1344         } else {
1345             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
1346         }
1347     }
1348 
1349     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1350     SDL_assert(!i || ((((size_t) src) & 15) == 0));
1351 
1352     {
1353         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1354         const float32x4_t one = vdupq_n_f32(1.0f);
1355         const float32x4_t negone = vdupq_n_f32(-1.0f);
1356         const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f);
1357         int32_t *mmdst = (int32_t *) dst;
1358         while (i >= 4) {   /* 4 * float32 */
1359             vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8));
1360             i -= 4; src += 4; mmdst += 4;
1361         }
1362         dst = (Sint32 *) mmdst;
1363     }
1364 
1365     /* Finish off any leftovers with scalar operations. */
1366     while (i) {
1367         const float sample = *src;
1368         if (sample >= 1.0f) {
1369             *dst = 2147483647;
1370         } else if (sample <= -1.0f) {
1371             *dst = (-2147483647) - 1;
1372         } else {
1373             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
1374         }
1375         i--; src++; dst++;
1376     }
1377 
1378     if (cvt->filters[++cvt->filter_index]) {
1379         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
1380     }
1381 }
1382 #endif
1383 
1384 
1385 
SDL_ChooseAudioConverters(void)1386 void SDL_ChooseAudioConverters(void)
1387 {
1388     static SDL_bool converters_chosen = SDL_FALSE;
1389 
1390     if (converters_chosen) {
1391         return;
1392     }
1393 
1394 #define SET_CONVERTER_FUNCS(fntype) \
1395         SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
1396         SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
1397         SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
1398         SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
1399         SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
1400         SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
1401         SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
1402         SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
1403         SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
1404         SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
1405         converters_chosen = SDL_TRUE
1406 
1407 #if HAVE_SSE2_INTRINSICS
1408     if (SDL_HasSSE2()) {
1409         SET_CONVERTER_FUNCS(SSE2);
1410         return;
1411     }
1412 #endif
1413 
1414 #if HAVE_NEON_INTRINSICS
1415     if (SDL_HasNEON()) {
1416         SET_CONVERTER_FUNCS(NEON);
1417         return;
1418     }
1419 #endif
1420 
1421 #if NEED_SCALAR_CONVERTER_FALLBACKS
1422     SET_CONVERTER_FUNCS(Scalar);
1423 #endif
1424 
1425 #undef SET_CONVERTER_FUNCS
1426 
1427     SDL_assert(converters_chosen == SDL_TRUE);
1428 }
1429 
1430 /* vi: set ts=4 sw=4 expandtab: */
1431