sdl

FORK: Simple Directmedia Layer
git clone https://git.neptards.moe/neptards/sdl.git
Log | Files | Refs

SDL_audiotypecvt.c (54841B)


      1 /*
      2   Simple DirectMedia Layer
      3   Copyright (C) 1997-2020 Sam Lantinga <slouken@libsdl.org>
      4 
      5   This software is provided 'as-is', without any express or implied
      6   warranty.  In no event will the authors be held liable for any damages
      7   arising from the use of this software.
      8 
      9   Permission is granted to anyone to use this software for any purpose,
     10   including commercial applications, and to alter it and redistribute it
     11   freely, subject to the following restrictions:
     12 
     13   1. The origin of this software must not be misrepresented; you must not
     14      claim that you wrote the original software. If you use this software
     15      in a product, an acknowledgment in the product documentation would be
     16      appreciated but is not required.
     17   2. Altered source versions must be plainly marked as such, and must not be
     18      misrepresented as being the original software.
     19   3. This notice may not be removed or altered from any source distribution.
     20 */
     21 #include "../SDL_internal.h"
     22 
     23 #include "SDL_audio.h"
     24 #include "SDL_audio_c.h"
     25 #include "SDL_cpuinfo.h"
     26 
     27 #ifdef __ARM_NEON
     28 #define HAVE_NEON_INTRINSICS 1
     29 #endif
     30 
     31 #ifdef __SSE2__
     32 #define HAVE_SSE2_INTRINSICS 1
     33 #endif
     34 
     35 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
     36 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
     37 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS
     38 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
     39 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
     40 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
     41 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
     42 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
     43 #endif
     44 
     45 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */
     46 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS
     47 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
     48 #endif
     49 
     50 /* Function pointers set to a CPU-specific implementation. */
     51 SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
     52 SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
     53 SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
     54 SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
     55 SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
     56 SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
     57 SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
     58 SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
     59 SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
     60 SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
     61 
     62 
     63 #define DIVBY128 0.0078125f
     64 #define DIVBY32768 0.000030517578125f
     65 #define DIVBY8388607 0.00000011920930376163766f
     66 
     67 
     68 #if NEED_SCALAR_CONVERTER_FALLBACKS
     69 static void SDLCALL
     70 SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
     71 {
     72     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
     73     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
     74     int i;
     75 
     76     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
     77 
     78     for (i = cvt->len_cvt; i; --i, --src, --dst) {
     79         *dst = ((float) *src) * DIVBY128;
     80     }
     81 
     82     cvt->len_cvt *= 4;
     83     if (cvt->filters[++cvt->filter_index]) {
     84         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
     85     }
     86 }
     87 
     88 static void SDLCALL
     89 SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
     90 {
     91     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
     92     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
     93     int i;
     94 
     95     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
     96 
     97     for (i = cvt->len_cvt; i; --i, --src, --dst) {
     98         *dst = (((float) *src) * DIVBY128) - 1.0f;
     99     }
    100 
    101     cvt->len_cvt *= 4;
    102     if (cvt->filters[++cvt->filter_index]) {
    103         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    104     }
    105 }
    106 
    107 static void SDLCALL
    108 SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    109 {
    110     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
    111     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
    112     int i;
    113 
    114     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
    115 
    116     for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
    117         *dst = ((float) *src) * DIVBY32768;
    118     }
    119 
    120     cvt->len_cvt *= 2;
    121     if (cvt->filters[++cvt->filter_index]) {
    122         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    123     }
    124 }
    125 
    126 static void SDLCALL
    127 SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    128 {
    129     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
    130     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
    131     int i;
    132 
    133     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
    134 
    135     for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
    136         *dst = (((float) *src) * DIVBY32768) - 1.0f;
    137     }
    138 
    139     cvt->len_cvt *= 2;
    140     if (cvt->filters[++cvt->filter_index]) {
    141         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    142     }
    143 }
    144 
    145 static void SDLCALL
    146 SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    147 {
    148     const Sint32 *src = (const Sint32 *) cvt->buf;
    149     float *dst = (float *) cvt->buf;
    150     int i;
    151 
    152     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
    153 
    154     for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
    155         *dst = ((float) (*src>>8)) * DIVBY8388607;
    156     }
    157 
    158     if (cvt->filters[++cvt->filter_index]) {
    159         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    160     }
    161 }
    162 
    163 static void SDLCALL
    164 SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    165 {
    166     const float *src = (const float *) cvt->buf;
    167     Sint8 *dst = (Sint8 *) cvt->buf;
    168     int i;
    169 
    170     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
    171 
    172     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
    173         const float sample = *src;
    174         if (sample >= 1.0f) {
    175             *dst = 127;
    176         } else if (sample <= -1.0f) {
    177             *dst = -128;
    178         } else {
    179             *dst = (Sint8)(sample * 127.0f);
    180         }
    181     }
    182 
    183     cvt->len_cvt /= 4;
    184     if (cvt->filters[++cvt->filter_index]) {
    185         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
    186     }
    187 }
    188 
    189 static void SDLCALL
    190 SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    191 {
    192     const float *src = (const float *) cvt->buf;
    193     Uint8 *dst = (Uint8 *) cvt->buf;
    194     int i;
    195 
    196     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
    197 
    198     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
    199         const float sample = *src;
    200         if (sample >= 1.0f) {
    201             *dst = 255;
    202         } else if (sample <= -1.0f) {
    203             *dst = 0;
    204         } else {
    205             *dst = (Uint8)((sample + 1.0f) * 127.0f);
    206         }
    207     }
    208 
    209     cvt->len_cvt /= 4;
    210     if (cvt->filters[++cvt->filter_index]) {
    211         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
    212     }
    213 }
    214 
    215 static void SDLCALL
    216 SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    217 {
    218     const float *src = (const float *) cvt->buf;
    219     Sint16 *dst = (Sint16 *) cvt->buf;
    220     int i;
    221 
    222     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
    223 
    224     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
    225         const float sample = *src;
    226         if (sample >= 1.0f) {
    227             *dst = 32767;
    228         } else if (sample <= -1.0f) {
    229             *dst = -32768;
    230         } else {
    231             *dst = (Sint16)(sample * 32767.0f);
    232         }
    233     }
    234 
    235     cvt->len_cvt /= 2;
    236     if (cvt->filters[++cvt->filter_index]) {
    237         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
    238     }
    239 }
    240 
    241 static void SDLCALL
    242 SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    243 {
    244     const float *src = (const float *) cvt->buf;
    245     Uint16 *dst = (Uint16 *) cvt->buf;
    246     int i;
    247 
    248     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
    249 
    250     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
    251         const float sample = *src;
    252         if (sample >= 1.0f) {
    253             *dst = 65535;
    254         } else if (sample <= -1.0f) {
    255             *dst = 0;
    256         } else {
    257             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
    258         }
    259     }
    260 
    261     cvt->len_cvt /= 2;
    262     if (cvt->filters[++cvt->filter_index]) {
    263         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
    264     }
    265 }
    266 
    267 static void SDLCALL
    268 SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    269 {
    270     const float *src = (const float *) cvt->buf;
    271     Sint32 *dst = (Sint32 *) cvt->buf;
    272     int i;
    273 
    274     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
    275 
    276     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
    277         const float sample = *src;
    278         if (sample >= 1.0f) {
    279             *dst = 2147483647;
    280         } else if (sample <= -1.0f) {
    281             *dst = (Sint32) -2147483648LL;
    282         } else {
    283             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
    284         }
    285     }
    286 
    287     if (cvt->filters[++cvt->filter_index]) {
    288         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
    289     }
    290 }
    291 #endif
    292 
    293 
    294 #if HAVE_SSE2_INTRINSICS
    295 static void SDLCALL
    296 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    297 {
    298     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    299     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    300     int i;
    301 
    302     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
    303 
    304     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
    305     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
    306         *dst = ((float) *src) * DIVBY128;
    307     }
    308 
    309     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
    310     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    311 
    312     /* Make sure src is aligned too. */
    313     if ((((size_t) src) & 15) == 0) {
    314         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    315         const __m128i *mmsrc = (const __m128i *) src;
    316         const __m128i zero = _mm_setzero_si128();
    317         const __m128 divby128 = _mm_set1_ps(DIVBY128);
    318         while (i >= 16) {   /* 16 * 8-bit */
    319             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
    320             /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
    321             const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
    322             /* right-shift-sign-extend gets us sint16 with the other set of values. */
    323             const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
    324             /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
    325             const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
    326             const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
    327             const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
    328             const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
    329             /* Interleave back into correct order, store. */
    330             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
    331             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
    332             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
    333             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
    334             i -= 16; mmsrc--; dst -= 16;
    335         }
    336 
    337         src = (const Sint8 *) mmsrc;
    338     }
    339 
    340     src += 15; dst += 15;  /* adjust for any scalar finishing. */
    341 
    342     /* Finish off any leftovers with scalar operations. */
    343     while (i) {
    344         *dst = ((float) *src) * DIVBY128;
    345         i--; src--; dst--;
    346     }
    347 
    348     cvt->len_cvt *= 4;
    349     if (cvt->filters[++cvt->filter_index]) {
    350         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    351     }
    352 }
    353 
    354 static void SDLCALL
    355 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    356 {
    357     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    358     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    359     int i;
    360 
    361     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
    362 
    363     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
    364     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
    365         *dst = (((float) *src) * DIVBY128) - 1.0f;
    366     }
    367 
    368     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
    369     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    370 
    371     /* Make sure src is aligned too. */
    372     if ((((size_t) src) & 15) == 0) {
    373         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    374         const __m128i *mmsrc = (const __m128i *) src;
    375         const __m128i zero = _mm_setzero_si128();
    376         const __m128 divby128 = _mm_set1_ps(DIVBY128);
    377         const __m128 minus1 = _mm_set1_ps(-1.0f);
    378         while (i >= 16) {   /* 16 * 8-bit */
    379             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
    380             /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
    381             const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
    382             /* right-shift-zero-extend gets us uint16 with the other set of values. */
    383             const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
    384             /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
    385             /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
    386             const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
    387             const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
    388             const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
    389             const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
    390             /* Interleave back into correct order, store. */
    391             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
    392             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
    393             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
    394             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
    395             i -= 16; mmsrc--; dst -= 16;
    396         }
    397 
    398         src = (const Uint8 *) mmsrc;
    399     }
    400 
    401     src += 15; dst += 15;  /* adjust for any scalar finishing. */
    402 
    403     /* Finish off any leftovers with scalar operations. */
    404     while (i) {
    405         *dst = (((float) *src) * DIVBY128) - 1.0f;
    406         i--; src--; dst--;
    407     }
    408 
    409     cvt->len_cvt *= 4;
    410     if (cvt->filters[++cvt->filter_index]) {
    411         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    412     }
    413 }
    414 
    415 static void SDLCALL
    416 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    417 {
    418     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
    419     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
    420     int i;
    421 
    422     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
    423 
    424     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
    425     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
    426         *dst = ((float) *src) * DIVBY32768;
    427     }
    428 
    429     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
    430     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    431 
    432     /* Make sure src is aligned too. */
    433     if ((((size_t) src) & 15) == 0) {
    434         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    435         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
    436         while (i >= 8) {   /* 8 * 16-bit */
    437             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
    438             /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
    439             const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
    440             /* right-shift-sign-extend gets us sint32 with the other set of values. */
    441             const __m128i b = _mm_srai_epi32(ints, 16);
    442             /* Interleave these back into the right order, convert to float, multiply, store. */
    443             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
    444             _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
    445             i -= 8; src -= 8; dst -= 8;
    446         }
    447     }
    448 
    449     src += 7; dst += 7;  /* adjust for any scalar finishing. */
    450 
    451     /* Finish off any leftovers with scalar operations. */
    452     while (i) {
    453         *dst = ((float) *src) * DIVBY32768;
    454         i--; src--; dst--;
    455     }
    456 
    457     cvt->len_cvt *= 2;
    458     if (cvt->filters[++cvt->filter_index]) {
    459         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    460     }
    461 }
    462 
    463 static void SDLCALL
    464 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    465 {
    466     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
    467     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
    468     int i;
    469 
    470     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
    471 
    472     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
    473     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
    474         *dst = (((float) *src) * DIVBY32768) - 1.0f;
    475     }
    476 
    477     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
    478     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    479 
    480     /* Make sure src is aligned too. */
    481     if ((((size_t) src) & 15) == 0) {
    482         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    483         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
    484         const __m128 minus1 = _mm_set1_ps(-1.0f);
    485         while (i >= 8) {   /* 8 * 16-bit */
    486             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
    487             /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
    488             const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
    489             /* right-shift-sign-extend gets us sint32 with the other set of values. */
    490             const __m128i b = _mm_srli_epi32(ints, 16);
    491             /* Interleave these back into the right order, convert to float, multiply, store. */
    492             _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
    493             _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
    494             i -= 8; src -= 8; dst -= 8;
    495         }
    496     }
    497 
    498     src += 7; dst += 7;  /* adjust for any scalar finishing. */
    499 
    500     /* Finish off any leftovers with scalar operations. */
    501     while (i) {
    502         *dst = (((float) *src) * DIVBY32768) - 1.0f;
    503         i--; src--; dst--;
    504     }
    505 
    506     cvt->len_cvt *= 2;
    507     if (cvt->filters[++cvt->filter_index]) {
    508         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    509     }
    510 }
    511 
    512 static void SDLCALL
    513 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    514 {
    515     const Sint32 *src = (const Sint32 *) cvt->buf;
    516     float *dst = (float *) cvt->buf;
    517     int i;
    518 
    519     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
    520 
    521     /* Get dst aligned to 16 bytes */
    522     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
    523         *dst = ((float) (*src>>8)) * DIVBY8388607;
    524     }
    525 
    526     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    527 
    528     /* Make sure src is aligned too. */
    529     if ((((size_t) src) & 15) == 0) {
    530         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    531         const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
    532         const __m128i *mmsrc = (const __m128i *) src;
    533         while (i >= 4) {   /* 4 * sint32 */
    534             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
    535             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
    536             i -= 4; mmsrc++; dst += 4;
    537         }
    538         src = (const Sint32 *) mmsrc;
    539     }
    540 
    541     /* Finish off any leftovers with scalar operations. */
    542     while (i) {
    543         *dst = ((float) (*src>>8)) * DIVBY8388607;
    544         i--; src++; dst++;
    545     }
    546 
    547     if (cvt->filters[++cvt->filter_index]) {
    548         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    549     }
    550 }
    551 
    552 static void SDLCALL
    553 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    554 {
    555     const float *src = (const float *) cvt->buf;
    556     Sint8 *dst = (Sint8 *) cvt->buf;
    557     int i;
    558 
    559     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
    560 
    561     /* Get dst aligned to 16 bytes */
    562     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
    563         const float sample = *src;
    564         if (sample >= 1.0f) {
    565             *dst = 127;
    566         } else if (sample <= -1.0f) {
    567             *dst = -128;
    568         } else {
    569             *dst = (Sint8)(sample * 127.0f);
    570         }
    571     }
    572 
    573     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    574 
    575     /* Make sure src is aligned too. */
    576     if ((((size_t) src) & 15) == 0) {
    577         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    578         const __m128 one = _mm_set1_ps(1.0f);
    579         const __m128 negone = _mm_set1_ps(-1.0f);
    580         const __m128 mulby127 = _mm_set1_ps(127.0f);
    581         __m128i *mmdst = (__m128i *) dst;
    582         while (i >= 16) {   /* 16 * float32 */
    583             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
    584             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
    585             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
    586             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
    587             _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
    588             i -= 16; src += 16; mmdst++;
    589         }
    590         dst = (Sint8 *) mmdst;
    591     }
    592 
    593     /* Finish off any leftovers with scalar operations. */
    594     while (i) {
    595         const float sample = *src;
    596         if (sample >= 1.0f) {
    597             *dst = 127;
    598         } else if (sample <= -1.0f) {
    599             *dst = -128;
    600         } else {
    601             *dst = (Sint8)(sample * 127.0f);
    602         }
    603         i--; src++; dst++;
    604     }
    605 
    606     cvt->len_cvt /= 4;
    607     if (cvt->filters[++cvt->filter_index]) {
    608         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
    609     }
    610 }
    611 
    612 static void SDLCALL
    613 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    614 {
    615     const float *src = (const float *) cvt->buf;
    616     Uint8 *dst = cvt->buf;
    617     int i;
    618 
    619     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
    620 
    621     /* Get dst aligned to 16 bytes */
    622     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
    623         const float sample = *src;
    624         if (sample >= 1.0f) {
    625             *dst = 255;
    626         } else if (sample <= -1.0f) {
    627             *dst = 0;
    628         } else {
    629             *dst = (Uint8)((sample + 1.0f) * 127.0f);
    630         }
    631     }
    632 
    633     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    634 
    635     /* Make sure src is aligned too. */
    636     if ((((size_t) src) & 15) == 0) {
    637         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    638         const __m128 one = _mm_set1_ps(1.0f);
    639         const __m128 negone = _mm_set1_ps(-1.0f);
    640         const __m128 mulby127 = _mm_set1_ps(127.0f);
    641         __m128i *mmdst = (__m128i *) dst;
    642         while (i >= 16) {   /* 16 * float32 */
    643             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
    644             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
    645             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
    646             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
    647             _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
    648             i -= 16; src += 16; mmdst++;
    649         }
    650         dst = (Uint8 *) mmdst;
    651     }
    652 
    653     /* Finish off any leftovers with scalar operations. */
    654     while (i) {
    655         const float sample = *src;
    656         if (sample >= 1.0f) {
    657             *dst = 255;
    658         } else if (sample <= -1.0f) {
    659             *dst = 0;
    660         } else {
    661             *dst = (Uint8)((sample + 1.0f) * 127.0f);
    662         }
    663         i--; src++; dst++;
    664     }
    665 
    666     cvt->len_cvt /= 4;
    667     if (cvt->filters[++cvt->filter_index]) {
    668         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
    669     }
    670 }
    671 
    672 static void SDLCALL
    673 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    674 {
    675     const float *src = (const float *) cvt->buf;
    676     Sint16 *dst = (Sint16 *) cvt->buf;
    677     int i;
    678 
    679     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
    680 
    681     /* Get dst aligned to 16 bytes */
    682     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
    683         const float sample = *src;
    684         if (sample >= 1.0f) {
    685             *dst = 32767;
    686         } else if (sample <= -1.0f) {
    687             *dst = -32768;
    688         } else {
    689             *dst = (Sint16)(sample * 32767.0f);
    690         }
    691     }
    692 
    693     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    694 
    695     /* Make sure src is aligned too. */
    696     if ((((size_t) src) & 15) == 0) {
    697         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    698         const __m128 one = _mm_set1_ps(1.0f);
    699         const __m128 negone = _mm_set1_ps(-1.0f);
    700         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
    701         __m128i *mmdst = (__m128i *) dst;
    702         while (i >= 8) {   /* 8 * float32 */
    703             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
    704             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
    705             _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
    706             i -= 8; src += 8; mmdst++;
    707         }
    708         dst = (Sint16 *) mmdst;
    709     }
    710 
    711     /* Finish off any leftovers with scalar operations. */
    712     while (i) {
    713         const float sample = *src;
    714         if (sample >= 1.0f) {
    715             *dst = 32767;
    716         } else if (sample <= -1.0f) {
    717             *dst = -32768;
    718         } else {
    719             *dst = (Sint16)(sample * 32767.0f);
    720         }
    721         i--; src++; dst++;
    722     }
    723 
    724     cvt->len_cvt /= 2;
    725     if (cvt->filters[++cvt->filter_index]) {
    726         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
    727     }
    728 }
    729 
    730 static void SDLCALL
    731 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    732 {
    733     const float *src = (const float *) cvt->buf;
    734     Uint16 *dst = (Uint16 *) cvt->buf;
    735     int i;
    736 
    737     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
    738 
    739     /* Get dst aligned to 16 bytes */
    740     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
    741         const float sample = *src;
    742         if (sample >= 1.0f) {
    743             *dst = 65535;
    744         } else if (sample <= -1.0f) {
    745             *dst = 0;
    746         } else {
    747             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
    748         }
    749     }
    750 
    751     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    752 
    753     /* Make sure src is aligned too. */
    754     if ((((size_t) src) & 15) == 0) {
    755         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    756         /* This calculates differently than the scalar path because SSE2 can't
    757            pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
    758            saturation, so that would corrupt our data. _mm_packus_epi32 exists,
    759            but not before SSE 4.1. So we convert from float to sint16, packing
    760            that down with legit signed saturation, and then xor the top bit
    761            against 1. This results in the correct unsigned 16-bit value, even
    762            though it looks like dark magic. */
    763         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
    764         const __m128i topbit = _mm_set1_epi16(-32768);
    765         const __m128 one = _mm_set1_ps(1.0f);
    766         const __m128 negone = _mm_set1_ps(-1.0f);
    767         __m128i *mmdst = (__m128i *) dst;
    768         while (i >= 8) {   /* 8 * float32 */
    769             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
    770             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
    771             _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
    772             i -= 8; src += 8; mmdst++;
    773         }
    774         dst = (Uint16 *) mmdst;
    775     }
    776 
    777     /* Finish off any leftovers with scalar operations. */
    778     while (i) {
    779         const float sample = *src;
    780         if (sample >= 1.0f) {
    781             *dst = 65535;
    782         } else if (sample <= -1.0f) {
    783             *dst = 0;
    784         } else {
    785             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
    786         }
    787         i--; src++; dst++;
    788     }
    789 
    790     cvt->len_cvt /= 2;
    791     if (cvt->filters[++cvt->filter_index]) {
    792         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
    793     }
    794 }
    795 
    796 static void SDLCALL
    797 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    798 {
    799     const float *src = (const float *) cvt->buf;
    800     Sint32 *dst = (Sint32 *) cvt->buf;
    801     int i;
    802 
    803     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
    804 
    805     /* Get dst aligned to 16 bytes */
    806     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
    807         const float sample = *src;
    808         if (sample >= 1.0f) {
    809             *dst = 2147483647;
    810         } else if (sample <= -1.0f) {
    811             *dst = (Sint32) -2147483648LL;
    812         } else {
    813             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
    814         }
    815     }
    816 
    817     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    818     SDL_assert(!i || ((((size_t) src) & 15) == 0));
    819 
    820     {
    821         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    822         const __m128 one = _mm_set1_ps(1.0f);
    823         const __m128 negone = _mm_set1_ps(-1.0f);
    824         const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
    825         __m128i *mmdst = (__m128i *) dst;
    826         while (i >= 4) {   /* 4 * float32 */
    827             _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8));  /* load 4 floats, clamp, convert to sint32 */
    828             i -= 4; src += 4; mmdst++;
    829         }
    830         dst = (Sint32 *) mmdst;
    831     }
    832 
    833     /* Finish off any leftovers with scalar operations. */
    834     while (i) {
    835         const float sample = *src;
    836         if (sample >= 1.0f) {
    837             *dst = 2147483647;
    838         } else if (sample <= -1.0f) {
    839             *dst = (Sint32) -2147483648LL;
    840         } else {
    841             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
    842         }
    843         i--; src++; dst++;
    844     }
    845 
    846     if (cvt->filters[++cvt->filter_index]) {
    847         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
    848     }
    849 }
    850 #endif
    851 
    852 
    853 #if HAVE_NEON_INTRINSICS
    854 static void SDLCALL
    855 SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    856 {
    857     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    858     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    859     int i;
    860 
    861     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using NEON)");
    862 
    863     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
    864     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
    865         *dst = ((float) *src) * DIVBY128;
    866     }
    867 
    868     src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
    869     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    870 
    871     /* Make sure src is aligned too. */
    872     if ((((size_t) src) & 15) == 0) {
    873         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
    874         const int8_t *mmsrc = (const int8_t *) src;
    875         const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
    876         while (i >= 16) {   /* 16 * 8-bit */
    877             const int8x16_t bytes = vld1q_s8(mmsrc);  /* get 16 sint8 into a NEON register. */
    878             const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes));  /* convert top 8 bytes to 8 int16 */
    879             const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes));   /* convert bottom 8 bytes to 8 int16 */
    880             /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
    881             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128));
    882             vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128));
    883             vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128));
    884             vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128));
    885             i -= 16; mmsrc -= 16; dst -= 16;
    886         }
    887 
    888         src = (const Sint8 *) mmsrc;
    889     }
    890 
    891     src += 15; dst += 15;  /* adjust for any scalar finishing. */
    892 
    893     /* Finish off any leftovers with scalar operations. */
    894     while (i) {
    895         *dst = ((float) *src) * DIVBY128;
    896         i--; src--; dst--;
    897     }
    898 
    899     cvt->len_cvt *= 4;
    900     if (cvt->filters[++cvt->filter_index]) {
    901         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    902     }
    903 }
    904 
    905 static void SDLCALL
    906 SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    907 {
    908     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    909     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    910     int i;
    911 
    912     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using NEON)");
    913 
    914     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
    915     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
    916         *dst = (((float) *src) * DIVBY128) - 1.0f;
    917     }
    918 
    919     src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
    920     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    921 
    922     /* Make sure src is aligned too. */
    923     if ((((size_t) src) & 15) == 0) {
    924         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
    925         const uint8_t *mmsrc = (const uint8_t *) src;
    926         const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
    927         const float32x4_t negone = vdupq_n_f32(-1.0f);
    928         while (i >= 16) {   /* 16 * 8-bit */
    929             const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
    930             const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
    931             const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
    932             /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
    933             vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
    934             vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
    935             vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
    936             vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
    937             i -= 16; mmsrc -= 16; dst -= 16;
    938         }
    939 
    940         src = (const Uint8 *) mmsrc;
    941     }
    942 
    943     src += 15; dst += 15;  /* adjust for any scalar finishing. */
    944 
    945     /* Finish off any leftovers with scalar operations. */
    946     while (i) {
    947         *dst = (((float) *src) * DIVBY128) - 1.0f;
    948         i--; src--; dst--;
    949     }
    950 
    951     cvt->len_cvt *= 4;
    952     if (cvt->filters[++cvt->filter_index]) {
    953         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    954     }
    955 }
    956 
    957 static void SDLCALL
    958 SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    959 {
    960     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
    961     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
    962     int i;
    963 
    964     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using NEON)");
    965 
    966     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
    967     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
    968         *dst = ((float) *src) * DIVBY32768;
    969     }
    970 
    971     src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
    972     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    973 
    974     /* Make sure src is aligned too. */
    975     if ((((size_t) src) & 15) == 0) {
    976         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
    977         const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
    978         while (i >= 8) {   /* 8 * 16-bit */
    979             const int16x8_t ints = vld1q_s16((int16_t const *) src);  /* get 8 sint16 into a NEON register. */
    980             /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
    981             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
    982             vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
    983             i -= 8; src -= 8; dst -= 8;
    984         }
    985     }
    986 
    987     src += 7; dst += 7;  /* adjust for any scalar finishing. */
    988 
    989     /* Finish off any leftovers with scalar operations. */
    990     while (i) {
    991         *dst = ((float) *src) * DIVBY32768;
    992         i--; src--; dst--;
    993     }
    994 
    995     cvt->len_cvt *= 2;
    996     if (cvt->filters[++cvt->filter_index]) {
    997         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    998     }
    999 }
   1000 
   1001 static void SDLCALL
   1002 SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1003 {
   1004     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   1005     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   1006     int i;
   1007 
   1008     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using NEON)");
   1009 
   1010     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   1011     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   1012         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   1013     }
   1014 
   1015     src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
   1016     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1017 
   1018     /* Make sure src is aligned too. */
   1019     if ((((size_t) src) & 15) == 0) {
   1020         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   1021         const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
   1022         const float32x4_t negone = vdupq_n_f32(-1.0f);
   1023         while (i >= 8) {   /* 8 * 16-bit */
   1024             const uint16x8_t uints = vld1q_u16((uint16_t const *) src);  /* get 8 uint16 into a NEON register. */
   1025             /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
   1026             vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
   1027             vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
   1028             i -= 8; src -= 8; dst -= 8;
   1029         }
   1030     }
   1031 
   1032     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   1033 
   1034     /* Finish off any leftovers with scalar operations. */
   1035     while (i) {
   1036         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   1037         i--; src--; dst--;
   1038     }
   1039 
   1040     cvt->len_cvt *= 2;
   1041     if (cvt->filters[++cvt->filter_index]) {
   1042         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   1043     }
   1044 }
   1045 
   1046 static void SDLCALL
   1047 SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1048 {
   1049     const Sint32 *src = (const Sint32 *) cvt->buf;
   1050     float *dst = (float *) cvt->buf;
   1051     int i;
   1052 
   1053     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using NEON)");
   1054 
   1055     /* Get dst aligned to 16 bytes */
   1056     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1057         *dst = ((float) (*src>>8)) * DIVBY8388607;
   1058     }
   1059 
   1060     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1061 
   1062     /* Make sure src is aligned too. */
   1063     if ((((size_t) src) & 15) == 0) {
   1064         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   1065         const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
   1066         const int32_t *mmsrc = (const int32_t *) src;
   1067         while (i >= 4) {   /* 4 * sint32 */
   1068             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
   1069             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
   1070             i -= 4; mmsrc += 4; dst += 4;
   1071         }
   1072         src = (const Sint32 *) mmsrc;
   1073     }
   1074 
   1075     /* Finish off any leftovers with scalar operations. */
   1076     while (i) {
   1077         *dst = ((float) (*src>>8)) * DIVBY8388607;
   1078         i--; src++; dst++;
   1079     }
   1080 
   1081     if (cvt->filters[++cvt->filter_index]) {
   1082         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   1083     }
   1084 }
   1085 
   1086 static void SDLCALL
   1087 SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1088 {
   1089     const float *src = (const float *) cvt->buf;
   1090     Sint8 *dst = (Sint8 *) cvt->buf;
   1091     int i;
   1092 
   1093     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using NEON)");
   1094 
   1095     /* Get dst aligned to 16 bytes */
   1096     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1097         const float sample = *src;
   1098         if (sample >= 1.0f) {
   1099             *dst = 127;
   1100         } else if (sample <= -1.0f) {
   1101             *dst = -128;
   1102         } else {
   1103             *dst = (Sint8)(sample * 127.0f);
   1104         }
   1105     }
   1106 
   1107     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1108 
   1109     /* Make sure src is aligned too. */
   1110     if ((((size_t) src) & 15) == 0) {
   1111         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   1112         const float32x4_t one = vdupq_n_f32(1.0f);
   1113         const float32x4_t negone = vdupq_n_f32(-1.0f);
   1114         const float32x4_t mulby127 = vdupq_n_f32(127.0f);
   1115         int8_t *mmdst = (int8_t *) dst;
   1116         while (i >= 16) {   /* 16 * float32 */
   1117             const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   1118             const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   1119             const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   1120             const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   1121             const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */
   1122             const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */
   1123             vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi));  /* combine to int8x16_t, store out */
   1124             i -= 16; src += 16; mmdst += 16;
   1125         }
   1126         dst = (Sint8 *) mmdst;
   1127     }
   1128 
   1129     /* Finish off any leftovers with scalar operations. */
   1130     while (i) {
   1131         const float sample = *src;
   1132         if (sample >= 1.0f) {
   1133             *dst = 127;
   1134         } else if (sample <= -1.0f) {
   1135             *dst = -128;
   1136         } else {
   1137             *dst = (Sint8)(sample * 127.0f);
   1138         }
   1139         i--; src++; dst++;
   1140     }
   1141 
   1142     cvt->len_cvt /= 4;
   1143     if (cvt->filters[++cvt->filter_index]) {
   1144         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   1145     }
   1146 }
   1147 
   1148 static void SDLCALL
   1149 SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1150 {
   1151     const float *src = (const float *) cvt->buf;
   1152     Uint8 *dst = (Uint8 *) cvt->buf;
   1153     int i;
   1154 
   1155     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using NEON)");
   1156 
   1157     /* Get dst aligned to 16 bytes */
   1158     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1159         const float sample = *src;
   1160         if (sample >= 1.0f) {
   1161             *dst = 255;
   1162         } else if (sample <= -1.0f) {
   1163             *dst = 0;
   1164         } else {
   1165             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   1166         }
   1167     }
   1168 
   1169     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1170 
   1171     /* Make sure src is aligned too. */
   1172     if ((((size_t) src) & 15) == 0) {
   1173         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   1174         const float32x4_t one = vdupq_n_f32(1.0f);
   1175         const float32x4_t negone = vdupq_n_f32(-1.0f);
   1176         const float32x4_t mulby127 = vdupq_n_f32(127.0f);
   1177         uint8_t *mmdst = (uint8_t *) dst;
   1178         while (i >= 16) {   /* 16 * float32 */
   1179             const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
   1180             const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
   1181             const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
   1182             const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
   1183             const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */
   1184             const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */
   1185             vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi));  /* combine to uint8x16_t, store out */
   1186             i -= 16; src += 16; mmdst += 16;
   1187         }
   1188 
   1189         dst = (Uint8 *) mmdst;
   1190     }
   1191 
   1192     /* Finish off any leftovers with scalar operations. */
   1193     while (i) {
   1194         const float sample = *src;
   1195         if (sample >= 1.0f) {
   1196             *dst = 255;
   1197         } else if (sample <= -1.0f) {
   1198             *dst = 0;
   1199         } else {
   1200             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   1201         }
   1202         i--; src++; dst++;
   1203     }
   1204 
   1205     cvt->len_cvt /= 4;
   1206     if (cvt->filters[++cvt->filter_index]) {
   1207         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   1208     }
   1209 }
   1210 
   1211 static void SDLCALL
   1212 SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1213 {
   1214     const float *src = (const float *) cvt->buf;
   1215     Sint16 *dst = (Sint16 *) cvt->buf;
   1216     int i;
   1217 
   1218     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using NEON)");
   1219 
   1220     /* Get dst aligned to 16 bytes */
   1221     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1222         const float sample = *src;
   1223         if (sample >= 1.0f) {
   1224             *dst = 32767;
   1225         } else if (sample <= -1.0f) {
   1226             *dst = -32768;
   1227         } else {
   1228             *dst = (Sint16)(sample * 32767.0f);
   1229         }
   1230     }
   1231 
   1232     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1233 
   1234     /* Make sure src is aligned too. */
   1235     if ((((size_t) src) & 15) == 0) {
   1236         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   1237         const float32x4_t one = vdupq_n_f32(1.0f);
   1238         const float32x4_t negone = vdupq_n_f32(-1.0f);
   1239         const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
   1240         int16_t *mmdst = (int16_t *) dst;
   1241         while (i >= 8) {   /* 8 * float32 */
   1242             const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   1243             const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   1244             vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2)));  /* narrow to sint16, combine, store out. */
   1245             i -= 8; src += 8; mmdst += 8;
   1246         }
   1247         dst = (Sint16 *) mmdst;
   1248     }
   1249 
   1250     /* Finish off any leftovers with scalar operations. */
   1251     while (i) {
   1252         const float sample = *src;
   1253         if (sample >= 1.0f) {
   1254             *dst = 32767;
   1255         } else if (sample <= -1.0f) {
   1256             *dst = -32768;
   1257         } else {
   1258             *dst = (Sint16)(sample * 32767.0f);
   1259         }
   1260         i--; src++; dst++;
   1261     }
   1262 
   1263     cvt->len_cvt /= 2;
   1264     if (cvt->filters[++cvt->filter_index]) {
   1265         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   1266     }
   1267 }
   1268 
   1269 static void SDLCALL
   1270 SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1271 {
   1272     const float *src = (const float *) cvt->buf;
   1273     Uint16 *dst = (Uint16 *) cvt->buf;
   1274     int i;
   1275 
   1276     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using NEON)");
   1277 
   1278     /* Get dst aligned to 16 bytes */
   1279     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1280         const float sample = *src;
   1281         if (sample >= 1.0f) {
   1282             *dst = 65535;
   1283         } else if (sample <= -1.0f) {
   1284             *dst = 0;
   1285         } else {
   1286             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   1287         }
   1288     }
   1289 
   1290     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1291 
   1292     /* Make sure src is aligned too. */
   1293     if ((((size_t) src) & 15) == 0) {
   1294         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   1295         const float32x4_t one = vdupq_n_f32(1.0f);
   1296         const float32x4_t negone = vdupq_n_f32(-1.0f);
   1297         const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
   1298         uint16_t *mmdst = (uint16_t *) dst;
   1299         while (i >= 8) {   /* 8 * float32 */
   1300             const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
   1301             const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
   1302             vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2)));  /* narrow to uint16, combine, store out. */
   1303             i -= 8; src += 8; mmdst += 8;
   1304         }
   1305         dst = (Uint16 *) mmdst;
   1306     }
   1307 
   1308     /* Finish off any leftovers with scalar operations. */
   1309     while (i) {
   1310         const float sample = *src;
   1311         if (sample >= 1.0f) {
   1312             *dst = 65535;
   1313         } else if (sample <= -1.0f) {
   1314             *dst = 0;
   1315         } else {
   1316             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   1317         }
   1318         i--; src++; dst++;
   1319     }
   1320 
   1321     cvt->len_cvt /= 2;
   1322     if (cvt->filters[++cvt->filter_index]) {
   1323         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   1324     }
   1325 }
   1326 
   1327 static void SDLCALL
   1328 SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1329 {
   1330     const float *src = (const float *) cvt->buf;
   1331     Sint32 *dst = (Sint32 *) cvt->buf;
   1332     int i;
   1333 
   1334     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using NEON)");
   1335 
   1336     /* Get dst aligned to 16 bytes */
   1337     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1338         const float sample = *src;
   1339         if (sample >= 1.0f) {
   1340             *dst = 2147483647;
   1341         } else if (sample <= -1.0f) {
   1342             *dst = (-2147483647) - 1;
   1343         } else {
   1344             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   1345         }
   1346     }
   1347 
   1348     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1349     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   1350 
   1351     {
   1352         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   1353         const float32x4_t one = vdupq_n_f32(1.0f);
   1354         const float32x4_t negone = vdupq_n_f32(-1.0f);
   1355         const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f);
   1356         int32_t *mmdst = (int32_t *) dst;
   1357         while (i >= 4) {   /* 4 * float32 */
   1358             vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8));
   1359             i -= 4; src += 4; mmdst += 4;
   1360         }
   1361         dst = (Sint32 *) mmdst;
   1362     }
   1363 
   1364     /* Finish off any leftovers with scalar operations. */
   1365     while (i) {
   1366         const float sample = *src;
   1367         if (sample >= 1.0f) {
   1368             *dst = 2147483647;
   1369         } else if (sample <= -1.0f) {
   1370             *dst = (-2147483647) - 1;
   1371         } else {
   1372             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   1373         }
   1374         i--; src++; dst++;
   1375     }
   1376 
   1377     if (cvt->filters[++cvt->filter_index]) {
   1378         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   1379     }
   1380 }
   1381 #endif
   1382 
   1383 
   1384 
   1385 void SDL_ChooseAudioConverters(void)
   1386 {
   1387     static SDL_bool converters_chosen = SDL_FALSE;
   1388 
   1389     if (converters_chosen) {
   1390         return;
   1391     }
   1392 
   1393 #define SET_CONVERTER_FUNCS(fntype) \
   1394         SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
   1395         SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
   1396         SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
   1397         SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
   1398         SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
   1399         SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
   1400         SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
   1401         SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
   1402         SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
   1403         SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
   1404         converters_chosen = SDL_TRUE
   1405 
   1406 #if HAVE_SSE2_INTRINSICS
   1407     if (SDL_HasSSE2()) {
   1408         SET_CONVERTER_FUNCS(SSE2);
   1409         return;
   1410     }
   1411 #endif
   1412 
   1413 #if HAVE_NEON_INTRINSICS
   1414     if (SDL_HasNEON()) {
   1415         SET_CONVERTER_FUNCS(NEON);
   1416         return;
   1417     }
   1418 #endif
   1419 
   1420 #if NEED_SCALAR_CONVERTER_FALLBACKS
   1421     SET_CONVERTER_FUNCS(Scalar);
   1422 #endif
   1423 
   1424 #undef SET_CONVERTER_FUNCS
   1425 
   1426     SDL_assert(converters_chosen == SDL_TRUE);
   1427 }
   1428 
   1429 /* vi: set ts=4 sw=4 expandtab: */