SDL_audiotypecvt.c (54841B)
1 /* 2 Simple DirectMedia Layer 3 Copyright (C) 1997-2020 Sam Lantinga <slouken@libsdl.org> 4 5 This software is provided 'as-is', without any express or implied 6 warranty. In no event will the authors be held liable for any damages 7 arising from the use of this software. 8 9 Permission is granted to anyone to use this software for any purpose, 10 including commercial applications, and to alter it and redistribute it 11 freely, subject to the following restrictions: 12 13 1. The origin of this software must not be misrepresented; you must not 14 claim that you wrote the original software. If you use this software 15 in a product, an acknowledgment in the product documentation would be 16 appreciated but is not required. 17 2. Altered source versions must be plainly marked as such, and must not be 18 misrepresented as being the original software. 19 3. This notice may not be removed or altered from any source distribution. 20 */ 21 #include "../SDL_internal.h" 22 23 #include "SDL_audio.h" 24 #include "SDL_audio_c.h" 25 #include "SDL_cpuinfo.h" 26 27 #ifdef __ARM_NEON 28 #define HAVE_NEON_INTRINSICS 1 29 #endif 30 31 #ifdef __SSE2__ 32 #define HAVE_SSE2_INTRINSICS 1 33 #endif 34 35 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS 36 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* x86_64 guarantees SSE2. */ 37 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS 38 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* Mac OS X/Intel guarantees SSE2. */ 39 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS 40 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* ARMv8+ promise NEON. */ 41 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS 42 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* All Apple ARMv7 chips promise NEON support. */ 43 #endif 44 45 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */ 46 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS 47 #define NEED_SCALAR_CONVERTER_FALLBACKS 1 48 #endif 49 50 /* Function pointers set to a CPU-specific implementation. */ 51 SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL; 52 SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL; 53 SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL; 54 SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL; 55 SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL; 56 SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL; 57 SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL; 58 SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL; 59 SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL; 60 SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL; 61 62 63 #define DIVBY128 0.0078125f 64 #define DIVBY32768 0.000030517578125f 65 #define DIVBY8388607 0.00000011920930376163766f 66 67 68 #if NEED_SCALAR_CONVERTER_FALLBACKS 69 static void SDLCALL 70 SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 71 { 72 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1; 73 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 74 int i; 75 76 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32"); 77 78 for (i = cvt->len_cvt; i; --i, --src, --dst) { 79 *dst = ((float) *src) * DIVBY128; 80 } 81 82 cvt->len_cvt *= 4; 83 if (cvt->filters[++cvt->filter_index]) { 84 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 85 } 86 } 87 88 static void SDLCALL 89 SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 90 { 91 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1; 92 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 93 int i; 94 95 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32"); 96 97 for (i = cvt->len_cvt; i; --i, --src, --dst) { 98 *dst = (((float) *src) * DIVBY128) - 1.0f; 99 } 100 101 cvt->len_cvt *= 4; 102 if (cvt->filters[++cvt->filter_index]) { 103 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 104 } 105 } 106 107 static void SDLCALL 108 SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 109 { 110 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1; 111 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 112 int i; 113 114 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32"); 115 116 for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) { 117 *dst = ((float) *src) * DIVBY32768; 118 } 119 120 cvt->len_cvt *= 2; 121 if (cvt->filters[++cvt->filter_index]) { 122 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 123 } 124 } 125 126 static void SDLCALL 127 SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 128 { 129 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1; 130 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 131 int i; 132 133 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32"); 134 135 for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) { 136 *dst = (((float) *src) * DIVBY32768) - 1.0f; 137 } 138 139 cvt->len_cvt *= 2; 140 if (cvt->filters[++cvt->filter_index]) { 141 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 142 } 143 } 144 145 static void SDLCALL 146 SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 147 { 148 const Sint32 *src = (const Sint32 *) cvt->buf; 149 float *dst = (float *) cvt->buf; 150 int i; 151 152 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32"); 153 154 for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) { 155 *dst = ((float) (*src>>8)) * DIVBY8388607; 156 } 157 158 if (cvt->filters[++cvt->filter_index]) { 159 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 160 } 161 } 162 163 static void SDLCALL 164 SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 165 { 166 const float *src = (const float *) cvt->buf; 167 Sint8 *dst = (Sint8 *) cvt->buf; 168 int i; 169 170 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8"); 171 172 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { 173 const float sample = *src; 174 if (sample >= 1.0f) { 175 *dst = 127; 176 } else if (sample <= -1.0f) { 177 *dst = -128; 178 } else { 179 *dst = (Sint8)(sample * 127.0f); 180 } 181 } 182 183 cvt->len_cvt /= 4; 184 if (cvt->filters[++cvt->filter_index]) { 185 cvt->filters[cvt->filter_index](cvt, AUDIO_S8); 186 } 187 } 188 189 static void SDLCALL 190 SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 191 { 192 const float *src = (const float *) cvt->buf; 193 Uint8 *dst = (Uint8 *) cvt->buf; 194 int i; 195 196 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8"); 197 198 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { 199 const float sample = *src; 200 if (sample >= 1.0f) { 201 *dst = 255; 202 } else if (sample <= -1.0f) { 203 *dst = 0; 204 } else { 205 *dst = (Uint8)((sample + 1.0f) * 127.0f); 206 } 207 } 208 209 cvt->len_cvt /= 4; 210 if (cvt->filters[++cvt->filter_index]) { 211 cvt->filters[cvt->filter_index](cvt, AUDIO_U8); 212 } 213 } 214 215 static void SDLCALL 216 SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 217 { 218 const float *src = (const float *) cvt->buf; 219 Sint16 *dst = (Sint16 *) cvt->buf; 220 int i; 221 222 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16"); 223 224 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { 225 const float sample = *src; 226 if (sample >= 1.0f) { 227 *dst = 32767; 228 } else if (sample <= -1.0f) { 229 *dst = -32768; 230 } else { 231 *dst = (Sint16)(sample * 32767.0f); 232 } 233 } 234 235 cvt->len_cvt /= 2; 236 if (cvt->filters[++cvt->filter_index]) { 237 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS); 238 } 239 } 240 241 static void SDLCALL 242 SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 243 { 244 const float *src = (const float *) cvt->buf; 245 Uint16 *dst = (Uint16 *) cvt->buf; 246 int i; 247 248 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16"); 249 250 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { 251 const float sample = *src; 252 if (sample >= 1.0f) { 253 *dst = 65535; 254 } else if (sample <= -1.0f) { 255 *dst = 0; 256 } else { 257 *dst = (Uint16)((sample + 1.0f) * 32767.0f); 258 } 259 } 260 261 cvt->len_cvt /= 2; 262 if (cvt->filters[++cvt->filter_index]) { 263 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS); 264 } 265 } 266 267 static void SDLCALL 268 SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 269 { 270 const float *src = (const float *) cvt->buf; 271 Sint32 *dst = (Sint32 *) cvt->buf; 272 int i; 273 274 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32"); 275 276 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { 277 const float sample = *src; 278 if (sample >= 1.0f) { 279 *dst = 2147483647; 280 } else if (sample <= -1.0f) { 281 *dst = (Sint32) -2147483648LL; 282 } else { 283 *dst = ((Sint32)(sample * 8388607.0f)) << 8; 284 } 285 } 286 287 if (cvt->filters[++cvt->filter_index]) { 288 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS); 289 } 290 } 291 #endif 292 293 294 #if HAVE_SSE2_INTRINSICS 295 static void SDLCALL 296 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 297 { 298 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1; 299 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 300 int i; 301 302 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)"); 303 304 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 305 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { 306 *dst = ((float) *src) * DIVBY128; 307 } 308 309 src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */ 310 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 311 312 /* Make sure src is aligned too. */ 313 if ((((size_t) src) & 15) == 0) { 314 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 315 const __m128i *mmsrc = (const __m128i *) src; 316 const __m128i zero = _mm_setzero_si128(); 317 const __m128 divby128 = _mm_set1_ps(DIVBY128); 318 while (i >= 16) { /* 16 * 8-bit */ 319 const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 sint8 into an XMM register. */ 320 /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */ 321 const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8); 322 /* right-shift-sign-extend gets us sint16 with the other set of values. */ 323 const __m128i shorts2 = _mm_srai_epi16(bytes, 8); 324 /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */ 325 const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128); 326 const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128); 327 const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128); 328 const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128); 329 /* Interleave back into correct order, store. */ 330 _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2)); 331 _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2)); 332 _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4)); 333 _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4)); 334 i -= 16; mmsrc--; dst -= 16; 335 } 336 337 src = (const Sint8 *) mmsrc; 338 } 339 340 src += 15; dst += 15; /* adjust for any scalar finishing. */ 341 342 /* Finish off any leftovers with scalar operations. */ 343 while (i) { 344 *dst = ((float) *src) * DIVBY128; 345 i--; src--; dst--; 346 } 347 348 cvt->len_cvt *= 4; 349 if (cvt->filters[++cvt->filter_index]) { 350 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 351 } 352 } 353 354 static void SDLCALL 355 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 356 { 357 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1; 358 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 359 int i; 360 361 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)"); 362 363 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 364 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { 365 *dst = (((float) *src) * DIVBY128) - 1.0f; 366 } 367 368 src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */ 369 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 370 371 /* Make sure src is aligned too. */ 372 if ((((size_t) src) & 15) == 0) { 373 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 374 const __m128i *mmsrc = (const __m128i *) src; 375 const __m128i zero = _mm_setzero_si128(); 376 const __m128 divby128 = _mm_set1_ps(DIVBY128); 377 const __m128 minus1 = _mm_set1_ps(-1.0f); 378 while (i >= 16) { /* 16 * 8-bit */ 379 const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */ 380 /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */ 381 const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8); 382 /* right-shift-zero-extend gets us uint16 with the other set of values. */ 383 const __m128i shorts2 = _mm_srli_epi16(bytes, 8); 384 /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */ 385 /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */ 386 const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1); 387 const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1); 388 const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1); 389 const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1); 390 /* Interleave back into correct order, store. */ 391 _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2)); 392 _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2)); 393 _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4)); 394 _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4)); 395 i -= 16; mmsrc--; dst -= 16; 396 } 397 398 src = (const Uint8 *) mmsrc; 399 } 400 401 src += 15; dst += 15; /* adjust for any scalar finishing. */ 402 403 /* Finish off any leftovers with scalar operations. */ 404 while (i) { 405 *dst = (((float) *src) * DIVBY128) - 1.0f; 406 i--; src--; dst--; 407 } 408 409 cvt->len_cvt *= 4; 410 if (cvt->filters[++cvt->filter_index]) { 411 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 412 } 413 } 414 415 static void SDLCALL 416 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 417 { 418 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1; 419 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 420 int i; 421 422 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)"); 423 424 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 425 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { 426 *dst = ((float) *src) * DIVBY32768; 427 } 428 429 src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */ 430 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 431 432 /* Make sure src is aligned too. */ 433 if ((((size_t) src) & 15) == 0) { 434 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 435 const __m128 divby32768 = _mm_set1_ps(DIVBY32768); 436 while (i >= 8) { /* 8 * 16-bit */ 437 const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */ 438 /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */ 439 const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16); 440 /* right-shift-sign-extend gets us sint32 with the other set of values. */ 441 const __m128i b = _mm_srai_epi32(ints, 16); 442 /* Interleave these back into the right order, convert to float, multiply, store. */ 443 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768)); 444 _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768)); 445 i -= 8; src -= 8; dst -= 8; 446 } 447 } 448 449 src += 7; dst += 7; /* adjust for any scalar finishing. */ 450 451 /* Finish off any leftovers with scalar operations. */ 452 while (i) { 453 *dst = ((float) *src) * DIVBY32768; 454 i--; src--; dst--; 455 } 456 457 cvt->len_cvt *= 2; 458 if (cvt->filters[++cvt->filter_index]) { 459 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 460 } 461 } 462 463 static void SDLCALL 464 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 465 { 466 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1; 467 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 468 int i; 469 470 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)"); 471 472 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 473 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { 474 *dst = (((float) *src) * DIVBY32768) - 1.0f; 475 } 476 477 src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */ 478 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 479 480 /* Make sure src is aligned too. */ 481 if ((((size_t) src) & 15) == 0) { 482 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 483 const __m128 divby32768 = _mm_set1_ps(DIVBY32768); 484 const __m128 minus1 = _mm_set1_ps(-1.0f); 485 while (i >= 8) { /* 8 * 16-bit */ 486 const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */ 487 /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */ 488 const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16); 489 /* right-shift-sign-extend gets us sint32 with the other set of values. */ 490 const __m128i b = _mm_srli_epi32(ints, 16); 491 /* Interleave these back into the right order, convert to float, multiply, store. */ 492 _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1)); 493 _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1)); 494 i -= 8; src -= 8; dst -= 8; 495 } 496 } 497 498 src += 7; dst += 7; /* adjust for any scalar finishing. */ 499 500 /* Finish off any leftovers with scalar operations. */ 501 while (i) { 502 *dst = (((float) *src) * DIVBY32768) - 1.0f; 503 i--; src--; dst--; 504 } 505 506 cvt->len_cvt *= 2; 507 if (cvt->filters[++cvt->filter_index]) { 508 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 509 } 510 } 511 512 static void SDLCALL 513 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 514 { 515 const Sint32 *src = (const Sint32 *) cvt->buf; 516 float *dst = (float *) cvt->buf; 517 int i; 518 519 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)"); 520 521 /* Get dst aligned to 16 bytes */ 522 for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 523 *dst = ((float) (*src>>8)) * DIVBY8388607; 524 } 525 526 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 527 528 /* Make sure src is aligned too. */ 529 if ((((size_t) src) & 15) == 0) { 530 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 531 const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607); 532 const __m128i *mmsrc = (const __m128i *) src; 533 while (i >= 4) { /* 4 * sint32 */ 534 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */ 535 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607)); 536 i -= 4; mmsrc++; dst += 4; 537 } 538 src = (const Sint32 *) mmsrc; 539 } 540 541 /* Finish off any leftovers with scalar operations. */ 542 while (i) { 543 *dst = ((float) (*src>>8)) * DIVBY8388607; 544 i--; src++; dst++; 545 } 546 547 if (cvt->filters[++cvt->filter_index]) { 548 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 549 } 550 } 551 552 static void SDLCALL 553 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 554 { 555 const float *src = (const float *) cvt->buf; 556 Sint8 *dst = (Sint8 *) cvt->buf; 557 int i; 558 559 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)"); 560 561 /* Get dst aligned to 16 bytes */ 562 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 563 const float sample = *src; 564 if (sample >= 1.0f) { 565 *dst = 127; 566 } else if (sample <= -1.0f) { 567 *dst = -128; 568 } else { 569 *dst = (Sint8)(sample * 127.0f); 570 } 571 } 572 573 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 574 575 /* Make sure src is aligned too. */ 576 if ((((size_t) src) & 15) == 0) { 577 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 578 const __m128 one = _mm_set1_ps(1.0f); 579 const __m128 negone = _mm_set1_ps(-1.0f); 580 const __m128 mulby127 = _mm_set1_ps(127.0f); 581 __m128i *mmdst = (__m128i *) dst; 582 while (i >= 16) { /* 16 * float32 */ 583 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 584 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 585 const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 586 const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 587 _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */ 588 i -= 16; src += 16; mmdst++; 589 } 590 dst = (Sint8 *) mmdst; 591 } 592 593 /* Finish off any leftovers with scalar operations. */ 594 while (i) { 595 const float sample = *src; 596 if (sample >= 1.0f) { 597 *dst = 127; 598 } else if (sample <= -1.0f) { 599 *dst = -128; 600 } else { 601 *dst = (Sint8)(sample * 127.0f); 602 } 603 i--; src++; dst++; 604 } 605 606 cvt->len_cvt /= 4; 607 if (cvt->filters[++cvt->filter_index]) { 608 cvt->filters[cvt->filter_index](cvt, AUDIO_S8); 609 } 610 } 611 612 static void SDLCALL 613 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 614 { 615 const float *src = (const float *) cvt->buf; 616 Uint8 *dst = cvt->buf; 617 int i; 618 619 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)"); 620 621 /* Get dst aligned to 16 bytes */ 622 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 623 const float sample = *src; 624 if (sample >= 1.0f) { 625 *dst = 255; 626 } else if (sample <= -1.0f) { 627 *dst = 0; 628 } else { 629 *dst = (Uint8)((sample + 1.0f) * 127.0f); 630 } 631 } 632 633 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 634 635 /* Make sure src is aligned too. */ 636 if ((((size_t) src) & 15) == 0) { 637 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 638 const __m128 one = _mm_set1_ps(1.0f); 639 const __m128 negone = _mm_set1_ps(-1.0f); 640 const __m128 mulby127 = _mm_set1_ps(127.0f); 641 __m128i *mmdst = (__m128i *) dst; 642 while (i >= 16) { /* 16 * float32 */ 643 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 644 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 645 const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 646 const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 647 _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */ 648 i -= 16; src += 16; mmdst++; 649 } 650 dst = (Uint8 *) mmdst; 651 } 652 653 /* Finish off any leftovers with scalar operations. */ 654 while (i) { 655 const float sample = *src; 656 if (sample >= 1.0f) { 657 *dst = 255; 658 } else if (sample <= -1.0f) { 659 *dst = 0; 660 } else { 661 *dst = (Uint8)((sample + 1.0f) * 127.0f); 662 } 663 i--; src++; dst++; 664 } 665 666 cvt->len_cvt /= 4; 667 if (cvt->filters[++cvt->filter_index]) { 668 cvt->filters[cvt->filter_index](cvt, AUDIO_U8); 669 } 670 } 671 672 static void SDLCALL 673 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 674 { 675 const float *src = (const float *) cvt->buf; 676 Sint16 *dst = (Sint16 *) cvt->buf; 677 int i; 678 679 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)"); 680 681 /* Get dst aligned to 16 bytes */ 682 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 683 const float sample = *src; 684 if (sample >= 1.0f) { 685 *dst = 32767; 686 } else if (sample <= -1.0f) { 687 *dst = -32768; 688 } else { 689 *dst = (Sint16)(sample * 32767.0f); 690 } 691 } 692 693 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 694 695 /* Make sure src is aligned too. */ 696 if ((((size_t) src) & 15) == 0) { 697 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 698 const __m128 one = _mm_set1_ps(1.0f); 699 const __m128 negone = _mm_set1_ps(-1.0f); 700 const __m128 mulby32767 = _mm_set1_ps(32767.0f); 701 __m128i *mmdst = (__m128i *) dst; 702 while (i >= 8) { /* 8 * float32 */ 703 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 704 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 705 _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2)); /* pack to sint16, store out. */ 706 i -= 8; src += 8; mmdst++; 707 } 708 dst = (Sint16 *) mmdst; 709 } 710 711 /* Finish off any leftovers with scalar operations. */ 712 while (i) { 713 const float sample = *src; 714 if (sample >= 1.0f) { 715 *dst = 32767; 716 } else if (sample <= -1.0f) { 717 *dst = -32768; 718 } else { 719 *dst = (Sint16)(sample * 32767.0f); 720 } 721 i--; src++; dst++; 722 } 723 724 cvt->len_cvt /= 2; 725 if (cvt->filters[++cvt->filter_index]) { 726 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS); 727 } 728 } 729 730 static void SDLCALL 731 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 732 { 733 const float *src = (const float *) cvt->buf; 734 Uint16 *dst = (Uint16 *) cvt->buf; 735 int i; 736 737 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)"); 738 739 /* Get dst aligned to 16 bytes */ 740 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 741 const float sample = *src; 742 if (sample >= 1.0f) { 743 *dst = 65535; 744 } else if (sample <= -1.0f) { 745 *dst = 0; 746 } else { 747 *dst = (Uint16)((sample + 1.0f) * 32767.0f); 748 } 749 } 750 751 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 752 753 /* Make sure src is aligned too. */ 754 if ((((size_t) src) & 15) == 0) { 755 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 756 /* This calculates differently than the scalar path because SSE2 can't 757 pack int32 data down to unsigned int16. _mm_packs_epi32 does signed 758 saturation, so that would corrupt our data. _mm_packus_epi32 exists, 759 but not before SSE 4.1. So we convert from float to sint16, packing 760 that down with legit signed saturation, and then xor the top bit 761 against 1. This results in the correct unsigned 16-bit value, even 762 though it looks like dark magic. */ 763 const __m128 mulby32767 = _mm_set1_ps(32767.0f); 764 const __m128i topbit = _mm_set1_epi16(-32768); 765 const __m128 one = _mm_set1_ps(1.0f); 766 const __m128 negone = _mm_set1_ps(-1.0f); 767 __m128i *mmdst = (__m128i *) dst; 768 while (i >= 8) { /* 8 * float32 */ 769 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 770 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 771 _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit)); /* pack to sint16, xor top bit, store out. */ 772 i -= 8; src += 8; mmdst++; 773 } 774 dst = (Uint16 *) mmdst; 775 } 776 777 /* Finish off any leftovers with scalar operations. */ 778 while (i) { 779 const float sample = *src; 780 if (sample >= 1.0f) { 781 *dst = 65535; 782 } else if (sample <= -1.0f) { 783 *dst = 0; 784 } else { 785 *dst = (Uint16)((sample + 1.0f) * 32767.0f); 786 } 787 i--; src++; dst++; 788 } 789 790 cvt->len_cvt /= 2; 791 if (cvt->filters[++cvt->filter_index]) { 792 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS); 793 } 794 } 795 796 static void SDLCALL 797 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 798 { 799 const float *src = (const float *) cvt->buf; 800 Sint32 *dst = (Sint32 *) cvt->buf; 801 int i; 802 803 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)"); 804 805 /* Get dst aligned to 16 bytes */ 806 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 807 const float sample = *src; 808 if (sample >= 1.0f) { 809 *dst = 2147483647; 810 } else if (sample <= -1.0f) { 811 *dst = (Sint32) -2147483648LL; 812 } else { 813 *dst = ((Sint32)(sample * 8388607.0f)) << 8; 814 } 815 } 816 817 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 818 SDL_assert(!i || ((((size_t) src) & 15) == 0)); 819 820 { 821 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 822 const __m128 one = _mm_set1_ps(1.0f); 823 const __m128 negone = _mm_set1_ps(-1.0f); 824 const __m128 mulby8388607 = _mm_set1_ps(8388607.0f); 825 __m128i *mmdst = (__m128i *) dst; 826 while (i >= 4) { /* 4 * float32 */ 827 _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8)); /* load 4 floats, clamp, convert to sint32 */ 828 i -= 4; src += 4; mmdst++; 829 } 830 dst = (Sint32 *) mmdst; 831 } 832 833 /* Finish off any leftovers with scalar operations. */ 834 while (i) { 835 const float sample = *src; 836 if (sample >= 1.0f) { 837 *dst = 2147483647; 838 } else if (sample <= -1.0f) { 839 *dst = (Sint32) -2147483648LL; 840 } else { 841 *dst = ((Sint32)(sample * 8388607.0f)) << 8; 842 } 843 i--; src++; dst++; 844 } 845 846 if (cvt->filters[++cvt->filter_index]) { 847 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS); 848 } 849 } 850 #endif 851 852 853 #if HAVE_NEON_INTRINSICS 854 static void SDLCALL 855 SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 856 { 857 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1; 858 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 859 int i; 860 861 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using NEON)"); 862 863 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 864 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { 865 *dst = ((float) *src) * DIVBY128; 866 } 867 868 src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */ 869 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 870 871 /* Make sure src is aligned too. */ 872 if ((((size_t) src) & 15) == 0) { 873 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 874 const int8_t *mmsrc = (const int8_t *) src; 875 const float32x4_t divby128 = vdupq_n_f32(DIVBY128); 876 while (i >= 16) { /* 16 * 8-bit */ 877 const int8x16_t bytes = vld1q_s8(mmsrc); /* get 16 sint8 into a NEON register. */ 878 const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes)); /* convert top 8 bytes to 8 int16 */ 879 const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes)); /* convert bottom 8 bytes to 8 int16 */ 880 /* split int16 to two int32, then convert to float, then multiply to normalize, store. */ 881 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128)); 882 vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128)); 883 vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128)); 884 vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128)); 885 i -= 16; mmsrc -= 16; dst -= 16; 886 } 887 888 src = (const Sint8 *) mmsrc; 889 } 890 891 src += 15; dst += 15; /* adjust for any scalar finishing. */ 892 893 /* Finish off any leftovers with scalar operations. */ 894 while (i) { 895 *dst = ((float) *src) * DIVBY128; 896 i--; src--; dst--; 897 } 898 899 cvt->len_cvt *= 4; 900 if (cvt->filters[++cvt->filter_index]) { 901 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 902 } 903 } 904 905 static void SDLCALL 906 SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 907 { 908 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1; 909 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 910 int i; 911 912 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using NEON)"); 913 914 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 915 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { 916 *dst = (((float) *src) * DIVBY128) - 1.0f; 917 } 918 919 src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */ 920 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 921 922 /* Make sure src is aligned too. */ 923 if ((((size_t) src) & 15) == 0) { 924 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 925 const uint8_t *mmsrc = (const uint8_t *) src; 926 const float32x4_t divby128 = vdupq_n_f32(DIVBY128); 927 const float32x4_t negone = vdupq_n_f32(-1.0f); 928 while (i >= 16) { /* 16 * 8-bit */ 929 const uint8x16_t bytes = vld1q_u8(mmsrc); /* get 16 uint8 into a NEON register. */ 930 const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); /* convert top 8 bytes to 8 uint16 */ 931 const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); /* convert bottom 8 bytes to 8 uint16 */ 932 /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */ 933 vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128)); 934 vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128)); 935 vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128)); 936 vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128)); 937 i -= 16; mmsrc -= 16; dst -= 16; 938 } 939 940 src = (const Uint8 *) mmsrc; 941 } 942 943 src += 15; dst += 15; /* adjust for any scalar finishing. */ 944 945 /* Finish off any leftovers with scalar operations. */ 946 while (i) { 947 *dst = (((float) *src) * DIVBY128) - 1.0f; 948 i--; src--; dst--; 949 } 950 951 cvt->len_cvt *= 4; 952 if (cvt->filters[++cvt->filter_index]) { 953 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 954 } 955 } 956 957 static void SDLCALL 958 SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 959 { 960 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1; 961 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 962 int i; 963 964 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using NEON)"); 965 966 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 967 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { 968 *dst = ((float) *src) * DIVBY32768; 969 } 970 971 src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */ 972 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 973 974 /* Make sure src is aligned too. */ 975 if ((((size_t) src) & 15) == 0) { 976 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 977 const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768); 978 while (i >= 8) { /* 8 * 16-bit */ 979 const int16x8_t ints = vld1q_s16((int16_t const *) src); /* get 8 sint16 into a NEON register. */ 980 /* split int16 to two int32, then convert to float, then multiply to normalize, store. */ 981 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768)); 982 vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768)); 983 i -= 8; src -= 8; dst -= 8; 984 } 985 } 986 987 src += 7; dst += 7; /* adjust for any scalar finishing. */ 988 989 /* Finish off any leftovers with scalar operations. */ 990 while (i) { 991 *dst = ((float) *src) * DIVBY32768; 992 i--; src--; dst--; 993 } 994 995 cvt->len_cvt *= 2; 996 if (cvt->filters[++cvt->filter_index]) { 997 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 998 } 999 } 1000 1001 static void SDLCALL 1002 SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1003 { 1004 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1; 1005 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 1006 int i; 1007 1008 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using NEON)"); 1009 1010 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 1011 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { 1012 *dst = (((float) *src) * DIVBY32768) - 1.0f; 1013 } 1014 1015 src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */ 1016 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1017 1018 /* Make sure src is aligned too. */ 1019 if ((((size_t) src) & 15) == 0) { 1020 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1021 const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768); 1022 const float32x4_t negone = vdupq_n_f32(-1.0f); 1023 while (i >= 8) { /* 8 * 16-bit */ 1024 const uint16x8_t uints = vld1q_u16((uint16_t const *) src); /* get 8 uint16 into a NEON register. */ 1025 /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */ 1026 vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768)); 1027 vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768)); 1028 i -= 8; src -= 8; dst -= 8; 1029 } 1030 } 1031 1032 src += 7; dst += 7; /* adjust for any scalar finishing. */ 1033 1034 /* Finish off any leftovers with scalar operations. */ 1035 while (i) { 1036 *dst = (((float) *src) * DIVBY32768) - 1.0f; 1037 i--; src--; dst--; 1038 } 1039 1040 cvt->len_cvt *= 2; 1041 if (cvt->filters[++cvt->filter_index]) { 1042 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 1043 } 1044 } 1045 1046 static void SDLCALL 1047 SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1048 { 1049 const Sint32 *src = (const Sint32 *) cvt->buf; 1050 float *dst = (float *) cvt->buf; 1051 int i; 1052 1053 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using NEON)"); 1054 1055 /* Get dst aligned to 16 bytes */ 1056 for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1057 *dst = ((float) (*src>>8)) * DIVBY8388607; 1058 } 1059 1060 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1061 1062 /* Make sure src is aligned too. */ 1063 if ((((size_t) src) & 15) == 0) { 1064 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1065 const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607); 1066 const int32_t *mmsrc = (const int32_t *) src; 1067 while (i >= 4) { /* 4 * sint32 */ 1068 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */ 1069 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607)); 1070 i -= 4; mmsrc += 4; dst += 4; 1071 } 1072 src = (const Sint32 *) mmsrc; 1073 } 1074 1075 /* Finish off any leftovers with scalar operations. */ 1076 while (i) { 1077 *dst = ((float) (*src>>8)) * DIVBY8388607; 1078 i--; src++; dst++; 1079 } 1080 1081 if (cvt->filters[++cvt->filter_index]) { 1082 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 1083 } 1084 } 1085 1086 static void SDLCALL 1087 SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1088 { 1089 const float *src = (const float *) cvt->buf; 1090 Sint8 *dst = (Sint8 *) cvt->buf; 1091 int i; 1092 1093 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using NEON)"); 1094 1095 /* Get dst aligned to 16 bytes */ 1096 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1097 const float sample = *src; 1098 if (sample >= 1.0f) { 1099 *dst = 127; 1100 } else if (sample <= -1.0f) { 1101 *dst = -128; 1102 } else { 1103 *dst = (Sint8)(sample * 127.0f); 1104 } 1105 } 1106 1107 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1108 1109 /* Make sure src is aligned too. */ 1110 if ((((size_t) src) & 15) == 0) { 1111 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1112 const float32x4_t one = vdupq_n_f32(1.0f); 1113 const float32x4_t negone = vdupq_n_f32(-1.0f); 1114 const float32x4_t mulby127 = vdupq_n_f32(127.0f); 1115 int8_t *mmdst = (int8_t *) dst; 1116 while (i >= 16) { /* 16 * float32 */ 1117 const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 1118 const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 1119 const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 1120 const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 1121 const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */ 1122 const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */ 1123 vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi)); /* combine to int8x16_t, store out */ 1124 i -= 16; src += 16; mmdst += 16; 1125 } 1126 dst = (Sint8 *) mmdst; 1127 } 1128 1129 /* Finish off any leftovers with scalar operations. */ 1130 while (i) { 1131 const float sample = *src; 1132 if (sample >= 1.0f) { 1133 *dst = 127; 1134 } else if (sample <= -1.0f) { 1135 *dst = -128; 1136 } else { 1137 *dst = (Sint8)(sample * 127.0f); 1138 } 1139 i--; src++; dst++; 1140 } 1141 1142 cvt->len_cvt /= 4; 1143 if (cvt->filters[++cvt->filter_index]) { 1144 cvt->filters[cvt->filter_index](cvt, AUDIO_S8); 1145 } 1146 } 1147 1148 static void SDLCALL 1149 SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1150 { 1151 const float *src = (const float *) cvt->buf; 1152 Uint8 *dst = (Uint8 *) cvt->buf; 1153 int i; 1154 1155 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using NEON)"); 1156 1157 /* Get dst aligned to 16 bytes */ 1158 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1159 const float sample = *src; 1160 if (sample >= 1.0f) { 1161 *dst = 255; 1162 } else if (sample <= -1.0f) { 1163 *dst = 0; 1164 } else { 1165 *dst = (Uint8)((sample + 1.0f) * 127.0f); 1166 } 1167 } 1168 1169 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1170 1171 /* Make sure src is aligned too. */ 1172 if ((((size_t) src) & 15) == 0) { 1173 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1174 const float32x4_t one = vdupq_n_f32(1.0f); 1175 const float32x4_t negone = vdupq_n_f32(-1.0f); 1176 const float32x4_t mulby127 = vdupq_n_f32(127.0f); 1177 uint8_t *mmdst = (uint8_t *) dst; 1178 while (i >= 16) { /* 16 * float32 */ 1179 const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ 1180 const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ 1181 const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ 1182 const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ 1183 const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */ 1184 const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */ 1185 vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi)); /* combine to uint8x16_t, store out */ 1186 i -= 16; src += 16; mmdst += 16; 1187 } 1188 1189 dst = (Uint8 *) mmdst; 1190 } 1191 1192 /* Finish off any leftovers with scalar operations. */ 1193 while (i) { 1194 const float sample = *src; 1195 if (sample >= 1.0f) { 1196 *dst = 255; 1197 } else if (sample <= -1.0f) { 1198 *dst = 0; 1199 } else { 1200 *dst = (Uint8)((sample + 1.0f) * 127.0f); 1201 } 1202 i--; src++; dst++; 1203 } 1204 1205 cvt->len_cvt /= 4; 1206 if (cvt->filters[++cvt->filter_index]) { 1207 cvt->filters[cvt->filter_index](cvt, AUDIO_U8); 1208 } 1209 } 1210 1211 static void SDLCALL 1212 SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1213 { 1214 const float *src = (const float *) cvt->buf; 1215 Sint16 *dst = (Sint16 *) cvt->buf; 1216 int i; 1217 1218 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using NEON)"); 1219 1220 /* Get dst aligned to 16 bytes */ 1221 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1222 const float sample = *src; 1223 if (sample >= 1.0f) { 1224 *dst = 32767; 1225 } else if (sample <= -1.0f) { 1226 *dst = -32768; 1227 } else { 1228 *dst = (Sint16)(sample * 32767.0f); 1229 } 1230 } 1231 1232 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1233 1234 /* Make sure src is aligned too. */ 1235 if ((((size_t) src) & 15) == 0) { 1236 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1237 const float32x4_t one = vdupq_n_f32(1.0f); 1238 const float32x4_t negone = vdupq_n_f32(-1.0f); 1239 const float32x4_t mulby32767 = vdupq_n_f32(32767.0f); 1240 int16_t *mmdst = (int16_t *) dst; 1241 while (i >= 8) { /* 8 * float32 */ 1242 const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 1243 const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 1244 vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, store out. */ 1245 i -= 8; src += 8; mmdst += 8; 1246 } 1247 dst = (Sint16 *) mmdst; 1248 } 1249 1250 /* Finish off any leftovers with scalar operations. */ 1251 while (i) { 1252 const float sample = *src; 1253 if (sample >= 1.0f) { 1254 *dst = 32767; 1255 } else if (sample <= -1.0f) { 1256 *dst = -32768; 1257 } else { 1258 *dst = (Sint16)(sample * 32767.0f); 1259 } 1260 i--; src++; dst++; 1261 } 1262 1263 cvt->len_cvt /= 2; 1264 if (cvt->filters[++cvt->filter_index]) { 1265 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS); 1266 } 1267 } 1268 1269 static void SDLCALL 1270 SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1271 { 1272 const float *src = (const float *) cvt->buf; 1273 Uint16 *dst = (Uint16 *) cvt->buf; 1274 int i; 1275 1276 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using NEON)"); 1277 1278 /* Get dst aligned to 16 bytes */ 1279 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1280 const float sample = *src; 1281 if (sample >= 1.0f) { 1282 *dst = 65535; 1283 } else if (sample <= -1.0f) { 1284 *dst = 0; 1285 } else { 1286 *dst = (Uint16)((sample + 1.0f) * 32767.0f); 1287 } 1288 } 1289 1290 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1291 1292 /* Make sure src is aligned too. */ 1293 if ((((size_t) src) & 15) == 0) { 1294 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1295 const float32x4_t one = vdupq_n_f32(1.0f); 1296 const float32x4_t negone = vdupq_n_f32(-1.0f); 1297 const float32x4_t mulby32767 = vdupq_n_f32(32767.0f); 1298 uint16_t *mmdst = (uint16_t *) dst; 1299 while (i >= 8) { /* 8 * float32 */ 1300 const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767)); /* load 4 floats, clamp, convert to uint32 */ 1301 const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767)); /* load 4 floats, clamp, convert to uint32 */ 1302 vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, store out. */ 1303 i -= 8; src += 8; mmdst += 8; 1304 } 1305 dst = (Uint16 *) mmdst; 1306 } 1307 1308 /* Finish off any leftovers with scalar operations. */ 1309 while (i) { 1310 const float sample = *src; 1311 if (sample >= 1.0f) { 1312 *dst = 65535; 1313 } else if (sample <= -1.0f) { 1314 *dst = 0; 1315 } else { 1316 *dst = (Uint16)((sample + 1.0f) * 32767.0f); 1317 } 1318 i--; src++; dst++; 1319 } 1320 1321 cvt->len_cvt /= 2; 1322 if (cvt->filters[++cvt->filter_index]) { 1323 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS); 1324 } 1325 } 1326 1327 static void SDLCALL 1328 SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1329 { 1330 const float *src = (const float *) cvt->buf; 1331 Sint32 *dst = (Sint32 *) cvt->buf; 1332 int i; 1333 1334 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using NEON)"); 1335 1336 /* Get dst aligned to 16 bytes */ 1337 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1338 const float sample = *src; 1339 if (sample >= 1.0f) { 1340 *dst = 2147483647; 1341 } else if (sample <= -1.0f) { 1342 *dst = (-2147483647) - 1; 1343 } else { 1344 *dst = ((Sint32)(sample * 8388607.0f)) << 8; 1345 } 1346 } 1347 1348 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1349 SDL_assert(!i || ((((size_t) src) & 15) == 0)); 1350 1351 { 1352 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1353 const float32x4_t one = vdupq_n_f32(1.0f); 1354 const float32x4_t negone = vdupq_n_f32(-1.0f); 1355 const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f); 1356 int32_t *mmdst = (int32_t *) dst; 1357 while (i >= 4) { /* 4 * float32 */ 1358 vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8)); 1359 i -= 4; src += 4; mmdst += 4; 1360 } 1361 dst = (Sint32 *) mmdst; 1362 } 1363 1364 /* Finish off any leftovers with scalar operations. */ 1365 while (i) { 1366 const float sample = *src; 1367 if (sample >= 1.0f) { 1368 *dst = 2147483647; 1369 } else if (sample <= -1.0f) { 1370 *dst = (-2147483647) - 1; 1371 } else { 1372 *dst = ((Sint32)(sample * 8388607.0f)) << 8; 1373 } 1374 i--; src++; dst++; 1375 } 1376 1377 if (cvt->filters[++cvt->filter_index]) { 1378 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS); 1379 } 1380 } 1381 #endif 1382 1383 1384 1385 void SDL_ChooseAudioConverters(void) 1386 { 1387 static SDL_bool converters_chosen = SDL_FALSE; 1388 1389 if (converters_chosen) { 1390 return; 1391 } 1392 1393 #define SET_CONVERTER_FUNCS(fntype) \ 1394 SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \ 1395 SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \ 1396 SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \ 1397 SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \ 1398 SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \ 1399 SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \ 1400 SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \ 1401 SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \ 1402 SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \ 1403 SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \ 1404 converters_chosen = SDL_TRUE 1405 1406 #if HAVE_SSE2_INTRINSICS 1407 if (SDL_HasSSE2()) { 1408 SET_CONVERTER_FUNCS(SSE2); 1409 return; 1410 } 1411 #endif 1412 1413 #if HAVE_NEON_INTRINSICS 1414 if (SDL_HasNEON()) { 1415 SET_CONVERTER_FUNCS(NEON); 1416 return; 1417 } 1418 #endif 1419 1420 #if NEED_SCALAR_CONVERTER_FALLBACKS 1421 SET_CONVERTER_FUNCS(Scalar); 1422 #endif 1423 1424 #undef SET_CONVERTER_FUNCS 1425 1426 SDL_assert(converters_chosen == SDL_TRUE); 1427 } 1428 1429 /* vi: set ts=4 sw=4 expandtab: */