icbc.h (127319B)
1 // icbc.h v1.05 2 // A High Quality SIMD BC1 Encoder by Ignacio Castano <castano@gmail.com>. 3 // 4 // LICENSE: 5 // MIT license at the end of this file. 6 7 #ifndef ICBC_H 8 #define ICBC_H 9 10 namespace icbc { 11 12 enum Decoder { 13 Decoder_D3D10 = 0, 14 Decoder_NVIDIA = 1, 15 Decoder_AMD = 2, // Apple's M1 decoder appears to match AMD's 16 Decoder_Intel = 3 17 }; 18 19 void init(Decoder decoder = Decoder_D3D10); 20 21 enum Quality { 22 Quality_Level0, // Box fit only. 23 Quality_Level1, // Box fit + least squares fit. 24 Quality_Level2, // Cluster fit 4, threshold = 24. 25 Quality_Level3, // Cluster fit 4, threshold = 32. 26 Quality_Level4, // Cluster fit 4, threshold = 48. 27 Quality_Level5, // Cluster fit 4, threshold = 64. 28 Quality_Level6, // Cluster fit 4, threshold = 96. 29 Quality_Level7, // Cluster fit 4, threshold = 128. 30 Quality_Level8, // Cluster fit 4+3, threshold = 256. 31 Quality_Level9, // Cluster fit 4+3, threshold = 256 + Refinement. 32 33 Quality_Fast = Quality_Level1, 34 Quality_Default = Quality_Level8, 35 Quality_Max = Quality_Level9, 36 }; 37 38 void decode_bc1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10); 39 void decode_bc3(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10); 40 //void decode_bc4(const void * block, bool snorm, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10); 41 //void decode_bc5(const void * block, bool snorm, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10); 42 43 float evaluate_bc1_error(const unsigned char rgba_block[16 * 4], const void * block, Decoder decoder = Decoder_D3D10); 44 float evaluate_bc3_error(const unsigned char rgba_block[16 * 4], const void * block, bool alpha_blend, Decoder decoder = Decoder_D3D10); 45 //float evaluate_bc4_error(const float rgba_block[16 * 4], const void * block, bool snorm, Decoder decoder = Decoder_D3D10); 46 //float evaluate_bc5_error(const float rgba_block[16 * 4], const void * block, bool snorm, Decoder decoder = Decoder_D3D10); 47 48 float compress_bc1(Quality level, const float * input_colors, const float * input_weights, const float color_weights[3], bool three_color_mode, bool three_color_black, void * output); 49 float compress_bc3(Quality level, const float * input_colors, const float * input_weights, const float color_weights[3], bool six_alpha_mode, void * output); 50 //float compress_bc4(Quality level, const float * input_colors, const float * input_weights, bool snorm, bool six_alpha_mode, void * output); 51 //float compress_bc5(Quality level, const float * input_colors, const float * input_weights, bool snorm, bool six_alpha_mode, void * output); 52 } 53 54 #endif // ICBC_H 55 56 #ifdef ICBC_IMPLEMENTATION 57 58 // Instruction level support must be chosen at compile time setting ICBC_SIMD to one of these values: 59 #define ICBC_SCALAR 0 60 #define ICBC_SSE2 1 61 #define ICBC_SSE41 2 62 #define ICBC_AVX1 3 63 #define ICBC_AVX2 4 64 #define ICBC_AVX512 5 65 #define ICBC_NEON -1 66 #define ICBC_VMX -2 67 68 #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) 69 #define ICBC_X86 1 70 #endif 71 72 #if defined(__x86_64__) || defined(_M_X64) 73 #define ICBC_X64 1 74 #endif 75 76 #if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || defined(_M_ARM64) 77 #define ICBC_ARM 1 78 #endif 79 80 #if defined(__aarch64__) || defined(_M_ARM64) 81 #define ICBC_ARM64 1 82 #endif 83 84 #if (defined(__PPC__) || defined(_M_PPC)) 85 #define ICBC_PPC 1 86 #endif 87 88 // SIMD version. 89 #ifndef ICBC_SIMD 90 #if ICBC_X86 91 #if __AVX512F__ 92 #define ICBC_SIMD ICBC_AVX512 93 #elif __AVX2__ 94 #define ICBC_SIMD ICBC_AVX2 95 #elif __AVX__ 96 #define ICBC_SIMD ICBC_AVX1 97 #elif __SSE4_1__ 98 #define ICBC_SIMD ICBC_SSE41 99 #elif __SSE2__ || ICBC_X64 100 #define ICBC_SIMD ICBC_SSE2 101 #else 102 #define ICBC_SIMD ICBC_SCALAR 103 #endif 104 #endif 105 106 #if ICBC_ARM 107 #if __ARM_NEON__ 108 #define ICBC_SIMD ICBC_NEON 109 #else 110 #define ICBC_SIMD ICBC_SCALAR 111 #endif 112 #endif 113 114 #if ICBC_PPC 115 #define ICBC_SIMD ICBC_VMX 116 #endif 117 #endif 118 119 // AVX1 does not require FMA, and depending on whether it's Intel or AMD you may have FMA3 or FMA4. What a mess. 120 #ifndef ICBC_USE_FMA 121 //#define ICBC_USE_FMA 3 122 //#define ICBC_USE_FMA 4 123 #endif 124 125 #if ICBC_SIMD >= ICBC_AVX2 126 #define ICBC_BMI2 1 127 #endif 128 129 // Apparently rcp is not deterministic (different precision on Intel and AMD), enable if you don't care about that for a small performance boost. 130 //#define ICBC_USE_RCP 1 131 132 #if ICBC_SIMD == ICBC_AVX2 133 #define ICBC_USE_AVX2_PERMUTE2 1 // Using permutevar8x32 and bitops. 134 #endif 135 136 #if ICBC_SIMD == ICBC_AVX512 137 #define ICBC_USE_AVX512_PERMUTE 1 138 #endif 139 140 #if ICBC_SIMD == ICBC_NEON 141 #define ICBC_USE_NEON_VTL 1 // Enable this for a 0.89x performance improvement. 142 #endif 143 144 #if ICBC_SIMD == ICBC_SSE41 145 #define ICBC_USE_SSSE3_SHUFFLEB 1 // Enable this for 0.88x performance improvement. 146 #endif 147 148 149 // Some experimental knobs: 150 #define ICBC_PERFECT_ROUND 0 // Enable perfect rounding to compute cluster fit residual. 151 152 153 #if ICBC_SIMD >= ICBC_SSE2 154 #include <emmintrin.h> 155 #endif 156 157 #if ICBC_SIMD >= ICBC_SSE41 158 #include <smmintrin.h> 159 #endif 160 161 #if ICBC_SIMD >= ICBC_AVX1 162 #include <immintrin.h> 163 #endif 164 165 #if ICBC_SIMD >= ICBC_AVX512 && _MSC_VER 166 #include <zmmintrin.h> 167 #endif 168 169 #if ICBC_SIMD == ICBC_NEON 170 #include <arm_neon.h> 171 #endif 172 173 #if ICBC_SIMD == ICBC_VMX 174 #include <altivec.h> 175 #endif 176 177 #if _MSC_VER 178 #include <intrin.h> // _BitScanReverse 179 #endif 180 181 #include <stdint.h> 182 #include <stdlib.h> // abs 183 #include <string.h> // memset 184 #include <math.h> // fabsf 185 #include <float.h> // FLT_MAX 186 187 #ifndef ICBC_ASSERT 188 #if _DEBUG 189 #define ICBC_ASSERT assert 190 #include <assert.h> 191 #else 192 #define ICBC_ASSERT(x) 193 #endif 194 #endif 195 196 namespace icbc { 197 198 /////////////////////////////////////////////////////////////////////////////////////////////////// 199 // Basic Templates 200 201 template <typename T> inline void swap(T & a, T & b) { 202 T temp(a); 203 a = b; 204 b = temp; 205 } 206 207 template <typename T> inline T max(const T & a, const T & b) { 208 return (b < a) ? a : b; 209 } 210 211 template <typename T> inline T min(const T & a, const T & b) { 212 return (a < b) ? a : b; 213 } 214 215 template <typename T> inline T clamp(const T & x, const T & a, const T & b) { 216 return min(max(x, a), b); 217 } 218 219 template <typename T> inline T square(const T & a) { 220 return a * a; 221 } 222 223 224 /////////////////////////////////////////////////////////////////////////////////////////////////// 225 // Basic Types 226 227 typedef uint8_t uint8; 228 typedef int8_t int8; 229 typedef uint16_t uint16; 230 typedef uint32_t uint32; 231 typedef uint32_t uint; 232 233 234 struct Color16 { 235 union { 236 struct { 237 uint16 b : 5; 238 uint16 g : 6; 239 uint16 r : 5; 240 }; 241 uint16 u; 242 }; 243 }; 244 245 struct Color32 { 246 union { 247 struct { 248 uint8 b, g, r, a; 249 }; 250 uint32 u; 251 }; 252 }; 253 254 struct BlockBC1 { 255 Color16 col0; 256 Color16 col1; 257 uint32 indices; 258 }; 259 260 struct BlockBC4 { 261 uint8 alpha0; 262 uint8 alpha1; 263 uint8 indices[6]; 264 }; 265 266 struct BlockBC3 { 267 BlockBC4 alpha; 268 BlockBC1 rgb; 269 }; 270 271 struct BlockBC5 { 272 BlockBC4 x; 273 BlockBC1 y; 274 }; 275 276 277 278 struct Vector3 { 279 float x; 280 float y; 281 float z; 282 283 inline void operator+=(Vector3 v) { 284 x += v.x; y += v.y; z += v.z; 285 } 286 inline void operator*=(Vector3 v) { 287 x *= v.x; y *= v.y; z *= v.z; 288 } 289 inline void operator*=(float s) { 290 x *= s; y *= s; z *= s; 291 } 292 }; 293 294 struct Vector4 { 295 union { 296 struct { 297 float x, y, z, w; 298 }; 299 Vector3 xyz; 300 }; 301 }; 302 303 304 inline Vector3 operator*(Vector3 v, float s) { 305 return { v.x * s, v.y * s, v.z * s }; 306 } 307 308 inline Vector3 operator*(float s, Vector3 v) { 309 return { v.x * s, v.y * s, v.z * s }; 310 } 311 312 inline Vector3 operator*(Vector3 a, Vector3 b) { 313 return { a.x * b.x, a.y * b.y, a.z * b.z }; 314 } 315 316 inline float dot(Vector3 a, Vector3 b) { 317 return a.x * b.x + a.y * b.y + a.z * b.z; 318 } 319 320 inline Vector3 operator+(Vector3 a, Vector3 b) { 321 return { a.x + b.x, a.y + b.y, a.z + b.z }; 322 } 323 324 inline Vector3 operator-(Vector3 a, Vector3 b) { 325 return { a.x - b.x, a.y - b.y, a.z - b.z }; 326 } 327 328 inline Vector3 operator/(Vector3 v, float s) { 329 return { v.x / s, v.y / s, v.z / s }; 330 } 331 332 inline float saturate(float x) { 333 return clamp(x, 0.0f, 1.0f); 334 } 335 336 inline Vector3 saturate(Vector3 v) { 337 return { saturate(v.x), saturate(v.y), saturate(v.z) }; 338 } 339 340 inline Vector3 min(Vector3 a, Vector3 b) { 341 return { min(a.x, b.x), min(a.y, b.y), min(a.z, b.z) }; 342 } 343 344 inline Vector3 max(Vector3 a, Vector3 b) { 345 return { max(a.x, b.x), max(a.y, b.y), max(a.z, b.z) }; 346 } 347 348 inline bool operator==(const Vector3 & a, const Vector3 & b) { 349 return a.x == b.x && a.y == b.y && a.z == b.z; 350 } 351 352 inline Vector3 scalar_to_vector3(float f) { 353 return {f, f, f}; 354 } 355 356 inline float lengthSquared(Vector3 v) { 357 return dot(v, v); 358 } 359 360 inline bool equal(float a, float b, float epsilon = 0.0001) { 361 // http://realtimecollisiondetection.net/blog/?p=89 362 //return fabsf(a - b) < epsilon * max(1.0f, max(fabsf(a), fabsf(b))); 363 return fabsf(a - b) < epsilon; 364 } 365 366 inline bool equal(Vector3 a, Vector3 b, float epsilon) { 367 return equal(a.x, b.x, epsilon) && equal(a.y, b.y, epsilon) && equal(a.z, b.z, epsilon); 368 } 369 370 371 /////////////////////////////////////////////////////////////////////////////////////////////////// 372 // SIMD 373 374 #ifndef ICBC_ALIGN 375 #if __GNUC__ 376 # define ICBC_ALIGN __attribute__ ((__aligned__ (icbc::VEC_SIZE*4))) 377 #else // _MSC_VER 378 # define ICBC_ALIGN __declspec(align(icbc::VEC_SIZE*4)) 379 #endif 380 #endif 381 382 #if __GNUC__ 383 #define ICBC_FORCEINLINE inline __attribute__((always_inline)) 384 #else 385 #define ICBC_FORCEINLINE __forceinline 386 #endif 387 388 389 // Count trailing zeros (BSR). 390 ICBC_FORCEINLINE int ctz(uint mask) { 391 #if __GNUC__ 392 return __builtin_ctz(mask); 393 #else 394 unsigned long index; 395 _BitScanReverse(&index, mask); 396 return (int)index; 397 #endif 398 } 399 400 401 #if ICBC_SIMD == ICBC_SCALAR // Purely scalar version. 402 403 constexpr int VEC_SIZE = 1; 404 405 using VFloat = float; 406 using VMask = bool; 407 408 ICBC_FORCEINLINE float & lane(VFloat & v, int i) { return v; } 409 ICBC_FORCEINLINE VFloat vzero() { return 0.0f; } 410 ICBC_FORCEINLINE VFloat vbroadcast(float x) { return x; } 411 ICBC_FORCEINLINE VFloat vload(const float * ptr) { return *ptr; } 412 ICBC_FORCEINLINE VFloat vrcp(VFloat a) { return 1.0f / a; } 413 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { return a * b + c; } 414 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { return a * b - c; } 415 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { return a * b - c * d; } 416 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { return min(max(a, 0.0f), 1.0f); } 417 ICBC_FORCEINLINE VFloat vround01(VFloat a) { return float(int(a + 0.5f)); } 418 ICBC_FORCEINLINE VFloat lane_id() { return 0; } 419 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return mask ? b : a; } 420 ICBC_FORCEINLINE VMask vbroadcast(bool b) { return b; } 421 ICBC_FORCEINLINE bool all(VMask m) { return m; } 422 ICBC_FORCEINLINE bool any(VMask m) { return m; } 423 ICBC_FORCEINLINE uint mask(VMask m) { return (uint)m; } 424 ICBC_FORCEINLINE int reduce_min_index(VFloat v) { return 0; } 425 ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {} 426 427 #elif ICBC_SIMD == ICBC_SSE2 || ICBC_SIMD == ICBC_SSE41 428 429 constexpr int VEC_SIZE = 4; 430 431 #if __GNUC__ 432 // GCC needs a struct so that we can overload operators. 433 union VFloat { 434 __m128 v; 435 float m128_f32[VEC_SIZE]; 436 437 VFloat() {} 438 VFloat(__m128 v) : v(v) {} 439 operator __m128 & () { return v; } 440 }; 441 union VMask { 442 __m128 m; 443 444 VMask() {} 445 VMask(__m128 m) : m(m) {} 446 operator __m128 & () { return m; } 447 }; 448 #else 449 using VFloat = __m128; 450 using VMask = __m128; 451 #endif 452 453 ICBC_FORCEINLINE float & lane(VFloat & v, int i) { 454 return v.m128_f32[i]; 455 } 456 457 ICBC_FORCEINLINE VFloat vzero() { 458 return _mm_setzero_ps(); 459 } 460 461 ICBC_FORCEINLINE VFloat vbroadcast(float x) { 462 return _mm_set1_ps(x); 463 } 464 465 ICBC_FORCEINLINE VFloat vload(const float * ptr) { 466 return _mm_load_ps(ptr); 467 } 468 469 ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) { 470 VFloat v; 471 for (int i = 0; i < VEC_SIZE; i++) { 472 lane(v, i) = base[int(lane(index, i))]; 473 } 474 return v; 475 } 476 477 ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { 478 return _mm_add_ps(a, b); 479 } 480 481 ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { 482 return _mm_sub_ps(a, b); 483 } 484 485 ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { 486 return _mm_mul_ps(a, b); 487 } 488 489 ICBC_FORCEINLINE VFloat vrcp(VFloat a) { 490 #if ICBC_USE_RCP 491 VFloat r = _mm_rcp_ps(a); 492 return _mm_mul_ps(r, _mm_sub_ps(vbroadcast(2.0f), _mm_mul_ps(r, a))); // r * (2 - r * a) 493 #else 494 return _mm_div_ps(vbroadcast(1.0f), a); 495 #endif 496 } 497 498 // a*b+c 499 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { 500 return a * b + c; 501 } 502 503 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { 504 return a * b - c; 505 } 506 507 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { 508 return a * b - c * d; 509 } 510 511 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { 512 auto zero = _mm_setzero_ps(); 513 auto one = _mm_set1_ps(1.0f); 514 return _mm_min_ps(_mm_max_ps(a, zero), one); 515 } 516 517 // Assumes a is in [0, 1] range. 518 ICBC_FORCEINLINE VFloat vround01(VFloat a) { 519 #if ICBC_SIMD == ICBC_SSE41 520 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 521 #else 522 return _mm_cvtepi32_ps(_mm_cvttps_epi32(a + vbroadcast(0.5f))); 523 #endif 524 } 525 526 ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { 527 #if ICBC_SIMD == ICBC_SSE41 528 return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 529 #else 530 return _mm_cvtepi32_ps(_mm_cvttps_epi32(a)); 531 #endif 532 } 533 534 ICBC_FORCEINLINE VFloat lane_id() { 535 return _mm_set_ps(3, 2, 1, 0); 536 } 537 538 ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm_cmpgt_ps(A, B); } 539 ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm_cmpge_ps(A, B); } 540 ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm_cmplt_ps(A, B); } 541 ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm_cmple_ps(A, B); } 542 543 ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm_or_ps(A, B); } 544 ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm_and_ps(A, B); } 545 ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm_xor_ps(A, B); } 546 547 // mask ? b : a 548 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { 549 #if ICBC_SIMD == ICBC_SSE41 550 return _mm_blendv_ps(a, b, mask); 551 #else 552 return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b)); 553 #endif 554 } 555 556 ICBC_FORCEINLINE VMask vbroadcast(bool b) { 557 return _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b))); 558 } 559 560 ICBC_FORCEINLINE bool all(VMask m) { 561 int value = _mm_movemask_ps(m); 562 return value == 0x7; 563 } 564 565 ICBC_FORCEINLINE bool any(VMask m) { 566 int value = _mm_movemask_ps(m); 567 return value != 0; 568 } 569 570 ICBC_FORCEINLINE uint mask(VMask m) { 571 return (uint)_mm_movemask_ps(m); 572 } 573 574 ICBC_FORCEINLINE int reduce_min_index(VFloat v) { 575 576 // First do an horizontal reduction. // v = [ D C | B A ] 577 VFloat shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1)); // [ C D | A B ] 578 VFloat mins = _mm_min_ps(v, shuf); // mins = [ D+C C+D | B+A A+B ] 579 shuf = _mm_movehl_ps(shuf, mins); // [ C D | D+C C+D ] // let the compiler avoid a mov by reusing shuf 580 mins = _mm_min_ss(mins, shuf); 581 mins = _mm_shuffle_ps(mins, mins, _MM_SHUFFLE(0, 0, 0, 0)); 582 583 // Then find the index. 584 uint mask = _mm_movemask_ps(v <= mins); 585 return ctz(mask); 586 } 587 588 // https://gcc.gnu.org/legacy-ml/gcc-patches/2005-10/msg00324.html 589 ICBC_FORCEINLINE void vtranspose4(VFloat & r0, VFloat & r1, VFloat & r2, VFloat & r3) { 590 VFloat t0 = _mm_unpacklo_ps(r0, r1); 591 VFloat t1 = _mm_unpacklo_ps(r2, r3); 592 VFloat t2 = _mm_unpackhi_ps(r0, r1); 593 VFloat t3 = _mm_unpackhi_ps(r2, r3); 594 r0 = _mm_movelh_ps(t0, t1); 595 r1 = _mm_movehl_ps(t1, t0); 596 r2 = _mm_movelh_ps(t2, t3); 597 r3 = _mm_movehl_ps(t3, t2); 598 } 599 600 601 #if ICBC_SIMD == ICBC_SSE41 602 603 ICBC_FORCEINLINE VFloat vpermute2(VFloat tab0, VFloat tab1, __m128i idx) { 604 605 // @@ Precompute this: 606 tab1 = _mm_xor_ps(tab1, tab0); 607 608 __m128i result = _mm_shuffle_epi8(_mm_castps_si128(tab0), idx); 609 idx = _mm_sub_epi8(idx, _mm_set1_epi8(16)); 610 611 result = _mm_xor_si128(result, _mm_shuffle_epi8(_mm_castps_si128(tab1), idx)); 612 613 return _mm_castsi128_ps(result); 614 } 615 616 ICBC_FORCEINLINE VFloat vpermute4(VFloat tab0, VFloat tab1, VFloat tab2, VFloat tab3, __m128i idx) { 617 618 // @@ Precompute this: 619 tab3 = _mm_xor_ps(tab3, tab2); 620 tab2 = _mm_xor_ps(tab2, tab1); 621 tab1 = _mm_xor_ps(tab1, tab0); 622 623 __m128i result = _mm_shuffle_epi8(_mm_castps_si128(tab0), idx); 624 idx = _mm_sub_epi8(idx, _mm_set1_epi8(16)); 625 626 result = _mm_xor_si128(result, _mm_shuffle_epi8(_mm_castps_si128(tab1), idx)); 627 idx = _mm_sub_epi8(idx, _mm_set1_epi8(16)); 628 629 result = _mm_xor_si128(result, _mm_shuffle_epi8(_mm_castps_si128(tab2), idx)); 630 idx = _mm_sub_epi8(idx, _mm_set1_epi8(16)); 631 632 result = _mm_xor_si128(result, _mm_shuffle_epi8(_mm_castps_si128(tab3), idx)); 633 634 return _mm_castsi128_ps(result); 635 } 636 637 #endif 638 639 640 #elif ICBC_SIMD == ICBC_AVX1 || ICBC_SIMD == ICBC_AVX2 641 642 constexpr int VEC_SIZE = 8; 643 644 #if __GNUC__ 645 union VFloat { 646 __m256 v; 647 float m256_f32[VEC_SIZE]; 648 649 VFloat() {} 650 VFloat(__m256 v) : v(v) {} 651 operator __m256 & () { return v; } 652 }; 653 union VInt { 654 __m256i v; 655 int m256_i32[VEC_SIZE]; 656 657 VInt() {} 658 VInt(__m256i v) : v(v) {} 659 operator __m256i & () { return v; } 660 }; 661 union VMask { 662 __m256 m; 663 664 VMask() {} 665 VMask(__m256 m) : m(m) {} 666 operator __m256 & () { return m; } 667 }; 668 #else 669 using VFloat = __m256; 670 using VInt = __m256i; 671 using VMask = __m256; // Emulate mask vector using packed float. 672 #endif 673 674 ICBC_FORCEINLINE float & lane(VFloat & v, int i) { 675 return v.m256_f32[i]; 676 } 677 678 ICBC_FORCEINLINE VFloat vzero() { 679 return _mm256_setzero_ps(); 680 } 681 682 ICBC_FORCEINLINE VFloat vbroadcast(float a) { 683 return _mm256_set1_ps(a); 684 } 685 686 ICBC_FORCEINLINE VFloat vload(const float * ptr) { 687 return _mm256_load_ps(ptr); 688 } 689 690 ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) { 691 #if ICBC_SIMD == ICBC_AVX2 692 return _mm256_i32gather_ps(base, _mm256_cvtps_epi32(index), 4); 693 #else 694 VFloat v; 695 for (int i = 0; i < VEC_SIZE; i++) { 696 lane(v, i) = base[int(lane(index, i))]; 697 } 698 return v; 699 #endif 700 } 701 702 ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { 703 return _mm256_add_ps(a, b); 704 } 705 706 ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { 707 return _mm256_sub_ps(a, b); 708 } 709 710 ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { 711 return _mm256_mul_ps(a, b); 712 } 713 714 ICBC_FORCEINLINE VFloat vrcp(VFloat a) { 715 #if ICBC_USE_RCP 716 #if ICBC_SIMD == ICBC_AVX512 717 VFloat r = _mm256_rcp14_ps(a); 718 #else 719 VFloat r = _mm256_rcp_ps(a); 720 #endif 721 722 // r = r * (2 - r * a) 723 #if ICBC_USE_FMA == 3 || ICBC_AVX2 724 return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vbroadcast(2.0f))); 725 #else 726 return _mm256_mul_ps(r, _mm256_sub_ps(vbroadcast(2.0f), _mm256_mul_ps(r, a))); 727 #endif 728 #else 729 return _mm256_div_ps(vbroadcast(1.0f), a); 730 #endif 731 } 732 733 // a*b+c 734 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { 735 #if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2 736 return _mm256_fmadd_ps(a, b, c); 737 #elif ICBC_USE_FMA == 4 738 return _mm256_macc_ps(a, b, c); 739 #else 740 return ((a * b) + c); 741 #endif 742 } 743 744 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { 745 #if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2 746 return _mm256_fmsub_ps(a, b, c); 747 #elif ICBC_USE_FMA == 4 748 return _mm256_msub_ps(a, b, c); 749 #else 750 return ((a * b) - c); 751 #endif 752 } 753 754 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { 755 return vmsub(a, b, c * d); 756 } 757 758 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { 759 __m256 zero = _mm256_setzero_ps(); 760 __m256 one = _mm256_set1_ps(1.0f); 761 return _mm256_min_ps(_mm256_max_ps(a, zero), one); 762 } 763 764 ICBC_FORCEINLINE VFloat vround01(VFloat a) { 765 return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 766 } 767 768 ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { 769 return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 770 } 771 772 ICBC_FORCEINLINE VFloat lane_id() { 773 return _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0); 774 } 775 776 ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GT_OQ); } 777 ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GE_OQ); } 778 ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LT_OQ); } 779 ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LE_OQ); } 780 781 ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm256_or_ps(A, B); } 782 ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm256_and_ps(A, B); } 783 ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm256_xor_ps(A, B); } 784 785 // mask ? b : a 786 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { 787 return _mm256_blendv_ps(a, b, mask); 788 } 789 790 ICBC_FORCEINLINE VMask vbroadcast(bool b) { 791 return _mm256_castsi256_ps(_mm256_set1_epi32(-int32_t(b))); 792 } 793 794 ICBC_FORCEINLINE bool all(VMask m) { 795 __m256 zero = _mm256_setzero_ps(); 796 return _mm256_testc_ps(_mm256_cmp_ps(zero, zero, _CMP_EQ_UQ), m) == 0; 797 } 798 799 ICBC_FORCEINLINE bool any(VMask m) { 800 return _mm256_testz_ps(m, m) == 0; 801 } 802 803 ICBC_FORCEINLINE uint mask(VMask m) { 804 return (uint)_mm256_movemask_ps(m); 805 } 806 807 // This is missing on some GCC versions. 808 #if !defined _mm256_set_m128 809 #define _mm256_set_m128(hi, lo) _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1) 810 #endif 811 812 ICBC_FORCEINLINE int reduce_min_index(VFloat v) { 813 814 __m128 vlow = _mm256_castps256_ps128(v); 815 __m128 vhigh = _mm256_extractf128_ps(v, 1); 816 vlow = _mm_min_ps(vlow, vhigh); 817 818 // First do an horizontal reduction. // v = [ D C | B A ] 819 __m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1)); // [ C D | A B ] 820 __m128 mins = _mm_min_ps(vlow, shuf); // mins = [ D+C C+D | B+A A+B ] 821 shuf = _mm_movehl_ps(shuf, mins); // [ C D | D+C C+D ] 822 mins = _mm_min_ss(mins, shuf); 823 824 VFloat vmin = _mm256_permute_ps(_mm256_set_m128(mins, mins), 0); // _MM256_PERMUTE(0, 0, 0, 0, 0, 0, 0, 0) 825 826 // Then find the index. 827 uint mask = _mm256_movemask_ps(v <= vmin); 828 return ctz(mask); 829 } 830 831 // AoS to SoA 832 ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) { 833 VFloat r0 = _mm256_unpacklo_ps(a, b); 834 VFloat r1 = _mm256_unpacklo_ps(c, d); 835 VFloat r2 = _mm256_permute2f128_ps(r0, r1, 0x20); 836 VFloat r3 = _mm256_permute2f128_ps(r0, r1, 0x31); 837 r0 = _mm256_unpackhi_ps(a, b); 838 r1 = _mm256_unpackhi_ps(c, d); 839 a = _mm256_unpacklo_ps(r2, r3); 840 b = _mm256_unpackhi_ps(r2, r3); 841 r2 = _mm256_permute2f128_ps(r0, r1, 0x20); 842 r3 = _mm256_permute2f128_ps(r0, r1, 0x31); 843 c = _mm256_unpacklo_ps(r2, r3); 844 d = _mm256_unpackhi_ps(r2, r3); 845 } 846 847 ICBC_FORCEINLINE VInt vzeroi() { 848 return _mm256_setzero_si256(); 849 } 850 851 ICBC_FORCEINLINE VInt vbroadcast(int a) { 852 return _mm256_set1_epi32(a); 853 } 854 855 ICBC_FORCEINLINE VInt vload(const int * ptr) { 856 return _mm256_load_si256((const __m256i*)ptr); 857 } 858 859 ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm256_sub_epi32(A, _mm256_set1_epi32(b)); } 860 ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm256_and_si256(A, _mm256_set1_epi32(b)); } 861 ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm256_srli_epi32(A, b); } 862 863 ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(A, _mm256_set1_epi32(b))); } 864 ICBC_FORCEINLINE VMask operator>= (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(A, _mm256_set1_epi32(b-1))); } 865 ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(A, _mm256_set1_epi32(b))); } 866 867 // mask ? v[idx] : 0 868 ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) { 869 return _mm256_and_ps(_mm256_permutevar8x32_ps(v, idx), mask); 870 } 871 872 // mask ? (idx > 8 ? vhi[idx] : vlo[idx]) : 0 873 ICBC_FORCEINLINE VFloat vpermute2if(VMask mask, VFloat vlo, VFloat vhi, VInt idx) { 874 #if 0 875 VMask mhi = idx > 7; 876 vlo = _mm256_permutevar8x32_ps(vlo, idx); 877 vhi = _mm256_permutevar8x32_ps(vhi, idx); 878 VFloat v = _mm256_blendv_ps(vlo, vhi, mhi); 879 return _mm256_and_ps(v, mask); 880 #else 881 // Fabian Giesen says not to mix _mm256_blendv_ps and _mm256_permutevar8x32_ps since they contend for the same gates and instead suggests the following: 882 vhi = _mm256_xor_ps(vhi, vlo); 883 VFloat v = _mm256_permutevar8x32_ps(vlo, idx); 884 VMask mhi = idx > 7; 885 v = _mm256_xor_ps(v, _mm256_and_ps(_mm256_permutevar8x32_ps(vhi, idx), mhi)); 886 return _mm256_and_ps(v, mask); 887 #endif 888 } 889 890 891 #elif ICBC_SIMD == ICBC_AVX512 892 893 constexpr int VEC_SIZE = 16; 894 895 #if __GNUC__ 896 union VFloat { 897 __m512 v; 898 float m512_f32[VEC_SIZE]; 899 900 VFloat() {} 901 VFloat(__m512 v) : v(v) {} 902 operator __m512 & () { return v; } 903 }; 904 union VInt { 905 __m512i v; 906 int m512i_i32[VEC_SIZE]; 907 908 VInt() {} 909 VInt(__m512i v) : v(v) {} 910 operator __m512i & () { return v; } 911 }; 912 #else 913 using VFloat = __m512; 914 using VInt = __m512i; 915 #endif 916 struct VMask { __mmask16 m; }; 917 918 ICBC_FORCEINLINE float & lane(VFloat & v, int i) { 919 return v.m512_f32[i]; 920 } 921 922 ICBC_FORCEINLINE VFloat vzero() { 923 return _mm512_setzero_ps(); 924 } 925 926 ICBC_FORCEINLINE VFloat vbroadcast(float a) { 927 return _mm512_set1_ps(a); 928 } 929 930 ICBC_FORCEINLINE VFloat vload(const float * ptr) { 931 return _mm512_load_ps(ptr); 932 } 933 934 ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr) { 935 return _mm512_mask_load_ps(_mm512_undefined(), mask.m, ptr); 936 } 937 938 ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr, float fallback) { 939 return _mm512_mask_load_ps(_mm512_set1_ps(fallback), mask.m, ptr); 940 } 941 942 ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) { 943 return _mm512_i32gather_ps(_mm512_cvtps_epi32(index), base, 4); 944 } 945 946 ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { 947 return _mm512_add_ps(a, b); 948 } 949 950 ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { 951 return _mm512_sub_ps(a, b); 952 } 953 954 ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { 955 return _mm512_mul_ps(a, b); 956 } 957 958 ICBC_FORCEINLINE VFloat vrcp(VFloat a) { 959 #if ICBC_USE_RCP 960 VFloat r = _mm512_rcp14_ps(a); 961 962 // r = r * (2 - r * a) 963 return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vbroadcast(2.0f))); 964 #else 965 return _mm512_div_ps(vbroadcast(1.0f), a); 966 #endif 967 } 968 969 // a*b+c 970 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { 971 return _mm512_fmadd_ps(a, b, c); 972 } 973 974 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { 975 return _mm512_fmsub_ps(a, b, c); 976 } 977 978 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { 979 return vmsub(a, b, c * d); 980 } 981 982 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { 983 auto zero = _mm512_setzero_ps(); 984 auto one = _mm512_set1_ps(1.0f); 985 return _mm512_min_ps(_mm512_max_ps(a, zero), one); 986 } 987 988 ICBC_FORCEINLINE VFloat vround01(VFloat a) { 989 return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT); 990 } 991 992 ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { 993 return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO); 994 } 995 996 ICBC_FORCEINLINE VFloat lane_id() { 997 return _mm512_set_ps(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 998 } 999 1000 ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GT_OQ) }; } 1001 ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ) }; } 1002 ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LT_OQ) }; } 1003 ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LE_OQ) }; } 1004 1005 ICBC_FORCEINLINE VMask operator! (VMask A) { return { _mm512_knot(A.m) }; } 1006 ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { _mm512_kor(A.m, B.m) }; } 1007 ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { _mm512_kand(A.m, B.m) }; } 1008 ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { _mm512_kxor(A.m, B.m) }; } 1009 1010 // mask ? b : a 1011 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { 1012 return _mm512_mask_blend_ps(mask.m, a, b); 1013 } 1014 1015 ICBC_FORCEINLINE VMask vbroadcast(bool b) { 1016 return { __mmask16(-int16_t(b)) }; 1017 } 1018 1019 ICBC_FORCEINLINE bool all(VMask mask) { 1020 return mask.m == 0xFFFFFFFF; 1021 } 1022 1023 ICBC_FORCEINLINE bool any(VMask mask) { 1024 return mask.m != 0; 1025 } 1026 1027 ICBC_FORCEINLINE uint mask(VMask mask) { 1028 return mask.m; 1029 } 1030 1031 ICBC_FORCEINLINE int reduce_min_index(VFloat v) { 1032 1033 // First do an horizontal reduction. 1034 VFloat vmin = vbroadcast(_mm512_reduce_min_ps(v)); 1035 1036 // Then find the index. 1037 VMask mask = (v <= vmin); 1038 return ctz(mask.m); 1039 } 1040 1041 //ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d); // @@ 1042 1043 ICBC_FORCEINLINE int lane(VInt v, int i) { 1044 //return _mm256_extract_epi32(v, i); 1045 return v.m512i_i32[i]; 1046 } 1047 1048 ICBC_FORCEINLINE VInt vzeroi() { 1049 return _mm512_setzero_epi32(); 1050 } 1051 1052 ICBC_FORCEINLINE VInt vbroadcast(int a) { 1053 return _mm512_set1_epi32(a); 1054 } 1055 1056 ICBC_FORCEINLINE VInt vload(const int * ptr) { 1057 return _mm512_load_epi32(ptr); 1058 } 1059 1060 ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm512_sub_epi32(A, vbroadcast(b)); } 1061 ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm512_and_epi32(A, vbroadcast(b)); } 1062 ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm512_srli_epi32(A, b); } 1063 1064 ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return { _mm512_cmpgt_epi32_mask(A, vbroadcast(b)) }; } 1065 ICBC_FORCEINLINE VMask operator>=(VInt A, int b) { return { _mm512_cmpge_epi32_mask(A, vbroadcast(b)) }; } 1066 ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return { _mm512_cmpeq_epi32_mask(A, vbroadcast(b)) }; } 1067 1068 // mask ? v[idx] : 0 1069 ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) { 1070 return _mm512_maskz_permutexvar_ps(mask.m, idx, v); 1071 } 1072 1073 1074 #elif ICBC_SIMD == ICBC_NEON 1075 1076 constexpr int VEC_SIZE = 4; 1077 1078 #if __GNUC__ 1079 union VFloat { 1080 float32x4_t v; 1081 float e[4]; 1082 VFloat() {} 1083 VFloat(float32x4_t v) : v(v) {} 1084 operator float32x4_t & () { return v; } 1085 }; 1086 struct VMask { 1087 uint32x4_t v; 1088 VMask() {} 1089 VMask(uint32x4_t v) : v(v) {} 1090 operator uint32x4_t & () { return v; } 1091 }; 1092 #else 1093 using VFloat = float32x4_t; 1094 using VMask = uint32x4_t; 1095 #endif 1096 1097 ICBC_FORCEINLINE float & lane(VFloat & v, int i) { 1098 #if defined(__clang__) 1099 return v.e[i]; 1100 #elif defined(_MSC_VER) 1101 return v.n128_f32[i]; 1102 #else 1103 return v.v[i]; 1104 #endif 1105 } 1106 1107 ICBC_FORCEINLINE VFloat vzero() { 1108 return vdupq_n_f32(0.0f); 1109 } 1110 1111 ICBC_FORCEINLINE VFloat vbroadcast(float a) { 1112 return vdupq_n_f32(a); 1113 } 1114 1115 ICBC_FORCEINLINE VFloat vload(const float * ptr) { 1116 return vld1q_f32(ptr); 1117 } 1118 1119 ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { 1120 return vaddq_f32(a, b); 1121 } 1122 1123 ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { 1124 return vsubq_f32(a, b); 1125 } 1126 1127 ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { 1128 return vmulq_f32(a, b); 1129 } 1130 1131 ICBC_FORCEINLINE VFloat vrcp(VFloat a) { 1132 #if ICBC_USE_RCP 1133 return vrecpeq_f32(a); 1134 #else 1135 //VFloat rcp = vrecpeq_f32(a); 1136 //return vmulq_f32(vrecpsq_f32(a, rcp), rcp); 1137 return vdivq_f32(vbroadcast(1.0f), a); 1138 #endif 1139 } 1140 1141 // a*b+c 1142 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { 1143 return vmlaq_f32(c, a, b); 1144 } 1145 1146 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { 1147 return a * b - c; 1148 } 1149 1150 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { 1151 // vmlsq_f32(a, b, c) == a - (b * c) 1152 return vmlsq_f32(a * b, c, d); 1153 } 1154 1155 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { 1156 return vminq_f32(vmaxq_f32(a, vzero()), vbroadcast(1)); 1157 } 1158 1159 ICBC_FORCEINLINE VFloat vround01(VFloat a) { 1160 return vrndnq_f32(a); // Round to integral (to nearest, ties to even) 1161 //return vcvtq_f32_s32(vcvtq_s32_f32(a + vbroadcast(0.5))); 1162 } 1163 1164 ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { 1165 return vrndq_f32(a); 1166 //return vcvtq_f32_s32(vcvtq_s32_f32(a)); 1167 } 1168 1169 ICBC_FORCEINLINE VFloat lane_id() { 1170 ICBC_ALIGN float data[4] = { 0, 1, 2, 3 }; 1171 return vld1q_f32(data); 1172 } 1173 1174 ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vcgtq_f32(A, B) }; } 1175 ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vcgeq_f32(A, B) }; } 1176 ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vcltq_f32(A, B) }; } 1177 ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vcleq_f32(A, B) }; } 1178 1179 ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vorrq_u32(A, B) }; } 1180 ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vandq_u32(A, B) }; } 1181 ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { veorq_u32(A, B) }; } 1182 1183 // mask ? b : a 1184 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { 1185 return vbslq_f32(mask, b, a); 1186 } 1187 1188 ICBC_FORCEINLINE bool all(VMask mask) { 1189 return vminvq_u32(mask) != 0; 1190 // uint32x2_t m2 = vpmin_u32(vget_low_u32(mask), vget_high_u32(mask)); 1191 // uint32x2_t m1 = vpmin_u32(m2, m2); 1192 // return vget_lane_u32(m1, 0) != 0; 1193 } 1194 1195 ICBC_FORCEINLINE bool any(VMask mask) { 1196 return vmaxvq_u32(mask) != 0; 1197 // uint32x2_t m2 = vpmax_u32(vget_low_u32(mask), vget_high_u32(mask)); 1198 // uint32x2_t m1 = vpmax_u32(m2, m2); 1199 // return vget_lane_u32(m1, 0) != 0; 1200 } 1201 1202 ICBC_FORCEINLINE uint mask(VMask b) { 1203 const uint32x4_t movemask = { 1, 2, 4, 8 }; 1204 return vaddvq_u32(vandq_u32(b, movemask)); 1205 } 1206 1207 ICBC_FORCEINLINE int reduce_min_index(VFloat v) { 1208 // float32x2_t m2 = vpmin_f32(vget_low_u32(v), vget_high_u32(v)); 1209 // float32x2_t m1 = vpmin_f32(m2, m2); 1210 // VFloat vmin = vdupq_lane_f32(m1, 0); 1211 VFloat vmin = vdupq_n_f32(vminvq_f32(v)); 1212 return ctz(mask(v <= vmin)); 1213 } 1214 1215 // https://github.com/Maratyszcza/NNPACK/blob/master/src/neon/transpose.h 1216 ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) { 1217 // row0 = ( x00 x01 x02 x03 ) 1218 // row1 = ( x10 x11 x12 x13 ) 1219 // row2 = ( x20 x21 x22 x23 ) 1220 // row3 = ( x30 x31 x32 x33 ) 1221 1222 // row01 = ( x00 x10 x02 x12 ), ( x01 x11 x03, x13 ) 1223 // row23 = ( x20 x30 x22 x32 ), ( x21 x31 x23, x33 ) 1224 float32x4x2_t row01 = vtrnq_f32(a, b); 1225 float32x4x2_t row23 = vtrnq_f32(c, d); 1226 1227 // row0 = ( x00 x10 x20 x30 ) 1228 // row1 = ( x01 x11 x21 x31 ) 1229 // row2 = ( x02 x12 x22 x32 ) 1230 // row3 = ( x03 x13 x23 x33 ) 1231 a = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0])); 1232 b = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1])); 1233 c = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0])); 1234 d = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1])); 1235 } 1236 1237 #elif ICBC_SIMD == ICBC_VMX 1238 1239 constexpr int VEC_SIZE = 4; 1240 1241 union VFloat { 1242 vectro float v; 1243 float e[4]; 1244 VFloat() {} 1245 VFloat(vector float v) : v(v) {} 1246 operator vector float & () { return v; } 1247 }; 1248 struct VMask { 1249 vector unsigned int v; 1250 VMask() {} 1251 VMask(vector unsigned int v) : v(v) {} 1252 operator vector unsigned int & () { return v; } 1253 }; 1254 1255 ICBC_FORCEINLINE float & lane(VFloat & v, int i) { 1256 return v.e[i]; 1257 } 1258 1259 ICBC_FORCEINLINE VFloat vzero() { 1260 return vec_splats(0.0f); 1261 } 1262 1263 ICBC_FORCEINLINE VFloat vbroadcast(float a) { 1264 return vec_splats(a); 1265 } 1266 1267 ICBC_FORCEINLINE VFloat vload(const float * ptr) { 1268 return vec_ld(ptr) 1269 } 1270 1271 ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { 1272 return vec_add(a, b); 1273 } 1274 1275 ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { 1276 return vec_sub(a, b); 1277 } 1278 1279 ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { 1280 return vec_madd(a, b, vec_splats(-0.0f)); 1281 } 1282 1283 ICBC_FORCEINLINE VFloat vrcp(VFloat a) { 1284 #if ICBC_USE_RCP 1285 // get the reciprocal estimate 1286 vector float estimate = vec_re( v.vec ); 1287 1288 // one round of Newton-Rhaphson refinement 1289 vector float diff = vec_nmsub( estimate, v.vec, vec_splats( 1.0f ) ); 1290 return vec_madd(diff, estimate, estimate ); 1291 #else 1292 return vec_div(vec_splats(1),a); 1293 #endif 1294 } 1295 1296 // a*b+c 1297 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { 1298 return vec_madd(a, b, c); 1299 } 1300 1301 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { 1302 return vec_msub(a, b, c); // @@ Is this right? 1303 } 1304 1305 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { 1306 return vmsub(a, b, c * d); 1307 } 1308 1309 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { 1310 return vec_min(vec_max(a, vzero()), vbroadcast(1)); 1311 } 1312 1313 ICBC_FORCEINLINE VFloat vround01(VFloat a) { 1314 // @@ Assumes a is positive and ~small 1315 return vec_trunc(a + vbroadcast(0.5)); 1316 } 1317 1318 ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { 1319 return vec_trunc(a); 1320 } 1321 1322 ICBC_FORCEINLINE VFloat lane_id() { 1323 return (VFloat){ 0, 1, 2, 3 }; 1324 } 1325 1326 ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vec_cmpgt(A, B) }; } 1327 ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vec_cmpge(A, B) }; } 1328 ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vec_cmplt(A, B) }; } 1329 ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vec_cmple(A, B) }; } 1330 1331 ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vec_or(A, B) }; } 1332 ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vec_and(A, B) }; } 1333 ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { vec_xor(A, B) }; } 1334 1335 // mask ? b : a 1336 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { 1337 return vec_sel(a, b, mask); 1338 } 1339 1340 ICBC_FORCEINLINE int reduce_min_index(VFloat v) { 1341 1342 //VFloat vmin = //@@ Horizontal min? 1343 //return vec_cmpeq_idx(v, vmin); 1344 1345 // @@ Is there a better way to do this reduction? 1346 int min_idx = 0; 1347 float min_value = lane(v, 0); 1348 1349 for (int i = 1; i < VEC_SIZE; i++) { 1350 float value = lane(v, i); 1351 if (value < min_value) { 1352 min_value = value; 1353 min_idx = i; 1354 } 1355 } 1356 1357 return min_idx; 1358 } 1359 1360 ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) { 1361 VFloat t1 = vec_mergeh(a, c); 1362 VFloat t2 = vec_mergel(a, c); 1363 VFloat t3 = vec_mergeh(b, d); 1364 VFloat t4 = vec_mergel(b, d); 1365 a = vec_mergeh(t1, t3); 1366 b = vec_mergel(t1, t3); 1367 c = vec_mergeh(t2, t4); 1368 d = vec_mergel(t2, t4); 1369 } 1370 1371 #endif // ICBC_SIMD == * 1372 1373 #if ICBC_SIMD != ICBC_SCALAR 1374 ICBC_FORCEINLINE VFloat vmadd(VFloat a, float b, VFloat c) { 1375 VFloat vb = vbroadcast(b); 1376 return vmadd(a, vb, c); 1377 } 1378 #endif 1379 1380 struct VVector3 { 1381 VFloat x; 1382 VFloat y; 1383 VFloat z; 1384 }; 1385 1386 ICBC_FORCEINLINE VVector3 vbroadcast(Vector3 v) { 1387 VVector3 v8; 1388 1389 v8.x = vbroadcast(v.x); 1390 v8.y = vbroadcast(v.y); 1391 v8.z = vbroadcast(v.z); 1392 return v8; 1393 } 1394 1395 ICBC_FORCEINLINE VVector3 vbroadcast(float x, float y, float z) { 1396 VVector3 v8; 1397 v8.x = vbroadcast(x); 1398 v8.y = vbroadcast(y); 1399 v8.z = vbroadcast(z); 1400 return v8; 1401 } 1402 1403 /*ICBC_FORCEINLINE VVector3 vload(const Vector3 * v) { 1404 // @@ See this for a 8x3 transpose: https://software.intel.com/content/www/us/en/develop/articles/3d-vector-normalization-using-256-bit-intel-advanced-vector-extensions-intel-avx.html 1405 }*/ 1406 1407 ICBC_FORCEINLINE VVector3 vload(const Vector4 * ptr) { 1408 #if ICBC_SIMD == ICBC_AVX512 1409 1410 // @@ AVX512 transpose not implemented. 1411 __m512i vindex = _mm512_set_epi32(4 * 15, 4 * 14, 4 * 13, 4 * 12, 4 * 11, 4 * 10, 4 * 9, 4 * 8, 4 * 7, 4 * 6, 4 * 5, 4 * 4, 4 * 3, 4 * 2, 4 * 1, 0); 1412 1413 VVector3 v; 1414 v.x = _mm512_i32gather_ps(vindex, &ptr->x, 4); 1415 v.y = _mm512_i32gather_ps(vindex, &ptr->y, 4); 1416 v.z = _mm512_i32gather_ps(vindex, &ptr->z, 4); 1417 return v; 1418 1419 #else 1420 1421 VVector3 v; 1422 v.x = vload(&ptr->x + 0 * VEC_SIZE); 1423 v.y = vload(&ptr->x + 1 * VEC_SIZE); 1424 v.z = vload(&ptr->x + 2 * VEC_SIZE); 1425 VFloat tmp = vload(&ptr->x + 3 * VEC_SIZE); 1426 1427 vtranspose4(v.x, v.y, v.z, tmp); 1428 1429 return v; 1430 #endif 1431 } 1432 1433 1434 ICBC_FORCEINLINE VVector3 operator+(VVector3 a, VVector3 b) { 1435 VVector3 v; 1436 v.x = (a.x + b.x); 1437 v.y = (a.y + b.y); 1438 v.z = (a.z + b.z); 1439 return v; 1440 } 1441 1442 ICBC_FORCEINLINE VVector3 operator-(VVector3 a, VVector3 b) { 1443 VVector3 v8; 1444 v8.x = (a.x - b.x); 1445 v8.y = (a.y - b.y); 1446 v8.z = (a.z - b.z); 1447 return v8; 1448 } 1449 1450 ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VVector3 b) { 1451 VVector3 v8; 1452 v8.x = (a.x * b.x); 1453 v8.y = (a.y * b.y); 1454 v8.z = (a.z * b.z); 1455 return v8; 1456 } 1457 1458 ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VFloat b) { 1459 VVector3 v8; 1460 v8.x = (a.x * b); 1461 v8.y = (a.y * b); 1462 v8.z = (a.z * b); 1463 return v8; 1464 } 1465 1466 ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VVector3 b, VVector3 c) { 1467 VVector3 v8; 1468 v8.x = vmadd(a.x, b.x, c.x); 1469 v8.y = vmadd(a.y, b.y, c.y); 1470 v8.z = vmadd(a.z, b.z, c.z); 1471 return v8; 1472 } 1473 1474 ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VFloat b, VVector3 c) { 1475 VVector3 v8; 1476 v8.x = vmadd(a.x, b, c.x); 1477 v8.y = vmadd(a.y, b, c.y); 1478 v8.z = vmadd(a.z, b, c.z); 1479 return v8; 1480 } 1481 1482 #if ICBC_SIMD != ICBC_SCALAR 1483 ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, float b, VVector3 c) { 1484 VVector3 v8; 1485 VFloat vb = vbroadcast(b); 1486 v8.x = vmadd(a.x, vb, c.x); 1487 v8.y = vmadd(a.y, vb, c.y); 1488 v8.z = vmadd(a.z, vb, c.z); 1489 return v8; 1490 } 1491 #endif 1492 1493 ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VFloat c) { 1494 VVector3 v8; 1495 v8.x = vmsub(a.x, b, c); 1496 v8.y = vmsub(a.y, b, c); 1497 v8.z = vmsub(a.z, b, c); 1498 return v8; 1499 } 1500 1501 ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VVector3 c) { 1502 VVector3 v8; 1503 v8.x = vmsub(a.x, b, c.x); 1504 v8.y = vmsub(a.y, b, c.y); 1505 v8.z = vmsub(a.z, b, c.z); 1506 return v8; 1507 } 1508 1509 ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VFloat b, VVector3 c, VFloat d) { 1510 VVector3 v; 1511 v.x = vm2sub(a.x, b, c.x, d); 1512 v.y = vm2sub(a.y, b, c.y, d); 1513 v.z = vm2sub(a.z, b, c.z, d); 1514 return v; 1515 } 1516 1517 ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VFloat d) { 1518 VVector3 v; 1519 v.x = vm2sub(a.x, b.x, c.x, d); 1520 v.y = vm2sub(a.y, b.y, c.y, d); 1521 v.z = vm2sub(a.z, b.z, c.z, d); 1522 return v; 1523 } 1524 1525 ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VVector3 d) { 1526 VVector3 v; 1527 v.x = vm2sub(a.x, b.x, c.x, d.x); 1528 v.y = vm2sub(a.y, b.y, c.y, d.y); 1529 v.z = vm2sub(a.z, b.z, c.z, d.z); 1530 return v; 1531 } 1532 1533 ICBC_FORCEINLINE VFloat vdot(VVector3 a, VVector3 b) { 1534 VFloat r; 1535 r = a.x * b.x + a.y * b.y + a.z * b.z; 1536 return r; 1537 } 1538 1539 // Length squared. 1540 ICBC_FORCEINLINE VFloat vlen2(VVector3 v) { 1541 return vdot(v, v); 1542 } 1543 1544 1545 // mask ? b : a 1546 ICBC_FORCEINLINE VVector3 vselect(VMask mask, VVector3 a, VVector3 b) { 1547 VVector3 r; 1548 r.x = vselect(mask, a.x, b.x); 1549 r.y = vselect(mask, a.y, b.y); 1550 r.z = vselect(mask, a.z, b.z); 1551 return r; 1552 } 1553 1554 1555 1556 1557 /////////////////////////////////////////////////////////////////////////////////////////////////// 1558 // Color conversion functions. 1559 1560 static const float midpoints5[32] = { 1561 0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f, 1562 0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, FLT_MAX 1563 }; 1564 1565 static const float midpoints6[64] = { 1566 0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f, 1567 0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f, 1568 0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f, 1569 0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, FLT_MAX 1570 }; 1571 1572 /*void init_tables() { 1573 for (int i = 0; i < 31; i++) { 1574 float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f; 1575 float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f; 1576 midpoints5[i] = (f0 + f1) * 0.5; 1577 } 1578 midpoints5[31] = FLT_MAX; 1579 1580 for (int i = 0; i < 63; i++) { 1581 float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f; 1582 float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f; 1583 midpoints6[i] = (f0 + f1) * 0.5; 1584 } 1585 midpoints6[63] = FLT_MAX; 1586 }*/ 1587 1588 ICBC_FORCEINLINE VFloat vround5(VFloat x) { 1589 const VFloat rb_scale = vbroadcast(31.0f); 1590 const VFloat rb_inv_scale = vbroadcast(1.0f / 31.0f); 1591 1592 #if ICBC_PERFECT_ROUND 1593 VFloat q = vtruncate(x * rb_scale); 1594 VFloat mp = vgather(midpoints5, q); 1595 //return (q + (vbroadcast(1.0f) & (x > mp))) * rb_inv_scale; 1596 return (q + vselect(x > mp, vzero(), vbroadcast(1))) * rb_inv_scale; 1597 #else 1598 return vround01(x * rb_scale) * rb_inv_scale; 1599 #endif 1600 } 1601 1602 ICBC_FORCEINLINE VFloat vround6(VFloat x) { 1603 const VFloat g_scale = vbroadcast(63.0f); 1604 const VFloat g_inv_scale = vbroadcast(1.0f / 63.0f); 1605 1606 #if ICBC_PERFECT_ROUND 1607 VFloat q = vtruncate(x * g_scale); 1608 VFloat mp = vgather(midpoints6, q); 1609 //return (q + (vbroadcast(1) & (x > mp))) * g_inv_scale; 1610 return (q + vselect(x > mp, vzero(), vbroadcast(1))) * g_inv_scale; 1611 #else 1612 return vround01(x * g_scale) * g_inv_scale; 1613 #endif 1614 } 1615 1616 ICBC_FORCEINLINE VVector3 vround_ept(VVector3 v) { 1617 VVector3 r; 1618 r.x = vround5(vsaturate(v.x)); 1619 r.y = vround6(vsaturate(v.y)); 1620 r.z = vround5(vsaturate(v.z)); 1621 return r; 1622 } 1623 1624 static Color16 vector3_to_color16(const Vector3 & v) { 1625 1626 // Truncate. 1627 uint r = uint(clamp(v.x * 31.0f, 0.0f, 31.0f)); 1628 uint g = uint(clamp(v.y * 63.0f, 0.0f, 63.0f)); 1629 uint b = uint(clamp(v.z * 31.0f, 0.0f, 31.0f)); 1630 1631 // Round exactly according to 565 bit-expansion. 1632 r += (v.x > midpoints5[r]); 1633 g += (v.y > midpoints6[g]); 1634 b += (v.z > midpoints5[b]); 1635 1636 Color16 c; 1637 c.u = uint16((r << 11) | (g << 5) | b); 1638 return c; 1639 } 1640 1641 static Color32 bitexpand_color16_to_color32(Color16 c16) { 1642 Color32 c32; 1643 //c32.b = (c16.b << 3) | (c16.b >> 2); 1644 //c32.g = (c16.g << 2) | (c16.g >> 4); 1645 //c32.r = (c16.r << 3) | (c16.r >> 2); 1646 //c32.a = 0xFF; 1647 1648 c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000); 1649 c32.u |= (c32.u >> 5) & 0x070007; 1650 c32.u |= (c32.u >> 6) & 0x000300; 1651 c32.u |= 0xff000000; 1652 1653 return c32; 1654 } 1655 1656 inline Vector3 color_to_vector3(Color32 c) { 1657 return { c.r / 255.0f, c.g / 255.0f, c.b / 255.0f }; 1658 } 1659 1660 inline Color32 vector3_to_color32(Vector3 v) { 1661 Color32 color; 1662 color.r = uint8(saturate(v.x) * 255 + 0.5f); 1663 color.g = uint8(saturate(v.y) * 255 + 0.5f); 1664 color.b = uint8(saturate(v.z) * 255 + 0.5f); 1665 color.a = 255; 1666 return color; 1667 } 1668 1669 1670 /////////////////////////////////////////////////////////////////////////////////////////////////// 1671 // Input block processing. 1672 1673 inline bool is_black(Vector3 c) { 1674 // This large threshold seems to improve compression. This is not forcing these texels to be black, just 1675 // causes them to be ignored during PCA. 1676 // @@ We may want to adjust this based on the quality level, since this increases the number of blocks that try cluster-3 fitting. 1677 //return c.x < midpoints5[0] && c.y < midpoints6[0] && c.z < midpoints5[0]; 1678 //return c.x < 1.0f / 32 && c.y < 1.0f / 32 && c.z < 1.0f / 32; 1679 return c.x < 1.0f / 8 && c.y < 1.0f / 8 && c.z < 1.0f / 8; 1680 } 1681 1682 // Find similar colors and combine them together. 1683 static int reduce_colors(const Vector4 * input_colors, const float * input_weights, int count, float threshold, Vector3 * colors, float * weights, bool * any_black) 1684 { 1685 #if 0 1686 for (int i = 0; i < 16; i++) { 1687 colors[i] = input_colors[i].xyz; 1688 weights[i] = input_weights[i]; 1689 } 1690 return 16; 1691 #else 1692 *any_black = false; 1693 1694 int n = 0; 1695 for (int i = 0; i < count; i++) 1696 { 1697 Vector3 ci = input_colors[i].xyz; 1698 float wi = input_weights[i]; 1699 1700 if (wi > 0) { 1701 // Find matching color. 1702 int j; 1703 for (j = 0; j < n; j++) { 1704 if (equal(colors[j], ci, threshold)) { 1705 colors[j] = (colors[j] * weights[j] + ci * wi) / (weights[j] + wi); 1706 weights[j] += wi; 1707 break; 1708 } 1709 } 1710 1711 // No match found. Add new color. 1712 if (j == n) { 1713 colors[n] = ci; 1714 weights[n] = wi; 1715 n++; 1716 } 1717 1718 if (is_black(ci)) { 1719 *any_black = true; 1720 } 1721 } 1722 } 1723 1724 ICBC_ASSERT(n <= count); 1725 1726 return n; 1727 #endif 1728 } 1729 1730 static int skip_blacks(const Vector3 * input_colors, const float * input_weights, int count, Vector3 * colors, float * weights) 1731 { 1732 int n = 0; 1733 for (int i = 0; i < count; i++) 1734 { 1735 Vector3 ci = input_colors[i]; 1736 float wi = input_weights[i]; 1737 1738 if (is_black(ci)) { 1739 continue; 1740 } 1741 1742 colors[n] = ci; 1743 weights[n] = wi; 1744 n += 1; 1745 } 1746 1747 return n; 1748 } 1749 1750 1751 1752 /////////////////////////////////////////////////////////////////////////////////////////////////// 1753 // PCA 1754 1755 static Vector3 computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights) 1756 { 1757 Vector3 centroid = { 0 }; 1758 float total = 0.0f; 1759 1760 for (int i = 0; i < n; i++) 1761 { 1762 total += weights[i]; 1763 centroid += weights[i] * points[i]; 1764 } 1765 centroid *= (1.0f / total); 1766 1767 return centroid; 1768 } 1769 1770 static Vector3 computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, float *__restrict covariance) 1771 { 1772 // compute the centroid 1773 Vector3 centroid = computeCentroid(n, points, weights); 1774 1775 // compute covariance matrix 1776 for (int i = 0; i < 6; i++) 1777 { 1778 covariance[i] = 0.0f; 1779 } 1780 1781 for (int i = 0; i < n; i++) 1782 { 1783 Vector3 a = (points[i] - centroid); // @@ I think weight should be squared, but that seems to increase the error slightly. 1784 Vector3 b = weights[i] * a; 1785 1786 covariance[0] += a.x * b.x; 1787 covariance[1] += a.x * b.y; 1788 covariance[2] += a.x * b.z; 1789 covariance[3] += a.y * b.y; 1790 covariance[4] += a.y * b.z; 1791 covariance[5] += a.z * b.z; 1792 } 1793 1794 return centroid; 1795 } 1796 1797 // @@ We should be able to do something cheaper... 1798 static Vector3 estimatePrincipalComponent(const float * __restrict matrix) 1799 { 1800 const Vector3 row0 = { matrix[0], matrix[1], matrix[2] }; 1801 const Vector3 row1 = { matrix[1], matrix[3], matrix[4] }; 1802 const Vector3 row2 = { matrix[2], matrix[4], matrix[5] }; 1803 1804 float r0 = lengthSquared(row0); 1805 float r1 = lengthSquared(row1); 1806 float r2 = lengthSquared(row2); 1807 1808 if (r0 > r1 && r0 > r2) return row0; 1809 if (r1 > r2) return row1; 1810 return row2; 1811 } 1812 1813 static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix) 1814 { 1815 if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0) 1816 { 1817 return {0}; 1818 } 1819 1820 Vector3 v = estimatePrincipalComponent(matrix); 1821 1822 const int NUM = 6; 1823 for (int i = 0; i < NUM; i++) 1824 { 1825 float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2]; 1826 float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4]; 1827 float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5]; 1828 1829 float norm = max(max(x, y), z); 1830 1831 v = { x, y, z }; 1832 v *= (1.0f / norm); 1833 } 1834 1835 return v; 1836 } 1837 1838 static Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights) 1839 { 1840 float matrix[6]; 1841 computeCovariance(n, points, weights, matrix); 1842 1843 return firstEigenVector_PowerMethod(matrix); 1844 } 1845 1846 1847 /////////////////////////////////////////////////////////////////////////////////////////////////// 1848 // SAT 1849 1850 struct SummedAreaTable { 1851 ICBC_ALIGN float r[16]; 1852 ICBC_ALIGN float g[16]; 1853 ICBC_ALIGN float b[16]; 1854 ICBC_ALIGN float w[16]; 1855 }; 1856 1857 int compute_sat(const Vector3 * colors, const float * weights, int count, SummedAreaTable * sat) 1858 { 1859 // I've tried using a lower quality approximation of the principal direction, but the best fit line seems to produce best results. 1860 Vector3 principal = computePrincipalComponent_PowerMethod(count, colors, weights); 1861 1862 // build the list of values 1863 int order[16]; 1864 float dps[16]; 1865 for (int i = 0; i < count; ++i) 1866 { 1867 order[i] = i; 1868 dps[i] = dot(colors[i], principal); 1869 } 1870 1871 // stable sort 1872 for (int i = 0; i < count; ++i) 1873 { 1874 for (int j = i; j > 0 && dps[j] < dps[j - 1]; --j) 1875 { 1876 swap(dps[j], dps[j - 1]); 1877 swap(order[j], order[j - 1]); 1878 } 1879 } 1880 1881 #if 0 1882 u8 byte_order[16*4]; 1883 for (int i = 0; i < count; ++i) 1884 { 1885 } 1886 1887 // @@ Load colors and weights. 1888 VFloat table_r[16/VEC_SIZE]; 1889 VFloat table_g[16/VEC_SIZE]; 1890 VFloat table_b[16/VEC_SIZE]; 1891 VFloat table_w[16/VEC_SIZE]; 1892 1893 VFloat sat_r[16/VEC_SIZE]; 1894 VFloat sat_g[16/VEC_SIZE]; 1895 VFloat sat_b[16/VEC_SIZE]; 1896 VFloat sat_w[16/VEC_SIZE]; 1897 1898 for (int i = 0; i < 16; i += VEC_SIZE) { 1899 VFloat w = table_lookup(table_w, order); 1900 sat_r[i] = table_lookup(table_r, order) * w; 1901 sat_g[i] = table_lookup(table_g, order) * w; 1902 sat_b[i] = table_lookup(table_b, order) * w; 1903 sat_w[i] = w; 1904 } 1905 1906 // Somewhat brute force way to compute sum tables. Is it worth it? 1907 for (int i = 0; i < 16; i += 1) { 1908 float r = sat_r[i/VEC_SIZE][i%VEC_SIZE]; 1909 1910 if (i > lane_id) 1911 sat_r[i] += sat_r[lane_id]; 1912 } 1913 #endif 1914 1915 float w = weights[order[0]]; 1916 sat->r[0] = colors[order[0]].x * w; 1917 sat->g[0] = colors[order[0]].y * w; 1918 sat->b[0] = colors[order[0]].z * w; 1919 sat->w[0] = w; 1920 1921 for (int i = 1; i < count; i++) { 1922 w = weights[order[i]]; 1923 sat->r[i] = sat->r[i - 1] + colors[order[i]].x * w; 1924 sat->g[i] = sat->g[i - 1] + colors[order[i]].y * w; 1925 sat->b[i] = sat->b[i - 1] + colors[order[i]].z * w; 1926 sat->w[i] = sat->w[i - 1] + w; 1927 } 1928 1929 for (int i = count; i < 16; i++) { 1930 sat->r[i] = FLT_MAX; 1931 sat->g[i] = FLT_MAX; 1932 sat->b[i] = FLT_MAX; 1933 sat->w[i] = FLT_MAX; 1934 } 1935 1936 return count; 1937 } 1938 1939 1940 /////////////////////////////////////////////////////////////////////////////////////////////////// 1941 // Cluster Fit 1942 1943 struct Combinations { 1944 uint8 c0, c1, c2, pad; 1945 }; 1946 1947 static ICBC_ALIGN int s_fourClusterTotal[16]; 1948 static ICBC_ALIGN int s_threeClusterTotal[16]; 1949 static ICBC_ALIGN Combinations s_fourCluster[968 + 8]; 1950 static ICBC_ALIGN Combinations s_threeCluster[152 + 8]; 1951 1952 #if ICBC_USE_NEON_VTL || ICBC_USE_SSSE3_SHUFFLEB 1953 static uint8 s_byte_vtl_index0_4[4 * 968]; 1954 static uint8 s_byte_vtl_index1_4[4 * 968]; 1955 static uint8 s_byte_vtl_index2_4[4 * 968]; 1956 1957 static uint8 s_byte_vtl_index0_3[4 * 152]; 1958 static uint8 s_byte_vtl_index1_3[4 * 152]; 1959 #endif 1960 1961 static float w1_3, w2_3; 1962 static float w1_9, w4_9, w2_9; 1963 1964 1965 static void init_cluster_tables() { 1966 1967 for (int t = 1, i = 0; t <= 16; t++) { 1968 for (int c0 = 0; c0 <= t; c0++) { 1969 for (int c1 = 0; c1 <= t - c0; c1++) { 1970 for (int c2 = 0; c2 <= t - c0 - c1; c2++) { 1971 1972 // Skip this cluster so that the total is a multiple of 8 1973 if (c0 == 0 && c1 == 0 && c2 == 0) continue; 1974 1975 bool found = false; 1976 if (t > 1) { 1977 for (int j = 0; j < s_fourClusterTotal[t-2]; j++) { 1978 if (s_fourCluster[j].c0 == c0 && s_fourCluster[j].c1 == c0+c1 && s_fourCluster[j].c2 == c0+c1+c2) { 1979 found = true; 1980 break; 1981 } 1982 } 1983 } 1984 1985 if (!found) { 1986 s_fourCluster[i].c0 = uint8(c0); 1987 s_fourCluster[i].c1 = uint8(c0+c1); 1988 s_fourCluster[i].c2 = uint8(c0+c1+c2); 1989 i++; 1990 } 1991 } 1992 } 1993 } 1994 1995 s_fourClusterTotal[t - 1] = i; 1996 } 1997 1998 // Replicate last entry. 1999 for (int i = 0; i < 8; i++) { 2000 s_fourCluster[968 + i] = s_fourCluster[968-1]; 2001 } 2002 2003 for (int t = 1, i = 0; t <= 16; t++) { 2004 for (int c0 = 0; c0 <= t; c0++) { 2005 for (int c1 = 0; c1 <= t - c0; c1++) { 2006 2007 // Skip this cluster so that the total is a multiple of 8 2008 if (c0 == 0 && c1 == 0) continue; 2009 2010 bool found = false; 2011 if (t > 1) { 2012 for (int j = 0; j < s_threeClusterTotal[t - 2]; j++) { 2013 if (s_threeCluster[j].c0 == c0 && s_threeCluster[j].c1 == c0+c1) { 2014 found = true; 2015 break; 2016 } 2017 } 2018 } 2019 2020 if (!found) { 2021 s_threeCluster[i].c0 = uint8(c0); 2022 s_threeCluster[i].c1 = uint8(c0 + c1); 2023 i++; 2024 } 2025 } 2026 } 2027 2028 s_threeClusterTotal[t - 1] = i; 2029 } 2030 2031 // Replicate last entry. 2032 for (int i = 0; i < 8; i++) { 2033 s_threeCluster[152 + i] = s_threeCluster[152 - 1]; 2034 } 2035 2036 #if ICBC_USE_NEON_VTL || ICBC_USE_SSSE3_SHUFFLEB 2037 for (int i = 0; i < 968; i++) { 2038 int c0 = (s_fourCluster[i].c0) - 1; 2039 s_byte_vtl_index0_4[4 * i + 0] = c0 >= 0 ? uint8(c0 * 4 + 0) : 255; 2040 s_byte_vtl_index0_4[4 * i + 1] = c0 >= 0 ? uint8(c0 * 4 + 1) : 255; 2041 s_byte_vtl_index0_4[4 * i + 2] = c0 >= 0 ? uint8(c0 * 4 + 2) : 255; 2042 s_byte_vtl_index0_4[4 * i + 3] = c0 >= 0 ? uint8(c0 * 4 + 3) : 255; 2043 2044 int c1 = (s_fourCluster[i].c1) - 1; 2045 s_byte_vtl_index1_4[4 * i + 0] = c1 >= 0 ? uint8(c1 * 4 + 0) : 255; 2046 s_byte_vtl_index1_4[4 * i + 1] = c1 >= 0 ? uint8(c1 * 4 + 1) : 255; 2047 s_byte_vtl_index1_4[4 * i + 2] = c1 >= 0 ? uint8(c1 * 4 + 2) : 255; 2048 s_byte_vtl_index1_4[4 * i + 3] = c1 >= 0 ? uint8(c1 * 4 + 3) : 255; 2049 2050 int c2 = (s_fourCluster[i].c2) - 1; 2051 s_byte_vtl_index2_4[4 * i + 0] = c2 >= 0 ? uint8(c2 * 4 + 0) : 255; 2052 s_byte_vtl_index2_4[4 * i + 1] = c2 >= 0 ? uint8(c2 * 4 + 1) : 255; 2053 s_byte_vtl_index2_4[4 * i + 2] = c2 >= 0 ? uint8(c2 * 4 + 2) : 255; 2054 s_byte_vtl_index2_4[4 * i + 3] = c2 >= 0 ? uint8(c2 * 4 + 3) : 255; 2055 } 2056 2057 for (int i = 0; i < 152; i++) { 2058 int c0 = (s_threeCluster[i].c0) - 1; 2059 s_byte_vtl_index0_3[4 * i + 0] = c0 >= 0 ? uint8(c0 * 4 + 0) : 255; 2060 s_byte_vtl_index0_3[4 * i + 1] = c0 >= 0 ? uint8(c0 * 4 + 1) : 255; 2061 s_byte_vtl_index0_3[4 * i + 2] = c0 >= 0 ? uint8(c0 * 4 + 2) : 255; 2062 s_byte_vtl_index0_3[4 * i + 3] = c0 >= 0 ? uint8(c0 * 4 + 3) : 255; 2063 2064 int c1 = (s_threeCluster[i].c1) - 1; 2065 s_byte_vtl_index1_3[4 * i + 0] = c1 >= 0 ? uint8(c1 * 4 + 0) : 255; 2066 s_byte_vtl_index1_3[4 * i + 1] = c1 >= 0 ? uint8(c1 * 4 + 1) : 255; 2067 s_byte_vtl_index1_3[4 * i + 2] = c1 >= 0 ? uint8(c1 * 4 + 2) : 255; 2068 s_byte_vtl_index1_3[4 * i + 3] = c1 >= 0 ? uint8(c1 * 4 + 3) : 255; 2069 } 2070 #endif 2071 } 2072 2073 2074 2075 static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end) 2076 { 2077 const float r_sum = sat.r[count-1]; 2078 const float g_sum = sat.g[count-1]; 2079 const float b_sum = sat.b[count-1]; 2080 const float w_sum = sat.w[count-1]; 2081 2082 VFloat vbesterror = vbroadcast(FLT_MAX); 2083 VVector3 vbeststart = { vzero(), vzero(), vzero() }; 2084 VVector3 vbestend = { vzero(), vzero(), vzero() }; 2085 2086 // check all possible clusters for this total order 2087 const int total_order_count = s_threeClusterTotal[count - 1]; 2088 2089 for (int i = 0; i < total_order_count; i += VEC_SIZE) 2090 { 2091 VVector3 x0, x1; 2092 VFloat w0, w1; 2093 2094 #if ICBC_USE_AVX512_PERMUTE 2095 2096 auto loadmask = lane_id() < vbroadcast(float(count)); 2097 2098 // Load sat in one register: 2099 VFloat vrsat = vload(loadmask, sat.r, FLT_MAX); 2100 VFloat vgsat = vload(loadmask, sat.g, FLT_MAX); 2101 VFloat vbsat = vload(loadmask, sat.b, FLT_MAX); 2102 VFloat vwsat = vload(loadmask, sat.w, FLT_MAX); 2103 2104 // Load 4 uint8 per lane. 2105 VInt packedClusterIndex = vload((int *)&s_threeCluster[i]); 2106 2107 VInt c0 = (packedClusterIndex & 0xFF) - 1; 2108 VInt c1 = (packedClusterIndex >> 8) - 1; 2109 2110 x0.x = vpermuteif(c0 >= 0, vrsat, c0); 2111 x0.y = vpermuteif(c0 >= 0, vgsat, c0); 2112 x0.z = vpermuteif(c0 >= 0, vbsat, c0); 2113 w0 = vpermuteif(c0 >= 0, vwsat, c0); 2114 2115 x1.x = vpermuteif(c1 >= 0, vrsat, c1); 2116 x1.y = vpermuteif(c1 >= 0, vgsat, c1); 2117 x1.z = vpermuteif(c1 >= 0, vbsat, c1); 2118 w1 = vpermuteif(c1 >= 0, vwsat, c1); 2119 2120 #elif ICBC_USE_AVX2_PERMUTE2 2121 2122 // Load 4 uint8 per lane. @@ Ideally I should pack this better and load only 2. 2123 VInt packedClusterIndex = vload((int *)&s_threeCluster[i]); 2124 2125 VInt c0 = (packedClusterIndex & 0xFF) - 1; 2126 VInt c1 = (packedClusterIndex >> 8) - 1; // No need for & 0xFF 2127 2128 if (count <= 8) { 2129 // Load sat.r in one register: 2130 VFloat rLo = vload(sat.r); 2131 VFloat gLo = vload(sat.g); 2132 VFloat bLo = vload(sat.b); 2133 VFloat wLo = vload(sat.w); 2134 2135 x0.x = vpermuteif(c0 >= 0, rLo, c0); 2136 x0.y = vpermuteif(c0 >= 0, gLo, c0); 2137 x0.z = vpermuteif(c0 >= 0, bLo, c0); 2138 w0 = vpermuteif(c0 >= 0, wLo, c0); 2139 2140 x1.x = vpermuteif(c1 >= 0, rLo, c1); 2141 x1.y = vpermuteif(c1 >= 0, gLo, c1); 2142 x1.z = vpermuteif(c1 >= 0, bLo, c1); 2143 w1 = vpermuteif(c1 >= 0, wLo, c1); 2144 } 2145 else { 2146 // Load sat.r in two registers: 2147 VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8); 2148 VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8); 2149 VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8); 2150 VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8); 2151 2152 x0.x = vpermute2if(c0 >= 0, rLo, rHi, c0); 2153 x0.y = vpermute2if(c0 >= 0, gLo, gHi, c0); 2154 x0.z = vpermute2if(c0 >= 0, bLo, bHi, c0); 2155 w0 = vpermute2if(c0 >= 0, wLo, wHi, c0); 2156 2157 x1.x = vpermute2if(c1 >= 0, rLo, rHi, c1); 2158 x1.y = vpermute2if(c1 >= 0, gLo, gHi, c1); 2159 x1.z = vpermute2if(c1 >= 0, bLo, bHi, c1); 2160 w1 = vpermute2if(c1 >= 0, wLo, wHi, c1); 2161 } 2162 2163 #elif ICBC_USE_SSSE3_SHUFFLEB 2164 2165 if (count <= 8) { 2166 VFloat rsat[2] = { vload(sat.r), vload(sat.r+4) }; 2167 VFloat gsat[2] = { vload(sat.g), vload(sat.g+4) }; 2168 VFloat bsat[2] = { vload(sat.b), vload(sat.b+4) }; 2169 VFloat wsat[2] = { vload(sat.w), vload(sat.w+4) }; 2170 2171 __m128i idx0 = (__m128i &)s_byte_vtl_index0_3[4*i]; 2172 x0.x = vpermute2(rsat[0], rsat[1], idx0); 2173 x0.y = vpermute2(gsat[0], gsat[1], idx0); 2174 x0.z = vpermute2(bsat[0], bsat[1], idx0); 2175 w0 = vpermute2(wsat[0], wsat[1], idx0); 2176 2177 __m128i idx1 = (__m128i &)s_byte_vtl_index1_3[4*i]; 2178 x1.x = vpermute2(rsat[0], rsat[1], idx1); 2179 x1.y = vpermute2(gsat[0], gsat[1], idx1); 2180 x1.z = vpermute2(bsat[0], bsat[1], idx1); 2181 w1 = vpermute2(wsat[0], wsat[1], idx1); 2182 } 2183 else { 2184 VFloat rsat[4] = { vload(sat.r), vload(sat.r+4), vload(sat.r+8), vload(sat.r+12) }; 2185 VFloat gsat[4] = { vload(sat.g), vload(sat.g+4), vload(sat.g+8), vload(sat.g+12) }; 2186 VFloat bsat[4] = { vload(sat.b), vload(sat.b+4), vload(sat.b+8), vload(sat.b+12) }; 2187 VFloat wsat[4] = { vload(sat.w), vload(sat.w+4), vload(sat.w+8), vload(sat.w+12) }; 2188 2189 __m128i idx0 = (__m128i &)s_byte_vtl_index0_3[4*i]; 2190 x0.x = vpermute4(rsat[0], rsat[1], rsat[2], rsat[3], idx0); 2191 x0.y = vpermute4(gsat[0], gsat[1], gsat[2], gsat[3], idx0); 2192 x0.z = vpermute4(bsat[0], bsat[1], bsat[2], bsat[3], idx0); 2193 w0 = vpermute4(wsat[0], wsat[1], wsat[2], wsat[3], idx0); 2194 2195 __m128i idx1 = (__m128i &)s_byte_vtl_index1_3[4*i]; 2196 x1.x = vpermute4(rsat[0], rsat[1], rsat[2], rsat[3], idx1); 2197 x1.y = vpermute4(gsat[0], gsat[1], gsat[2], gsat[3], idx1); 2198 x1.z = vpermute4(bsat[0], bsat[1], bsat[2], bsat[3], idx1); 2199 w1 = vpermute4(wsat[0], wsat[1], wsat[2], wsat[3], idx1); 2200 } 2201 2202 #elif ICBC_USE_NEON_VTL 2203 2204 uint8x16_t idx0 = (uint8x16_t &)s_byte_vtl_index0_3[4*i]; 2205 uint8x16_t idx1 = (uint8x16_t &)s_byte_vtl_index1_3[4*i]; 2206 2207 /*if (count <= 4) { 2208 uint8x16_t rsat1 = (uint8x16_t&)sat.r; 2209 uint8x16_t gsat1 = (uint8x16_t&)sat.g; 2210 uint8x16_t bsat1 = (uint8x16_t&)sat.b; 2211 uint8x16_t wsat1 = (uint8x16_t&)sat.w; 2212 2213 x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0)); 2214 x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0)); 2215 x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0)); 2216 w0 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0)); 2217 2218 x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1)); 2219 x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1)); 2220 x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1)); 2221 w1 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1)); 2222 } 2223 else*/ 2224 if (count <= 8) { 2225 uint8x16x2_t rsat2 = (uint8x16x2_t&)sat.r; 2226 uint8x16x2_t gsat2 = (uint8x16x2_t&)sat.g; 2227 uint8x16x2_t bsat2 = (uint8x16x2_t&)sat.b; 2228 uint8x16x2_t wsat2 = (uint8x16x2_t&)sat.w; 2229 2230 x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0)); 2231 x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0)); 2232 x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0)); 2233 w0 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0)); 2234 2235 x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1)); 2236 x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1)); 2237 x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1)); 2238 w1 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1)); 2239 } 2240 else if (count <= 12) { 2241 uint8x16x3_t rsat3 = (uint8x16x3_t&)sat.r; 2242 uint8x16x3_t gsat3 = (uint8x16x3_t&)sat.g; 2243 uint8x16x3_t bsat3 = (uint8x16x3_t&)sat.b; 2244 uint8x16x3_t wsat3 = (uint8x16x3_t&)sat.w; 2245 2246 x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0)); 2247 x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0)); 2248 x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0)); 2249 w0 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0)); 2250 2251 x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1)); 2252 x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1)); 2253 x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1)); 2254 w1 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1)); 2255 } 2256 else { 2257 uint8x16x4_t rsat4 = (uint8x16x4_t&)sat.r; 2258 uint8x16x4_t gsat4 = (uint8x16x4_t&)sat.g; 2259 uint8x16x4_t bsat4 = (uint8x16x4_t&)sat.b; 2260 uint8x16x4_t wsat4 = (uint8x16x4_t&)sat.w; 2261 2262 x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0)); 2263 x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0)); 2264 x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0)); 2265 w0 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0)); 2266 2267 x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1)); 2268 x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1)); 2269 x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1)); 2270 w1 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1)); 2271 } 2272 2273 #else 2274 2275 // Scalar path 2276 x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero(); 2277 x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero(); 2278 2279 for (int l = 0; l < VEC_SIZE; l++) { 2280 int c0 = s_threeCluster[i + l].c0 - 1; 2281 if (c0 >= 0) { 2282 lane(x0.x, l) = sat.r[c0]; 2283 lane(x0.y, l) = sat.g[c0]; 2284 lane(x0.z, l) = sat.b[c0]; 2285 lane(w0, l) = sat.w[c0]; 2286 } 2287 2288 int c1 = s_threeCluster[i + l].c1 - 1; 2289 if (c1 >= 0) { 2290 lane(x1.x, l) = sat.r[c1]; 2291 lane(x1.y, l) = sat.g[c1]; 2292 lane(x1.z, l) = sat.b[c1]; 2293 lane(w1, l) = sat.w[c1]; 2294 } 2295 } 2296 2297 #endif 2298 2299 VFloat w2 = vbroadcast(w_sum) - w1; 2300 x1 = x1 - x0; 2301 w1 = w1 - w0; 2302 2303 VFloat alphabeta_sum = w1 * vbroadcast(0.25f); 2304 VFloat alpha2_sum = w0 + alphabeta_sum; 2305 VFloat beta2_sum = w2 + alphabeta_sum; 2306 VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum)); 2307 2308 VVector3 alphax_sum = x0 + x1 * vbroadcast(0.5f); 2309 VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum; 2310 2311 VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor; 2312 VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor; 2313 2314 // snap to the grid 2315 a = vround_ept(a); 2316 b = vround_ept(b); 2317 2318 // compute the error 2319 VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f); 2320 VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2)); 2321 2322 // apply the metric to the error term 2323 VFloat error = vdot(e1, vbroadcast(metric_sqr)); 2324 2325 2326 // keep the solution if it wins 2327 auto mask = (error < vbesterror); 2328 2329 // I could mask the unused lanes here, but instead I set the invalid SAT entries to FLT_MAX. 2330 //mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds? 2331 2332 vbesterror = vselect(mask, vbesterror, error); 2333 vbeststart = vselect(mask, vbeststart, a); 2334 vbestend = vselect(mask, vbestend, b); 2335 } 2336 2337 int bestindex = reduce_min_index(vbesterror); 2338 2339 start->x = lane(vbeststart.x, bestindex); 2340 start->y = lane(vbeststart.y, bestindex); 2341 start->z = lane(vbeststart.z, bestindex); 2342 end->x = lane(vbestend.x, bestindex); 2343 end->y = lane(vbestend.y, bestindex); 2344 end->z = lane(vbestend.z, bestindex); 2345 } 2346 2347 2348 static void cluster_fit_four(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end) 2349 { 2350 const float r_sum = sat.r[count-1]; 2351 const float g_sum = sat.g[count-1]; 2352 const float b_sum = sat.b[count-1]; 2353 const float w_sum = sat.w[count-1]; 2354 2355 VFloat vbesterror = vbroadcast(FLT_MAX); 2356 VVector3 vbeststart = { vzero(), vzero(), vzero() }; 2357 VVector3 vbestend = { vzero(), vzero(), vzero() }; 2358 2359 // check all possible clusters for this total order 2360 const int total_order_count = s_fourClusterTotal[count - 1]; 2361 2362 for (int i = 0; i < total_order_count; i += VEC_SIZE) 2363 { 2364 VVector3 x0, x1, x2; 2365 VFloat w0, w1, w2; 2366 2367 /* 2368 // Another approach would be to load and broadcast one color at a time like I do in my old CUDA implementation. 2369 uint akku = 0; 2370 2371 // Compute alpha & beta for this permutation. 2372 #pragma unroll 2373 for (int i = 0; i < 16; i++) 2374 { 2375 const uint bits = permutation >> (2*i); 2376 2377 alphax_sum += alphaTable4[bits & 3] * colors[i]; 2378 akku += prods4[bits & 3]; 2379 } 2380 2381 float alpha2_sum = float(akku >> 16); 2382 float beta2_sum = float((akku >> 8) & 0xff); 2383 float alphabeta_sum = float(akku & 0xff); 2384 float3 betax_sum = 9.0f * color_sum - alphax_sum; 2385 */ 2386 2387 #if ICBC_USE_AVX512_PERMUTE 2388 2389 auto loadmask = lane_id() < vbroadcast(float(count)); 2390 2391 // Load sat in one register: 2392 VFloat vrsat = vload(loadmask, sat.r, FLT_MAX); 2393 VFloat vgsat = vload(loadmask, sat.g, FLT_MAX); 2394 VFloat vbsat = vload(loadmask, sat.b, FLT_MAX); 2395 VFloat vwsat = vload(loadmask, sat.w, FLT_MAX); 2396 2397 // Load 4 uint8 per lane. 2398 VInt packedClusterIndex = vload((int *)&s_fourCluster[i]); 2399 2400 VInt c0 = (packedClusterIndex & 0xFF) - 1; 2401 VInt c1 = ((packedClusterIndex >> 8) & 0xFF) - 1; 2402 VInt c2 = ((packedClusterIndex >> 16)) - 1; // No need for & 2403 2404 x0.x = vpermuteif(c0 >= 0, vrsat, c0); 2405 x0.y = vpermuteif(c0 >= 0, vgsat, c0); 2406 x0.z = vpermuteif(c0 >= 0, vbsat, c0); 2407 w0 = vpermuteif(c0 >= 0, vwsat, c0); 2408 2409 x1.x = vpermuteif(c1 >= 0, vrsat, c1); 2410 x1.y = vpermuteif(c1 >= 0, vgsat, c1); 2411 x1.z = vpermuteif(c1 >= 0, vbsat, c1); 2412 w1 = vpermuteif(c1 >= 0, vwsat, c1); 2413 2414 x2.x = vpermuteif(c2 >= 0, vrsat, c2); 2415 x2.y = vpermuteif(c2 >= 0, vgsat, c2); 2416 x2.z = vpermuteif(c2 >= 0, vbsat, c2); 2417 w2 = vpermuteif(c2 >= 0, vwsat, c2); 2418 2419 #elif ICBC_USE_AVX2_PERMUTE2 2420 2421 // Load 4 uint8 per lane. 2422 VInt packedClusterIndex = vload((int *)&s_fourCluster[i]); 2423 2424 VInt c0 = (packedClusterIndex & 0xFF) - 1; 2425 VInt c1 = ((packedClusterIndex >> 8) & 0xFF) - 1; 2426 VInt c2 = ((packedClusterIndex >> 16)) - 1; // No need for & 2427 2428 if (count <= 8) { 2429 // Load sat.r in one register: 2430 VFloat rLo = vload(sat.r); 2431 VFloat gLo = vload(sat.g); 2432 VFloat bLo = vload(sat.b); 2433 VFloat wLo = vload(sat.w); 2434 2435 x0.x = vpermuteif(c0 >= 0, rLo, c0); 2436 x0.y = vpermuteif(c0 >= 0, gLo, c0); 2437 x0.z = vpermuteif(c0 >= 0, bLo, c0); 2438 w0 = vpermuteif(c0 >= 0, wLo, c0); 2439 2440 x1.x = vpermuteif(c1 >= 0, rLo, c1); 2441 x1.y = vpermuteif(c1 >= 0, gLo, c1); 2442 x1.z = vpermuteif(c1 >= 0, bLo, c1); 2443 w1 = vpermuteif(c1 >= 0, wLo, c1); 2444 2445 x2.x = vpermuteif(c2 >= 0, rLo, c2); 2446 x2.y = vpermuteif(c2 >= 0, gLo, c2); 2447 x2.z = vpermuteif(c2 >= 0, bLo, c2); 2448 w2 = vpermuteif(c2 >= 0, wLo, c2); 2449 } 2450 else { 2451 // Load sat.r in two registers: 2452 VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8); 2453 VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8); 2454 VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8); 2455 VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8); 2456 2457 x0.x = vpermute2if(c0 >= 0, rLo, rHi, c0); 2458 x0.y = vpermute2if(c0 >= 0, gLo, gHi, c0); 2459 x0.z = vpermute2if(c0 >= 0, bLo, bHi, c0); 2460 w0 = vpermute2if(c0 >= 0, wLo, wHi, c0); 2461 2462 x1.x = vpermute2if(c1 >= 0, rLo, rHi, c1); 2463 x1.y = vpermute2if(c1 >= 0, gLo, gHi, c1); 2464 x1.z = vpermute2if(c1 >= 0, bLo, bHi, c1); 2465 w1 = vpermute2if(c1 >= 0, wLo, wHi, c1); 2466 2467 x2.x = vpermute2if(c2 >= 0, rLo, rHi, c2); 2468 x2.y = vpermute2if(c2 >= 0, gLo, gHi, c2); 2469 x2.z = vpermute2if(c2 >= 0, bLo, bHi, c2); 2470 w2 = vpermute2if(c2 >= 0, wLo, wHi, c2); 2471 } 2472 2473 #elif ICBC_USE_SSSE3_SHUFFLEB 2474 2475 if (count <= 8) { 2476 VFloat rsat[2] = { vload(sat.r), vload(sat.r+4) }; 2477 VFloat gsat[2] = { vload(sat.g), vload(sat.g+4) }; 2478 VFloat bsat[2] = { vload(sat.b), vload(sat.b+4) }; 2479 VFloat wsat[2] = { vload(sat.w), vload(sat.w+4) }; 2480 2481 __m128i idx0 = (__m128i &)s_byte_vtl_index0_4[4*i]; 2482 x0.x = vpermute2(rsat[0], rsat[1], idx0); 2483 x0.y = vpermute2(gsat[0], gsat[1], idx0); 2484 x0.z = vpermute2(bsat[0], bsat[1], idx0); 2485 w0 = vpermute2(wsat[0], wsat[1], idx0); 2486 2487 __m128i idx1 = (__m128i &)s_byte_vtl_index1_4[4*i]; 2488 x1.x = vpermute2(rsat[0], rsat[1], idx1); 2489 x1.y = vpermute2(gsat[0], gsat[1], idx1); 2490 x1.z = vpermute2(bsat[0], bsat[1], idx1); 2491 w1 = vpermute2(wsat[0], wsat[1], idx1); 2492 2493 __m128i idx2 = (__m128i &)s_byte_vtl_index2_4[4*i]; 2494 x2.x = vpermute2(rsat[0], rsat[1], idx2); 2495 x2.y = vpermute2(gsat[0], gsat[1], idx2); 2496 x2.z = vpermute2(bsat[0], bsat[1], idx2); 2497 w2 = vpermute2(wsat[0], wsat[1], idx2); 2498 } 2499 else { 2500 VFloat rsat[4] = { vload(sat.r), vload(sat.r+4), vload(sat.r+8), vload(sat.r+12) }; 2501 VFloat gsat[4] = { vload(sat.g), vload(sat.g+4), vload(sat.g+8), vload(sat.g+12) }; 2502 VFloat bsat[4] = { vload(sat.b), vload(sat.b+4), vload(sat.b+8), vload(sat.b+12) }; 2503 VFloat wsat[4] = { vload(sat.w), vload(sat.w+4), vload(sat.w+8), vload(sat.w+12) }; 2504 2505 __m128i idx0 = (__m128i &)s_byte_vtl_index0_4[4*i]; 2506 x0.x = vpermute4(rsat[0], rsat[1], rsat[2], rsat[3], idx0); 2507 x0.y = vpermute4(gsat[0], gsat[1], gsat[2], gsat[3], idx0); 2508 x0.z = vpermute4(bsat[0], bsat[1], bsat[2], bsat[3], idx0); 2509 w0 = vpermute4(wsat[0], wsat[1], wsat[2], wsat[3], idx0); 2510 2511 __m128i idx1 = (__m128i &)s_byte_vtl_index1_4[4*i]; 2512 x1.x = vpermute4(rsat[0], rsat[1], rsat[2], rsat[3], idx1); 2513 x1.y = vpermute4(gsat[0], gsat[1], gsat[2], gsat[3], idx1); 2514 x1.z = vpermute4(bsat[0], bsat[1], bsat[2], bsat[3], idx1); 2515 w1 = vpermute4(wsat[0], wsat[1], wsat[2], wsat[3], idx1); 2516 2517 __m128i idx2 = (__m128i &)s_byte_vtl_index2_4[4*i]; 2518 x2.x = vpermute4(rsat[0], rsat[1], rsat[2], rsat[3], idx2); 2519 x2.y = vpermute4(gsat[0], gsat[1], gsat[2], gsat[3], idx2); 2520 x2.z = vpermute4(bsat[0], bsat[1], bsat[2], bsat[3], idx2); 2521 w2 = vpermute4(wsat[0], wsat[1], wsat[2], wsat[3], idx2); 2522 } 2523 2524 #elif ICBC_USE_NEON_VTL 2525 2526 /* 2527 // c0 = { pq[12],pq[12],pq[12],pq[12], pq[8],pq[8],pq[8],pq[8], pq[4],pq[4],pq[4],pq[4], pq[0],pq[0],pq[0],pq[0] } 2528 // idx0 = (c0 - 1) * 4 + byte_offset 2529 // idx0 = c0 == 0 ? 255 : idx0 2530 2531 // c1 = { pq[13],pq[13],pq[13],pq[13], pq[9],pq[9],pq[9],pq[9], pq[5],pq[5],pq[5],pq[5], pq[1],pq[1],pq[1],pq[1] } 2532 // idx1 = (c1 - 1) * 4 + byte_offset 2533 // idx1 = c1 == 0 ? 255 : idx1 2534 2535 // c2 = { pq[14],pq[14],pq[14],pq[14], pq[10],pq[10],pq[10],pq[10], pq[6],pq[6],pq[6],pq[6], pq[2],pq[2],pq[2],pq[2] } 2536 // idx2 = (c2 - 1) * 4 + byte_offset 2537 // idx2 = c2 == 0 ? 255 : idx2 2538 2539 uint8x16_t packedClusterIndex = (uint8x16_t &)s_fourCluster[i]; 2540 2541 const uint8x16_t byte_offsets = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; 2542 uint8x16_t indices = {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}; 2543 2544 uint8x16_t c0 = vqtbl1q_u8(packedClusterIndex, indices); 2545 uint8x16_t idx0 = vaddq_u8(vshlq_n_u8(vsubq_u8(c0, vdupq_n_u8(1)), 2), byte_offsets); // (c0 - 1) * 4 + byte_offsets 2546 idx0 = vorrq_u8(idx0, vceqzq_u8(c0)); // idx0 = c0 == 0 ? 255 : idx0 2547 2548 indices = vaddq_u8(indices, vdupq_n_u8(1)); 2549 uint8x16_t c1 = vqtbl1q_u8(packedClusterIndex, indices); 2550 uint8x16_t idx1 = vaddq_u8(vshlq_n_u8(vsubq_u8(c1, vdupq_n_u8(1)), 2), byte_offsets); // (c1 - 1) * 4 + byte_offsets 2551 idx1 = vorrq_u8(idx1, vceqzq_u8(c1)); // idx1 = c1 == 0 ? 255 : idx1 2552 2553 indices = vaddq_u8(indices, vdupq_n_u8(1)); 2554 uint8x16_t c2 = vqtbl1q_u8(packedClusterIndex, indices); 2555 uint8x16_t idx2 = vaddq_u8(vshlq_n_u8(vsubq_u8(c2, vdupq_n_u8(1)), 2), byte_offsets); // (c2 - 1) * 4 + byte_offsets 2556 idx2 = vorrq_u8(idx2, vceqzq_u8(c2)); // idx2 = c2 == 0 ? 255 : bc0 2557 */ 2558 2559 // Loading the precomputed byte indices is faster than computing them on the fly. 2560 uint8x16_t idx0 = (uint8x16_t &)s_byte_vtl_index0_4[4*i]; 2561 uint8x16_t idx1 = (uint8x16_t &)s_byte_vtl_index1_4[4*i]; 2562 uint8x16_t idx2 = (uint8x16_t &)s_byte_vtl_index2_4[4*i]; 2563 2564 /*if (count <= 4) { 2565 uint8x16_t rsat1 = (uint8x16_t&)sat.r; 2566 uint8x16_t gsat1 = (uint8x16_t&)sat.g; 2567 uint8x16_t bsat1 = (uint8x16_t&)sat.b; 2568 uint8x16_t wsat1 = (uint8x16_t&)sat.w; 2569 2570 x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0)); 2571 x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0)); 2572 x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0)); 2573 w0 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0)); 2574 2575 x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1)); 2576 x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1)); 2577 x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1)); 2578 w1 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1)); 2579 2580 x2.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx2)); 2581 x2.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx2)); 2582 x2.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx2)); 2583 w2 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx2)); 2584 } 2585 else*/ 2586 if (count <= 8) { 2587 uint8x16x2_t rsat2 = (uint8x16x2_t&)sat.r; 2588 uint8x16x2_t gsat2 = (uint8x16x2_t&)sat.g; 2589 uint8x16x2_t bsat2 = (uint8x16x2_t&)sat.b; 2590 uint8x16x2_t wsat2 = (uint8x16x2_t&)sat.w; 2591 2592 x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0)); 2593 x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0)); 2594 x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0)); 2595 w0 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0)); 2596 2597 x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1)); 2598 x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1)); 2599 x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1)); 2600 w1 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1)); 2601 2602 x2.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx2)); 2603 x2.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx2)); 2604 x2.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx2)); 2605 w2 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx2)); 2606 } 2607 else if (count <= 12) { 2608 uint8x16x3_t rsat3 = (uint8x16x3_t&)sat.r; 2609 uint8x16x3_t gsat3 = (uint8x16x3_t&)sat.g; 2610 uint8x16x3_t bsat3 = (uint8x16x3_t&)sat.b; 2611 uint8x16x3_t wsat3 = (uint8x16x3_t&)sat.w; 2612 2613 x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0)); 2614 x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0)); 2615 x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0)); 2616 w0 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0)); 2617 2618 x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1)); 2619 x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1)); 2620 x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1)); 2621 w1 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1)); 2622 2623 x2.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx2)); 2624 x2.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx2)); 2625 x2.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx2)); 2626 w2 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx2)); 2627 } 2628 else { 2629 uint8x16x4_t rsat4 = (uint8x16x4_t&)sat.r; 2630 uint8x16x4_t gsat4 = (uint8x16x4_t&)sat.g; 2631 uint8x16x4_t bsat4 = (uint8x16x4_t&)sat.b; 2632 uint8x16x4_t wsat4 = (uint8x16x4_t&)sat.w; 2633 2634 x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0)); 2635 x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0)); 2636 x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0)); 2637 w0 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0)); 2638 2639 x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1)); 2640 x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1)); 2641 x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1)); 2642 w1 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1)); 2643 2644 x2.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx2)); 2645 x2.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx2)); 2646 x2.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx2)); 2647 w2 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx2)); 2648 } 2649 2650 #else 2651 2652 // Scalar path 2653 x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero(); 2654 x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero(); 2655 x2.x = vzero(); x2.y = vzero(); x2.z = vzero(); w2 = vzero(); 2656 2657 for (int l = 0; l < VEC_SIZE; l++) { 2658 int c0 = s_fourCluster[i + l].c0 - 1; 2659 if (c0 >= 0) { 2660 lane(x0.x, l) = sat.r[c0]; 2661 lane(x0.y, l) = sat.g[c0]; 2662 lane(x0.z, l) = sat.b[c0]; 2663 lane(w0, l) = sat.w[c0]; 2664 } 2665 2666 int c1 = s_fourCluster[i + l].c1 - 1; 2667 if (c1 >= 0) { 2668 lane(x1.x, l) = sat.r[c1]; 2669 lane(x1.y, l) = sat.g[c1]; 2670 lane(x1.z, l) = sat.b[c1]; 2671 lane(w1, l) = sat.w[c1]; 2672 } 2673 2674 int c2 = s_fourCluster[i + l].c2 - 1; 2675 if (c2 >= 0) { 2676 lane(x2.x, l) = sat.r[c2]; 2677 lane(x2.y, l) = sat.g[c2]; 2678 lane(x2.z, l) = sat.b[c2]; 2679 lane(w2, l) = sat.w[c2]; 2680 } 2681 } 2682 2683 #endif 2684 2685 VFloat w3 = vbroadcast(w_sum) - w2; 2686 x2 = x2 - x1; 2687 x1 = x1 - x0; 2688 w2 = w2 - w1; 2689 w1 = w1 - w0; 2690 2691 #if 1 2692 VFloat alpha2_sum = vmadd(w2, w1_9, vmadd(w1, w4_9, w0)); 2693 VFloat beta2_sum = vmadd(w1, w1_9, vmadd(w2, w4_9, w3)); 2694 2695 VFloat alphabeta_sum = (w1 + w2) * vbroadcast(w2_9); 2696 VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum)); 2697 2698 VVector3 alphax_sum = vmadd(x2, w1_3, vmadd(x1, w2_3, x0)); 2699 #else 2700 VFloat alpha2_sum = vmadd(w2, (1.0f / 9.0f), vmadd(w1, (4.0f / 9.0f), w0)); 2701 VFloat beta2_sum = vmadd(w1, (1.0f / 9.0f), vmadd(w2, (4.0f / 9.0f), w3)); 2702 2703 VFloat alphabeta_sum = (w1 + w2) * vbroadcast(2.0f / 9.0f); 2704 VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum)); 2705 2706 VVector3 alphax_sum = vmadd(x2, (1.0f / 3.0f), vmadd(x1, (2.0f / 3.0f), x0)); 2707 #endif 2708 VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum; 2709 2710 VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor; 2711 VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor; 2712 2713 // snap to the grid 2714 a = vround_ept(a); 2715 b = vround_ept(b); 2716 2717 // compute the error 2718 VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f); 2719 VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2)); 2720 2721 // apply the metric to the error term 2722 VFloat error = vdot(e1, vbroadcast(metric_sqr)); 2723 2724 // keep the solution if it wins 2725 auto mask = (error < vbesterror); 2726 2727 // We could mask the unused lanes here, but instead set the invalid SAT entries to FLT_MAX. 2728 //mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds? 2729 2730 vbesterror = vselect(mask, vbesterror, error); 2731 vbeststart = vselect(mask, vbeststart, a); 2732 vbestend = vselect(mask, vbestend, b); 2733 } 2734 2735 int bestindex = reduce_min_index(vbesterror); 2736 2737 start->x = lane(vbeststart.x, bestindex); 2738 start->y = lane(vbeststart.y, bestindex); 2739 start->z = lane(vbeststart.z, bestindex); 2740 end->x = lane(vbestend.x, bestindex); 2741 end->y = lane(vbestend.y, bestindex); 2742 end->z = lane(vbestend.z, bestindex); 2743 } 2744 2745 2746 /////////////////////////////////////////////////////////////////////////////////////////////////// 2747 // Palette evaluation. 2748 2749 Decoder s_decoder = Decoder_D3D10; 2750 2751 // D3D10 2752 inline void evaluate_palette4_d3d10(Color32 palette[4]) { 2753 palette[2].r = (2 * palette[0].r + palette[1].r) / 3; 2754 palette[2].g = (2 * palette[0].g + palette[1].g) / 3; 2755 palette[2].b = (2 * palette[0].b + palette[1].b) / 3; 2756 palette[2].a = 0xFF; 2757 2758 palette[3].r = (2 * palette[1].r + palette[0].r) / 3; 2759 palette[3].g = (2 * palette[1].g + palette[0].g) / 3; 2760 palette[3].b = (2 * palette[1].b + palette[0].b) / 3; 2761 palette[3].a = 0xFF; 2762 } 2763 inline void evaluate_palette3_d3d10(Color32 palette[4]) { 2764 palette[2].r = (palette[0].r + palette[1].r) / 2; 2765 palette[2].g = (palette[0].g + palette[1].g) / 2; 2766 palette[2].b = (palette[0].b + palette[1].b) / 2; 2767 palette[2].a = 0xFF; 2768 palette[3].u = 0; 2769 } 2770 static void evaluate_palette_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) { 2771 palette[0] = bitexpand_color16_to_color32(c0); 2772 palette[1] = bitexpand_color16_to_color32(c1); 2773 if (c0.u > c1.u) { 2774 evaluate_palette4_d3d10(palette); 2775 } 2776 else { 2777 evaluate_palette3_d3d10(palette); 2778 } 2779 } 2780 2781 // NV 2782 inline void evaluate_palette4_nv(Color16 c0, Color16 c1, Color32 palette[4]) { 2783 int gdiff = palette[1].g - palette[0].g; 2784 palette[2].r = uint8(((2 * c0.r + c1.r) * 22) / 8); 2785 palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256); 2786 palette[2].b = uint8(((2 * c0.b + c1.b) * 22) / 8); 2787 palette[2].a = 0xFF; 2788 2789 palette[3].r = uint8(((2 * c1.r + c0.r) * 22) / 8); 2790 palette[3].g = uint8((256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256); 2791 palette[3].b = uint8(((2 * c1.b + c0.b) * 22) / 8); 2792 palette[3].a = 0xFF; 2793 } 2794 inline void evaluate_palette3_nv(Color16 c0, Color16 c1, Color32 palette[4]) { 2795 int gdiff = palette[1].g - palette[0].g; 2796 palette[2].r = uint8(((c0.r + c1.r) * 33) / 8); 2797 palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256); 2798 palette[2].b = uint8(((c0.b + c1.b) * 33) / 8); 2799 palette[2].a = 0xFF; 2800 palette[3].u = 0; 2801 } 2802 static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) { 2803 palette[0] = bitexpand_color16_to_color32(c0); 2804 palette[1] = bitexpand_color16_to_color32(c1); 2805 2806 if (c0.u > c1.u) { 2807 evaluate_palette4_nv(c0, c1, palette); 2808 } 2809 else { 2810 evaluate_palette3_nv(c0, c1, palette); 2811 } 2812 } 2813 2814 // AMD 2815 inline void evaluate_palette4_amd(Color32 palette[4]) { 2816 palette[2].r = uint8((43 * palette[0].r + 21 * palette[1].r + 32) >> 6); 2817 palette[2].g = uint8((43 * palette[0].g + 21 * palette[1].g + 32) >> 6); 2818 palette[2].b = uint8((43 * palette[0].b + 21 * palette[1].b + 32) >> 6); 2819 palette[2].a = 0xFF; 2820 2821 palette[3].r = uint8((43 * palette[1].r + 21 * palette[0].r + 32) >> 6); 2822 palette[3].g = uint8((43 * palette[1].g + 21 * palette[0].g + 32) >> 6); 2823 palette[3].b = uint8((43 * palette[1].b + 21 * palette[0].b + 32) >> 6); 2824 palette[3].a = 0xFF; 2825 } 2826 inline void evaluate_palette3_amd(Color32 palette[4]) { 2827 palette[2].r = uint8((palette[0].r + palette[1].r + 1) / 2); 2828 palette[2].g = uint8((palette[0].g + palette[1].g + 1) / 2); 2829 palette[2].b = uint8((palette[0].b + palette[1].b + 1) / 2); 2830 palette[2].a = 0xFF; 2831 palette[3].u = 0; 2832 } 2833 static void evaluate_palette_amd(Color16 c0, Color16 c1, Color32 palette[4]) { 2834 palette[0] = bitexpand_color16_to_color32(c0); 2835 palette[1] = bitexpand_color16_to_color32(c1); 2836 2837 if (c0.u > c1.u) { 2838 evaluate_palette4_amd(palette); 2839 } 2840 else { 2841 evaluate_palette3_amd(palette); 2842 } 2843 } 2844 2845 // Intel 2846 inline void evaluate_palette4_intel(Color32 palette[4]) { 2847 palette[2].r = uint8((171 * palette[0].r + 85 * palette[1].r + 128) >> 8); 2848 palette[2].g = uint8((171 * palette[0].g + 85 * palette[1].g + 128) >> 8); 2849 palette[2].b = uint8((171 * palette[0].b + 85 * palette[1].b + 128) >> 8); 2850 palette[2].a = 0xFF; 2851 2852 palette[3].r = uint8((171 * palette[1].r + 85 * palette[0].r + 128) >> 8); 2853 palette[3].g = uint8((171 * palette[1].g + 85 * palette[0].g + 128) >> 8); 2854 palette[3].b = uint8((171 * palette[1].b + 85 * palette[0].b + 128) >> 8); 2855 palette[3].a = 0xFF; 2856 } 2857 inline void evaluate_palette3_intel(Color32 palette[4]) { 2858 palette[2].r = uint8((128 * palette[0].r + 128 * palette[1].r + 128) >> 8); 2859 palette[2].g = uint8((128 * palette[0].g + 128 * palette[1].g + 128) >> 8); 2860 palette[2].b = uint8((128 * palette[0].b + 128 * palette[1].b + 128) >> 8); 2861 palette[2].a = 0xFF; 2862 palette[3].u = 0; 2863 } 2864 static void evaluate_palette_intel(Color16 c0, Color16 c1, Color32 palette[4]) { 2865 palette[0] = bitexpand_color16_to_color32(c0); 2866 palette[1] = bitexpand_color16_to_color32(c1); 2867 2868 if (c0.u > c1.u) { 2869 evaluate_palette4_intel(palette); 2870 } 2871 else { 2872 evaluate_palette3_intel(palette); 2873 } 2874 } 2875 2876 inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4], Decoder decoder = s_decoder) { 2877 if (decoder == Decoder_D3D10) evaluate_palette_d3d10(c0, c1, palette); 2878 else if (decoder == Decoder_NVIDIA) evaluate_palette_nv(c0, c1, palette); 2879 else if (decoder == Decoder_AMD) evaluate_palette_amd(c0, c1, palette); 2880 else if (decoder == Decoder_Intel) evaluate_palette_intel(c0, c1, palette); 2881 } 2882 2883 inline void evaluate_palette4(Color16 c0, Color16 c1, Color32 palette[4], Decoder decoder = s_decoder) { 2884 palette[0] = bitexpand_color16_to_color32(c0); 2885 palette[1] = bitexpand_color16_to_color32(c1); 2886 2887 if (decoder == Decoder_D3D10) evaluate_palette4_d3d10(palette); 2888 else if (decoder == Decoder_NVIDIA) evaluate_palette4_nv(c0, c1, palette); 2889 else if (decoder == Decoder_AMD) evaluate_palette4_amd(palette); 2890 else if (decoder == Decoder_Intel) evaluate_palette4_intel(palette); 2891 } 2892 2893 static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) { 2894 Color32 palette32[4]; 2895 evaluate_palette(c0, c1, palette32); 2896 2897 for (int i = 0; i < 4; i++) { 2898 palette[i] = color_to_vector3(palette32[i]); 2899 } 2900 } 2901 2902 static void evaluate_palette(uint8 alpha0, uint8 alpha1, uint8 alpha_palette[8], Decoder decoder) { 2903 // @@ 2904 alpha_palette[0] = alpha0; 2905 alpha_palette[1] = alpha1; 2906 if (alpha0 > alpha1) { 2907 // @@ 2908 } 2909 else { 2910 // @@ 2911 } 2912 } 2913 2914 2915 static void decode_bc1(const BlockBC1 * block, unsigned char rgba_block[16 * 4], Decoder decoder) { 2916 Color32 palette[4]; 2917 evaluate_palette(block->col0, block->col1, palette, decoder); 2918 2919 for (int i = 0; i < 16; i++) { 2920 int index = (block->indices >> (2 * i)) & 3; 2921 Color32 c = palette[index]; 2922 rgba_block[4 * i + 0] = c.r; 2923 rgba_block[4 * i + 1] = c.g; 2924 rgba_block[4 * i + 2] = c.b; 2925 rgba_block[4 * i + 3] = c.a; 2926 } 2927 } 2928 2929 static void decode_bc3(const BlockBC3 * block, unsigned char rgba_block[16 * 4], Decoder decoder) { 2930 Color32 rgb_palette[4]; 2931 evaluate_palette4(block->rgb.col0, block->rgb.col1, rgb_palette, decoder); 2932 2933 uint8 alpha_palette[8]; 2934 evaluate_palette(block->alpha.alpha0, block->alpha.alpha1, alpha_palette, decoder); 2935 2936 for (int i = 0; i < 16; i++) { 2937 int rgb_index = (block->rgb.indices >> (2 * i)) & 3; 2938 Color32 c = rgb_palette[rgb_index]; 2939 rgba_block[4 * i + 0] = c.r; 2940 rgba_block[4 * i + 1] = c.g; 2941 rgba_block[4 * i + 2] = c.b; 2942 2943 int alpha_index = 0; // @@ 2944 uint8 alpha = alpha_palette[alpha_index]; 2945 rgba_block[4 * i + 3] = alpha; 2946 } 2947 } 2948 2949 2950 /////////////////////////////////////////////////////////////////////////////////////////////////// 2951 // Error evaluation. 2952 2953 // Different ways of estimating the error. 2954 2955 static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) { 2956 Vector3 d = (p - c) * w * 255; 2957 return dot(d, d); 2958 } 2959 2960 static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 & w) { 2961 Vector3 d = (color_to_vector3(p) - c) * w * 255; 2962 return dot(d, d); 2963 } 2964 2965 static int evaluate_mse(const Color32 & p, const Color32 & c) { 2966 return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b)); 2967 } 2968 2969 static int evaluate_mse(const Color32 palette[4], const Color32 & c) { 2970 int e0 = evaluate_mse(palette[0], c); 2971 int e1 = evaluate_mse(palette[1], c); 2972 int e2 = evaluate_mse(palette[2], c); 2973 int e3 = evaluate_mse(palette[3], c); 2974 return min(min(e0, e1), min(e2, e3)); 2975 } 2976 2977 // Returns MSE error in [0-255] range. 2978 static int evaluate_mse(const BlockBC1 * output, Color32 color, int index) { 2979 Color32 palette[4]; 2980 evaluate_palette(output->col0, output->col1, palette); 2981 2982 return evaluate_mse(palette[index], color); 2983 } 2984 2985 // Returns weighted MSE error in [0-255] range. 2986 static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) { 2987 2988 float total = 0.0f; 2989 for (int i = 0; i < count; i++) { 2990 total += weights[i] * float(evaluate_mse(palette, colors[i])); 2991 } 2992 2993 return total; 2994 } 2995 2996 static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, int count) { 2997 2998 float total = 0.0f; 2999 for (int i = 0; i < count; i++) { 3000 total += float(evaluate_mse(palette, colors[i])); 3001 } 3002 3003 return total; 3004 } 3005 3006 static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockBC1 * output) { 3007 Color32 palette[4]; 3008 evaluate_palette(output->col0, output->col1, palette); 3009 3010 // evaluate error for each index. 3011 float error = 0.0f; 3012 for (int i = 0; i < 16; i++) { 3013 int index = (output->indices >> (2 * i)) & 3; 3014 error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights); 3015 } 3016 return error; 3017 } 3018 3019 static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3& color_weights, Vector3 palette[4], uint32 indices) { 3020 3021 // evaluate error for each index. 3022 float error = 0.0f; 3023 for (int i = 0; i < 16; i++) { 3024 int index = (indices >> (2 * i)) & 3; 3025 error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights); 3026 } 3027 return error; 3028 } 3029 3030 float evaluate_bc1_error(const uint8 rgba_block[16*4], const BlockBC1 * block, Decoder decoder) { 3031 Color32 palette[4]; 3032 evaluate_palette(block->col0, block->col1, palette, decoder); 3033 3034 // evaluate error for each index. 3035 float error = 0.0f; 3036 for (int i = 0; i < 16; i++) { 3037 int index = (block->indices >> (2 * i)) & 3; 3038 Color32 c; 3039 c.r = rgba_block[4 * i + 0]; 3040 c.g = rgba_block[4 * i + 1]; 3041 c.b = rgba_block[4 * i + 2]; 3042 c.a = 255; 3043 error += float(evaluate_mse(palette[index], c)); 3044 } 3045 return error; 3046 } 3047 3048 float evaluate_bc3_error(const uint8 rgba_block[16*4], const BlockBC3 * block, bool alpha_blend, Decoder decoder) { 3049 Color32 rgb_palette[4]; 3050 evaluate_palette4(block->rgb.col0, block->rgb.col1, rgb_palette, decoder); 3051 3052 uint8 alpha_palette[8]; 3053 evaluate_palette(block->alpha.alpha0, block->alpha.alpha1, alpha_palette, decoder); 3054 3055 // evaluate error for each index. 3056 float error = 0.0f; 3057 for (int i = 0; i < 16; i++) { 3058 Color32 c; 3059 c.r = rgba_block[4 * i + 0]; 3060 c.g = rgba_block[4 * i + 1]; 3061 c.b = rgba_block[4 * i + 2]; 3062 c.a = rgba_block[4 * i + 3]; 3063 3064 int rgb_index = (block->rgb.indices >> (2 * i)) & 3; 3065 int alpha_index = 0; // @@ 3066 3067 error += (alpha_blend ? (c.a * 1.0f/255.0f) : 1.0f) * float(evaluate_mse(rgb_palette[rgb_index], c)); 3068 error += float(square(int(alpha_palette[alpha_index]) - c.a)); 3069 } 3070 return error; 3071 } 3072 3073 3074 3075 /////////////////////////////////////////////////////////////////////////////////////////////////// 3076 // Index selection 3077 3078 // @@ Can we interleave the two uint16 at once? 3079 inline uint32 interleave_uint16_with_zeros(uint32 input) { 3080 uint32 word = input; 3081 word = (word ^ (word << 8 )) & 0x00ff00ff; 3082 word = (word ^ (word << 4 )) & 0x0f0f0f0f; 3083 word = (word ^ (word << 2 )) & 0x33333333; 3084 word = (word ^ (word << 1 )) & 0x55555555; 3085 return word; 3086 } 3087 3088 // Interleave the bits. https://lemire.me/blog/2018/01/08/how-fast-can-you-bit-interleave-32-bit-integers/ 3089 ICBC_FORCEINLINE uint32 interleave(uint32 a, uint32 b) { 3090 #if ICBC_BMI2 3091 return _pdep_u32(a, 0x55555555) | _pdep_u32(b, 0xaaaaaaaa); 3092 #else 3093 return interleave_uint16_with_zeros(a) | (interleave_uint16_with_zeros(b) << 1); 3094 #endif 3095 } 3096 3097 static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { 3098 uint indices0 = 0; 3099 uint indices1 = 0; 3100 3101 VVector3 vw = vbroadcast(color_weights); 3102 VVector3 vp0 = vbroadcast(palette[0]) * vw; 3103 VVector3 vp1 = vbroadcast(palette[1]) * vw; 3104 VVector3 vp2 = vbroadcast(palette[2]) * vw; 3105 VVector3 vp3 = vbroadcast(palette[3]) * vw; 3106 3107 for (int i = 0; i < 16; i += VEC_SIZE) { 3108 VVector3 vc = vload(&input_colors[i]) * vw; 3109 3110 VFloat d0 = vlen2(vc - vp0); 3111 VFloat d1 = vlen2(vc - vp1); 3112 VFloat d2 = vlen2(vc - vp2); 3113 VFloat d3 = vlen2(vc - vp3); 3114 3115 VMask b1 = d1 > d2; 3116 VMask b2 = d0 > d2; 3117 VMask x0 = b1 & b2; 3118 3119 VMask b0 = d0 > d3; 3120 VMask b3 = d1 > d3; 3121 x0 = x0 | (b0 & b3); 3122 3123 VMask b4 = d2 > d3; 3124 VMask x1 = b0 & b4; 3125 3126 indices0 |= mask(x0) << i; 3127 indices1 |= mask(x1) << i; 3128 } 3129 3130 return interleave(indices1, indices0); 3131 } 3132 3133 static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 palette[4]) { 3134 uint indices0 = 0; 3135 uint indices1 = 0; 3136 3137 VVector3 vw = vbroadcast(color_weights); 3138 VVector3 vp0 = vbroadcast(palette[0]) * vw; 3139 VVector3 vp1 = vbroadcast(palette[1]) * vw; 3140 VVector3 vp2 = vbroadcast(palette[2]) * vw; 3141 3142 if (allow_transparent_black) { 3143 for (int i = 0; i < 16; i += VEC_SIZE) { 3144 VVector3 vc = vload(&input_colors[i]) * vw; 3145 3146 VFloat d0 = vlen2(vp0 - vc); 3147 VFloat d1 = vlen2(vp1 - vc); 3148 VFloat d2 = vlen2(vp2 - vc); 3149 VFloat d3 = vdot(vc, vc); 3150 3151 VMask i1 = (d1 < d2); 3152 VMask i2 = (d2 <= d0) & (d2 <= d1); 3153 VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2); 3154 3155 indices0 |= mask(i2 | i3) << i; 3156 indices1 |= mask(i1 | i3) << i; 3157 } 3158 } 3159 else { 3160 for (int i = 0; i < 16; i += VEC_SIZE) { 3161 VVector3 vc = vload(&input_colors[i]) * vw; 3162 3163 VFloat d0 = vlen2(vc - vp0); 3164 VFloat d1 = vlen2(vc - vp1); 3165 VFloat d2 = vlen2(vc - vp2); 3166 3167 VMask i1 = (d1 < d2); 3168 VMask i2 = (d2 <= d0) & (d2 <= d1); 3169 3170 indices0 |= mask(i2) << i; 3171 indices1 |= mask(i1) << i; 3172 } 3173 } 3174 3175 return interleave(indices1, indices0); 3176 } 3177 3178 3179 static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { 3180 #if 0 3181 Vector3 p0 = palette[0] * color_weights; 3182 Vector3 p1 = palette[1] * color_weights; 3183 Vector3 p2 = palette[2] * color_weights; 3184 Vector3 p3 = palette[3] * color_weights; 3185 3186 uint indices = 0; 3187 for (int i = 0; i < 16; i++) { 3188 Vector3 ci = input_colors[i].xyz * color_weights; 3189 float d0 = lengthSquared(p0 - ci); 3190 float d1 = lengthSquared(p1 - ci); 3191 float d2 = lengthSquared(p2 - ci); 3192 float d3 = lengthSquared(p3 - ci); 3193 3194 uint index; 3195 if (d0 < d1 && d0 < d2 && d0 < d3) index = 0; 3196 else if (d1 < d2 && d1 < d3) index = 1; 3197 else if (d2 < d3) index = 2; 3198 else index = 3; 3199 3200 indices |= index << (2 * i); 3201 } 3202 3203 return indices; 3204 #else 3205 uint indices0 = 0; 3206 uint indices1 = 0; 3207 3208 VVector3 vw = vbroadcast(color_weights); 3209 VVector3 vp0 = vbroadcast(palette[0]) * vw; 3210 VVector3 vp1 = vbroadcast(palette[1]) * vw; 3211 VVector3 vp2 = vbroadcast(palette[2]) * vw; 3212 VVector3 vp3 = vbroadcast(palette[3]) * vw; 3213 3214 for (int i = 0; i < 16; i += VEC_SIZE) { 3215 VVector3 vc = vload(&input_colors[i]) * vw; 3216 3217 VFloat d0 = vlen2(vc - vp0); 3218 VFloat d1 = vlen2(vc - vp1); 3219 VFloat d2 = vlen2(vc - vp2); 3220 VFloat d3 = vlen2(vc - vp3); 3221 3222 //VMask i0 = (d0 < d1) & (d0 < d2) & (d0 < d3); 3223 VMask i1 = (d1 <= d0) & (d1 < d2) & (d1 < d3); 3224 VMask i2 = (d2 <= d0) & (d2 <= d1) & (d2 < d3); 3225 VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2); 3226 //VFloat vindex = vselect(i0, vselect(i1, vselect(i2, vbroadcast(3), vbroadcast(2)), vbroadcast(1)), vbroadcast(0)); 3227 3228 indices0 |= mask(i2 | i3) << i; 3229 indices1 |= mask(i1 | i3) << i; 3230 } 3231 3232 uint indices = interleave(indices1, indices0); 3233 return indices; 3234 #endif 3235 } 3236 3237 3238 static float output_block3(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 & v0, const Vector3 & v1, BlockBC1 * block) 3239 { 3240 Color16 color0 = vector3_to_color16(v0); 3241 Color16 color1 = vector3_to_color16(v1); 3242 3243 if (color0.u > color1.u) { 3244 swap(color0, color1); 3245 } 3246 3247 Vector3 palette[4]; 3248 evaluate_palette(color0, color1, palette); 3249 3250 block->col0 = color0; 3251 block->col1 = color1; 3252 block->indices = compute_indices3(input_colors, color_weights, allow_transparent_black, palette); 3253 3254 return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices); 3255 } 3256 3257 static float output_block4(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockBC1 * block) 3258 { 3259 Color16 color0 = vector3_to_color16(v0); 3260 Color16 color1 = vector3_to_color16(v1); 3261 3262 if (color0.u < color1.u) { 3263 swap(color0, color1); 3264 } 3265 3266 Vector3 palette[4]; 3267 evaluate_palette(color0, color1, palette); 3268 3269 block->col0 = color0; 3270 block->col1 = color1; 3271 block->indices = compute_indices4(input_colors, color_weights, palette); 3272 3273 return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices); 3274 } 3275 3276 3277 // Least squares fitting of color end points for the given indices. @@ Take weights into account. 3278 static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b) 3279 { 3280 float alpha2_sum = 0.0f; 3281 float beta2_sum = 0.0f; 3282 float alphabeta_sum = 0.0f; 3283 Vector3 alphax_sum = { 0,0,0 }; 3284 Vector3 betax_sum = { 0,0,0 }; 3285 3286 for (int i = 0; i < count; i++) 3287 { 3288 const uint bits = indices >> (2 * i); 3289 3290 float beta = float(bits & 1); 3291 if (bits & 2) beta = (1 + beta) / 3.0f; 3292 float alpha = 1.0f - beta; 3293 3294 alpha2_sum += alpha * alpha; 3295 beta2_sum += beta * beta; 3296 alphabeta_sum += alpha * beta; 3297 alphax_sum += alpha * colors[i].xyz; 3298 betax_sum += beta * colors[i].xyz; 3299 } 3300 3301 float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; 3302 if (equal(denom, 0.0f)) return false; 3303 3304 float factor = 1.0f / denom; 3305 3306 *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor); 3307 *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor); 3308 3309 return true; 3310 } 3311 3312 // find minimum and maximum colors based on bounding box in color space 3313 inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1) 3314 { 3315 *c0 = { 0,0,0 }; 3316 *c1 = { 1,1,1 }; 3317 3318 for (int i = 0; i < count; i++) { 3319 *c0 = max(*c0, colors[i]); 3320 *c1 = min(*c1, colors[i]); 3321 } 3322 } 3323 3324 inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1) 3325 { 3326 Vector3 center = (*c0 + *c1) * 0.5f; 3327 3328 /*Vector3 center = colors[0]; 3329 for (int i = 1; i < count; i++) { 3330 center = center * float(i-1) / i + colors[i] / i; 3331 }*/ 3332 /*Vector3 center = colors[0]; 3333 for (int i = 1; i < count; i++) { 3334 center += colors[i]; 3335 } 3336 center /= count;*/ 3337 3338 float cov_xz = 0.0f; 3339 float cov_yz = 0.0f; 3340 for (int i = 0; i < count; i++) { 3341 Vector3 t = colors[i] - center; 3342 cov_xz += t.x * t.z; 3343 cov_yz += t.y * t.z; 3344 } 3345 3346 float x0 = c0->x; 3347 float y0 = c0->y; 3348 float x1 = c1->x; 3349 float y1 = c1->y; 3350 3351 if (cov_xz < 0) { 3352 swap(x0, x1); 3353 } 3354 if (cov_yz < 0) { 3355 swap(y0, y1); 3356 } 3357 3358 *c0 = { x0, y0, c0->z }; 3359 *c1 = { x1, y1, c1->z }; 3360 } 3361 3362 inline static void inset_bbox(Vector3 * __restrict c0, Vector3 * __restrict c1) 3363 { 3364 float bias = (8.0f / 255.0f) / 16.0f; 3365 Vector3 inset = saturate((*c0 - *c1) / 16.0f - scalar_to_vector3(bias)); 3366 *c0 = saturate(*c0 - inset); 3367 *c1 = saturate(*c1 + inset); 3368 } 3369 3370 3371 3372 // Single color lookup tables from: 3373 // https://github.com/nothings/stb/blob/master/stb_dxt.h 3374 static uint8 s_match5[256][2]; 3375 static uint8 s_match6[256][2]; 3376 3377 static void PrepareOptTable5(uint8 * table, Decoder decoder) 3378 { 3379 uint8 expand[32]; 3380 for (int i = 0; i < 32; i++) expand[i] = uint8((i << 3) | (i >> 2)); 3381 3382 for (int i = 0; i < 256; i++) { 3383 int bestErr = 256 * 100; 3384 3385 for (int mn = 0; mn < 32; mn++) { 3386 for (int mx = 0; mx < 32; mx++) { 3387 int mine = expand[mn]; 3388 int maxe = expand[mx]; 3389 3390 int err; 3391 3392 int amd_r = (43 * maxe + 21 * mine + 32) >> 6; 3393 int amd_err = abs(amd_r - i); 3394 3395 int nv_r = ((2 * mx + mn) * 22) / 8; 3396 int nv_err = abs(nv_r - i); 3397 3398 int intel_r = (171 * maxe + 85 * mine + 128) >> 8; 3399 int intel_err = abs(intel_r - i); 3400 3401 if (decoder == Decoder_D3D10) { 3402 // DX10 spec says that interpolation must be within 3% of "correct" result, 3403 // add this as error term. (normally we'd expect a random distribution of 3404 // +-1.5% error, but nowhere in the spec does it say that the error has to be 3405 // unbiased - better safe than sorry). 3406 int r = (maxe * 2 + mine) / 3; 3407 err = abs(r - i) * 100 + abs(mx - mn) * 3; 3408 3409 // Another approach is to consider the worst of AMD and NVIDIA errors. 3410 err = max(max(amd_err, nv_err), intel_err); 3411 } 3412 else if (decoder == Decoder_NVIDIA) { 3413 err = nv_err; 3414 } 3415 else if (decoder == Decoder_Intel) { 3416 err = intel_err; 3417 } 3418 else /*if (decoder == Decoder_AMD)*/ { 3419 err = amd_err; 3420 } 3421 3422 if (err < bestErr) { 3423 bestErr = err; 3424 table[i * 2 + 0] = uint8(mx); 3425 table[i * 2 + 1] = uint8(mn); 3426 } 3427 } 3428 } 3429 } 3430 } 3431 3432 static void PrepareOptTable6(uint8 * table, Decoder decoder) 3433 { 3434 uint8 expand[64]; 3435 for (int i = 0; i < 64; i++) expand[i] = uint8((i << 2) | (i >> 4)); 3436 3437 for (int i = 0; i < 256; i++) { 3438 int bestErr = 256 * 100; 3439 3440 for (int mn = 0; mn < 64; mn++) { 3441 for (int mx = 0; mx < 64; mx++) { 3442 int mine = expand[mn]; 3443 int maxe = expand[mx]; 3444 3445 int err; 3446 3447 int amd_g = (43 * maxe + 21 * mine + 32) >> 6; 3448 int amd_err = abs(amd_g - i); 3449 3450 int nv_g = (256 * mine + (maxe - mine) / 4 + 128 + (maxe - mine) * 80) / 256; 3451 int nv_err = abs(nv_g - i); 3452 3453 int intel_g = (171 * maxe + 85 * mine + 128) >> 8; 3454 int intel_err = abs(intel_g - i); 3455 3456 if (decoder == Decoder_D3D10) { 3457 // DX10 spec says that interpolation must be within 3% of "correct" result, 3458 // add this as error term. (normally we'd expect a random distribution of 3459 // +-1.5% error, but nowhere in the spec does it say that the error has to be 3460 // unbiased - better safe than sorry). 3461 int g = (maxe * 2 + mine) / 3; 3462 err = abs(g - i) * 100 + abs(mx - mn) * 3; 3463 3464 // Another approach is to consider the worst of AMD and NVIDIA errors. 3465 err = max(max(amd_err, nv_err), intel_err); 3466 } 3467 else if (decoder == Decoder_NVIDIA) { 3468 err = nv_err; 3469 } 3470 else if (decoder == Decoder_Intel) { 3471 err = intel_err; 3472 } 3473 else /*if (decoder == Decoder_AMD)*/ { 3474 err = amd_err; 3475 } 3476 3477 if (err < bestErr) { 3478 bestErr = err; 3479 table[i * 2 + 0] = uint8(mx); 3480 table[i * 2 + 1] = uint8(mn); 3481 } 3482 } 3483 } 3484 } 3485 } 3486 3487 3488 static void init_single_color_tables(Decoder decoder) 3489 { 3490 // Prepare single color lookup tables. 3491 PrepareOptTable5(&s_match5[0][0], decoder); 3492 PrepareOptTable6(&s_match6[0][0], decoder); 3493 } 3494 3495 static void init_decoder_weights(Decoder decoder) 3496 { 3497 w1_3 = 1.0f / 3; 3498 w2_3 = 2.0f / 3; 3499 3500 if (decoder == Decoder_NVIDIA) { 3501 // @@ Choose weights that approximate better nvidia's decoder. 3502 } 3503 else if (decoder == Decoder_AMD) { 3504 w1_3 = 21.0f / 64; 3505 w2_3 = 43.0f / 64; 3506 } 3507 else if (decoder == Decoder_Intel) { 3508 w1_3 = 85.0f / 256; 3509 w2_3 = 171.0f / 256; 3510 } 3511 3512 w1_9 = w1_3 * w1_3; 3513 w4_9 = w2_3 * w2_3; 3514 w2_9 = w1_3 * w2_3; 3515 } 3516 3517 // Single color compressor, based on: 3518 // https://mollyrocket.com/forums/viewtopic.php?t=392 3519 static void compress_dxt1_single_color_optimal(Color32 c, BlockBC1 * output) 3520 { 3521 output->col0.r = s_match5[c.r][0]; 3522 output->col0.g = s_match6[c.g][0]; 3523 output->col0.b = s_match5[c.b][0]; 3524 output->col1.r = s_match5[c.r][1]; 3525 output->col1.g = s_match6[c.g][1]; 3526 output->col1.b = s_match5[c.b][1]; 3527 output->indices = 0xaaaaaaaa; 3528 3529 if (output->col0.u < output->col1.u) 3530 { 3531 swap(output->col0.u, output->col1.u); 3532 output->indices ^= 0x55555555; 3533 } 3534 } 3535 3536 3537 static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const float input_weights[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, bool try_transparent_black, bool allow_transparent_black, BlockBC1 * output) 3538 { 3539 Vector3 metric_sqr = color_weights * color_weights; 3540 3541 SummedAreaTable sat; 3542 int sat_count = compute_sat(colors, weights, count, &sat); 3543 3544 Vector3 start, end; 3545 cluster_fit_four(sat, sat_count, metric_sqr, &start, &end); 3546 3547 float best_error = output_block4(input_colors, input_weights, color_weights, start, end, output); 3548 3549 if (three_color_mode) { 3550 if (try_transparent_black) { 3551 Vector3 tmp_colors[16]; 3552 float tmp_weights[16]; 3553 int tmp_count = skip_blacks(colors, weights, count, tmp_colors, tmp_weights); 3554 if (!tmp_count) return best_error; 3555 3556 sat_count = compute_sat(tmp_colors, tmp_weights, tmp_count, &sat); 3557 } 3558 3559 cluster_fit_three(sat, sat_count, metric_sqr, &start, &end); 3560 3561 BlockBC1 three_color_block; 3562 float three_color_error = output_block3(input_colors, input_weights, color_weights, allow_transparent_black, start, end, &three_color_block); 3563 3564 if (three_color_error < best_error) { 3565 best_error = three_color_error; 3566 *output = three_color_block; 3567 } 3568 } 3569 3570 return best_error; 3571 } 3572 3573 3574 static float refine_endpoints(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, float input_error, BlockBC1 * output) { 3575 // TODO: 3576 // - Optimize palette evaluation when updating only one channel. 3577 // - try all diagonals. 3578 3579 // Things that don't help: 3580 // - Alternate endpoint updates. 3581 // - Randomize order. 3582 // - If one direction does not improve, test opposite direction next. 3583 3584 static const int8 deltas[16][3] = { 3585 {1,0,0}, 3586 {0,1,0}, 3587 {0,0,1}, 3588 3589 {-1,0,0}, 3590 {0,-1,0}, 3591 {0,0,-1}, 3592 3593 {1,1,0}, 3594 {1,0,1}, 3595 {0,1,1}, 3596 3597 {-1,-1,0}, 3598 {-1,0,-1}, 3599 {0,-1,-1}, 3600 3601 {-1,1,0}, 3602 //{-1,0,1}, 3603 3604 {1,-1,0}, 3605 {0,-1,1}, 3606 3607 //{1,0,-1}, 3608 {0,1,-1}, 3609 }; 3610 3611 float best_error = input_error; 3612 3613 int lastImprovement = 0; 3614 for (int i = 0; i < 256; i++) { 3615 BlockBC1 refined = *output; 3616 int8 delta[3] = { deltas[i % 16][0], deltas[i % 16][1], deltas[i % 16][2] }; 3617 3618 if ((i / 16) & 1) { 3619 refined.col0.r += delta[0]; 3620 refined.col0.g += delta[1]; 3621 refined.col0.b += delta[2]; 3622 } 3623 else { 3624 refined.col1.r += delta[0]; 3625 refined.col1.g += delta[1]; 3626 refined.col1.b += delta[2]; 3627 } 3628 3629 if (!three_color_mode) { 3630 if (refined.col0.u == refined.col1.u) refined.col1.g += 1; 3631 if (refined.col0.u < refined.col1.u) swap(refined.col0.u, refined.col1.u); 3632 } 3633 3634 Vector3 palette[4]; 3635 evaluate_palette(output->col0, output->col1, palette); 3636 3637 refined.indices = compute_indices(input_colors, color_weights, palette); 3638 3639 float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined); 3640 if (refined_error < best_error) { 3641 best_error = refined_error; 3642 *output = refined; 3643 lastImprovement = i; 3644 } 3645 3646 // Early out if the last 32 steps didn't improve error. 3647 if (i - lastImprovement > 32) break; 3648 } 3649 3650 return best_error; 3651 } 3652 3653 struct Options { 3654 float threshold = 0.0f; 3655 bool box_fit = false; 3656 bool least_squares_fit = false; 3657 bool cluster_fit = false; 3658 bool cluster_fit_3 = false; 3659 bool cluster_fit_3_black_only = false; 3660 bool endpoint_refinement = false; 3661 }; 3662 3663 static Options setup_options(Quality level, bool enable_three_color_mode, bool enable_transparent_black) { 3664 Options opt; 3665 3666 switch (level) { 3667 case Quality_Level0: // Box fit only. 3668 opt.box_fit = true; 3669 opt.least_squares_fit = false; 3670 break; 3671 3672 case Quality_Level1: // Box fit + least squares fit. 3673 opt.box_fit = true; 3674 opt.least_squares_fit = true; 3675 break; 3676 3677 case Quality_Level2: // Cluster fit 4, threshold = 24. 3678 opt.box_fit = true; 3679 opt.least_squares_fit = true; 3680 opt.cluster_fit = true; 3681 opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; 3682 opt.threshold = 1.0f / 24; 3683 break; 3684 3685 case Quality_Level3: // Cluster fit 4, threshold = 32. 3686 opt.box_fit = true; 3687 opt.cluster_fit = true; 3688 opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; 3689 opt.threshold = 1.0f / 32; 3690 break; 3691 3692 case Quality_Level4: // Cluster fit 3+4, threshold = 48. 3693 opt.cluster_fit = true; 3694 opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; 3695 opt.threshold = 1.0f / 48; 3696 break; 3697 3698 case Quality_Level5: // Cluster fit 3+4, threshold = 64. 3699 opt.cluster_fit = true; 3700 opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; 3701 opt.threshold = 1.0f / 64; 3702 break; 3703 3704 case Quality_Level6: // Cluster fit 3+4, threshold = 96. 3705 opt.cluster_fit = true; 3706 opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; 3707 opt.threshold = 1.0f / 96; 3708 break; 3709 3710 case Quality_Level7: // Cluster fit 3+4, threshold = 128. 3711 opt.cluster_fit = true; 3712 opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; 3713 opt.threshold = 1.0f / 128; 3714 break; 3715 3716 case Quality_Level8: // Cluster fit 3+4, threshold = 256. 3717 opt.cluster_fit = true; 3718 opt.cluster_fit_3 = enable_three_color_mode; 3719 opt.threshold = 1.0f / 256; 3720 break; 3721 3722 case Quality_Level9: // Cluster fit 3+4, threshold = 256 + Refinement. 3723 opt.cluster_fit = true; 3724 opt.cluster_fit_3 = enable_three_color_mode; 3725 opt.threshold = 1.0f / 256; 3726 opt.endpoint_refinement = true; 3727 break; 3728 } 3729 3730 return opt; 3731 } 3732 3733 3734 static float compress_bc1(Quality level, const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool three_color_black, BlockBC1 * output) 3735 { 3736 Options opt = setup_options(level, three_color_mode, three_color_black); 3737 3738 Vector3 colors[16]; 3739 float weights[16]; 3740 bool any_black = false; 3741 int count; 3742 if (opt.cluster_fit) { 3743 count = reduce_colors(input_colors, input_weights, 16, opt.threshold, colors, weights, &any_black); 3744 } 3745 else { 3746 for (int i = 0; i < 16; i++) { 3747 colors[i] = input_colors[i].xyz; 3748 } 3749 count = 16; 3750 } 3751 3752 if (count == 0) { 3753 // Output trivial block. 3754 output->col0.u = 0; 3755 output->col1.u = 0; 3756 output->indices = 0; 3757 return 0; 3758 } 3759 3760 // Cluster fit cannot handle single color blocks, so encode them optimally. 3761 if (count == 1) { 3762 compress_dxt1_single_color_optimal(vector3_to_color32(colors[0]), output); 3763 return evaluate_mse(input_colors, input_weights, color_weights, output); 3764 } 3765 3766 float error = FLT_MAX; 3767 3768 // Quick end point selection. 3769 if (opt.box_fit) { 3770 Vector3 c0, c1; 3771 fit_colors_bbox(colors, count, &c0, &c1); 3772 inset_bbox(&c0, &c1); 3773 select_diagonal(colors, count, &c0, &c1); 3774 error = output_block4(input_colors, input_weights, color_weights, c0, c1, output); 3775 3776 // Refine color for the selected indices. 3777 if (opt.least_squares_fit && optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) { 3778 BlockBC1 optimized_block; 3779 float optimized_error = output_block4(input_colors, input_weights, color_weights, c0, c1, &optimized_block); 3780 3781 if (optimized_error < error) { 3782 error = optimized_error; 3783 *output = optimized_block; 3784 } 3785 } 3786 } 3787 3788 if (opt.cluster_fit) { 3789 // @@ Use current endpoints as input for initial PCA approximation? 3790 3791 bool use_three_color_black = any_black && three_color_black; 3792 bool use_three_color_mode = opt.cluster_fit_3 || (use_three_color_black && opt.cluster_fit_3_black_only); 3793 3794 // Try cluster fit. 3795 BlockBC1 cluster_fit_output; 3796 float cluster_fit_error = compress_dxt1_cluster_fit(input_colors, input_weights, colors, weights, count, color_weights, use_three_color_mode, use_three_color_black, three_color_black, &cluster_fit_output); 3797 if (cluster_fit_error < error) { 3798 *output = cluster_fit_output; 3799 error = cluster_fit_error; 3800 } 3801 } 3802 3803 if (opt.endpoint_refinement) { 3804 error = refine_endpoints(input_colors, input_weights, color_weights, three_color_mode, error, output); 3805 } 3806 3807 return error; 3808 } 3809 3810 3811 // Public API 3812 3813 void init(Decoder decoder) { 3814 s_decoder = decoder; 3815 init_single_color_tables(decoder); 3816 init_cluster_tables(); 3817 init_decoder_weights(decoder); 3818 } 3819 3820 void decode_bc1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) { 3821 decode_bc1((const BlockBC1 *)block, rgba_block, decoder); 3822 } 3823 3824 void decode_bc3(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) { 3825 decode_bc3((const BlockBC3 *)block, rgba_block, decoder); 3826 } 3827 3828 // void decode_bc4(const void * block, bool snorm, float rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) { 3829 // decode_bc4((const BlockBC4 *)block, snorm, rgba_block, decoder); 3830 // } 3831 3832 // void decode_bc5(const void * block, bool snorm, float rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) { 3833 // decode_bc5((const BlockBC5 *)block, snorm, rgba_block, decoder); 3834 // } 3835 3836 3837 float evaluate_bc1_error(const unsigned char rgba_block[16 * 4], const void * dxt_block, Decoder decoder/*=Decoder_D3D10*/) { 3838 return evaluate_bc1_error(rgba_block, (const BlockBC1 *)dxt_block, decoder); 3839 } 3840 3841 float evaluate_bc3_error(const unsigned char rgba_block[16 * 4], const void * dxt_block, bool alpha_blend, Decoder decoder/*=Decoder_D3D10*/) { 3842 return evaluate_bc3_error(rgba_block, (const BlockBC3 *)dxt_block, alpha_blend, decoder); 3843 } 3844 3845 3846 float compress_bc1(Quality level, const float * input_colors, const float * input_weights, const float rgb[3], bool three_color_mode, bool three_color_black, void * output) { 3847 return compress_bc1(level, (Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, three_color_mode, three_color_black, (BlockBC1*)output); 3848 } 3849 3850 float compress_bc3(Quality level, const float * input_colors, const float * input_weights, const float rgb[3], bool six_alpha_mode, void * output) { 3851 //return compress_bc3(level, (Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, six_alpha_mode, (BlockBC3*)output); 3852 return 0.0f; // @@ 3853 } 3854 3855 } // icbc 3856 3857 // // Do not polute preprocessor definitions. 3858 // #undef ICBC_SIMD 3859 // #undef ICBC_ASSERT 3860 3861 // #undef ICBC_SCALAR 3862 // #undef ICBC_SSE2 3863 // #undef ICBC_SSE41 3864 // #undef ICBC_AVX1 3865 // #undef ICBC_AVX2 3866 // #undef ICBC_AVX512 3867 // #undef ICBC_NEON 3868 // #undef ICBC_VMX 3869 3870 // #undef ICBC_USE_FMA 3871 // #undef ICBC_USE_AVX2_PERMUTE2 3872 // #undef ICBC_USE_AVX512_PERMUTE 3873 // #undef ICBC_USE_NEON_VTL 3874 3875 // #undef ICBC_PERFECT_ROUND 3876 3877 #endif // ICBC_IMPLEMENTATION 3878 3879 // Version History: 3880 // v1.00 - Initial release. 3881 // v1.01 - Added SPMD code path with AVX support. 3882 // v1.02 - Removed SIMD code path. 3883 // v1.03 - Quality levels. AVX512, Neon, Altivec, vectorized reduction and index selection. 3884 // v1.04 - Automatic compile-time SIMD selection. Specify hw decoder at runtime. More optimizations. 3885 // v1.05 - Bug fixes. Small optimizations. 3886 3887 // Copyright (c) 2020 Ignacio Castano <castano@gmail.com> 3888 // 3889 // Permission is hereby granted, free of charge, to any person obtaining 3890 // a copy of this software and associated documentation files (the 3891 // "Software"), to deal in the Software without restriction, including 3892 // without limitation the rights to use, copy, modify, merge, publish, 3893 // distribute, sublicense, and/or sell copies of the Software, and to 3894 // permit persons to whom the Software is furnished to do so, subject to 3895 // the following conditions: 3896 // 3897 // The above copyright notice and this permission notice shall be included 3898 // in all copies or substantial portions of the Software. 3899 // 3900 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 3901 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 3902 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 3903 // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 3904 // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 3905 // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 3906 // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.