skinpeel

Change vita's lockscreen foreground and background randomly
git clone https://git.neptards.moe/neptards/skinpeel.git
Log | Files | Refs | README | LICENSE

icbc.h (127319B)


      1 // icbc.h v1.05
      2 // A High Quality SIMD BC1 Encoder by Ignacio Castano <castano@gmail.com>.
      3 //
      4 // LICENSE:
      5 //  MIT license at the end of this file.
      6 
      7 #ifndef ICBC_H
      8 #define ICBC_H
      9 
     10 namespace icbc {
     11 
     12     enum Decoder {
     13         Decoder_D3D10 = 0,
     14         Decoder_NVIDIA = 1,
     15         Decoder_AMD = 2,    // Apple's M1 decoder appears to match AMD's
     16         Decoder_Intel = 3
     17     };
     18 
     19     void init(Decoder decoder = Decoder_D3D10);
     20 
     21     enum Quality {
     22         Quality_Level0,  // Box fit only.
     23         Quality_Level1,  // Box fit + least squares fit.
     24         Quality_Level2,  // Cluster fit 4, threshold = 24.
     25         Quality_Level3,  // Cluster fit 4, threshold = 32.
     26         Quality_Level4,  // Cluster fit 4, threshold = 48.
     27         Quality_Level5,  // Cluster fit 4, threshold = 64.
     28         Quality_Level6,  // Cluster fit 4, threshold = 96.
     29         Quality_Level7,  // Cluster fit 4, threshold = 128.
     30         Quality_Level8,  // Cluster fit 4+3, threshold = 256.
     31         Quality_Level9,  // Cluster fit 4+3, threshold = 256 + Refinement.
     32 
     33         Quality_Fast = Quality_Level1,
     34         Quality_Default = Quality_Level8,
     35         Quality_Max = Quality_Level9,
     36     };
     37 
     38     void decode_bc1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10);
     39     void decode_bc3(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10);
     40     //void decode_bc4(const void * block, bool snorm, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10);
     41     //void decode_bc5(const void * block, bool snorm, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10);
     42 
     43     float evaluate_bc1_error(const unsigned char rgba_block[16 * 4], const void * block, Decoder decoder = Decoder_D3D10);
     44     float evaluate_bc3_error(const unsigned char rgba_block[16 * 4], const void * block, bool alpha_blend, Decoder decoder = Decoder_D3D10);
     45     //float evaluate_bc4_error(const float rgba_block[16 * 4], const void * block, bool snorm, Decoder decoder = Decoder_D3D10);
     46     //float evaluate_bc5_error(const float rgba_block[16 * 4], const void * block, bool snorm, Decoder decoder = Decoder_D3D10);
     47 
     48     float compress_bc1(Quality level, const float * input_colors, const float * input_weights, const float color_weights[3], bool three_color_mode, bool three_color_black, void * output);
     49     float compress_bc3(Quality level, const float * input_colors, const float * input_weights, const float color_weights[3], bool six_alpha_mode, void * output);
     50     //float compress_bc4(Quality level, const float * input_colors, const float * input_weights, bool snorm, bool six_alpha_mode, void * output);
     51     //float compress_bc5(Quality level, const float * input_colors, const float * input_weights, bool snorm, bool six_alpha_mode, void * output);
     52 }
     53 
     54 #endif // ICBC_H
     55 
     56 #ifdef ICBC_IMPLEMENTATION
     57 
     58 // Instruction level support must be chosen at compile time setting ICBC_SIMD to one of these values:
     59 #define ICBC_SCALAR 0
     60 #define ICBC_SSE2   1
     61 #define ICBC_SSE41  2
     62 #define ICBC_AVX1   3
     63 #define ICBC_AVX2   4
     64 #define ICBC_AVX512 5
     65 #define ICBC_NEON   -1
     66 #define ICBC_VMX    -2
     67 
     68 #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
     69     #define ICBC_X86 1
     70 #endif
     71 
     72 #if defined(__x86_64__) || defined(_M_X64)
     73     #define ICBC_X64 1
     74 #endif
     75 
     76 #if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || defined(_M_ARM64)
     77     #define ICBC_ARM 1
     78 #endif
     79 
     80 #if defined(__aarch64__) || defined(_M_ARM64)
     81     #define ICBC_ARM64 1
     82 #endif
     83 
     84 #if (defined(__PPC__) || defined(_M_PPC))
     85     #define ICBC_PPC 1
     86 #endif
     87 
     88 // SIMD version.
     89 #ifndef ICBC_SIMD
     90     #if ICBC_X86
     91         #if __AVX512F__
     92             #define ICBC_SIMD ICBC_AVX512
     93         #elif __AVX2__
     94             #define ICBC_SIMD ICBC_AVX2
     95         #elif __AVX__
     96             #define ICBC_SIMD ICBC_AVX1
     97         #elif __SSE4_1__
     98             #define ICBC_SIMD ICBC_SSE41
     99         #elif __SSE2__ || ICBC_X64
    100             #define ICBC_SIMD ICBC_SSE2
    101         #else
    102             #define ICBC_SIMD ICBC_SCALAR
    103         #endif
    104     #endif
    105 
    106     #if ICBC_ARM
    107         #if __ARM_NEON__
    108             #define ICBC_SIMD ICBC_NEON
    109         #else
    110             #define ICBC_SIMD ICBC_SCALAR
    111         #endif
    112     #endif
    113 
    114     #if ICBC_PPC
    115         #define ICBC_SIMD ICBC_VMX
    116     #endif
    117 #endif
    118 
    119 // AVX1 does not require FMA, and depending on whether it's Intel or AMD you may have FMA3 or FMA4. What a mess.
    120 #ifndef ICBC_USE_FMA
    121 //#define ICBC_USE_FMA 3
    122 //#define ICBC_USE_FMA 4
    123 #endif
    124 
    125 #if ICBC_SIMD >= ICBC_AVX2
    126 #define ICBC_BMI2 1
    127 #endif
    128 
    129 // Apparently rcp is not deterministic (different precision on Intel and AMD), enable if you don't care about that for a small performance boost.
    130 //#define ICBC_USE_RCP 1
    131 
    132 #if ICBC_SIMD == ICBC_AVX2
    133 #define ICBC_USE_AVX2_PERMUTE2 1    // Using permutevar8x32 and bitops.
    134 #endif
    135 
    136 #if ICBC_SIMD == ICBC_AVX512
    137 #define ICBC_USE_AVX512_PERMUTE 1
    138 #endif
    139 
    140 #if ICBC_SIMD == ICBC_NEON
    141 #define ICBC_USE_NEON_VTL 1         // Enable this for a 0.89x performance improvement.
    142 #endif
    143 
    144 #if ICBC_SIMD == ICBC_SSE41
    145 #define ICBC_USE_SSSE3_SHUFFLEB 1   // Enable this for 0.88x performance improvement.
    146 #endif
    147 
    148 
    149 // Some experimental knobs:
    150 #define ICBC_PERFECT_ROUND 0        // Enable perfect rounding to compute cluster fit residual.
    151 
    152 
    153 #if ICBC_SIMD >= ICBC_SSE2
    154 #include <emmintrin.h>
    155 #endif
    156 
    157 #if ICBC_SIMD >= ICBC_SSE41
    158 #include <smmintrin.h>
    159 #endif
    160 
    161 #if ICBC_SIMD >= ICBC_AVX1
    162 #include <immintrin.h>
    163 #endif
    164 
    165 #if ICBC_SIMD >= ICBC_AVX512 && _MSC_VER
    166 #include <zmmintrin.h>
    167 #endif
    168 
    169 #if ICBC_SIMD == ICBC_NEON
    170 #include <arm_neon.h>
    171 #endif
    172 
    173 #if ICBC_SIMD == ICBC_VMX
    174 #include <altivec.h>
    175 #endif
    176 
    177 #if _MSC_VER
    178 #include <intrin.h> // _BitScanReverse
    179 #endif
    180 
    181 #include <stdint.h>
    182 #include <stdlib.h> // abs
    183 #include <string.h> // memset
    184 #include <math.h>   // fabsf
    185 #include <float.h>  // FLT_MAX
    186 
    187 #ifndef ICBC_ASSERT
    188 #if _DEBUG
    189 #define ICBC_ASSERT assert
    190 #include <assert.h>
    191 #else
    192 #define ICBC_ASSERT(x)
    193 #endif
    194 #endif
    195 
    196 namespace icbc {
    197 
    198 ///////////////////////////////////////////////////////////////////////////////////////////////////
    199 // Basic Templates
    200 
    201 template <typename T> inline void swap(T & a, T & b) {
    202     T temp(a);
    203     a = b;
    204     b = temp;
    205 }
    206 
    207 template <typename T> inline T max(const T & a, const T & b) {
    208     return (b < a) ? a : b;
    209 }
    210 
    211 template <typename T> inline T min(const T & a, const T & b) {
    212     return (a < b) ? a : b;
    213 }
    214 
    215 template <typename T> inline T clamp(const T & x, const T & a, const T & b) {
    216     return min(max(x, a), b);
    217 }
    218 
    219 template <typename T> inline T square(const T & a) {
    220     return a * a;
    221 }
    222 
    223 
    224 ///////////////////////////////////////////////////////////////////////////////////////////////////
    225 // Basic Types
    226 
    227 typedef uint8_t uint8;
    228 typedef int8_t int8;
    229 typedef uint16_t uint16;
    230 typedef uint32_t uint32;
    231 typedef uint32_t uint;
    232 
    233 
    234 struct Color16 {
    235     union {
    236         struct {
    237             uint16 b : 5;
    238             uint16 g : 6;
    239             uint16 r : 5;
    240         };
    241         uint16 u;
    242     };
    243 };
    244 
    245 struct Color32 {
    246     union {
    247         struct {
    248             uint8 b, g, r, a;
    249         };
    250         uint32 u;
    251     };
    252 };
    253 
    254 struct BlockBC1 {
    255     Color16 col0;
    256     Color16 col1;
    257     uint32 indices;
    258 };
    259 
    260 struct BlockBC4 {
    261     uint8 alpha0;
    262     uint8 alpha1;
    263     uint8 indices[6];
    264 };
    265 
    266 struct BlockBC3 {
    267     BlockBC4 alpha;
    268     BlockBC1 rgb;
    269 };
    270 
    271 struct BlockBC5 {
    272     BlockBC4 x;
    273     BlockBC1 y;
    274 };
    275 
    276 
    277 
    278 struct Vector3 {
    279     float x;
    280     float y;
    281     float z;
    282 
    283     inline void operator+=(Vector3 v) {
    284         x += v.x; y += v.y; z += v.z;
    285     }
    286     inline void operator*=(Vector3 v) {
    287         x *= v.x; y *= v.y; z *= v.z;
    288     }
    289     inline void operator*=(float s) {
    290         x *= s; y *= s; z *= s;
    291     }
    292 };
    293 
    294 struct Vector4 {
    295     union {
    296         struct {
    297             float x, y, z, w;
    298         };
    299         Vector3 xyz;
    300     };
    301 };
    302 
    303 
    304 inline Vector3 operator*(Vector3 v, float s) {
    305     return { v.x * s, v.y * s, v.z * s };
    306 }
    307 
    308 inline Vector3 operator*(float s, Vector3 v) {
    309     return { v.x * s, v.y * s, v.z * s };
    310 }
    311 
    312 inline Vector3 operator*(Vector3 a, Vector3 b) {
    313     return { a.x * b.x, a.y * b.y, a.z * b.z };
    314 }
    315 
    316 inline float dot(Vector3 a, Vector3 b) {
    317     return a.x * b.x + a.y * b.y + a.z * b.z;
    318 }
    319 
    320 inline Vector3 operator+(Vector3 a, Vector3 b) {
    321     return { a.x + b.x, a.y + b.y, a.z + b.z };
    322 }
    323 
    324 inline Vector3 operator-(Vector3 a, Vector3 b) {
    325     return { a.x - b.x, a.y - b.y, a.z - b.z };
    326 }
    327 
    328 inline Vector3 operator/(Vector3 v, float s) {
    329     return { v.x / s, v.y / s, v.z / s };
    330 }
    331 
    332 inline float saturate(float x) {
    333     return clamp(x, 0.0f, 1.0f);
    334 }
    335 
    336 inline Vector3 saturate(Vector3 v) {
    337     return { saturate(v.x), saturate(v.y), saturate(v.z) };
    338 }
    339 
    340 inline Vector3 min(Vector3 a, Vector3 b) {
    341     return { min(a.x, b.x), min(a.y, b.y), min(a.z, b.z) };
    342 }
    343 
    344 inline Vector3 max(Vector3 a, Vector3 b) {
    345     return { max(a.x, b.x), max(a.y, b.y), max(a.z, b.z) };
    346 }
    347 
    348 inline bool operator==(const Vector3 & a, const Vector3 & b) {
    349     return a.x == b.x && a.y == b.y && a.z == b.z;
    350 }
    351 
    352 inline Vector3 scalar_to_vector3(float f) {
    353     return {f, f, f};
    354 }
    355 
    356 inline float lengthSquared(Vector3 v) {
    357     return dot(v, v);
    358 }
    359 
    360 inline bool equal(float a, float b, float epsilon = 0.0001) {
    361     // http://realtimecollisiondetection.net/blog/?p=89
    362     //return fabsf(a - b) < epsilon * max(1.0f, max(fabsf(a), fabsf(b)));
    363     return fabsf(a - b) < epsilon;
    364 }
    365 
    366 inline bool equal(Vector3 a, Vector3 b, float epsilon) {
    367     return equal(a.x, b.x, epsilon) && equal(a.y, b.y, epsilon) && equal(a.z, b.z, epsilon);
    368 }
    369 
    370 
    371 ///////////////////////////////////////////////////////////////////////////////////////////////////
    372 // SIMD
    373 
    374 #ifndef ICBC_ALIGN
    375 #if __GNUC__
    376 #   define ICBC_ALIGN __attribute__ ((__aligned__ (icbc::VEC_SIZE*4)))
    377 #else // _MSC_VER
    378 #   define ICBC_ALIGN __declspec(align(icbc::VEC_SIZE*4))
    379 #endif
    380 #endif
    381 
    382 #if __GNUC__
    383 #define ICBC_FORCEINLINE inline __attribute__((always_inline))
    384 #else
    385 #define ICBC_FORCEINLINE __forceinline
    386 #endif
    387 
    388 
    389 // Count trailing zeros (BSR).
    390 ICBC_FORCEINLINE int ctz(uint mask) {
    391 #if __GNUC__
    392     return __builtin_ctz(mask);
    393 #else
    394     unsigned long index;
    395     _BitScanReverse(&index, mask);
    396     return (int)index;
    397 #endif
    398 }
    399 
    400 
    401 #if ICBC_SIMD == ICBC_SCALAR  // Purely scalar version.
    402 
    403 constexpr int VEC_SIZE = 1;
    404 
    405 using VFloat = float;
    406 using VMask = bool;
    407 
    408 ICBC_FORCEINLINE float & lane(VFloat & v, int i) { return v; }
    409 ICBC_FORCEINLINE VFloat vzero() { return 0.0f; }
    410 ICBC_FORCEINLINE VFloat vbroadcast(float x) { return x; }
    411 ICBC_FORCEINLINE VFloat vload(const float * ptr) { return *ptr; }
    412 ICBC_FORCEINLINE VFloat vrcp(VFloat a) { return 1.0f / a; }
    413 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { return a * b + c; }
    414 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { return a * b - c; }
    415 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { return a * b - c * d; }
    416 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { return min(max(a, 0.0f), 1.0f); }
    417 ICBC_FORCEINLINE VFloat vround01(VFloat a) { return float(int(a + 0.5f)); }
    418 ICBC_FORCEINLINE VFloat lane_id() { return 0; }
    419 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return mask ? b : a; }
    420 ICBC_FORCEINLINE VMask vbroadcast(bool b) { return b; }
    421 ICBC_FORCEINLINE bool all(VMask m) { return m; }
    422 ICBC_FORCEINLINE bool any(VMask m) { return m; }
    423 ICBC_FORCEINLINE uint mask(VMask m) { return (uint)m; }
    424 ICBC_FORCEINLINE int reduce_min_index(VFloat v) { return 0; }
    425 ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {}
    426 
    427 #elif ICBC_SIMD == ICBC_SSE2 || ICBC_SIMD == ICBC_SSE41
    428 
    429 constexpr int VEC_SIZE = 4;
    430 
    431 #if __GNUC__
    432 // GCC needs a struct so that we can overload operators.
    433 union VFloat {
    434     __m128 v;
    435     float m128_f32[VEC_SIZE];
    436 
    437     VFloat() {}
    438     VFloat(__m128 v) : v(v) {}
    439     operator __m128 & () { return v; }
    440 };
    441 union VMask {
    442     __m128 m;
    443 
    444     VMask() {}
    445     VMask(__m128 m) : m(m) {}
    446     operator __m128 & () { return m; }
    447 };
    448 #else
    449 using VFloat = __m128;
    450 using VMask = __m128;
    451 #endif
    452 
    453 ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
    454     return v.m128_f32[i];
    455 }
    456 
    457 ICBC_FORCEINLINE VFloat vzero() {
    458     return _mm_setzero_ps();
    459 }
    460 
    461 ICBC_FORCEINLINE VFloat vbroadcast(float x) {
    462     return _mm_set1_ps(x);
    463 }
    464 
    465 ICBC_FORCEINLINE VFloat vload(const float * ptr) {
    466     return _mm_load_ps(ptr);
    467 }
    468 
    469 ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) {
    470     VFloat v;
    471     for (int i = 0; i < VEC_SIZE; i++) {
    472         lane(v, i) = base[int(lane(index, i))];
    473     }
    474     return v;
    475 }
    476 
    477 ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
    478     return _mm_add_ps(a, b);
    479 }
    480 
    481 ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
    482     return _mm_sub_ps(a, b);
    483 }
    484 
    485 ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
    486     return _mm_mul_ps(a, b);
    487 }
    488 
    489 ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
    490 #if ICBC_USE_RCP
    491     VFloat r = _mm_rcp_ps(a);
    492     return _mm_mul_ps(r, _mm_sub_ps(vbroadcast(2.0f), _mm_mul_ps(r, a)));   // r * (2 - r * a)
    493 #else
    494     return _mm_div_ps(vbroadcast(1.0f), a);
    495 #endif
    496 }
    497 
    498 // a*b+c
    499 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
    500     return a * b + c;
    501 }
    502 
    503 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
    504     return a * b - c;
    505 }
    506 
    507 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
    508     return a * b - c * d;
    509 }
    510 
    511 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
    512     auto zero = _mm_setzero_ps();
    513     auto one = _mm_set1_ps(1.0f);
    514     return _mm_min_ps(_mm_max_ps(a, zero), one);
    515 }
    516 
    517 // Assumes a is in [0, 1] range.
    518 ICBC_FORCEINLINE VFloat vround01(VFloat a) {
    519 #if ICBC_SIMD == ICBC_SSE41
    520     return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
    521 #else
    522     return _mm_cvtepi32_ps(_mm_cvttps_epi32(a + vbroadcast(0.5f)));
    523 #endif
    524 }
    525 
    526 ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
    527 #if ICBC_SIMD == ICBC_SSE41
    528     return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
    529 #else
    530     return _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
    531 #endif
    532 }
    533 
    534 ICBC_FORCEINLINE VFloat lane_id() {
    535     return _mm_set_ps(3, 2, 1, 0);
    536 }
    537 
    538 ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm_cmpgt_ps(A, B); }
    539 ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm_cmpge_ps(A, B); }
    540 ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm_cmplt_ps(A, B); }
    541 ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm_cmple_ps(A, B); }
    542 
    543 ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm_or_ps(A, B); }
    544 ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm_and_ps(A, B); }
    545 ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm_xor_ps(A, B); }
    546 
    547 // mask ? b : a
    548 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
    549 #if ICBC_SIMD == ICBC_SSE41
    550     return _mm_blendv_ps(a, b, mask);
    551 #else
    552     return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
    553 #endif
    554 }
    555 
    556 ICBC_FORCEINLINE VMask vbroadcast(bool b) { 
    557     return _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b)));
    558 }
    559 
    560 ICBC_FORCEINLINE bool all(VMask m) {
    561     int value = _mm_movemask_ps(m);
    562     return value == 0x7;
    563 }
    564 
    565 ICBC_FORCEINLINE bool any(VMask m) {
    566     int value = _mm_movemask_ps(m);
    567     return value != 0;
    568 }
    569 
    570 ICBC_FORCEINLINE uint mask(VMask m) {
    571     return (uint)_mm_movemask_ps(m);
    572 }
    573 
    574 ICBC_FORCEINLINE int reduce_min_index(VFloat v) {
    575 
    576     // First do an horizontal reduction.                            // v = [ D C | B A ]
    577     VFloat shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));    //     [ C D | A B ]
    578     VFloat mins = _mm_min_ps(v, shuf);                              // mins = [ D+C C+D | B+A A+B ]
    579     shuf        = _mm_movehl_ps(shuf, mins);                        //        [   C   D | D+C C+D ]  // let the compiler avoid a mov by reusing shuf
    580     mins        = _mm_min_ss(mins, shuf);
    581     mins =      _mm_shuffle_ps(mins, mins, _MM_SHUFFLE(0, 0, 0, 0));
    582 
    583     // Then find the index.
    584     uint mask = _mm_movemask_ps(v <= mins);
    585     return ctz(mask);
    586 }
    587 
    588 // https://gcc.gnu.org/legacy-ml/gcc-patches/2005-10/msg00324.html
    589 ICBC_FORCEINLINE void vtranspose4(VFloat & r0, VFloat & r1, VFloat & r2, VFloat & r3) {
    590     VFloat t0 = _mm_unpacklo_ps(r0, r1);
    591     VFloat t1 = _mm_unpacklo_ps(r2, r3);
    592     VFloat t2 = _mm_unpackhi_ps(r0, r1);
    593     VFloat t3 = _mm_unpackhi_ps(r2, r3);
    594     r0 = _mm_movelh_ps(t0, t1);
    595     r1 = _mm_movehl_ps(t1, t0);
    596     r2 = _mm_movelh_ps(t2, t3);
    597     r3 = _mm_movehl_ps(t3, t2);
    598 }
    599 
    600 
    601 #if ICBC_SIMD == ICBC_SSE41
    602 
    603 ICBC_FORCEINLINE VFloat vpermute2(VFloat tab0, VFloat tab1, __m128i idx) {
    604 
    605     // @@ Precompute this:
    606     tab1 = _mm_xor_ps(tab1, tab0);
    607 
    608     __m128i result = _mm_shuffle_epi8(_mm_castps_si128(tab0), idx);
    609     idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
    610 
    611     result = _mm_xor_si128(result, _mm_shuffle_epi8(_mm_castps_si128(tab1), idx));
    612 
    613     return _mm_castsi128_ps(result);
    614 }
    615 
    616 ICBC_FORCEINLINE VFloat vpermute4(VFloat tab0, VFloat tab1, VFloat tab2, VFloat tab3, __m128i idx) {
    617 
    618     // @@ Precompute this:
    619     tab3 = _mm_xor_ps(tab3, tab2);
    620     tab2 = _mm_xor_ps(tab2, tab1);
    621     tab1 = _mm_xor_ps(tab1, tab0);
    622 
    623     __m128i result = _mm_shuffle_epi8(_mm_castps_si128(tab0), idx);
    624     idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
    625 
    626     result = _mm_xor_si128(result, _mm_shuffle_epi8(_mm_castps_si128(tab1), idx));
    627     idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
    628 
    629     result = _mm_xor_si128(result, _mm_shuffle_epi8(_mm_castps_si128(tab2), idx));
    630     idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
    631 
    632     result = _mm_xor_si128(result, _mm_shuffle_epi8(_mm_castps_si128(tab3), idx));
    633 
    634     return _mm_castsi128_ps(result);
    635 }
    636 
    637 #endif
    638 
    639 
    640 #elif ICBC_SIMD == ICBC_AVX1 || ICBC_SIMD == ICBC_AVX2
    641 
    642 constexpr int VEC_SIZE = 8;
    643 
    644 #if __GNUC__
    645 union VFloat {
    646     __m256 v;
    647     float m256_f32[VEC_SIZE];
    648 
    649     VFloat() {}
    650     VFloat(__m256 v) : v(v) {}
    651     operator __m256 & () { return v; }
    652 };
    653 union VInt {
    654     __m256i v;
    655     int m256_i32[VEC_SIZE];
    656 
    657     VInt() {}
    658     VInt(__m256i v) : v(v) {}
    659     operator __m256i & () { return v; }
    660 };
    661 union VMask {
    662     __m256 m;
    663 
    664     VMask() {}
    665     VMask(__m256 m) : m(m) {}
    666     operator __m256 & () { return m; }
    667 };
    668 #else
    669 using VFloat = __m256;
    670 using VInt = __m256i;
    671 using VMask = __m256;   // Emulate mask vector using packed float.
    672 #endif
    673 
    674 ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
    675     return v.m256_f32[i];
    676 }
    677 
    678 ICBC_FORCEINLINE VFloat vzero() {
    679     return _mm256_setzero_ps();
    680 }
    681 
    682 ICBC_FORCEINLINE VFloat vbroadcast(float a) {
    683     return _mm256_set1_ps(a);
    684 }
    685 
    686 ICBC_FORCEINLINE VFloat vload(const float * ptr) {
    687     return _mm256_load_ps(ptr);
    688 }
    689 
    690 ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) {
    691 #if ICBC_SIMD == ICBC_AVX2
    692     return _mm256_i32gather_ps(base, _mm256_cvtps_epi32(index), 4);
    693 #else
    694     VFloat v;
    695     for (int i = 0; i < VEC_SIZE; i++) {
    696         lane(v, i) = base[int(lane(index, i))];
    697     }
    698     return v;
    699 #endif
    700 }
    701 
    702 ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
    703     return _mm256_add_ps(a, b);
    704 }
    705 
    706 ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
    707     return _mm256_sub_ps(a, b);
    708 }
    709 
    710 ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
    711     return _mm256_mul_ps(a, b);
    712 }
    713 
    714 ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
    715 #if ICBC_USE_RCP
    716     #if ICBC_SIMD == ICBC_AVX512
    717         VFloat r = _mm256_rcp14_ps(a);
    718     #else
    719         VFloat r = _mm256_rcp_ps(a);
    720     #endif
    721 
    722     // r = r * (2 - r * a)
    723     #if ICBC_USE_FMA == 3 || ICBC_AVX2
    724         return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vbroadcast(2.0f)));
    725     #else
    726         return _mm256_mul_ps(r, _mm256_sub_ps(vbroadcast(2.0f), _mm256_mul_ps(r, a)));
    727     #endif
    728 #else
    729     return _mm256_div_ps(vbroadcast(1.0f), a);
    730 #endif
    731 }
    732 
    733 // a*b+c
    734 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
    735 #if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2
    736     return _mm256_fmadd_ps(a, b, c);
    737 #elif ICBC_USE_FMA == 4
    738     return _mm256_macc_ps(a, b, c);
    739 #else
    740     return ((a * b) + c);
    741 #endif
    742 }
    743 
    744 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
    745 #if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2
    746     return _mm256_fmsub_ps(a, b, c);
    747 #elif ICBC_USE_FMA == 4
    748     return _mm256_msub_ps(a, b, c);
    749 #else
    750     return ((a * b) - c);
    751 #endif
    752 }
    753 
    754 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
    755     return vmsub(a, b, c * d);
    756 }
    757 
    758 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
    759     __m256 zero = _mm256_setzero_ps();
    760     __m256 one = _mm256_set1_ps(1.0f);
    761     return _mm256_min_ps(_mm256_max_ps(a, zero), one);
    762 }
    763 
    764 ICBC_FORCEINLINE VFloat vround01(VFloat a) {
    765     return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
    766 }
    767 
    768 ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
    769     return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
    770 }
    771 
    772 ICBC_FORCEINLINE VFloat lane_id() {
    773     return _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0);
    774 }
    775 
    776 ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GT_OQ); }
    777 ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GE_OQ); }
    778 ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LT_OQ); }
    779 ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LE_OQ); }
    780 
    781 ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm256_or_ps(A, B); }
    782 ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm256_and_ps(A, B); }
    783 ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm256_xor_ps(A, B); }
    784 
    785 // mask ? b : a
    786 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
    787     return _mm256_blendv_ps(a, b, mask);
    788 }
    789 
    790 ICBC_FORCEINLINE VMask vbroadcast(bool b) { 
    791     return _mm256_castsi256_ps(_mm256_set1_epi32(-int32_t(b)));
    792 }
    793 
    794 ICBC_FORCEINLINE bool all(VMask m) {
    795     __m256 zero = _mm256_setzero_ps();
    796     return _mm256_testc_ps(_mm256_cmp_ps(zero, zero, _CMP_EQ_UQ), m) == 0;
    797 }
    798 
    799 ICBC_FORCEINLINE bool any(VMask m) {
    800     return _mm256_testz_ps(m, m) == 0;
    801 }
    802 
    803 ICBC_FORCEINLINE uint mask(VMask m) {
    804     return (uint)_mm256_movemask_ps(m);
    805 }
    806 
    807 // This is missing on some GCC versions.
    808 #if !defined _mm256_set_m128
    809 #define _mm256_set_m128(hi, lo) _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
    810 #endif
    811 
    812 ICBC_FORCEINLINE int reduce_min_index(VFloat v) {
    813 
    814     __m128 vlow  = _mm256_castps256_ps128(v);
    815     __m128 vhigh = _mm256_extractf128_ps(v, 1);
    816            vlow  = _mm_min_ps(vlow, vhigh);
    817 
    818     // First do an horizontal reduction.                                // v = [ D C | B A ]
    819     __m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));  //     [ C D | A B ]
    820     __m128 mins = _mm_min_ps(vlow, shuf);                            // mins = [ D+C C+D | B+A A+B ]
    821     shuf        = _mm_movehl_ps(shuf, mins);                         //        [   C   D | D+C C+D ]
    822     mins        = _mm_min_ss(mins, shuf);
    823 
    824     VFloat vmin = _mm256_permute_ps(_mm256_set_m128(mins, mins), 0); // _MM256_PERMUTE(0, 0, 0, 0, 0, 0, 0, 0)
    825 
    826     // Then find the index.
    827     uint mask = _mm256_movemask_ps(v <= vmin);
    828     return ctz(mask);
    829 }
    830 
    831 // AoS to SoA
    832 ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {
    833     VFloat r0 = _mm256_unpacklo_ps(a, b);
    834     VFloat r1 = _mm256_unpacklo_ps(c, d);
    835     VFloat r2 = _mm256_permute2f128_ps(r0, r1, 0x20);
    836     VFloat r3 = _mm256_permute2f128_ps(r0, r1, 0x31);
    837     r0 = _mm256_unpackhi_ps(a, b);
    838     r1 = _mm256_unpackhi_ps(c, d);
    839     a = _mm256_unpacklo_ps(r2, r3);
    840     b = _mm256_unpackhi_ps(r2, r3);
    841     r2 = _mm256_permute2f128_ps(r0, r1, 0x20);
    842     r3 = _mm256_permute2f128_ps(r0, r1, 0x31);
    843     c = _mm256_unpacklo_ps(r2, r3);
    844     d = _mm256_unpackhi_ps(r2, r3);
    845 }
    846 
    847 ICBC_FORCEINLINE VInt vzeroi() {
    848     return _mm256_setzero_si256();
    849 }
    850 
    851 ICBC_FORCEINLINE VInt vbroadcast(int a) {
    852     return _mm256_set1_epi32(a);
    853 }
    854 
    855 ICBC_FORCEINLINE VInt vload(const int * ptr) {
    856     return _mm256_load_si256((const __m256i*)ptr);
    857 }
    858 
    859 ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm256_sub_epi32(A, _mm256_set1_epi32(b)); }
    860 ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm256_and_si256(A, _mm256_set1_epi32(b)); }
    861 ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm256_srli_epi32(A, b); }
    862 
    863 ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(A, _mm256_set1_epi32(b))); }
    864 ICBC_FORCEINLINE VMask operator>= (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(A, _mm256_set1_epi32(b-1))); }
    865 ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(A, _mm256_set1_epi32(b))); }
    866 
    867 // mask ? v[idx] : 0
    868 ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) {
    869     return _mm256_and_ps(_mm256_permutevar8x32_ps(v, idx), mask);
    870 }
    871 
    872 // mask ? (idx > 8 ? vhi[idx] : vlo[idx]) : 0
    873 ICBC_FORCEINLINE VFloat vpermute2if(VMask mask, VFloat vlo, VFloat vhi, VInt idx) {
    874 #if 0
    875     VMask mhi = idx > 7;
    876     vlo = _mm256_permutevar8x32_ps(vlo, idx);
    877     vhi = _mm256_permutevar8x32_ps(vhi, idx);
    878     VFloat v = _mm256_blendv_ps(vlo, vhi, mhi);
    879     return _mm256_and_ps(v, mask);
    880 #else
    881     // Fabian Giesen says not to mix _mm256_blendv_ps and _mm256_permutevar8x32_ps since they contend for the same gates and instead suggests the following:
    882     vhi = _mm256_xor_ps(vhi, vlo);
    883     VFloat v = _mm256_permutevar8x32_ps(vlo, idx);
    884     VMask mhi = idx > 7;
    885     v = _mm256_xor_ps(v, _mm256_and_ps(_mm256_permutevar8x32_ps(vhi, idx), mhi));
    886     return _mm256_and_ps(v, mask);
    887 #endif
    888 }
    889 
    890 
    891 #elif ICBC_SIMD == ICBC_AVX512
    892 
    893 constexpr int VEC_SIZE = 16;
    894 
    895 #if __GNUC__
    896 union VFloat {
    897     __m512 v;
    898     float m512_f32[VEC_SIZE];
    899 
    900     VFloat() {}
    901     VFloat(__m512 v) : v(v) {}
    902     operator __m512 & () { return v; }
    903 };
    904 union VInt {
    905     __m512i v;
    906     int m512i_i32[VEC_SIZE];
    907 
    908     VInt() {}
    909     VInt(__m512i v) : v(v) {}
    910     operator __m512i & () { return v; }
    911 };
    912 #else
    913 using VFloat = __m512;
    914 using VInt = __m512i;
    915 #endif
    916 struct VMask { __mmask16 m; };
    917 
    918 ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
    919     return v.m512_f32[i];
    920 }
    921 
    922 ICBC_FORCEINLINE VFloat vzero() {
    923     return _mm512_setzero_ps();
    924 }
    925 
    926 ICBC_FORCEINLINE VFloat vbroadcast(float a) {
    927     return _mm512_set1_ps(a);
    928 }
    929 
    930 ICBC_FORCEINLINE VFloat vload(const float * ptr) {
    931     return _mm512_load_ps(ptr);
    932 }
    933 
    934 ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr) {
    935     return _mm512_mask_load_ps(_mm512_undefined(), mask.m, ptr);
    936 }
    937 
    938 ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr, float fallback) {
    939     return _mm512_mask_load_ps(_mm512_set1_ps(fallback), mask.m, ptr);
    940 }
    941 
    942 ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) {
    943     return _mm512_i32gather_ps(_mm512_cvtps_epi32(index), base, 4);
    944 }
    945 
    946 ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
    947     return _mm512_add_ps(a, b);
    948 }
    949 
    950 ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
    951     return _mm512_sub_ps(a, b);
    952 }
    953 
    954 ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
    955     return _mm512_mul_ps(a, b);
    956 }
    957 
    958 ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
    959 #if ICBC_USE_RCP
    960     VFloat r = _mm512_rcp14_ps(a);
    961 
    962     // r = r * (2 - r * a)
    963     return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vbroadcast(2.0f)));
    964 #else
    965     return _mm512_div_ps(vbroadcast(1.0f), a);
    966 #endif
    967 }
    968 
    969 // a*b+c
    970 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
    971     return _mm512_fmadd_ps(a, b, c);
    972 }
    973 
    974 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
    975     return _mm512_fmsub_ps(a, b, c);
    976 }
    977 
    978 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
    979     return vmsub(a, b, c * d);
    980 }
    981 
    982 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
    983     auto zero = _mm512_setzero_ps();
    984     auto one = _mm512_set1_ps(1.0f);
    985     return _mm512_min_ps(_mm512_max_ps(a, zero), one);
    986 }
    987 
    988 ICBC_FORCEINLINE VFloat vround01(VFloat a) {
    989     return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT);
    990 }
    991 
    992 ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
    993     return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO);
    994 }
    995 
    996 ICBC_FORCEINLINE VFloat lane_id() {
    997     return _mm512_set_ps(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
    998 }
    999 
   1000 ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GT_OQ) }; }
   1001 ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ) }; }
   1002 ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LT_OQ) }; }
   1003 ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LE_OQ) }; }
   1004 
   1005 ICBC_FORCEINLINE VMask operator! (VMask A) { return { _mm512_knot(A.m) }; }
   1006 ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { _mm512_kor(A.m, B.m) }; }
   1007 ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { _mm512_kand(A.m, B.m) }; }
   1008 ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { _mm512_kxor(A.m, B.m) }; }
   1009 
   1010 // mask ? b : a
   1011 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
   1012     return _mm512_mask_blend_ps(mask.m, a, b);
   1013 }
   1014 
   1015 ICBC_FORCEINLINE VMask vbroadcast(bool b) { 
   1016     return { __mmask16(-int16_t(b)) };
   1017 }
   1018 
   1019 ICBC_FORCEINLINE bool all(VMask mask) {
   1020     return mask.m == 0xFFFFFFFF;
   1021 }
   1022 
   1023 ICBC_FORCEINLINE bool any(VMask mask) {
   1024     return mask.m != 0;
   1025 }
   1026 
   1027 ICBC_FORCEINLINE uint mask(VMask mask) {
   1028     return mask.m;
   1029 }
   1030 
   1031 ICBC_FORCEINLINE int reduce_min_index(VFloat v) {
   1032 
   1033     // First do an horizontal reduction.
   1034     VFloat vmin = vbroadcast(_mm512_reduce_min_ps(v));
   1035 
   1036     // Then find the index.
   1037     VMask mask = (v <= vmin);
   1038     return ctz(mask.m);
   1039 }
   1040 
   1041 //ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d); // @@
   1042 
   1043 ICBC_FORCEINLINE int lane(VInt v, int i) {
   1044     //return _mm256_extract_epi32(v, i);
   1045     return v.m512i_i32[i];
   1046 }
   1047 
   1048 ICBC_FORCEINLINE VInt vzeroi() {
   1049     return _mm512_setzero_epi32();
   1050 }
   1051 
   1052 ICBC_FORCEINLINE VInt vbroadcast(int a) {
   1053     return _mm512_set1_epi32(a);
   1054 }
   1055 
   1056 ICBC_FORCEINLINE VInt vload(const int * ptr) {
   1057     return _mm512_load_epi32(ptr);
   1058 }
   1059 
   1060 ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm512_sub_epi32(A, vbroadcast(b)); }
   1061 ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm512_and_epi32(A, vbroadcast(b)); }
   1062 ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm512_srli_epi32(A, b); }
   1063 
   1064 ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return { _mm512_cmpgt_epi32_mask(A, vbroadcast(b)) }; }
   1065 ICBC_FORCEINLINE VMask operator>=(VInt A, int b) { return { _mm512_cmpge_epi32_mask(A, vbroadcast(b)) }; }
   1066 ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return { _mm512_cmpeq_epi32_mask(A, vbroadcast(b)) }; }
   1067 
   1068 // mask ? v[idx] : 0
   1069 ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) {
   1070     return _mm512_maskz_permutexvar_ps(mask.m, idx, v);
   1071 }
   1072 
   1073 
   1074 #elif ICBC_SIMD == ICBC_NEON
   1075 
   1076 constexpr int VEC_SIZE = 4;
   1077 
   1078 #if __GNUC__
   1079 union VFloat {
   1080     float32x4_t v;
   1081     float e[4];
   1082     VFloat() {}
   1083     VFloat(float32x4_t v) : v(v) {}
   1084     operator float32x4_t & () { return v; }
   1085 };
   1086 struct VMask {
   1087     uint32x4_t v;
   1088     VMask() {}
   1089     VMask(uint32x4_t v) : v(v) {}
   1090     operator uint32x4_t & () { return v; }
   1091 };
   1092 #else
   1093 using VFloat = float32x4_t;
   1094 using VMask = uint32x4_t;
   1095 #endif
   1096 
   1097 ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
   1098 #if defined(__clang__)
   1099     return v.e[i];
   1100 #elif defined(_MSC_VER)
   1101     return v.n128_f32[i];
   1102 #else
   1103     return v.v[i];
   1104 #endif
   1105 }
   1106 
   1107 ICBC_FORCEINLINE VFloat vzero() {
   1108     return vdupq_n_f32(0.0f);
   1109 }
   1110 
   1111 ICBC_FORCEINLINE VFloat vbroadcast(float a) {
   1112     return vdupq_n_f32(a);
   1113 }
   1114 
   1115 ICBC_FORCEINLINE VFloat vload(const float * ptr) {
   1116     return vld1q_f32(ptr);
   1117 }
   1118 
   1119 ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
   1120     return vaddq_f32(a, b);
   1121 }
   1122 
   1123 ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
   1124     return vsubq_f32(a, b);
   1125 }
   1126 
   1127 ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
   1128     return vmulq_f32(a, b);
   1129 }
   1130 
   1131 ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
   1132 #if ICBC_USE_RCP
   1133     return vrecpeq_f32(a);
   1134 #else
   1135     //VFloat rcp = vrecpeq_f32(a);
   1136     //return vmulq_f32(vrecpsq_f32(a, rcp), rcp);
   1137     return vdivq_f32(vbroadcast(1.0f), a);
   1138 #endif
   1139 }
   1140 
   1141 // a*b+c
   1142 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
   1143     return vmlaq_f32(c, a, b);
   1144 }
   1145 
   1146 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
   1147     return a * b - c;
   1148 }
   1149 
   1150 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
   1151     // vmlsq_f32(a, b, c) == a - (b * c)
   1152     return vmlsq_f32(a * b, c, d);
   1153 }
   1154 
   1155 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
   1156     return vminq_f32(vmaxq_f32(a, vzero()), vbroadcast(1));
   1157 }
   1158 
   1159 ICBC_FORCEINLINE VFloat vround01(VFloat a) {
   1160     return vrndnq_f32(a);   // Round to integral (to nearest, ties to even)
   1161     //return vcvtq_f32_s32(vcvtq_s32_f32(a + vbroadcast(0.5)));
   1162 }
   1163 
   1164 ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
   1165     return vrndq_f32(a);
   1166     //return vcvtq_f32_s32(vcvtq_s32_f32(a));
   1167 }
   1168 
   1169 ICBC_FORCEINLINE VFloat lane_id() {
   1170     ICBC_ALIGN float data[4] = { 0, 1, 2, 3 };
   1171 	return vld1q_f32(data);
   1172 }
   1173 
   1174 ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vcgtq_f32(A, B) }; }
   1175 ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vcgeq_f32(A, B) }; }
   1176 ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vcltq_f32(A, B) }; }
   1177 ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vcleq_f32(A, B) }; }
   1178 
   1179 ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vorrq_u32(A, B) }; }
   1180 ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vandq_u32(A, B) }; }
   1181 ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { veorq_u32(A, B) }; }
   1182 
   1183 // mask ? b : a
   1184 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
   1185     return vbslq_f32(mask, b, a);
   1186 }
   1187 
   1188 ICBC_FORCEINLINE bool all(VMask mask) {
   1189     return vminvq_u32(mask) != 0;
   1190     // uint32x2_t m2 = vpmin_u32(vget_low_u32(mask), vget_high_u32(mask));
   1191     // uint32x2_t m1 = vpmin_u32(m2, m2);
   1192     // return vget_lane_u32(m1, 0) != 0;
   1193 }
   1194 
   1195 ICBC_FORCEINLINE bool any(VMask mask) {
   1196     return vmaxvq_u32(mask) != 0;
   1197     // uint32x2_t m2 = vpmax_u32(vget_low_u32(mask), vget_high_u32(mask));
   1198     // uint32x2_t m1 = vpmax_u32(m2, m2);
   1199     // return vget_lane_u32(m1, 0) != 0;
   1200 }
   1201 
   1202 ICBC_FORCEINLINE uint mask(VMask b) {
   1203     const uint32x4_t movemask = { 1, 2, 4, 8 };
   1204     return vaddvq_u32(vandq_u32(b, movemask));
   1205 }
   1206 
   1207 ICBC_FORCEINLINE int reduce_min_index(VFloat v) {
   1208     // float32x2_t m2 = vpmin_f32(vget_low_u32(v), vget_high_u32(v));
   1209     // float32x2_t m1 = vpmin_f32(m2, m2);
   1210     // VFloat vmin = vdupq_lane_f32(m1, 0);
   1211     VFloat vmin = vdupq_n_f32(vminvq_f32(v));
   1212     return ctz(mask(v <= vmin));
   1213 }
   1214 
   1215 // https://github.com/Maratyszcza/NNPACK/blob/master/src/neon/transpose.h
   1216 ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {
   1217     // row0 = ( x00 x01 x02 x03 )
   1218     // row1 = ( x10 x11 x12 x13 )
   1219     // row2 = ( x20 x21 x22 x23 )
   1220     // row3 = ( x30 x31 x32 x33 )
   1221 
   1222     // row01 = ( x00 x10 x02 x12 ), ( x01 x11 x03, x13 )
   1223     // row23 = ( x20 x30 x22 x32 ), ( x21 x31 x23, x33 )
   1224     float32x4x2_t row01 = vtrnq_f32(a, b);
   1225     float32x4x2_t row23 = vtrnq_f32(c, d);
   1226 
   1227     // row0 = ( x00 x10 x20 x30 )
   1228     // row1 = ( x01 x11 x21 x31 )
   1229     // row2 = ( x02 x12 x22 x32 )
   1230     // row3 = ( x03 x13 x23 x33 )
   1231     a = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0]));
   1232     b = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1]));
   1233     c = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0]));
   1234     d = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1]));
   1235 }
   1236 
   1237 #elif ICBC_SIMD == ICBC_VMX
   1238 
   1239 constexpr int VEC_SIZE = 4;
   1240 
   1241 union VFloat {
   1242     vectro float v;
   1243     float e[4];
   1244     VFloat() {}
   1245     VFloat(vector float v) : v(v) {}
   1246     operator vector float & () { return v; }
   1247 };
   1248 struct VMask {
   1249     vector unsigned int v;
   1250     VMask() {}
   1251     VMask(vector unsigned int v) : v(v) {}
   1252     operator vector unsigned int & () { return v; }
   1253 };
   1254 
   1255 ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
   1256     return v.e[i];
   1257 }
   1258 
   1259 ICBC_FORCEINLINE VFloat vzero() {
   1260     return vec_splats(0.0f);
   1261 }
   1262 
   1263 ICBC_FORCEINLINE VFloat vbroadcast(float a) {
   1264     return vec_splats(a);
   1265 }
   1266 
   1267 ICBC_FORCEINLINE VFloat vload(const float * ptr) {
   1268     return vec_ld(ptr)
   1269 }
   1270 
   1271 ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
   1272     return vec_add(a, b);
   1273 }
   1274 
   1275 ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
   1276     return vec_sub(a, b);
   1277 }
   1278 
   1279 ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
   1280     return vec_madd(a, b, vec_splats(-0.0f));
   1281 }
   1282 
   1283 ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
   1284 #if ICBC_USE_RCP
   1285     // get the reciprocal estimate
   1286     vector float estimate = vec_re( v.vec );
   1287 
   1288     // one round of Newton-Rhaphson refinement
   1289     vector float diff = vec_nmsub( estimate, v.vec, vec_splats( 1.0f ) );
   1290     return vec_madd(diff, estimate, estimate );
   1291 #else
   1292     return vec_div(vec_splats(1),a);
   1293 #endif
   1294 }
   1295 
   1296 // a*b+c
   1297 ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
   1298     return vec_madd(a, b, c);
   1299 }
   1300 
   1301 ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
   1302     return vec_msub(a, b, c); // @@ Is this right?
   1303 }
   1304 
   1305 ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
   1306     return vmsub(a, b, c * d);
   1307 }
   1308 
   1309 ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
   1310     return vec_min(vec_max(a, vzero()), vbroadcast(1));
   1311 }
   1312 
   1313 ICBC_FORCEINLINE VFloat vround01(VFloat a) {
   1314     // @@ Assumes a is positive and ~small
   1315     return vec_trunc(a + vbroadcast(0.5));
   1316 }
   1317 
   1318 ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
   1319     return vec_trunc(a);
   1320 }
   1321 
   1322 ICBC_FORCEINLINE VFloat lane_id() {
   1323     return (VFloat){ 0, 1, 2, 3 };
   1324 }
   1325 
   1326 ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vec_cmpgt(A, B) }; }
   1327 ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vec_cmpge(A, B) }; }
   1328 ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vec_cmplt(A, B) }; }
   1329 ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vec_cmple(A, B) }; }
   1330 
   1331 ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vec_or(A, B) }; }
   1332 ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vec_and(A, B) }; }
   1333 ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { vec_xor(A, B) }; }
   1334 
   1335 // mask ? b : a
   1336 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
   1337     return vec_sel(a, b, mask);
   1338 }
   1339 
   1340 ICBC_FORCEINLINE int reduce_min_index(VFloat v) {
   1341 
   1342     //VFloat vmin =  //@@ Horizontal min?
   1343     //return vec_cmpeq_idx(v, vmin);
   1344 
   1345     // @@ Is there a better way to do this reduction?
   1346     int min_idx = 0;
   1347     float min_value = lane(v, 0);
   1348 
   1349     for (int i = 1; i < VEC_SIZE; i++) {
   1350         float value = lane(v, i);
   1351         if (value < min_value) {
   1352             min_value = value;
   1353             min_idx = i;
   1354         }
   1355     }
   1356 
   1357     return min_idx;
   1358 }
   1359 
   1360 ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {
   1361     VFloat t1 = vec_mergeh(a, c);
   1362     VFloat t2 = vec_mergel(a, c);
   1363     VFloat t3 = vec_mergeh(b, d);
   1364     VFloat t4 = vec_mergel(b, d);
   1365     a = vec_mergeh(t1, t3);
   1366     b = vec_mergel(t1, t3);
   1367     c = vec_mergeh(t2, t4);
   1368     d = vec_mergel(t2, t4);
   1369 }
   1370 
   1371 #endif // ICBC_SIMD == *
   1372 
   1373 #if ICBC_SIMD != ICBC_SCALAR
   1374 ICBC_FORCEINLINE VFloat vmadd(VFloat a, float b, VFloat c) {
   1375     VFloat vb = vbroadcast(b);
   1376     return vmadd(a, vb, c);
   1377 }
   1378 #endif
   1379 
   1380 struct VVector3 {
   1381     VFloat x;
   1382     VFloat y;
   1383     VFloat z;
   1384 };
   1385 
   1386 ICBC_FORCEINLINE VVector3 vbroadcast(Vector3 v) {
   1387     VVector3 v8;
   1388 
   1389     v8.x = vbroadcast(v.x);
   1390     v8.y = vbroadcast(v.y);
   1391     v8.z = vbroadcast(v.z);
   1392     return v8;
   1393 }
   1394 
   1395 ICBC_FORCEINLINE VVector3 vbroadcast(float x, float y, float z) {
   1396     VVector3 v8;
   1397     v8.x = vbroadcast(x);
   1398     v8.y = vbroadcast(y);
   1399     v8.z = vbroadcast(z);
   1400     return v8;
   1401 }
   1402 
   1403 /*ICBC_FORCEINLINE VVector3 vload(const Vector3 * v) {
   1404     // @@ See this for a 8x3 transpose: https://software.intel.com/content/www/us/en/develop/articles/3d-vector-normalization-using-256-bit-intel-advanced-vector-extensions-intel-avx.html
   1405 }*/
   1406 
   1407 ICBC_FORCEINLINE VVector3 vload(const Vector4 * ptr) {
   1408 #if ICBC_SIMD == ICBC_AVX512
   1409 
   1410     // @@ AVX512 transpose not implemented.
   1411     __m512i vindex = _mm512_set_epi32(4 * 15, 4 * 14, 4 * 13, 4 * 12, 4 * 11, 4 * 10, 4 * 9, 4 * 8, 4 * 7, 4 * 6, 4 * 5, 4 * 4, 4 * 3, 4 * 2, 4 * 1, 0);
   1412 
   1413     VVector3 v;
   1414     v.x = _mm512_i32gather_ps(vindex, &ptr->x, 4);
   1415     v.y = _mm512_i32gather_ps(vindex, &ptr->y, 4);
   1416     v.z = _mm512_i32gather_ps(vindex, &ptr->z, 4);
   1417     return v;
   1418 
   1419 #else
   1420 
   1421     VVector3 v;
   1422     v.x = vload(&ptr->x + 0 * VEC_SIZE);
   1423     v.y = vload(&ptr->x + 1 * VEC_SIZE);
   1424     v.z = vload(&ptr->x + 2 * VEC_SIZE);
   1425     VFloat tmp = vload(&ptr->x + 3 * VEC_SIZE);
   1426 
   1427     vtranspose4(v.x, v.y, v.z, tmp);
   1428 
   1429     return v;
   1430 #endif
   1431 }
   1432 
   1433 
   1434 ICBC_FORCEINLINE VVector3 operator+(VVector3 a, VVector3 b) {
   1435     VVector3 v;
   1436     v.x = (a.x + b.x);
   1437     v.y = (a.y + b.y);
   1438     v.z = (a.z + b.z);
   1439     return v;
   1440 }
   1441 
   1442 ICBC_FORCEINLINE VVector3 operator-(VVector3 a, VVector3 b) {
   1443     VVector3 v8;
   1444     v8.x = (a.x - b.x);
   1445     v8.y = (a.y - b.y);
   1446     v8.z = (a.z - b.z);
   1447     return v8;
   1448 }
   1449 
   1450 ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VVector3 b) {
   1451     VVector3 v8;
   1452     v8.x = (a.x * b.x);
   1453     v8.y = (a.y * b.y);
   1454     v8.z = (a.z * b.z);
   1455     return v8;
   1456 }
   1457 
   1458 ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VFloat b) {
   1459     VVector3 v8;
   1460     v8.x = (a.x * b);
   1461     v8.y = (a.y * b);
   1462     v8.z = (a.z * b);
   1463     return v8;
   1464 }
   1465 
   1466 ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VVector3 b, VVector3 c) {
   1467     VVector3 v8;
   1468     v8.x = vmadd(a.x, b.x, c.x);
   1469     v8.y = vmadd(a.y, b.y, c.y);
   1470     v8.z = vmadd(a.z, b.z, c.z);
   1471     return v8;
   1472 }
   1473 
   1474 ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VFloat b, VVector3 c) {
   1475     VVector3 v8;
   1476     v8.x = vmadd(a.x, b, c.x);
   1477     v8.y = vmadd(a.y, b, c.y);
   1478     v8.z = vmadd(a.z, b, c.z);
   1479     return v8;
   1480 }
   1481 
   1482 #if ICBC_SIMD != ICBC_SCALAR
   1483 ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, float b, VVector3 c) {
   1484     VVector3 v8;
   1485     VFloat vb = vbroadcast(b);
   1486     v8.x = vmadd(a.x, vb, c.x);
   1487     v8.y = vmadd(a.y, vb, c.y);
   1488     v8.z = vmadd(a.z, vb, c.z);
   1489     return v8;
   1490 }
   1491 #endif
   1492 
   1493 ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VFloat c) {
   1494     VVector3 v8;
   1495     v8.x = vmsub(a.x, b, c);
   1496     v8.y = vmsub(a.y, b, c);
   1497     v8.z = vmsub(a.z, b, c);
   1498     return v8;
   1499 }
   1500 
   1501 ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VVector3 c) {
   1502     VVector3 v8;
   1503     v8.x = vmsub(a.x, b, c.x);
   1504     v8.y = vmsub(a.y, b, c.y);
   1505     v8.z = vmsub(a.z, b, c.z);
   1506     return v8;
   1507 }
   1508 
   1509 ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VFloat b, VVector3 c, VFloat d) {
   1510     VVector3 v;
   1511     v.x = vm2sub(a.x, b, c.x, d);
   1512     v.y = vm2sub(a.y, b, c.y, d);
   1513     v.z = vm2sub(a.z, b, c.z, d);
   1514     return v;
   1515 }
   1516 
   1517 ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VFloat d) {
   1518     VVector3 v;
   1519     v.x = vm2sub(a.x, b.x, c.x, d);
   1520     v.y = vm2sub(a.y, b.y, c.y, d);
   1521     v.z = vm2sub(a.z, b.z, c.z, d);
   1522     return v;
   1523 }
   1524 
   1525 ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VVector3 d) {
   1526     VVector3 v;
   1527     v.x = vm2sub(a.x, b.x, c.x, d.x);
   1528     v.y = vm2sub(a.y, b.y, c.y, d.y);
   1529     v.z = vm2sub(a.z, b.z, c.z, d.z);
   1530     return v;
   1531 }
   1532 
   1533 ICBC_FORCEINLINE VFloat vdot(VVector3 a, VVector3 b) {
   1534     VFloat r;
   1535     r = a.x * b.x + a.y * b.y + a.z * b.z;
   1536     return r;
   1537 }
   1538 
   1539 // Length squared.
   1540 ICBC_FORCEINLINE VFloat vlen2(VVector3 v) {
   1541     return vdot(v, v);
   1542 }
   1543 
   1544 
   1545 // mask ? b : a
   1546 ICBC_FORCEINLINE VVector3 vselect(VMask mask, VVector3 a, VVector3 b) {
   1547     VVector3 r;
   1548     r.x = vselect(mask, a.x, b.x);
   1549     r.y = vselect(mask, a.y, b.y);
   1550     r.z = vselect(mask, a.z, b.z);
   1551     return r;
   1552 }
   1553 
   1554 
   1555 
   1556 
   1557 ///////////////////////////////////////////////////////////////////////////////////////////////////
   1558 // Color conversion functions.
   1559 
   1560 static const float midpoints5[32] = {
   1561     0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
   1562     0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, FLT_MAX
   1563 };
   1564 
   1565 static const float midpoints6[64] = {
   1566     0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f,
   1567     0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f,
   1568     0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f,
   1569     0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, FLT_MAX
   1570 };
   1571 
   1572 /*void init_tables() {
   1573     for (int i = 0; i < 31; i++) {
   1574         float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f;
   1575         float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f;
   1576         midpoints5[i] = (f0 + f1) * 0.5;
   1577     }
   1578     midpoints5[31] = FLT_MAX;
   1579 
   1580     for (int i = 0; i < 63; i++) {
   1581         float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f;
   1582         float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f;
   1583         midpoints6[i] = (f0 + f1) * 0.5;
   1584     }
   1585     midpoints6[63] = FLT_MAX;
   1586 }*/
   1587 
   1588 ICBC_FORCEINLINE VFloat vround5(VFloat x) {
   1589     const VFloat rb_scale = vbroadcast(31.0f);
   1590     const VFloat rb_inv_scale = vbroadcast(1.0f / 31.0f);
   1591 
   1592 #if ICBC_PERFECT_ROUND
   1593     VFloat q = vtruncate(x * rb_scale);
   1594     VFloat mp = vgather(midpoints5, q);
   1595     //return (q + (vbroadcast(1.0f) & (x > mp))) * rb_inv_scale;
   1596     return (q + vselect(x > mp, vzero(), vbroadcast(1))) * rb_inv_scale;
   1597 #else
   1598     return vround01(x * rb_scale) * rb_inv_scale;
   1599 #endif
   1600 }
   1601 
   1602 ICBC_FORCEINLINE VFloat vround6(VFloat x) {
   1603     const VFloat g_scale = vbroadcast(63.0f);
   1604     const VFloat g_inv_scale = vbroadcast(1.0f / 63.0f);
   1605 
   1606 #if ICBC_PERFECT_ROUND
   1607     VFloat q = vtruncate(x * g_scale);
   1608     VFloat mp = vgather(midpoints6, q);
   1609     //return (q + (vbroadcast(1) & (x > mp))) * g_inv_scale;
   1610     return (q + vselect(x > mp, vzero(), vbroadcast(1))) * g_inv_scale;
   1611 #else
   1612     return vround01(x * g_scale) * g_inv_scale;
   1613 #endif
   1614 }
   1615 
   1616 ICBC_FORCEINLINE VVector3 vround_ept(VVector3 v) {
   1617     VVector3 r;
   1618     r.x = vround5(vsaturate(v.x));
   1619     r.y = vround6(vsaturate(v.y));
   1620     r.z = vround5(vsaturate(v.z));
   1621     return r;
   1622 }
   1623 
   1624 static Color16 vector3_to_color16(const Vector3 & v) {
   1625 
   1626     // Truncate.
   1627     uint r = uint(clamp(v.x * 31.0f, 0.0f, 31.0f));
   1628 	uint g = uint(clamp(v.y * 63.0f, 0.0f, 63.0f));
   1629 	uint b = uint(clamp(v.z * 31.0f, 0.0f, 31.0f));
   1630 
   1631     // Round exactly according to 565 bit-expansion.
   1632     r += (v.x > midpoints5[r]);
   1633     g += (v.y > midpoints6[g]);
   1634     b += (v.z > midpoints5[b]);
   1635 
   1636     Color16 c;
   1637     c.u = uint16((r << 11) | (g << 5) | b);
   1638     return c;
   1639 }
   1640 
   1641 static Color32 bitexpand_color16_to_color32(Color16 c16) {
   1642     Color32 c32;
   1643     //c32.b = (c16.b << 3) | (c16.b >> 2);
   1644     //c32.g = (c16.g << 2) | (c16.g >> 4);
   1645     //c32.r = (c16.r << 3) | (c16.r >> 2);
   1646     //c32.a = 0xFF;
   1647 
   1648     c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000);
   1649     c32.u |= (c32.u >> 5) & 0x070007;
   1650     c32.u |= (c32.u >> 6) & 0x000300;
   1651     c32.u |= 0xff000000;	
   1652 
   1653     return c32;
   1654 }
   1655 
   1656 inline Vector3 color_to_vector3(Color32 c) {
   1657     return { c.r / 255.0f, c.g / 255.0f, c.b / 255.0f };
   1658 }
   1659 
   1660 inline Color32 vector3_to_color32(Vector3 v) {
   1661     Color32 color;
   1662     color.r = uint8(saturate(v.x) * 255 + 0.5f);
   1663     color.g = uint8(saturate(v.y) * 255 + 0.5f);
   1664     color.b = uint8(saturate(v.z) * 255 + 0.5f);
   1665     color.a = 255;
   1666     return color;
   1667 }
   1668 
   1669 
   1670 ///////////////////////////////////////////////////////////////////////////////////////////////////
   1671 // Input block processing.
   1672 
   1673 inline bool is_black(Vector3 c) {
   1674     // This large threshold seems to improve compression. This is not forcing these texels to be black, just
   1675     // causes them to be ignored during PCA.
   1676     // @@ We may want to adjust this based on the quality level, since this increases the number of blocks that try cluster-3 fitting.
   1677     //return c.x < midpoints5[0] && c.y < midpoints6[0] && c.z < midpoints5[0];
   1678     //return c.x < 1.0f / 32 && c.y < 1.0f / 32 && c.z < 1.0f / 32;
   1679     return c.x < 1.0f / 8 && c.y < 1.0f / 8 && c.z < 1.0f / 8;
   1680 }
   1681 
   1682 // Find similar colors and combine them together.
   1683 static int reduce_colors(const Vector4 * input_colors, const float * input_weights, int count, float threshold, Vector3 * colors, float * weights, bool * any_black)
   1684 {
   1685 #if 0
   1686     for (int i = 0; i < 16; i++) {
   1687         colors[i] = input_colors[i].xyz;
   1688         weights[i] = input_weights[i];
   1689     }
   1690     return 16;
   1691 #else
   1692     *any_black = false;
   1693 
   1694     int n = 0;
   1695     for (int i = 0; i < count; i++)
   1696     {
   1697         Vector3 ci = input_colors[i].xyz;
   1698         float wi = input_weights[i];
   1699 
   1700         if (wi > 0) {
   1701             // Find matching color.
   1702             int j;
   1703             for (j = 0; j < n; j++) {
   1704                 if (equal(colors[j], ci, threshold)) {
   1705                     colors[j] = (colors[j] * weights[j] + ci * wi) / (weights[j] + wi);
   1706                     weights[j] += wi;
   1707                     break;
   1708                 }
   1709             }
   1710 
   1711             // No match found. Add new color.
   1712             if (j == n) {
   1713                 colors[n] = ci;
   1714                 weights[n] = wi;
   1715                 n++;
   1716             }
   1717 
   1718             if (is_black(ci)) {
   1719                 *any_black = true;
   1720             }
   1721         }
   1722     }
   1723 
   1724     ICBC_ASSERT(n <= count);
   1725 
   1726     return n;
   1727 #endif
   1728 }
   1729 
   1730 static int skip_blacks(const Vector3 * input_colors, const float * input_weights, int count, Vector3 * colors, float * weights)
   1731 {
   1732     int n = 0;
   1733     for (int i = 0; i < count; i++)
   1734     {
   1735         Vector3 ci = input_colors[i];
   1736         float wi = input_weights[i];
   1737 
   1738         if (is_black(ci)) {
   1739             continue;
   1740         }
   1741 
   1742         colors[n] = ci;
   1743         weights[n] = wi;
   1744         n += 1;
   1745     }
   1746 
   1747     return n;
   1748 }
   1749 
   1750 
   1751 
   1752 ///////////////////////////////////////////////////////////////////////////////////////////////////
   1753 // PCA
   1754 
   1755 static Vector3 computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights)
   1756 {
   1757     Vector3 centroid = { 0 };
   1758     float total = 0.0f;
   1759 
   1760     for (int i = 0; i < n; i++)
   1761     {
   1762         total += weights[i];
   1763         centroid += weights[i] * points[i];
   1764     }
   1765     centroid *= (1.0f / total);
   1766 
   1767     return centroid;
   1768 }
   1769 
   1770 static Vector3 computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, float *__restrict covariance)
   1771 {
   1772     // compute the centroid
   1773     Vector3 centroid = computeCentroid(n, points, weights);
   1774 
   1775     // compute covariance matrix
   1776     for (int i = 0; i < 6; i++)
   1777     {
   1778         covariance[i] = 0.0f;
   1779     }
   1780 
   1781     for (int i = 0; i < n; i++)
   1782     {
   1783         Vector3 a = (points[i] - centroid);    // @@ I think weight should be squared, but that seems to increase the error slightly.
   1784         Vector3 b = weights[i] * a;
   1785 
   1786         covariance[0] += a.x * b.x;
   1787         covariance[1] += a.x * b.y;
   1788         covariance[2] += a.x * b.z;
   1789         covariance[3] += a.y * b.y;
   1790         covariance[4] += a.y * b.z;
   1791         covariance[5] += a.z * b.z;
   1792     }
   1793 
   1794     return centroid;
   1795 }
   1796 
   1797 // @@ We should be able to do something cheaper...
   1798 static Vector3 estimatePrincipalComponent(const float * __restrict matrix)
   1799 {
   1800     const Vector3 row0 = { matrix[0], matrix[1], matrix[2] };
   1801     const Vector3 row1 = { matrix[1], matrix[3], matrix[4] };
   1802     const Vector3 row2 = { matrix[2], matrix[4], matrix[5] };
   1803 
   1804     float r0 = lengthSquared(row0);
   1805     float r1 = lengthSquared(row1);
   1806     float r2 = lengthSquared(row2);
   1807 
   1808     if (r0 > r1 && r0 > r2) return row0;
   1809     if (r1 > r2) return row1;
   1810     return row2;
   1811 }
   1812 
   1813 static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
   1814 {
   1815     if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
   1816     {
   1817         return {0};
   1818     }
   1819 
   1820     Vector3 v = estimatePrincipalComponent(matrix);
   1821 
   1822     const int NUM = 6;
   1823     for (int i = 0; i < NUM; i++)
   1824     {
   1825         float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
   1826         float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
   1827         float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
   1828 
   1829         float norm = max(max(x, y), z);
   1830 
   1831         v = { x, y, z };
   1832         v *= (1.0f / norm);
   1833     }
   1834 
   1835     return v;
   1836 }
   1837 
   1838 static Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights)
   1839 {
   1840     float matrix[6];
   1841     computeCovariance(n, points, weights, matrix);
   1842 
   1843     return firstEigenVector_PowerMethod(matrix);
   1844 }
   1845 
   1846 
   1847 ///////////////////////////////////////////////////////////////////////////////////////////////////
   1848 // SAT
   1849 
   1850 struct SummedAreaTable {
   1851     ICBC_ALIGN float r[16];
   1852     ICBC_ALIGN float g[16];
   1853     ICBC_ALIGN float b[16];
   1854     ICBC_ALIGN float w[16];
   1855 };
   1856 
   1857 int compute_sat(const Vector3 * colors, const float * weights, int count, SummedAreaTable * sat)
   1858 {
   1859     // I've tried using a lower quality approximation of the principal direction, but the best fit line seems to produce best results.
   1860     Vector3 principal = computePrincipalComponent_PowerMethod(count, colors, weights);
   1861 
   1862     // build the list of values
   1863     int order[16];
   1864     float dps[16];
   1865     for (int i = 0; i < count; ++i)
   1866     {
   1867         order[i] = i;
   1868         dps[i] = dot(colors[i], principal);
   1869     }
   1870 
   1871     // stable sort
   1872     for (int i = 0; i < count; ++i)
   1873     {
   1874         for (int j = i; j > 0 && dps[j] < dps[j - 1]; --j)
   1875         {
   1876             swap(dps[j], dps[j - 1]);
   1877             swap(order[j], order[j - 1]);
   1878         }
   1879     }
   1880 
   1881 #if 0
   1882     u8 byte_order[16*4];
   1883     for (int i = 0; i < count; ++i)
   1884     {
   1885     }
   1886     
   1887     // @@ Load colors and weights.
   1888     VFloat table_r[16/VEC_SIZE];
   1889     VFloat table_g[16/VEC_SIZE];
   1890     VFloat table_b[16/VEC_SIZE];
   1891     VFloat table_w[16/VEC_SIZE];
   1892 
   1893     VFloat sat_r[16/VEC_SIZE];
   1894     VFloat sat_g[16/VEC_SIZE];
   1895     VFloat sat_b[16/VEC_SIZE];
   1896     VFloat sat_w[16/VEC_SIZE];
   1897 
   1898     for (int i = 0; i < 16; i += VEC_SIZE) {
   1899         VFloat w = table_lookup(table_w, order);
   1900         sat_r[i] = table_lookup(table_r, order) * w;
   1901         sat_g[i] = table_lookup(table_g, order) * w;
   1902         sat_b[i] = table_lookup(table_b, order) * w;
   1903         sat_w[i] = w;
   1904     }
   1905 
   1906     // Somewhat brute force way to compute sum tables. Is it worth it?
   1907     for (int i = 0; i < 16; i += 1) {
   1908         float r = sat_r[i/VEC_SIZE][i%VEC_SIZE];
   1909 
   1910         if (i > lane_id)
   1911             sat_r[i] += sat_r[lane_id];
   1912     }
   1913 #endif
   1914 
   1915     float w = weights[order[0]];
   1916     sat->r[0] = colors[order[0]].x * w;
   1917     sat->g[0] = colors[order[0]].y * w;
   1918     sat->b[0] = colors[order[0]].z * w;
   1919     sat->w[0] = w;
   1920 
   1921     for (int i = 1; i < count; i++) {
   1922         w = weights[order[i]];
   1923         sat->r[i] = sat->r[i - 1] + colors[order[i]].x * w;
   1924         sat->g[i] = sat->g[i - 1] + colors[order[i]].y * w;
   1925         sat->b[i] = sat->b[i - 1] + colors[order[i]].z * w;
   1926         sat->w[i] = sat->w[i - 1] + w;
   1927     }
   1928 
   1929     for (int i = count; i < 16; i++) {
   1930         sat->r[i] = FLT_MAX;
   1931         sat->g[i] = FLT_MAX;
   1932         sat->b[i] = FLT_MAX;
   1933         sat->w[i] = FLT_MAX;
   1934     }
   1935 
   1936     return count;
   1937 }
   1938 
   1939 
   1940 ///////////////////////////////////////////////////////////////////////////////////////////////////
   1941 // Cluster Fit
   1942 
   1943 struct Combinations {
   1944     uint8 c0, c1, c2, pad;
   1945 };
   1946 
   1947 static ICBC_ALIGN int s_fourClusterTotal[16];
   1948 static ICBC_ALIGN int s_threeClusterTotal[16];
   1949 static ICBC_ALIGN Combinations s_fourCluster[968 + 8];
   1950 static ICBC_ALIGN Combinations s_threeCluster[152 + 8];
   1951 
   1952 #if ICBC_USE_NEON_VTL || ICBC_USE_SSSE3_SHUFFLEB
   1953 static uint8 s_byte_vtl_index0_4[4 * 968];
   1954 static uint8 s_byte_vtl_index1_4[4 * 968];
   1955 static uint8 s_byte_vtl_index2_4[4 * 968];
   1956 
   1957 static uint8 s_byte_vtl_index0_3[4 * 152];
   1958 static uint8 s_byte_vtl_index1_3[4 * 152];
   1959 #endif
   1960 
   1961 static float w1_3, w2_3;
   1962 static float w1_9, w4_9, w2_9;
   1963 
   1964 
   1965 static void init_cluster_tables() {
   1966 
   1967     for (int t = 1, i = 0; t <= 16; t++) {
   1968         for (int c0 = 0; c0 <= t; c0++) {
   1969             for (int c1 = 0; c1 <= t - c0; c1++) {
   1970                 for (int c2 = 0; c2 <= t - c0 - c1; c2++) {
   1971 
   1972                     // Skip this cluster so that the total is a multiple of 8
   1973                     if (c0 == 0 && c1 == 0 && c2 == 0) continue;
   1974 
   1975                     bool found = false;
   1976                     if (t > 1) {
   1977                         for (int j = 0; j < s_fourClusterTotal[t-2]; j++) {
   1978                             if (s_fourCluster[j].c0 == c0 && s_fourCluster[j].c1 == c0+c1 && s_fourCluster[j].c2 == c0+c1+c2) {
   1979                                 found = true;
   1980                                 break;
   1981                             }
   1982                         }
   1983                     }
   1984 
   1985                     if (!found) {
   1986                         s_fourCluster[i].c0 = uint8(c0);
   1987                         s_fourCluster[i].c1 = uint8(c0+c1);
   1988                         s_fourCluster[i].c2 = uint8(c0+c1+c2);
   1989                         i++;
   1990                     }
   1991                 }
   1992             }
   1993         }
   1994 
   1995         s_fourClusterTotal[t - 1] = i;
   1996     }
   1997 
   1998     // Replicate last entry.
   1999     for (int i = 0; i < 8; i++) {
   2000         s_fourCluster[968 + i] = s_fourCluster[968-1];
   2001     }
   2002 
   2003     for (int t = 1, i = 0; t <= 16; t++) {
   2004         for (int c0 = 0; c0 <= t; c0++) {
   2005             for (int c1 = 0; c1 <= t - c0; c1++) {
   2006 
   2007                 // Skip this cluster so that the total is a multiple of 8
   2008                 if (c0 == 0 && c1 == 0) continue;
   2009 
   2010                 bool found = false;
   2011                 if (t > 1) {
   2012                     for (int j = 0; j < s_threeClusterTotal[t - 2]; j++) {
   2013                         if (s_threeCluster[j].c0 == c0 && s_threeCluster[j].c1 == c0+c1) {
   2014                             found = true;
   2015                             break;
   2016                         }
   2017                     }
   2018                 }
   2019 
   2020                 if (!found) {
   2021                     s_threeCluster[i].c0 = uint8(c0);
   2022                     s_threeCluster[i].c1 = uint8(c0 + c1);
   2023                     i++;
   2024                 }
   2025             }
   2026         }
   2027 
   2028         s_threeClusterTotal[t - 1] = i;
   2029     }
   2030 
   2031     // Replicate last entry.
   2032     for (int i = 0; i < 8; i++) {
   2033         s_threeCluster[152 + i] = s_threeCluster[152 - 1];
   2034     }
   2035 
   2036 #if ICBC_USE_NEON_VTL || ICBC_USE_SSSE3_SHUFFLEB
   2037     for (int i = 0; i < 968; i++) {
   2038         int c0 = (s_fourCluster[i].c0) - 1;
   2039         s_byte_vtl_index0_4[4 * i + 0] = c0 >= 0 ? uint8(c0 * 4 + 0) : 255;
   2040         s_byte_vtl_index0_4[4 * i + 1] = c0 >= 0 ? uint8(c0 * 4 + 1) : 255;
   2041         s_byte_vtl_index0_4[4 * i + 2] = c0 >= 0 ? uint8(c0 * 4 + 2) : 255;
   2042         s_byte_vtl_index0_4[4 * i + 3] = c0 >= 0 ? uint8(c0 * 4 + 3) : 255;
   2043 
   2044         int c1 = (s_fourCluster[i].c1) - 1;
   2045         s_byte_vtl_index1_4[4 * i + 0] = c1 >= 0 ? uint8(c1 * 4 + 0) : 255;
   2046         s_byte_vtl_index1_4[4 * i + 1] = c1 >= 0 ? uint8(c1 * 4 + 1) : 255;
   2047         s_byte_vtl_index1_4[4 * i + 2] = c1 >= 0 ? uint8(c1 * 4 + 2) : 255;
   2048         s_byte_vtl_index1_4[4 * i + 3] = c1 >= 0 ? uint8(c1 * 4 + 3) : 255;
   2049 
   2050         int c2 = (s_fourCluster[i].c2) - 1;
   2051         s_byte_vtl_index2_4[4 * i + 0] = c2 >= 0 ? uint8(c2 * 4 + 0) : 255;
   2052         s_byte_vtl_index2_4[4 * i + 1] = c2 >= 0 ? uint8(c2 * 4 + 1) : 255;
   2053         s_byte_vtl_index2_4[4 * i + 2] = c2 >= 0 ? uint8(c2 * 4 + 2) : 255;
   2054         s_byte_vtl_index2_4[4 * i + 3] = c2 >= 0 ? uint8(c2 * 4 + 3) : 255;
   2055     }
   2056 
   2057     for (int i = 0; i < 152; i++) {
   2058         int c0 = (s_threeCluster[i].c0) - 1;
   2059         s_byte_vtl_index0_3[4 * i + 0] = c0 >= 0 ? uint8(c0 * 4 + 0) : 255;
   2060         s_byte_vtl_index0_3[4 * i + 1] = c0 >= 0 ? uint8(c0 * 4 + 1) : 255;
   2061         s_byte_vtl_index0_3[4 * i + 2] = c0 >= 0 ? uint8(c0 * 4 + 2) : 255;
   2062         s_byte_vtl_index0_3[4 * i + 3] = c0 >= 0 ? uint8(c0 * 4 + 3) : 255;
   2063 
   2064         int c1 = (s_threeCluster[i].c1) - 1;
   2065         s_byte_vtl_index1_3[4 * i + 0] = c1 >= 0 ? uint8(c1 * 4 + 0) : 255;
   2066         s_byte_vtl_index1_3[4 * i + 1] = c1 >= 0 ? uint8(c1 * 4 + 1) : 255;
   2067         s_byte_vtl_index1_3[4 * i + 2] = c1 >= 0 ? uint8(c1 * 4 + 2) : 255;
   2068         s_byte_vtl_index1_3[4 * i + 3] = c1 >= 0 ? uint8(c1 * 4 + 3) : 255;
   2069     }
   2070 #endif
   2071 }
   2072 
   2073 
   2074 
   2075 static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end)
   2076 {
   2077     const float r_sum = sat.r[count-1];
   2078     const float g_sum = sat.g[count-1];
   2079     const float b_sum = sat.b[count-1];
   2080     const float w_sum = sat.w[count-1];
   2081 
   2082     VFloat vbesterror = vbroadcast(FLT_MAX);
   2083     VVector3 vbeststart = { vzero(), vzero(), vzero() };
   2084     VVector3 vbestend = { vzero(), vzero(), vzero() };
   2085 
   2086     // check all possible clusters for this total order
   2087     const int total_order_count = s_threeClusterTotal[count - 1];
   2088 
   2089     for (int i = 0; i < total_order_count; i += VEC_SIZE)
   2090     {
   2091         VVector3 x0, x1;
   2092         VFloat w0, w1;
   2093 
   2094 #if ICBC_USE_AVX512_PERMUTE
   2095 
   2096         auto loadmask = lane_id() < vbroadcast(float(count));
   2097 
   2098         // Load sat in one register:
   2099         VFloat vrsat = vload(loadmask, sat.r, FLT_MAX);
   2100         VFloat vgsat = vload(loadmask, sat.g, FLT_MAX);
   2101         VFloat vbsat = vload(loadmask, sat.b, FLT_MAX);
   2102         VFloat vwsat = vload(loadmask, sat.w, FLT_MAX);
   2103 
   2104         // Load 4 uint8 per lane.
   2105         VInt packedClusterIndex = vload((int *)&s_threeCluster[i]);
   2106 
   2107         VInt c0 = (packedClusterIndex & 0xFF) - 1;
   2108         VInt c1 = (packedClusterIndex >> 8) - 1;
   2109 
   2110         x0.x = vpermuteif(c0 >= 0, vrsat, c0);
   2111         x0.y = vpermuteif(c0 >= 0, vgsat, c0);
   2112         x0.z = vpermuteif(c0 >= 0, vbsat, c0);
   2113         w0   = vpermuteif(c0 >= 0, vwsat, c0);
   2114 
   2115         x1.x = vpermuteif(c1 >= 0, vrsat, c1);
   2116         x1.y = vpermuteif(c1 >= 0, vgsat, c1);
   2117         x1.z = vpermuteif(c1 >= 0, vbsat, c1);
   2118         w1   = vpermuteif(c1 >= 0, vwsat, c1);
   2119 
   2120 #elif ICBC_USE_AVX2_PERMUTE2
   2121 
   2122         // Load 4 uint8 per lane. @@ Ideally I should pack this better and load only 2.
   2123         VInt packedClusterIndex = vload((int *)&s_threeCluster[i]);
   2124 
   2125         VInt c0 = (packedClusterIndex & 0xFF) - 1;
   2126         VInt c1 = (packedClusterIndex >> 8) - 1; // No need for & 0xFF
   2127 
   2128         if (count <= 8) {
   2129             // Load sat.r in one register:
   2130             VFloat rLo = vload(sat.r);
   2131             VFloat gLo = vload(sat.g);
   2132             VFloat bLo = vload(sat.b);
   2133             VFloat wLo = vload(sat.w);
   2134 
   2135             x0.x = vpermuteif(c0 >= 0, rLo, c0);
   2136             x0.y = vpermuteif(c0 >= 0, gLo, c0);
   2137             x0.z = vpermuteif(c0 >= 0, bLo, c0);
   2138             w0   = vpermuteif(c0 >= 0, wLo, c0);
   2139 
   2140             x1.x = vpermuteif(c1 >= 0, rLo, c1);
   2141             x1.y = vpermuteif(c1 >= 0, gLo, c1);
   2142             x1.z = vpermuteif(c1 >= 0, bLo, c1);
   2143             w1   = vpermuteif(c1 >= 0, wLo, c1);
   2144         }
   2145         else {
   2146             // Load sat.r in two registers:
   2147             VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8);
   2148             VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8);
   2149             VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8);
   2150             VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8);
   2151 
   2152             x0.x = vpermute2if(c0 >= 0, rLo, rHi, c0);
   2153             x0.y = vpermute2if(c0 >= 0, gLo, gHi, c0);
   2154             x0.z = vpermute2if(c0 >= 0, bLo, bHi, c0);
   2155             w0   = vpermute2if(c0 >= 0, wLo, wHi, c0);
   2156 
   2157             x1.x = vpermute2if(c1 >= 0, rLo, rHi, c1);
   2158             x1.y = vpermute2if(c1 >= 0, gLo, gHi, c1);
   2159             x1.z = vpermute2if(c1 >= 0, bLo, bHi, c1);
   2160             w1   = vpermute2if(c1 >= 0, wLo, wHi, c1);
   2161         }
   2162 
   2163 #elif ICBC_USE_SSSE3_SHUFFLEB
   2164 
   2165         if (count <= 8) {
   2166             VFloat rsat[2] = { vload(sat.r), vload(sat.r+4) };
   2167             VFloat gsat[2] = { vload(sat.g), vload(sat.g+4) };
   2168             VFloat bsat[2] = { vload(sat.b), vload(sat.b+4) };
   2169             VFloat wsat[2] = { vload(sat.w), vload(sat.w+4) };
   2170 
   2171             __m128i idx0 = (__m128i &)s_byte_vtl_index0_3[4*i];
   2172             x0.x = vpermute2(rsat[0], rsat[1], idx0);
   2173             x0.y = vpermute2(gsat[0], gsat[1], idx0);
   2174             x0.z = vpermute2(bsat[0], bsat[1], idx0);
   2175             w0   = vpermute2(wsat[0], wsat[1], idx0);
   2176 
   2177             __m128i idx1 = (__m128i &)s_byte_vtl_index1_3[4*i];
   2178             x1.x = vpermute2(rsat[0], rsat[1], idx1);
   2179             x1.y = vpermute2(gsat[0], gsat[1], idx1);
   2180             x1.z = vpermute2(bsat[0], bsat[1], idx1);
   2181             w1   = vpermute2(wsat[0], wsat[1], idx1);
   2182         }
   2183         else {
   2184             VFloat rsat[4] = { vload(sat.r), vload(sat.r+4), vload(sat.r+8), vload(sat.r+12) };
   2185             VFloat gsat[4] = { vload(sat.g), vload(sat.g+4), vload(sat.g+8), vload(sat.g+12) };
   2186             VFloat bsat[4] = { vload(sat.b), vload(sat.b+4), vload(sat.b+8), vload(sat.b+12) };
   2187             VFloat wsat[4] = { vload(sat.w), vload(sat.w+4), vload(sat.w+8), vload(sat.w+12) };
   2188 
   2189             __m128i idx0 = (__m128i &)s_byte_vtl_index0_3[4*i];
   2190             x0.x = vpermute4(rsat[0], rsat[1], rsat[2], rsat[3], idx0);
   2191             x0.y = vpermute4(gsat[0], gsat[1], gsat[2], gsat[3], idx0);
   2192             x0.z = vpermute4(bsat[0], bsat[1], bsat[2], bsat[3], idx0);
   2193             w0   = vpermute4(wsat[0], wsat[1], wsat[2], wsat[3], idx0);
   2194 
   2195             __m128i idx1 = (__m128i &)s_byte_vtl_index1_3[4*i];
   2196             x1.x = vpermute4(rsat[0], rsat[1], rsat[2], rsat[3], idx1);
   2197             x1.y = vpermute4(gsat[0], gsat[1], gsat[2], gsat[3], idx1);
   2198             x1.z = vpermute4(bsat[0], bsat[1], bsat[2], bsat[3], idx1);
   2199             w1   = vpermute4(wsat[0], wsat[1], wsat[2], wsat[3], idx1);
   2200         }
   2201 
   2202 #elif ICBC_USE_NEON_VTL
   2203 
   2204         uint8x16_t idx0 = (uint8x16_t &)s_byte_vtl_index0_3[4*i];
   2205         uint8x16_t idx1 = (uint8x16_t &)s_byte_vtl_index1_3[4*i];
   2206 
   2207         /*if (count <= 4) {
   2208             uint8x16_t rsat1 = (uint8x16_t&)sat.r;
   2209             uint8x16_t gsat1 = (uint8x16_t&)sat.g;
   2210             uint8x16_t bsat1 = (uint8x16_t&)sat.b;
   2211             uint8x16_t wsat1 = (uint8x16_t&)sat.w;
   2212 
   2213             x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0));
   2214             x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0));
   2215             x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0));
   2216             w0   = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0));
   2217 
   2218             x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1));
   2219             x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1));
   2220             x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1));
   2221             w1   = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1));
   2222         }
   2223         else*/
   2224         if (count <= 8) {
   2225             uint8x16x2_t rsat2 = (uint8x16x2_t&)sat.r;
   2226             uint8x16x2_t gsat2 = (uint8x16x2_t&)sat.g;
   2227             uint8x16x2_t bsat2 = (uint8x16x2_t&)sat.b;
   2228             uint8x16x2_t wsat2 = (uint8x16x2_t&)sat.w;
   2229 
   2230             x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0));
   2231             x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0));
   2232             x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0));
   2233             w0   = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0));
   2234 
   2235             x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1));
   2236             x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1));
   2237             x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1));
   2238             w1   = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1));
   2239         }
   2240         else if (count <= 12) {
   2241             uint8x16x3_t rsat3 = (uint8x16x3_t&)sat.r;
   2242             uint8x16x3_t gsat3 = (uint8x16x3_t&)sat.g;
   2243             uint8x16x3_t bsat3 = (uint8x16x3_t&)sat.b;
   2244             uint8x16x3_t wsat3 = (uint8x16x3_t&)sat.w;
   2245 
   2246             x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0));
   2247             x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0));
   2248             x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0));
   2249             w0   = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0));
   2250 
   2251             x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1));
   2252             x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1));
   2253             x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1));
   2254             w1   = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1));
   2255         }
   2256         else {
   2257             uint8x16x4_t rsat4 = (uint8x16x4_t&)sat.r;
   2258             uint8x16x4_t gsat4 = (uint8x16x4_t&)sat.g;
   2259             uint8x16x4_t bsat4 = (uint8x16x4_t&)sat.b;
   2260             uint8x16x4_t wsat4 = (uint8x16x4_t&)sat.w;
   2261 
   2262             x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0));
   2263             x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0));
   2264             x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0));
   2265             w0   = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0));
   2266 
   2267             x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1));
   2268             x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1));
   2269             x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1));
   2270             w1   = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1));
   2271         }
   2272 
   2273 #else
   2274 
   2275         // Scalar path
   2276         x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero();
   2277         x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero();
   2278 
   2279         for (int l = 0; l < VEC_SIZE; l++) {
   2280             int c0 = s_threeCluster[i + l].c0 - 1;
   2281             if (c0 >= 0) {
   2282                 lane(x0.x, l) = sat.r[c0];
   2283                 lane(x0.y, l) = sat.g[c0];
   2284                 lane(x0.z, l) = sat.b[c0];
   2285                 lane(w0, l) = sat.w[c0];
   2286             }
   2287 
   2288             int c1 = s_threeCluster[i + l].c1 - 1;
   2289             if (c1 >= 0) {
   2290                 lane(x1.x, l) = sat.r[c1];
   2291                 lane(x1.y, l) = sat.g[c1];
   2292                 lane(x1.z, l) = sat.b[c1];
   2293                 lane(w1, l) = sat.w[c1];
   2294             }
   2295         }
   2296 
   2297 #endif
   2298 
   2299         VFloat w2 = vbroadcast(w_sum) - w1;
   2300         x1 = x1 - x0;
   2301         w1 = w1 - w0;
   2302 
   2303         VFloat alphabeta_sum = w1 * vbroadcast(0.25f);
   2304         VFloat alpha2_sum = w0 + alphabeta_sum;
   2305         VFloat beta2_sum = w2 + alphabeta_sum;
   2306         VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum));
   2307 
   2308         VVector3 alphax_sum = x0 + x1 * vbroadcast(0.5f);
   2309         VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum;
   2310 
   2311         VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor;
   2312         VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor;
   2313 
   2314         // snap to the grid
   2315         a = vround_ept(a);
   2316         b = vround_ept(b);
   2317 
   2318         // compute the error
   2319         VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f);
   2320         VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2));
   2321 
   2322         // apply the metric to the error term
   2323         VFloat error = vdot(e1, vbroadcast(metric_sqr));
   2324 
   2325 
   2326         // keep the solution if it wins
   2327         auto mask = (error < vbesterror);
   2328 
   2329         // I could mask the unused lanes here, but instead I set the invalid SAT entries to FLT_MAX.
   2330         //mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds?
   2331 
   2332         vbesterror = vselect(mask, vbesterror, error);
   2333         vbeststart = vselect(mask, vbeststart, a);
   2334         vbestend = vselect(mask, vbestend, b);
   2335     }
   2336 
   2337     int bestindex = reduce_min_index(vbesterror);
   2338 
   2339     start->x = lane(vbeststart.x, bestindex);
   2340     start->y = lane(vbeststart.y, bestindex);
   2341     start->z = lane(vbeststart.z, bestindex);
   2342     end->x = lane(vbestend.x, bestindex);
   2343     end->y = lane(vbestend.y, bestindex);
   2344     end->z = lane(vbestend.z, bestindex);
   2345 }
   2346 
   2347 
   2348 static void cluster_fit_four(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end)
   2349 {
   2350     const float r_sum = sat.r[count-1];
   2351     const float g_sum = sat.g[count-1];
   2352     const float b_sum = sat.b[count-1];
   2353     const float w_sum = sat.w[count-1];
   2354 
   2355     VFloat vbesterror = vbroadcast(FLT_MAX);
   2356     VVector3 vbeststart = { vzero(), vzero(), vzero() };
   2357     VVector3 vbestend = { vzero(), vzero(), vzero() };
   2358 
   2359     // check all possible clusters for this total order
   2360     const int total_order_count = s_fourClusterTotal[count - 1];
   2361 
   2362     for (int i = 0; i < total_order_count; i += VEC_SIZE)
   2363     {
   2364         VVector3 x0, x1, x2;
   2365         VFloat w0, w1, w2;
   2366 
   2367         /*
   2368         // Another approach would be to load and broadcast one color at a time like I do in my old CUDA implementation.
   2369         uint akku = 0;
   2370 
   2371         // Compute alpha & beta for this permutation.
   2372         #pragma unroll
   2373         for (int i = 0; i < 16; i++)
   2374         {
   2375             const uint bits = permutation >> (2*i);
   2376 
   2377             alphax_sum += alphaTable4[bits & 3] * colors[i];
   2378             akku += prods4[bits & 3];
   2379         }
   2380 
   2381         float alpha2_sum = float(akku >> 16);
   2382         float beta2_sum = float((akku >> 8) & 0xff);
   2383         float alphabeta_sum = float(akku & 0xff);
   2384         float3 betax_sum = 9.0f * color_sum - alphax_sum;
   2385         */
   2386 
   2387 #if ICBC_USE_AVX512_PERMUTE
   2388 
   2389         auto loadmask = lane_id() < vbroadcast(float(count));
   2390 
   2391         // Load sat in one register:
   2392         VFloat vrsat = vload(loadmask, sat.r, FLT_MAX);
   2393         VFloat vgsat = vload(loadmask, sat.g, FLT_MAX);
   2394         VFloat vbsat = vload(loadmask, sat.b, FLT_MAX);
   2395         VFloat vwsat = vload(loadmask, sat.w, FLT_MAX);
   2396 
   2397         // Load 4 uint8 per lane.
   2398         VInt packedClusterIndex = vload((int *)&s_fourCluster[i]);
   2399 
   2400         VInt c0 = (packedClusterIndex & 0xFF) - 1;
   2401         VInt c1 = ((packedClusterIndex >> 8) & 0xFF) - 1;
   2402         VInt c2 = ((packedClusterIndex >> 16)) - 1; // No need for &
   2403 
   2404         x0.x = vpermuteif(c0 >= 0, vrsat, c0);
   2405         x0.y = vpermuteif(c0 >= 0, vgsat, c0);
   2406         x0.z = vpermuteif(c0 >= 0, vbsat, c0);
   2407         w0   = vpermuteif(c0 >= 0, vwsat, c0);
   2408 
   2409         x1.x = vpermuteif(c1 >= 0, vrsat, c1);
   2410         x1.y = vpermuteif(c1 >= 0, vgsat, c1);
   2411         x1.z = vpermuteif(c1 >= 0, vbsat, c1);
   2412         w1   = vpermuteif(c1 >= 0, vwsat, c1);
   2413 
   2414         x2.x = vpermuteif(c2 >= 0, vrsat, c2);
   2415         x2.y = vpermuteif(c2 >= 0, vgsat, c2);
   2416         x2.z = vpermuteif(c2 >= 0, vbsat, c2);
   2417         w2   = vpermuteif(c2 >= 0, vwsat, c2);
   2418 
   2419 #elif ICBC_USE_AVX2_PERMUTE2
   2420 
   2421         // Load 4 uint8 per lane.
   2422         VInt packedClusterIndex = vload((int *)&s_fourCluster[i]);
   2423 
   2424         VInt c0 = (packedClusterIndex & 0xFF) - 1;
   2425         VInt c1 = ((packedClusterIndex >> 8) & 0xFF) - 1;
   2426         VInt c2 = ((packedClusterIndex >> 16)) - 1; // No need for &
   2427 
   2428         if (count <= 8) {
   2429             // Load sat.r in one register:
   2430             VFloat rLo = vload(sat.r);
   2431             VFloat gLo = vload(sat.g);
   2432             VFloat bLo = vload(sat.b);
   2433             VFloat wLo = vload(sat.w);
   2434 
   2435             x0.x = vpermuteif(c0 >= 0, rLo, c0);
   2436             x0.y = vpermuteif(c0 >= 0, gLo, c0);
   2437             x0.z = vpermuteif(c0 >= 0, bLo, c0);
   2438             w0   = vpermuteif(c0 >= 0, wLo, c0);
   2439 
   2440             x1.x = vpermuteif(c1 >= 0, rLo, c1);
   2441             x1.y = vpermuteif(c1 >= 0, gLo, c1);
   2442             x1.z = vpermuteif(c1 >= 0, bLo, c1);
   2443             w1   = vpermuteif(c1 >= 0, wLo, c1);
   2444 
   2445             x2.x = vpermuteif(c2 >= 0, rLo, c2);
   2446             x2.y = vpermuteif(c2 >= 0, gLo, c2);
   2447             x2.z = vpermuteif(c2 >= 0, bLo, c2);
   2448             w2   = vpermuteif(c2 >= 0, wLo, c2);
   2449         }
   2450         else {
   2451             // Load sat.r in two registers:
   2452             VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8);
   2453             VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8);
   2454             VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8);
   2455             VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8);
   2456 
   2457             x0.x = vpermute2if(c0 >= 0, rLo, rHi, c0);
   2458             x0.y = vpermute2if(c0 >= 0, gLo, gHi, c0);
   2459             x0.z = vpermute2if(c0 >= 0, bLo, bHi, c0);
   2460             w0   = vpermute2if(c0 >= 0, wLo, wHi, c0);
   2461 
   2462             x1.x = vpermute2if(c1 >= 0, rLo, rHi, c1);
   2463             x1.y = vpermute2if(c1 >= 0, gLo, gHi, c1);
   2464             x1.z = vpermute2if(c1 >= 0, bLo, bHi, c1);
   2465             w1   = vpermute2if(c1 >= 0, wLo, wHi, c1);
   2466 
   2467             x2.x = vpermute2if(c2 >= 0, rLo, rHi, c2);
   2468             x2.y = vpermute2if(c2 >= 0, gLo, gHi, c2);
   2469             x2.z = vpermute2if(c2 >= 0, bLo, bHi, c2);
   2470             w2   = vpermute2if(c2 >= 0, wLo, wHi, c2);
   2471         }
   2472 
   2473 #elif ICBC_USE_SSSE3_SHUFFLEB
   2474 
   2475         if (count <= 8) {
   2476             VFloat rsat[2] = { vload(sat.r), vload(sat.r+4) };
   2477             VFloat gsat[2] = { vload(sat.g), vload(sat.g+4) };
   2478             VFloat bsat[2] = { vload(sat.b), vload(sat.b+4) };
   2479             VFloat wsat[2] = { vload(sat.w), vload(sat.w+4) };
   2480 
   2481             __m128i idx0 = (__m128i &)s_byte_vtl_index0_4[4*i];
   2482             x0.x = vpermute2(rsat[0], rsat[1], idx0);
   2483             x0.y = vpermute2(gsat[0], gsat[1], idx0);
   2484             x0.z = vpermute2(bsat[0], bsat[1], idx0);
   2485             w0   = vpermute2(wsat[0], wsat[1], idx0);
   2486 
   2487             __m128i idx1 = (__m128i &)s_byte_vtl_index1_4[4*i];
   2488             x1.x = vpermute2(rsat[0], rsat[1], idx1);
   2489             x1.y = vpermute2(gsat[0], gsat[1], idx1);
   2490             x1.z = vpermute2(bsat[0], bsat[1], idx1);
   2491             w1   = vpermute2(wsat[0], wsat[1], idx1);
   2492 
   2493             __m128i idx2 = (__m128i &)s_byte_vtl_index2_4[4*i];
   2494             x2.x = vpermute2(rsat[0], rsat[1], idx2);
   2495             x2.y = vpermute2(gsat[0], gsat[1], idx2);
   2496             x2.z = vpermute2(bsat[0], bsat[1], idx2);
   2497             w2   = vpermute2(wsat[0], wsat[1], idx2);
   2498         }
   2499         else {
   2500             VFloat rsat[4] = { vload(sat.r), vload(sat.r+4), vload(sat.r+8), vload(sat.r+12) };
   2501             VFloat gsat[4] = { vload(sat.g), vload(sat.g+4), vload(sat.g+8), vload(sat.g+12) };
   2502             VFloat bsat[4] = { vload(sat.b), vload(sat.b+4), vload(sat.b+8), vload(sat.b+12) };
   2503             VFloat wsat[4] = { vload(sat.w), vload(sat.w+4), vload(sat.w+8), vload(sat.w+12) };
   2504 
   2505             __m128i idx0 = (__m128i &)s_byte_vtl_index0_4[4*i];
   2506             x0.x = vpermute4(rsat[0], rsat[1], rsat[2], rsat[3], idx0);
   2507             x0.y = vpermute4(gsat[0], gsat[1], gsat[2], gsat[3], idx0);
   2508             x0.z = vpermute4(bsat[0], bsat[1], bsat[2], bsat[3], idx0);
   2509             w0   = vpermute4(wsat[0], wsat[1], wsat[2], wsat[3], idx0);
   2510 
   2511             __m128i idx1 = (__m128i &)s_byte_vtl_index1_4[4*i];
   2512             x1.x = vpermute4(rsat[0], rsat[1], rsat[2], rsat[3], idx1);
   2513             x1.y = vpermute4(gsat[0], gsat[1], gsat[2], gsat[3], idx1);
   2514             x1.z = vpermute4(bsat[0], bsat[1], bsat[2], bsat[3], idx1);
   2515             w1   = vpermute4(wsat[0], wsat[1], wsat[2], wsat[3], idx1);
   2516 
   2517             __m128i idx2 = (__m128i &)s_byte_vtl_index2_4[4*i];
   2518             x2.x = vpermute4(rsat[0], rsat[1], rsat[2], rsat[3], idx2);
   2519             x2.y = vpermute4(gsat[0], gsat[1], gsat[2], gsat[3], idx2);
   2520             x2.z = vpermute4(bsat[0], bsat[1], bsat[2], bsat[3], idx2);
   2521             w2   = vpermute4(wsat[0], wsat[1], wsat[2], wsat[3], idx2);
   2522         }
   2523 
   2524 #elif ICBC_USE_NEON_VTL
   2525 
   2526         /*
   2527         // c0 = { pq[12],pq[12],pq[12],pq[12], pq[8],pq[8],pq[8],pq[8], pq[4],pq[4],pq[4],pq[4], pq[0],pq[0],pq[0],pq[0] }
   2528         // idx0 = (c0 - 1) * 4 + byte_offset
   2529         // idx0 = c0 == 0 ? 255 : idx0
   2530 
   2531         // c1 = { pq[13],pq[13],pq[13],pq[13], pq[9],pq[9],pq[9],pq[9], pq[5],pq[5],pq[5],pq[5], pq[1],pq[1],pq[1],pq[1] }
   2532         // idx1 = (c1 - 1) * 4 + byte_offset
   2533         // idx1 = c1 == 0 ? 255 : idx1
   2534 
   2535         // c2 = { pq[14],pq[14],pq[14],pq[14], pq[10],pq[10],pq[10],pq[10], pq[6],pq[6],pq[6],pq[6], pq[2],pq[2],pq[2],pq[2] }
   2536         // idx2 = (c2 - 1) * 4 + byte_offset
   2537         // idx2 = c2 == 0 ? 255 : idx2
   2538 
   2539         uint8x16_t packedClusterIndex = (uint8x16_t &)s_fourCluster[i];
   2540 
   2541         const uint8x16_t byte_offsets = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
   2542         uint8x16_t indices = {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12};
   2543 
   2544         uint8x16_t c0 = vqtbl1q_u8(packedClusterIndex, indices);
   2545         uint8x16_t idx0 = vaddq_u8(vshlq_n_u8(vsubq_u8(c0, vdupq_n_u8(1)), 2), byte_offsets); // (c0 - 1) * 4 + byte_offsets
   2546         idx0 = vorrq_u8(idx0, vceqzq_u8(c0));  // idx0 = c0 == 0 ? 255 : idx0
   2547 
   2548         indices = vaddq_u8(indices, vdupq_n_u8(1));
   2549         uint8x16_t c1 = vqtbl1q_u8(packedClusterIndex, indices);
   2550         uint8x16_t idx1 = vaddq_u8(vshlq_n_u8(vsubq_u8(c1, vdupq_n_u8(1)), 2), byte_offsets); // (c1 - 1) * 4 + byte_offsets
   2551         idx1 = vorrq_u8(idx1, vceqzq_u8(c1));  // idx1 = c1 == 0 ? 255 : idx1
   2552 
   2553         indices = vaddq_u8(indices, vdupq_n_u8(1));
   2554         uint8x16_t c2 = vqtbl1q_u8(packedClusterIndex, indices);
   2555         uint8x16_t idx2 = vaddq_u8(vshlq_n_u8(vsubq_u8(c2, vdupq_n_u8(1)), 2), byte_offsets); // (c2 - 1) * 4 + byte_offsets
   2556         idx2 = vorrq_u8(idx2, vceqzq_u8(c2));  // idx2 = c2 == 0 ? 255 : bc0
   2557         */
   2558 
   2559         // Loading the precomputed byte indices is faster than computing them on the fly.
   2560         uint8x16_t idx0 = (uint8x16_t &)s_byte_vtl_index0_4[4*i];
   2561         uint8x16_t idx1 = (uint8x16_t &)s_byte_vtl_index1_4[4*i];
   2562         uint8x16_t idx2 = (uint8x16_t &)s_byte_vtl_index2_4[4*i];
   2563 
   2564         /*if (count <= 4) {
   2565             uint8x16_t rsat1 = (uint8x16_t&)sat.r;
   2566             uint8x16_t gsat1 = (uint8x16_t&)sat.g;
   2567             uint8x16_t bsat1 = (uint8x16_t&)sat.b;
   2568             uint8x16_t wsat1 = (uint8x16_t&)sat.w;
   2569 
   2570             x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0));
   2571             x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0));
   2572             x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0));
   2573             w0   = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0));
   2574 
   2575             x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1));
   2576             x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1));
   2577             x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1));
   2578             w1   = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1));
   2579 
   2580             x2.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx2));
   2581             x2.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx2));
   2582             x2.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx2));
   2583             w2   = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx2));
   2584         }
   2585         else*/ 
   2586         if (count <= 8) {
   2587             uint8x16x2_t rsat2 = (uint8x16x2_t&)sat.r;
   2588             uint8x16x2_t gsat2 = (uint8x16x2_t&)sat.g;
   2589             uint8x16x2_t bsat2 = (uint8x16x2_t&)sat.b;
   2590             uint8x16x2_t wsat2 = (uint8x16x2_t&)sat.w;
   2591 
   2592             x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0));
   2593             x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0));
   2594             x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0));
   2595             w0   = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0));
   2596 
   2597             x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1));
   2598             x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1));
   2599             x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1));
   2600             w1   = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1));
   2601 
   2602             x2.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx2));
   2603             x2.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx2));
   2604             x2.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx2));
   2605             w2   = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx2));
   2606         }
   2607         else if (count <= 12) {
   2608             uint8x16x3_t rsat3 = (uint8x16x3_t&)sat.r;
   2609             uint8x16x3_t gsat3 = (uint8x16x3_t&)sat.g;
   2610             uint8x16x3_t bsat3 = (uint8x16x3_t&)sat.b;
   2611             uint8x16x3_t wsat3 = (uint8x16x3_t&)sat.w;
   2612 
   2613             x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0));
   2614             x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0));
   2615             x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0));
   2616             w0   = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0));
   2617 
   2618             x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1));
   2619             x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1));
   2620             x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1));
   2621             w1   = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1));
   2622 
   2623             x2.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx2));
   2624             x2.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx2));
   2625             x2.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx2));
   2626             w2   = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx2));
   2627         }
   2628         else {
   2629             uint8x16x4_t rsat4 = (uint8x16x4_t&)sat.r;
   2630             uint8x16x4_t gsat4 = (uint8x16x4_t&)sat.g;
   2631             uint8x16x4_t bsat4 = (uint8x16x4_t&)sat.b;
   2632             uint8x16x4_t wsat4 = (uint8x16x4_t&)sat.w;
   2633 
   2634             x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0));
   2635             x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0));
   2636             x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0));
   2637             w0   = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0));
   2638 
   2639             x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1));
   2640             x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1));
   2641             x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1));
   2642             w1   = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1));
   2643 
   2644             x2.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx2));
   2645             x2.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx2));
   2646             x2.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx2));
   2647             w2   = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx2));
   2648         }
   2649 
   2650 #else
   2651 
   2652         // Scalar path
   2653         x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero();
   2654         x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero();
   2655         x2.x = vzero(); x2.y = vzero(); x2.z = vzero(); w2 = vzero();
   2656 
   2657         for (int l = 0; l < VEC_SIZE; l++) {
   2658             int c0 = s_fourCluster[i + l].c0 - 1;
   2659             if (c0 >= 0) {
   2660                 lane(x0.x, l) = sat.r[c0];
   2661                 lane(x0.y, l) = sat.g[c0];
   2662                 lane(x0.z, l) = sat.b[c0];
   2663                 lane(w0, l) = sat.w[c0];
   2664             }
   2665 
   2666             int c1 = s_fourCluster[i + l].c1 - 1;
   2667             if (c1 >= 0) {
   2668                 lane(x1.x, l) = sat.r[c1];
   2669                 lane(x1.y, l) = sat.g[c1];
   2670                 lane(x1.z, l) = sat.b[c1];
   2671                 lane(w1, l) = sat.w[c1];
   2672             }
   2673 
   2674             int c2 = s_fourCluster[i + l].c2 - 1;
   2675             if (c2 >= 0) {
   2676                 lane(x2.x, l) = sat.r[c2];
   2677                 lane(x2.y, l) = sat.g[c2];
   2678                 lane(x2.z, l) = sat.b[c2];
   2679                 lane(w2, l) = sat.w[c2];
   2680             }
   2681         }
   2682 
   2683 #endif
   2684 
   2685         VFloat w3 = vbroadcast(w_sum) - w2;
   2686         x2 = x2 - x1;
   2687         x1 = x1 - x0;
   2688         w2 = w2 - w1;
   2689         w1 = w1 - w0;
   2690 
   2691 #if 1
   2692         VFloat alpha2_sum = vmadd(w2, w1_9, vmadd(w1, w4_9, w0));
   2693         VFloat beta2_sum  = vmadd(w1, w1_9, vmadd(w2, w4_9, w3));
   2694 
   2695         VFloat alphabeta_sum = (w1 + w2) * vbroadcast(w2_9);
   2696         VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum));
   2697 
   2698         VVector3 alphax_sum = vmadd(x2, w1_3, vmadd(x1, w2_3, x0));
   2699 #else
   2700         VFloat alpha2_sum = vmadd(w2, (1.0f / 9.0f), vmadd(w1, (4.0f / 9.0f), w0));
   2701         VFloat beta2_sum  = vmadd(w1, (1.0f / 9.0f), vmadd(w2, (4.0f / 9.0f), w3));
   2702 
   2703         VFloat alphabeta_sum = (w1 + w2) * vbroadcast(2.0f / 9.0f);
   2704         VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum));
   2705 
   2706         VVector3 alphax_sum = vmadd(x2, (1.0f / 3.0f), vmadd(x1, (2.0f / 3.0f), x0));
   2707 #endif
   2708         VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum;
   2709 
   2710         VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor;
   2711         VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor;
   2712 
   2713         // snap to the grid
   2714         a = vround_ept(a);
   2715         b = vround_ept(b);
   2716 
   2717         // compute the error
   2718         VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f);
   2719         VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2));
   2720 
   2721         // apply the metric to the error term
   2722         VFloat error = vdot(e1, vbroadcast(metric_sqr));
   2723 
   2724         // keep the solution if it wins
   2725         auto mask = (error < vbesterror);
   2726 
   2727         // We could mask the unused lanes here, but instead set the invalid SAT entries to FLT_MAX.
   2728         //mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds?
   2729 
   2730         vbesterror = vselect(mask, vbesterror, error);
   2731         vbeststart = vselect(mask, vbeststart, a);
   2732         vbestend = vselect(mask, vbestend, b);
   2733     }
   2734 
   2735     int bestindex = reduce_min_index(vbesterror);
   2736 
   2737     start->x = lane(vbeststart.x, bestindex);
   2738     start->y = lane(vbeststart.y, bestindex);
   2739     start->z = lane(vbeststart.z, bestindex);
   2740     end->x = lane(vbestend.x, bestindex);
   2741     end->y = lane(vbestend.y, bestindex);
   2742     end->z = lane(vbestend.z, bestindex);
   2743 }
   2744 
   2745 
   2746 ///////////////////////////////////////////////////////////////////////////////////////////////////
   2747 // Palette evaluation.
   2748 
   2749 Decoder s_decoder = Decoder_D3D10;
   2750 
   2751 // D3D10
   2752 inline void evaluate_palette4_d3d10(Color32 palette[4]) {
   2753     palette[2].r = (2 * palette[0].r + palette[1].r) / 3;
   2754     palette[2].g = (2 * palette[0].g + palette[1].g) / 3;
   2755     palette[2].b = (2 * palette[0].b + palette[1].b) / 3;
   2756     palette[2].a = 0xFF;
   2757 
   2758     palette[3].r = (2 * palette[1].r + palette[0].r) / 3;
   2759     palette[3].g = (2 * palette[1].g + palette[0].g) / 3;
   2760     palette[3].b = (2 * palette[1].b + palette[0].b) / 3;
   2761     palette[3].a = 0xFF;
   2762 }
   2763 inline void evaluate_palette3_d3d10(Color32 palette[4]) {
   2764     palette[2].r = (palette[0].r + palette[1].r) / 2;
   2765     palette[2].g = (palette[0].g + palette[1].g) / 2;
   2766     palette[2].b = (palette[0].b + palette[1].b) / 2;
   2767     palette[2].a = 0xFF;
   2768     palette[3].u = 0;
   2769 }
   2770 static void evaluate_palette_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) {
   2771     palette[0] = bitexpand_color16_to_color32(c0);
   2772     palette[1] = bitexpand_color16_to_color32(c1);
   2773     if (c0.u > c1.u) {
   2774         evaluate_palette4_d3d10(palette);
   2775     }
   2776     else {
   2777         evaluate_palette3_d3d10(palette);
   2778     }
   2779 }
   2780 
   2781 // NV
   2782 inline void evaluate_palette4_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
   2783     int gdiff = palette[1].g - palette[0].g;
   2784     palette[2].r = uint8(((2 * c0.r + c1.r) * 22) / 8);
   2785     palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256);
   2786     palette[2].b = uint8(((2 * c0.b + c1.b) * 22) / 8);
   2787     palette[2].a = 0xFF;
   2788 
   2789     palette[3].r = uint8(((2 * c1.r + c0.r) * 22) / 8);
   2790     palette[3].g = uint8((256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256);
   2791     palette[3].b = uint8(((2 * c1.b + c0.b) * 22) / 8);
   2792     palette[3].a = 0xFF;
   2793 }
   2794 inline void evaluate_palette3_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
   2795     int gdiff = palette[1].g - palette[0].g;
   2796     palette[2].r = uint8(((c0.r + c1.r) * 33) / 8);
   2797     palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256);
   2798     palette[2].b = uint8(((c0.b + c1.b) * 33) / 8);
   2799     palette[2].a = 0xFF;
   2800     palette[3].u = 0;
   2801 }
   2802 static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
   2803     palette[0] = bitexpand_color16_to_color32(c0);
   2804     palette[1] = bitexpand_color16_to_color32(c1);
   2805 
   2806     if (c0.u > c1.u) {
   2807         evaluate_palette4_nv(c0, c1, palette);
   2808     }
   2809     else {
   2810         evaluate_palette3_nv(c0, c1, palette);
   2811     }
   2812 }
   2813 
   2814 // AMD
   2815 inline void evaluate_palette4_amd(Color32 palette[4]) {
   2816     palette[2].r = uint8((43 * palette[0].r + 21 * palette[1].r + 32) >> 6);
   2817     palette[2].g = uint8((43 * palette[0].g + 21 * palette[1].g + 32) >> 6);
   2818     palette[2].b = uint8((43 * palette[0].b + 21 * palette[1].b + 32) >> 6);
   2819     palette[2].a = 0xFF;
   2820 
   2821     palette[3].r = uint8((43 * palette[1].r + 21 * palette[0].r + 32) >> 6);
   2822     palette[3].g = uint8((43 * palette[1].g + 21 * palette[0].g + 32) >> 6);
   2823     palette[3].b = uint8((43 * palette[1].b + 21 * palette[0].b + 32) >> 6);
   2824     palette[3].a = 0xFF;
   2825 }
   2826 inline void evaluate_palette3_amd(Color32 palette[4]) {
   2827     palette[2].r = uint8((palette[0].r + palette[1].r + 1) / 2);
   2828     palette[2].g = uint8((palette[0].g + palette[1].g + 1) / 2);
   2829     palette[2].b = uint8((palette[0].b + palette[1].b + 1) / 2);
   2830     palette[2].a = 0xFF;
   2831     palette[3].u = 0;
   2832 }
   2833 static void evaluate_palette_amd(Color16 c0, Color16 c1, Color32 palette[4]) {
   2834     palette[0] = bitexpand_color16_to_color32(c0);
   2835     palette[1] = bitexpand_color16_to_color32(c1);
   2836 
   2837     if (c0.u > c1.u) {
   2838         evaluate_palette4_amd(palette);
   2839     }
   2840     else {
   2841         evaluate_palette3_amd(palette);
   2842     }
   2843 }
   2844 
   2845 // Intel
   2846 inline void evaluate_palette4_intel(Color32 palette[4]) {
   2847     palette[2].r = uint8((171 * palette[0].r + 85 * palette[1].r + 128) >> 8);
   2848     palette[2].g = uint8((171 * palette[0].g + 85 * palette[1].g + 128) >> 8);
   2849     palette[2].b = uint8((171 * palette[0].b + 85 * palette[1].b + 128) >> 8);
   2850     palette[2].a = 0xFF;
   2851 
   2852     palette[3].r = uint8((171 * palette[1].r + 85 * palette[0].r + 128) >> 8);
   2853     palette[3].g = uint8((171 * palette[1].g + 85 * palette[0].g + 128) >> 8);
   2854     palette[3].b = uint8((171 * palette[1].b + 85 * palette[0].b + 128) >> 8);
   2855     palette[3].a = 0xFF;
   2856 }
   2857 inline void evaluate_palette3_intel(Color32 palette[4]) {
   2858     palette[2].r = uint8((128 * palette[0].r + 128 * palette[1].r + 128) >> 8);
   2859     palette[2].g = uint8((128 * palette[0].g + 128 * palette[1].g + 128) >> 8);
   2860     palette[2].b = uint8((128 * palette[0].b + 128 * palette[1].b + 128) >> 8);
   2861     palette[2].a = 0xFF;
   2862     palette[3].u = 0;
   2863 }
   2864 static void evaluate_palette_intel(Color16 c0, Color16 c1, Color32 palette[4]) {
   2865     palette[0] = bitexpand_color16_to_color32(c0);
   2866     palette[1] = bitexpand_color16_to_color32(c1);
   2867 
   2868     if (c0.u > c1.u) {
   2869         evaluate_palette4_intel(palette);
   2870     }
   2871     else {
   2872         evaluate_palette3_intel(palette);
   2873     }
   2874 }
   2875 
   2876 inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4], Decoder decoder = s_decoder) {
   2877     if (decoder == Decoder_D3D10)         evaluate_palette_d3d10(c0, c1, palette);
   2878     else if (decoder == Decoder_NVIDIA)   evaluate_palette_nv(c0, c1, palette);
   2879     else if (decoder == Decoder_AMD)      evaluate_palette_amd(c0, c1, palette);
   2880     else if (decoder == Decoder_Intel)    evaluate_palette_intel(c0, c1, palette);
   2881 }
   2882 
   2883 inline void evaluate_palette4(Color16 c0, Color16 c1, Color32 palette[4], Decoder decoder = s_decoder) {
   2884     palette[0] = bitexpand_color16_to_color32(c0);
   2885     palette[1] = bitexpand_color16_to_color32(c1);
   2886 
   2887     if (decoder == Decoder_D3D10)         evaluate_palette4_d3d10(palette);
   2888     else if (decoder == Decoder_NVIDIA)   evaluate_palette4_nv(c0, c1, palette);
   2889     else if (decoder == Decoder_AMD)      evaluate_palette4_amd(palette);
   2890     else if (decoder == Decoder_Intel)    evaluate_palette4_intel(palette);
   2891 }
   2892 
   2893 static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) {
   2894     Color32 palette32[4];
   2895     evaluate_palette(c0, c1, palette32);
   2896 
   2897     for (int i = 0; i < 4; i++) {
   2898         palette[i] = color_to_vector3(palette32[i]);
   2899     }
   2900 }
   2901 
   2902 static void evaluate_palette(uint8 alpha0, uint8  alpha1, uint8 alpha_palette[8], Decoder decoder) {
   2903     // @@ 
   2904     alpha_palette[0] = alpha0;
   2905     alpha_palette[1] = alpha1;
   2906     if (alpha0 > alpha1) {
   2907         // @@ 
   2908     }
   2909     else {
   2910         // @@
   2911     }
   2912 }
   2913 
   2914 
   2915 static void decode_bc1(const BlockBC1 * block, unsigned char rgba_block[16 * 4], Decoder decoder) {
   2916     Color32 palette[4];
   2917     evaluate_palette(block->col0, block->col1, palette, decoder);
   2918 
   2919     for (int i = 0; i < 16; i++) {
   2920         int index = (block->indices >> (2 * i)) & 3;
   2921         Color32 c = palette[index];
   2922         rgba_block[4 * i + 0] = c.r;
   2923         rgba_block[4 * i + 1] = c.g;
   2924         rgba_block[4 * i + 2] = c.b;
   2925         rgba_block[4 * i + 3] = c.a;
   2926     }
   2927 }
   2928  
   2929 static void decode_bc3(const BlockBC3 * block, unsigned char rgba_block[16 * 4], Decoder decoder) {
   2930     Color32 rgb_palette[4];
   2931     evaluate_palette4(block->rgb.col0, block->rgb.col1, rgb_palette, decoder);
   2932 
   2933     uint8 alpha_palette[8];
   2934     evaluate_palette(block->alpha.alpha0, block->alpha.alpha1, alpha_palette, decoder);
   2935 
   2936     for (int i = 0; i < 16; i++) {
   2937         int rgb_index = (block->rgb.indices >> (2 * i)) & 3;
   2938         Color32 c = rgb_palette[rgb_index];
   2939         rgba_block[4 * i + 0] = c.r;
   2940         rgba_block[4 * i + 1] = c.g;
   2941         rgba_block[4 * i + 2] = c.b;
   2942 
   2943         int alpha_index = 0; // @@
   2944         uint8 alpha = alpha_palette[alpha_index];
   2945         rgba_block[4 * i + 3] = alpha;
   2946     }
   2947 }
   2948 
   2949 
   2950 ///////////////////////////////////////////////////////////////////////////////////////////////////
   2951 // Error evaluation.
   2952 
   2953 // Different ways of estimating the error.
   2954 
   2955 static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
   2956     Vector3 d = (p - c) * w * 255;
   2957     return dot(d, d);
   2958 }
   2959 
   2960 static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 & w) {
   2961     Vector3 d = (color_to_vector3(p) - c) * w * 255;
   2962     return dot(d, d);
   2963 }
   2964 
   2965 static int evaluate_mse(const Color32 & p, const Color32 & c) {
   2966     return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b));
   2967 }
   2968 
   2969 static int evaluate_mse(const Color32 palette[4], const Color32 & c) {
   2970     int e0 = evaluate_mse(palette[0], c);
   2971     int e1 = evaluate_mse(palette[1], c);
   2972     int e2 = evaluate_mse(palette[2], c);
   2973     int e3 = evaluate_mse(palette[3], c);
   2974     return min(min(e0, e1), min(e2, e3));
   2975 }
   2976 
   2977 // Returns MSE error in [0-255] range.
   2978 static int evaluate_mse(const BlockBC1 * output, Color32 color, int index) {
   2979     Color32 palette[4];
   2980     evaluate_palette(output->col0, output->col1, palette);
   2981 
   2982     return evaluate_mse(palette[index], color);
   2983 }
   2984 
   2985 // Returns weighted MSE error in [0-255] range.
   2986 static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) {
   2987 
   2988     float total = 0.0f;
   2989     for (int i = 0; i < count; i++) {
   2990         total += weights[i] * float(evaluate_mse(palette, colors[i]));
   2991     }
   2992 
   2993     return total;
   2994 }
   2995 
   2996 static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, int count) {
   2997 
   2998     float total = 0.0f;
   2999     for (int i = 0; i < count; i++) {
   3000         total += float(evaluate_mse(palette, colors[i]));
   3001     }
   3002 
   3003     return total;
   3004 }
   3005 
   3006 static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockBC1 * output) {
   3007     Color32 palette[4];
   3008     evaluate_palette(output->col0, output->col1, palette);
   3009 
   3010     // evaluate error for each index.
   3011     float error = 0.0f;
   3012     for (int i = 0; i < 16; i++) {
   3013         int index = (output->indices >> (2 * i)) & 3;
   3014         error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights);
   3015     }
   3016     return error;
   3017 }
   3018 
   3019 static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3& color_weights, Vector3 palette[4], uint32 indices) {
   3020 
   3021     // evaluate error for each index.
   3022     float error = 0.0f;
   3023     for (int i = 0; i < 16; i++) {
   3024         int index = (indices >> (2 * i)) & 3;
   3025         error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights);
   3026     }
   3027     return error;
   3028 }
   3029 
   3030 float evaluate_bc1_error(const uint8 rgba_block[16*4], const BlockBC1 * block, Decoder decoder) {
   3031     Color32 palette[4];
   3032     evaluate_palette(block->col0, block->col1, palette, decoder);
   3033 
   3034     // evaluate error for each index.
   3035     float error = 0.0f;
   3036     for (int i = 0; i < 16; i++) {
   3037         int index = (block->indices >> (2 * i)) & 3;
   3038         Color32 c;
   3039         c.r = rgba_block[4 * i + 0];
   3040         c.g = rgba_block[4 * i + 1];
   3041         c.b = rgba_block[4 * i + 2];
   3042         c.a = 255;
   3043         error += float(evaluate_mse(palette[index], c));
   3044     }
   3045     return error;
   3046 }
   3047 
   3048 float evaluate_bc3_error(const uint8 rgba_block[16*4], const BlockBC3 * block, bool alpha_blend, Decoder decoder) {
   3049     Color32 rgb_palette[4];
   3050     evaluate_palette4(block->rgb.col0, block->rgb.col1, rgb_palette, decoder);
   3051 
   3052     uint8 alpha_palette[8];
   3053     evaluate_palette(block->alpha.alpha0, block->alpha.alpha1, alpha_palette, decoder);
   3054 
   3055     // evaluate error for each index.
   3056     float error = 0.0f;
   3057     for (int i = 0; i < 16; i++) {
   3058         Color32 c;
   3059         c.r = rgba_block[4 * i + 0];
   3060         c.g = rgba_block[4 * i + 1];
   3061         c.b = rgba_block[4 * i + 2];
   3062         c.a = rgba_block[4 * i + 3];
   3063 
   3064         int rgb_index = (block->rgb.indices >> (2 * i)) & 3;
   3065         int alpha_index = 0; // @@ 
   3066 
   3067         error += (alpha_blend ? (c.a * 1.0f/255.0f) : 1.0f) * float(evaluate_mse(rgb_palette[rgb_index], c));
   3068         error += float(square(int(alpha_palette[alpha_index]) - c.a));
   3069     }
   3070     return error;
   3071 }
   3072 
   3073 
   3074 
   3075 ///////////////////////////////////////////////////////////////////////////////////////////////////
   3076 // Index selection
   3077 
   3078 // @@ Can we interleave the two uint16 at once?
   3079 inline uint32 interleave_uint16_with_zeros(uint32 input)  {
   3080     uint32 word = input;
   3081     word = (word ^ (word << 8 )) & 0x00ff00ff;
   3082     word = (word ^ (word << 4 )) & 0x0f0f0f0f;
   3083     word = (word ^ (word << 2 )) & 0x33333333;
   3084     word = (word ^ (word << 1 )) & 0x55555555;
   3085     return word;
   3086 }
   3087 
   3088 // Interleave the bits. https://lemire.me/blog/2018/01/08/how-fast-can-you-bit-interleave-32-bit-integers/
   3089 ICBC_FORCEINLINE uint32 interleave(uint32 a, uint32 b) {
   3090 #if ICBC_BMI2
   3091     return _pdep_u32(a, 0x55555555) | _pdep_u32(b, 0xaaaaaaaa);
   3092 #else
   3093     return interleave_uint16_with_zeros(a) | (interleave_uint16_with_zeros(b) << 1);
   3094 #endif
   3095 }
   3096 
   3097 static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
   3098     uint indices0 = 0;
   3099     uint indices1 = 0;
   3100 
   3101     VVector3 vw = vbroadcast(color_weights);
   3102     VVector3 vp0 = vbroadcast(palette[0]) * vw;
   3103     VVector3 vp1 = vbroadcast(palette[1]) * vw;
   3104     VVector3 vp2 = vbroadcast(palette[2]) * vw;
   3105     VVector3 vp3 = vbroadcast(palette[3]) * vw;
   3106 
   3107     for (int i = 0; i < 16; i += VEC_SIZE) {
   3108         VVector3 vc = vload(&input_colors[i]) * vw;
   3109 
   3110         VFloat d0 = vlen2(vc - vp0);
   3111         VFloat d1 = vlen2(vc - vp1);
   3112         VFloat d2 = vlen2(vc - vp2);
   3113         VFloat d3 = vlen2(vc - vp3);
   3114 
   3115         VMask b1 = d1 > d2;
   3116         VMask b2 = d0 > d2;
   3117         VMask x0 = b1 & b2;
   3118 
   3119         VMask b0 = d0 > d3;
   3120         VMask b3 = d1 > d3;
   3121         x0 = x0 | (b0 & b3);
   3122 
   3123         VMask b4 = d2 > d3;
   3124         VMask x1 = b0 & b4;
   3125 
   3126         indices0 |= mask(x0) << i;
   3127         indices1 |= mask(x1) << i;
   3128     }
   3129 
   3130     return interleave(indices1, indices0);
   3131 }
   3132 
   3133 static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 palette[4]) {
   3134     uint indices0 = 0;
   3135     uint indices1 = 0;
   3136 
   3137     VVector3 vw = vbroadcast(color_weights);
   3138     VVector3 vp0 = vbroadcast(palette[0]) * vw;
   3139     VVector3 vp1 = vbroadcast(palette[1]) * vw;
   3140     VVector3 vp2 = vbroadcast(palette[2]) * vw;
   3141 
   3142     if (allow_transparent_black) {
   3143         for (int i = 0; i < 16; i += VEC_SIZE) {
   3144             VVector3 vc = vload(&input_colors[i]) * vw;
   3145 
   3146             VFloat d0 = vlen2(vp0 - vc);
   3147             VFloat d1 = vlen2(vp1 - vc);
   3148             VFloat d2 = vlen2(vp2 - vc);
   3149             VFloat d3 = vdot(vc, vc);
   3150 
   3151             VMask i1 = (d1 < d2);
   3152             VMask i2 = (d2 <= d0) & (d2 <= d1);
   3153             VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2);
   3154 
   3155             indices0 |= mask(i2 | i3) << i;
   3156             indices1 |= mask(i1 | i3) << i;
   3157         }
   3158     }
   3159     else {
   3160         for (int i = 0; i < 16; i += VEC_SIZE) {
   3161             VVector3 vc = vload(&input_colors[i]) * vw;
   3162 
   3163             VFloat d0 = vlen2(vc - vp0);
   3164             VFloat d1 = vlen2(vc - vp1);
   3165             VFloat d2 = vlen2(vc - vp2);
   3166 
   3167             VMask i1 = (d1 < d2);
   3168             VMask i2 = (d2 <= d0) & (d2 <= d1);
   3169 
   3170             indices0 |= mask(i2) << i;
   3171             indices1 |= mask(i1) << i;
   3172         }
   3173     }
   3174 
   3175     return interleave(indices1, indices0);
   3176 }
   3177 
   3178 
   3179 static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
   3180 #if 0
   3181     Vector3 p0 = palette[0] * color_weights;
   3182     Vector3 p1 = palette[1] * color_weights;
   3183     Vector3 p2 = palette[2] * color_weights;
   3184     Vector3 p3 = palette[3] * color_weights;
   3185 
   3186     uint indices = 0;
   3187     for (int i = 0; i < 16; i++) {
   3188         Vector3 ci = input_colors[i].xyz * color_weights;
   3189         float d0 = lengthSquared(p0 - ci);
   3190         float d1 = lengthSquared(p1 - ci);
   3191         float d2 = lengthSquared(p2 - ci);
   3192         float d3 = lengthSquared(p3 - ci);
   3193 
   3194         uint index;
   3195         if (d0 < d1 && d0 < d2 && d0 < d3) index = 0;
   3196         else if (d1 < d2 && d1 < d3) index = 1;
   3197         else if (d2 < d3) index = 2;
   3198         else index = 3;
   3199 
   3200         indices |= index << (2 * i);
   3201     }
   3202 
   3203     return indices;
   3204 #else
   3205     uint indices0 = 0;
   3206     uint indices1 = 0;
   3207 
   3208     VVector3 vw = vbroadcast(color_weights);
   3209     VVector3 vp0 = vbroadcast(palette[0]) * vw;
   3210     VVector3 vp1 = vbroadcast(palette[1]) * vw;
   3211     VVector3 vp2 = vbroadcast(palette[2]) * vw;
   3212     VVector3 vp3 = vbroadcast(palette[3]) * vw;
   3213 
   3214     for (int i = 0; i < 16; i += VEC_SIZE) {
   3215         VVector3 vc = vload(&input_colors[i]) * vw;
   3216 
   3217         VFloat d0 = vlen2(vc - vp0);
   3218         VFloat d1 = vlen2(vc - vp1);
   3219         VFloat d2 = vlen2(vc - vp2);
   3220         VFloat d3 = vlen2(vc - vp3);
   3221 
   3222         //VMask i0 = (d0 < d1) & (d0 < d2) & (d0 < d3);
   3223         VMask i1 = (d1 <= d0) & (d1 < d2) & (d1 < d3);
   3224         VMask i2 = (d2 <= d0) & (d2 <= d1) & (d2 < d3);
   3225         VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2);
   3226         //VFloat vindex = vselect(i0, vselect(i1, vselect(i2, vbroadcast(3), vbroadcast(2)), vbroadcast(1)), vbroadcast(0));
   3227 
   3228         indices0 |= mask(i2 | i3) << i;
   3229         indices1 |= mask(i1 | i3) << i;
   3230     }
   3231 
   3232     uint indices = interleave(indices1, indices0);
   3233     return indices;
   3234 #endif
   3235 }
   3236 
   3237 
   3238 static float output_block3(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 & v0, const Vector3 & v1, BlockBC1 * block)
   3239 {
   3240     Color16 color0 = vector3_to_color16(v0);
   3241     Color16 color1 = vector3_to_color16(v1);
   3242 
   3243     if (color0.u > color1.u) {
   3244         swap(color0, color1);
   3245     }
   3246 
   3247     Vector3 palette[4];
   3248     evaluate_palette(color0, color1, palette);
   3249 
   3250     block->col0 = color0;
   3251     block->col1 = color1;
   3252     block->indices = compute_indices3(input_colors, color_weights, allow_transparent_black, palette);
   3253 
   3254     return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices);
   3255 }
   3256 
   3257 static float output_block4(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockBC1 * block)
   3258 {
   3259     Color16 color0 = vector3_to_color16(v0);
   3260     Color16 color1 = vector3_to_color16(v1);
   3261 
   3262     if (color0.u < color1.u) {
   3263         swap(color0, color1);
   3264     }
   3265 
   3266     Vector3 palette[4];
   3267     evaluate_palette(color0, color1, palette);
   3268 
   3269     block->col0 = color0;
   3270     block->col1 = color1;
   3271     block->indices = compute_indices4(input_colors, color_weights, palette);
   3272 
   3273     return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices);
   3274 }
   3275 
   3276 
   3277 // Least squares fitting of color end points for the given indices. @@ Take weights into account.
   3278 static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
   3279 {
   3280     float alpha2_sum = 0.0f;
   3281     float beta2_sum = 0.0f;
   3282     float alphabeta_sum = 0.0f;
   3283     Vector3 alphax_sum = { 0,0,0 };
   3284     Vector3 betax_sum = { 0,0,0 };
   3285 
   3286     for (int i = 0; i < count; i++)
   3287     {
   3288         const uint bits = indices >> (2 * i);
   3289 
   3290         float beta = float(bits & 1);
   3291         if (bits & 2) beta = (1 + beta) / 3.0f;
   3292         float alpha = 1.0f - beta;
   3293 
   3294         alpha2_sum += alpha * alpha;
   3295         beta2_sum += beta * beta;
   3296         alphabeta_sum += alpha * beta;
   3297         alphax_sum += alpha * colors[i].xyz;
   3298         betax_sum += beta * colors[i].xyz;
   3299     }
   3300 
   3301     float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
   3302     if (equal(denom, 0.0f)) return false;
   3303 
   3304     float factor = 1.0f / denom;
   3305 
   3306     *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
   3307     *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
   3308 
   3309     return true;
   3310 }
   3311 
   3312 // find minimum and maximum colors based on bounding box in color space
   3313 inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1)
   3314 {
   3315     *c0 = { 0,0,0 };
   3316     *c1 = { 1,1,1 };
   3317 
   3318     for (int i = 0; i < count; i++) {
   3319         *c0 = max(*c0, colors[i]);
   3320         *c1 = min(*c1, colors[i]);
   3321     }
   3322 }
   3323 
   3324 inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1)
   3325 {
   3326     Vector3 center = (*c0 + *c1) * 0.5f;
   3327 
   3328     /*Vector3 center = colors[0];
   3329     for (int i = 1; i < count; i++) {
   3330         center = center * float(i-1) / i + colors[i] / i;
   3331     }*/
   3332     /*Vector3 center = colors[0];
   3333     for (int i = 1; i < count; i++) {
   3334         center += colors[i];
   3335     }
   3336     center /= count;*/
   3337 
   3338     float cov_xz = 0.0f;
   3339     float cov_yz = 0.0f;
   3340     for (int i = 0; i < count; i++) {
   3341         Vector3 t = colors[i] - center;
   3342         cov_xz += t.x * t.z;
   3343         cov_yz += t.y * t.z;
   3344     }
   3345 
   3346     float x0 = c0->x;
   3347     float y0 = c0->y;
   3348     float x1 = c1->x;
   3349     float y1 = c1->y;
   3350 
   3351     if (cov_xz < 0) {
   3352         swap(x0, x1);
   3353     }
   3354     if (cov_yz < 0) {
   3355         swap(y0, y1);
   3356     }
   3357 
   3358     *c0 = { x0, y0, c0->z };
   3359     *c1 = { x1, y1, c1->z };
   3360 }
   3361 
   3362 inline static void inset_bbox(Vector3 * __restrict c0, Vector3 * __restrict c1)
   3363 {
   3364     float bias = (8.0f / 255.0f) / 16.0f;
   3365     Vector3 inset = saturate((*c0 - *c1) / 16.0f - scalar_to_vector3(bias));
   3366     *c0 = saturate(*c0 - inset);
   3367     *c1 = saturate(*c1 + inset);
   3368 }
   3369 
   3370 
   3371 
   3372 // Single color lookup tables from:
   3373 // https://github.com/nothings/stb/blob/master/stb_dxt.h
   3374 static uint8 s_match5[256][2];
   3375 static uint8 s_match6[256][2];
   3376 
   3377 static void PrepareOptTable5(uint8 * table, Decoder decoder)
   3378 {
   3379     uint8 expand[32];
   3380     for (int i = 0; i < 32; i++) expand[i] = uint8((i << 3) | (i >> 2));
   3381 
   3382     for (int i = 0; i < 256; i++) {
   3383         int bestErr = 256 * 100;
   3384 
   3385         for (int mn = 0; mn < 32; mn++) {
   3386             for (int mx = 0; mx < 32; mx++) {
   3387                 int mine = expand[mn];
   3388                 int maxe = expand[mx];
   3389 
   3390                 int err;
   3391 
   3392                 int amd_r = (43 * maxe + 21 * mine + 32) >> 6;
   3393                 int amd_err = abs(amd_r - i);
   3394 
   3395                 int nv_r = ((2 * mx + mn) * 22) / 8;
   3396                 int nv_err = abs(nv_r - i);
   3397 
   3398                 int intel_r = (171 * maxe + 85 * mine + 128) >> 8;
   3399                 int intel_err = abs(intel_r - i);
   3400 
   3401                 if (decoder == Decoder_D3D10) {
   3402                     // DX10 spec says that interpolation must be within 3% of "correct" result,
   3403                     // add this as error term. (normally we'd expect a random distribution of
   3404                     // +-1.5% error, but nowhere in the spec does it say that the error has to be
   3405                     // unbiased - better safe than sorry).
   3406                     int r = (maxe * 2 + mine) / 3;
   3407                     err = abs(r - i) * 100 + abs(mx - mn) * 3;
   3408 
   3409                     // Another approach is to consider the worst of AMD and NVIDIA errors.
   3410                     err = max(max(amd_err, nv_err), intel_err);
   3411                 }
   3412                 else if (decoder == Decoder_NVIDIA) {
   3413                     err = nv_err;
   3414                 }
   3415                 else if (decoder == Decoder_Intel) {
   3416                     err = intel_err;
   3417                 }
   3418                 else /*if (decoder == Decoder_AMD)*/ {
   3419                     err = amd_err;
   3420                 }
   3421 
   3422                 if (err < bestErr) {
   3423                     bestErr = err;
   3424                     table[i * 2 + 0] = uint8(mx);
   3425                     table[i * 2 + 1] = uint8(mn);
   3426                 }
   3427             }
   3428         }
   3429     }
   3430 }
   3431 
   3432 static void PrepareOptTable6(uint8 * table, Decoder decoder)
   3433 {
   3434     uint8 expand[64];
   3435     for (int i = 0; i < 64; i++) expand[i] = uint8((i << 2) | (i >> 4));
   3436 
   3437     for (int i = 0; i < 256; i++) {
   3438         int bestErr = 256 * 100;
   3439 
   3440         for (int mn = 0; mn < 64; mn++) {
   3441             for (int mx = 0; mx < 64; mx++) {
   3442                 int mine = expand[mn];
   3443                 int maxe = expand[mx];
   3444 
   3445                 int err;
   3446 
   3447                 int amd_g = (43 * maxe + 21 * mine + 32) >> 6;
   3448                 int amd_err = abs(amd_g - i);
   3449 
   3450                 int nv_g = (256 * mine + (maxe - mine) / 4 + 128 + (maxe - mine) * 80) / 256;
   3451                 int nv_err = abs(nv_g - i);
   3452 
   3453                 int intel_g = (171 * maxe + 85 * mine + 128) >> 8;
   3454                 int intel_err = abs(intel_g - i);
   3455 
   3456                 if (decoder == Decoder_D3D10) {
   3457                     // DX10 spec says that interpolation must be within 3% of "correct" result,
   3458                     // add this as error term. (normally we'd expect a random distribution of
   3459                     // +-1.5% error, but nowhere in the spec does it say that the error has to be
   3460                     // unbiased - better safe than sorry).
   3461                     int g = (maxe * 2 + mine) / 3;
   3462                     err = abs(g - i) * 100 + abs(mx - mn) * 3;
   3463 
   3464                     // Another approach is to consider the worst of AMD and NVIDIA errors.
   3465                     err = max(max(amd_err, nv_err), intel_err);
   3466                 }
   3467                 else if (decoder == Decoder_NVIDIA) {
   3468                     err = nv_err;
   3469                 }
   3470                 else if (decoder == Decoder_Intel) {
   3471                     err = intel_err;
   3472                 }
   3473                 else /*if (decoder == Decoder_AMD)*/ {
   3474                     err = amd_err;
   3475                 }
   3476 
   3477                 if (err < bestErr) {
   3478                     bestErr = err;
   3479                     table[i * 2 + 0] = uint8(mx);
   3480                     table[i * 2 + 1] = uint8(mn);
   3481                 }
   3482             }
   3483         }
   3484     }
   3485 }
   3486 
   3487 
   3488 static void init_single_color_tables(Decoder decoder)
   3489 {
   3490     // Prepare single color lookup tables.
   3491     PrepareOptTable5(&s_match5[0][0], decoder);
   3492     PrepareOptTable6(&s_match6[0][0], decoder);
   3493 }
   3494 
   3495 static void init_decoder_weights(Decoder decoder)
   3496 {
   3497     w1_3 = 1.0f / 3; 
   3498     w2_3 = 2.0f / 3;
   3499 
   3500     if (decoder == Decoder_NVIDIA) {
   3501         // @@ Choose weights that approximate better nvidia's decoder.
   3502     }
   3503     else if (decoder == Decoder_AMD) {
   3504         w1_3 = 21.0f / 64;
   3505         w2_3 = 43.0f / 64;
   3506     }
   3507     else if (decoder == Decoder_Intel) {
   3508         w1_3 = 85.0f / 256;
   3509         w2_3 = 171.0f / 256;
   3510     }
   3511 
   3512     w1_9 = w1_3 * w1_3;
   3513     w4_9 = w2_3 * w2_3;
   3514     w2_9 = w1_3 * w2_3;
   3515 }
   3516 
   3517 // Single color compressor, based on:
   3518 // https://mollyrocket.com/forums/viewtopic.php?t=392
   3519 static void compress_dxt1_single_color_optimal(Color32 c, BlockBC1 * output)
   3520 {
   3521     output->col0.r = s_match5[c.r][0];
   3522     output->col0.g = s_match6[c.g][0];
   3523     output->col0.b = s_match5[c.b][0];
   3524     output->col1.r = s_match5[c.r][1];
   3525     output->col1.g = s_match6[c.g][1];
   3526     output->col1.b = s_match5[c.b][1];
   3527     output->indices = 0xaaaaaaaa;
   3528 
   3529     if (output->col0.u < output->col1.u)
   3530     {
   3531         swap(output->col0.u, output->col1.u);
   3532         output->indices ^= 0x55555555;
   3533     }
   3534 }
   3535 
   3536 
   3537 static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const float input_weights[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, bool try_transparent_black, bool allow_transparent_black, BlockBC1 * output)
   3538 {
   3539     Vector3 metric_sqr = color_weights * color_weights;
   3540 
   3541     SummedAreaTable sat;
   3542     int sat_count = compute_sat(colors, weights, count, &sat);
   3543 
   3544     Vector3 start, end;
   3545     cluster_fit_four(sat, sat_count, metric_sqr, &start, &end);
   3546 
   3547     float best_error = output_block4(input_colors, input_weights, color_weights, start, end, output);
   3548 
   3549     if (three_color_mode) {
   3550         if (try_transparent_black) {
   3551             Vector3 tmp_colors[16];
   3552             float tmp_weights[16];
   3553             int tmp_count = skip_blacks(colors, weights, count, tmp_colors, tmp_weights);
   3554             if (!tmp_count) return best_error;
   3555 
   3556             sat_count = compute_sat(tmp_colors, tmp_weights, tmp_count, &sat);
   3557         }
   3558 
   3559         cluster_fit_three(sat, sat_count, metric_sqr, &start, &end);
   3560 
   3561         BlockBC1 three_color_block;
   3562         float three_color_error = output_block3(input_colors, input_weights, color_weights, allow_transparent_black, start, end, &three_color_block);
   3563 
   3564         if (three_color_error < best_error) {
   3565             best_error = three_color_error;
   3566             *output = three_color_block;
   3567         }
   3568     }
   3569 
   3570     return best_error;
   3571 }
   3572 
   3573 
   3574 static float refine_endpoints(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, float input_error, BlockBC1 * output) {
   3575     // TODO:
   3576     // - Optimize palette evaluation when updating only one channel.
   3577     // - try all diagonals.
   3578 
   3579     // Things that don't help:
   3580     // - Alternate endpoint updates.
   3581     // - Randomize order.
   3582     // - If one direction does not improve, test opposite direction next.
   3583 
   3584     static const int8 deltas[16][3] = {
   3585         {1,0,0},
   3586         {0,1,0},
   3587         {0,0,1},
   3588 
   3589         {-1,0,0},
   3590         {0,-1,0},
   3591         {0,0,-1},
   3592 
   3593         {1,1,0},
   3594         {1,0,1},
   3595         {0,1,1},
   3596 
   3597         {-1,-1,0},
   3598         {-1,0,-1},
   3599         {0,-1,-1},
   3600 
   3601         {-1,1,0},
   3602         //{-1,0,1},
   3603 
   3604         {1,-1,0},
   3605         {0,-1,1},
   3606 
   3607         //{1,0,-1},
   3608         {0,1,-1},
   3609     };
   3610 
   3611     float best_error = input_error;
   3612 
   3613     int lastImprovement = 0;
   3614     for (int i = 0; i < 256; i++) {
   3615         BlockBC1 refined = *output;
   3616         int8 delta[3] = { deltas[i % 16][0], deltas[i % 16][1], deltas[i % 16][2] };
   3617 
   3618         if ((i / 16) & 1) {
   3619             refined.col0.r += delta[0];
   3620             refined.col0.g += delta[1];
   3621             refined.col0.b += delta[2];
   3622         }
   3623         else {
   3624             refined.col1.r += delta[0];
   3625             refined.col1.g += delta[1];
   3626             refined.col1.b += delta[2];
   3627         }
   3628 
   3629         if (!three_color_mode) {
   3630             if (refined.col0.u == refined.col1.u) refined.col1.g += 1;
   3631             if (refined.col0.u < refined.col1.u) swap(refined.col0.u, refined.col1.u);
   3632         }
   3633 
   3634         Vector3 palette[4];
   3635         evaluate_palette(output->col0, output->col1, palette);
   3636 
   3637         refined.indices = compute_indices(input_colors, color_weights, palette);
   3638 
   3639         float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined);
   3640         if (refined_error < best_error) {
   3641             best_error = refined_error;
   3642             *output = refined;
   3643             lastImprovement = i;
   3644         }
   3645 
   3646         // Early out if the last 32 steps didn't improve error.
   3647         if (i - lastImprovement > 32) break;
   3648     }
   3649 
   3650     return best_error;
   3651 }
   3652 
   3653 struct Options {
   3654     float threshold = 0.0f;
   3655     bool box_fit = false;
   3656     bool least_squares_fit = false;
   3657     bool cluster_fit = false;
   3658     bool cluster_fit_3 = false;
   3659     bool cluster_fit_3_black_only = false;
   3660     bool endpoint_refinement = false;
   3661 };
   3662 
   3663 static Options setup_options(Quality level, bool enable_three_color_mode, bool enable_transparent_black) {
   3664     Options opt;
   3665 
   3666     switch (level) {
   3667         case Quality_Level0:            // Box fit only.
   3668             opt.box_fit = true;
   3669             opt.least_squares_fit = false;
   3670             break;
   3671 
   3672         case Quality_Level1:            // Box fit + least squares fit.
   3673             opt.box_fit = true;
   3674             opt.least_squares_fit = true;
   3675             break;
   3676 
   3677         case Quality_Level2:            // Cluster fit 4, threshold = 24.
   3678             opt.box_fit = true;
   3679             opt.least_squares_fit = true;
   3680             opt.cluster_fit = true;
   3681             opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
   3682             opt.threshold = 1.0f / 24;
   3683             break;
   3684 
   3685         case Quality_Level3:            // Cluster fit 4, threshold = 32.
   3686             opt.box_fit = true;
   3687             opt.cluster_fit = true;
   3688             opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
   3689             opt.threshold = 1.0f / 32;
   3690             break;
   3691 
   3692         case Quality_Level4:            // Cluster fit 3+4, threshold = 48.
   3693             opt.cluster_fit = true;
   3694             opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
   3695             opt.threshold = 1.0f / 48;
   3696             break;
   3697 
   3698         case Quality_Level5:            // Cluster fit 3+4, threshold = 64.
   3699             opt.cluster_fit = true;
   3700             opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
   3701             opt.threshold = 1.0f / 64;
   3702             break;
   3703 
   3704         case Quality_Level6:            // Cluster fit 3+4, threshold = 96.
   3705             opt.cluster_fit = true;
   3706             opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
   3707             opt.threshold = 1.0f / 96;
   3708             break;
   3709 
   3710         case Quality_Level7:            // Cluster fit 3+4, threshold = 128.
   3711             opt.cluster_fit = true;
   3712             opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
   3713             opt.threshold = 1.0f / 128;
   3714             break;
   3715 
   3716         case Quality_Level8:            // Cluster fit 3+4, threshold = 256.
   3717             opt.cluster_fit = true;
   3718             opt.cluster_fit_3 = enable_three_color_mode;
   3719             opt.threshold = 1.0f / 256;
   3720             break;
   3721 
   3722         case Quality_Level9:           // Cluster fit 3+4, threshold = 256 + Refinement.
   3723             opt.cluster_fit = true;
   3724             opt.cluster_fit_3 = enable_three_color_mode;
   3725             opt.threshold = 1.0f / 256;
   3726             opt.endpoint_refinement = true;
   3727             break;
   3728     }
   3729 
   3730     return opt;
   3731 }
   3732 
   3733 
   3734 static float compress_bc1(Quality level, const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool three_color_black, BlockBC1 * output)
   3735 {
   3736     Options opt = setup_options(level, three_color_mode, three_color_black);
   3737 
   3738     Vector3 colors[16];
   3739     float weights[16];
   3740     bool any_black = false;
   3741     int count;
   3742     if (opt.cluster_fit) {
   3743         count = reduce_colors(input_colors, input_weights, 16, opt.threshold, colors, weights, &any_black);
   3744     }
   3745     else {
   3746         for (int i = 0; i < 16; i++) {
   3747             colors[i] = input_colors[i].xyz;
   3748         }
   3749         count = 16;
   3750     }
   3751 
   3752     if (count == 0) {
   3753         // Output trivial block.
   3754         output->col0.u = 0;
   3755         output->col1.u = 0;
   3756         output->indices = 0;
   3757         return 0;
   3758     }
   3759 
   3760     // Cluster fit cannot handle single color blocks, so encode them optimally.
   3761     if (count == 1) {
   3762         compress_dxt1_single_color_optimal(vector3_to_color32(colors[0]), output);
   3763         return evaluate_mse(input_colors, input_weights, color_weights, output);
   3764     }
   3765 
   3766     float error = FLT_MAX;
   3767 
   3768     // Quick end point selection.
   3769     if (opt.box_fit) {
   3770         Vector3 c0, c1;
   3771         fit_colors_bbox(colors, count, &c0, &c1);
   3772         inset_bbox(&c0, &c1);
   3773         select_diagonal(colors, count, &c0, &c1);
   3774         error = output_block4(input_colors, input_weights, color_weights, c0, c1, output);
   3775 
   3776         // Refine color for the selected indices.
   3777         if (opt.least_squares_fit && optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
   3778             BlockBC1 optimized_block;
   3779             float optimized_error = output_block4(input_colors, input_weights, color_weights, c0, c1, &optimized_block);
   3780 
   3781             if (optimized_error < error) {
   3782                 error = optimized_error;
   3783                 *output = optimized_block;
   3784             }
   3785         }
   3786     }
   3787 
   3788     if (opt.cluster_fit) {
   3789         // @@ Use current endpoints as input for initial PCA approximation?
   3790 
   3791         bool use_three_color_black = any_black && three_color_black;
   3792         bool use_three_color_mode = opt.cluster_fit_3 || (use_three_color_black && opt.cluster_fit_3_black_only);
   3793 
   3794         // Try cluster fit.
   3795         BlockBC1 cluster_fit_output;
   3796         float cluster_fit_error = compress_dxt1_cluster_fit(input_colors, input_weights, colors, weights, count, color_weights, use_three_color_mode, use_three_color_black, three_color_black, &cluster_fit_output);
   3797         if (cluster_fit_error < error) {
   3798             *output = cluster_fit_output;
   3799             error = cluster_fit_error;
   3800         }
   3801     }
   3802 
   3803     if (opt.endpoint_refinement) {
   3804         error = refine_endpoints(input_colors, input_weights, color_weights, three_color_mode, error, output);
   3805     }
   3806 
   3807     return error;
   3808 }
   3809 
   3810 
   3811 // Public API
   3812 
   3813 void init(Decoder decoder) {
   3814     s_decoder = decoder;
   3815     init_single_color_tables(decoder);
   3816     init_cluster_tables();
   3817     init_decoder_weights(decoder);
   3818 }
   3819 
   3820 void decode_bc1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) {
   3821     decode_bc1((const BlockBC1 *)block, rgba_block, decoder);
   3822 }
   3823 
   3824 void decode_bc3(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) {
   3825     decode_bc3((const BlockBC3 *)block, rgba_block, decoder);
   3826 }
   3827 
   3828 // void decode_bc4(const void * block, bool snorm, float rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) {
   3829 //     decode_bc4((const BlockBC4 *)block, snorm, rgba_block, decoder);
   3830 // }
   3831 
   3832 // void decode_bc5(const void * block, bool snorm, float rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) {
   3833 //     decode_bc5((const BlockBC5 *)block, snorm, rgba_block, decoder);
   3834 // }
   3835 
   3836 
   3837 float evaluate_bc1_error(const unsigned char rgba_block[16 * 4], const void * dxt_block, Decoder decoder/*=Decoder_D3D10*/) {
   3838     return evaluate_bc1_error(rgba_block, (const BlockBC1 *)dxt_block, decoder);
   3839 }
   3840 
   3841 float evaluate_bc3_error(const unsigned char rgba_block[16 * 4], const void * dxt_block, bool alpha_blend, Decoder decoder/*=Decoder_D3D10*/) {
   3842     return evaluate_bc3_error(rgba_block, (const BlockBC3 *)dxt_block, alpha_blend, decoder);
   3843 }
   3844 
   3845 
   3846 float compress_bc1(Quality level, const float * input_colors, const float * input_weights, const float rgb[3], bool three_color_mode, bool three_color_black, void * output) {
   3847     return compress_bc1(level, (Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, three_color_mode, three_color_black, (BlockBC1*)output);
   3848 }
   3849 
   3850 float compress_bc3(Quality level, const float * input_colors, const float * input_weights, const float rgb[3], bool six_alpha_mode, void * output) {
   3851     //return compress_bc3(level, (Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, six_alpha_mode, (BlockBC3*)output);
   3852     return 0.0f; // @@ 
   3853 }
   3854 
   3855 } // icbc
   3856 
   3857 // // Do not polute preprocessor definitions.
   3858 // #undef ICBC_SIMD
   3859 // #undef ICBC_ASSERT
   3860 
   3861 // #undef ICBC_SCALAR
   3862 // #undef ICBC_SSE2
   3863 // #undef ICBC_SSE41
   3864 // #undef ICBC_AVX1
   3865 // #undef ICBC_AVX2
   3866 // #undef ICBC_AVX512
   3867 // #undef ICBC_NEON
   3868 // #undef ICBC_VMX
   3869 
   3870 // #undef ICBC_USE_FMA
   3871 // #undef ICBC_USE_AVX2_PERMUTE2
   3872 // #undef ICBC_USE_AVX512_PERMUTE
   3873 // #undef ICBC_USE_NEON_VTL
   3874 
   3875 // #undef ICBC_PERFECT_ROUND
   3876 
   3877 #endif // ICBC_IMPLEMENTATION
   3878 
   3879 // Version History:
   3880 // v1.00 - Initial release.
   3881 // v1.01 - Added SPMD code path with AVX support.
   3882 // v1.02 - Removed SIMD code path.
   3883 // v1.03 - Quality levels. AVX512, Neon, Altivec, vectorized reduction and index selection.
   3884 // v1.04 - Automatic compile-time SIMD selection. Specify hw decoder at runtime. More optimizations.
   3885 // v1.05 - Bug fixes. Small optimizations.
   3886 
   3887 // Copyright (c) 2020 Ignacio Castano <castano@gmail.com>
   3888 //
   3889 // Permission is hereby granted, free of charge, to any person obtaining
   3890 // a copy of this software and associated documentation files (the
   3891 // "Software"), to	deal in the Software without restriction, including
   3892 // without limitation the rights to use, copy, modify, merge, publish,
   3893 // distribute, sublicense, and/or sell copies of the Software, and to
   3894 // permit persons to whom the Software is furnished to do so, subject to
   3895 // the following conditions:
   3896 //
   3897 // The above copyright notice and this permission notice shall be included
   3898 // in all copies or substantial portions of the Software.
   3899 //
   3900 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   3901 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   3902 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   3903 // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
   3904 // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   3905 // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   3906 // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.