duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

gsvector_nosimd.h (78262B)


      1 // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: LGPL-3.0+
      3 
      4 // Implementation of GSVector4/GSVector4i when the host does not support any form of SIMD.
      5 
      6 #pragma once
      7 
      8 #include "common/types.h"
      9 
     10 #include <algorithm>
     11 #include <cmath>
     12 #include <cstring>
     13 
     14 #define GSVECTOR_HAS_UNSIGNED 1
     15 #define GSVECTOR_HAS_SRLV 1
     16 
     17 class GSVector2;
     18 class GSVector2i;
     19 class GSVector4;
     20 class GSVector4i;
     21 
     22 #define SSATURATE8(expr) static_cast<s8>(std::clamp<decltype(expr)>(expr, -128, 127))
     23 #define USATURATE8(expr) static_cast<u8>(std::clamp<decltype(expr)>(expr, 0, 255))
     24 #define SSATURATE16(expr) static_cast<s16>(std::clamp<decltype(expr)>(expr, -32768, 32767))
     25 #define USATURATE16(expr) static_cast<u16>(std::clamp<decltype(expr)>(expr, 0, 65535))
     26 
     27 #define ALL_LANES_8(expr)                                                                                              \
     28   GSVector2i ret;                                                                                                      \
     29   for (size_t i = 0; i < 8; i++)                                                                                       \
     30     expr;                                                                                                              \
     31   return ret;
     32 #define ALL_LANES_16(expr)                                                                                             \
     33   GSVector2i ret;                                                                                                      \
     34   for (size_t i = 0; i < 4; i++)                                                                                       \
     35     expr;                                                                                                              \
     36   return ret;
     37 #define ALL_LANES_32(expr)                                                                                             \
     38   GSVector2i ret;                                                                                                      \
     39   for (size_t i = 0; i < 2; i++)                                                                                       \
     40     expr;                                                                                                              \
     41   return ret;
     42 
     43 class alignas(16) GSVector2i
     44 {
     45   struct cxpr_init_tag
     46   {
     47   };
     48   static constexpr cxpr_init_tag cxpr_init{};
     49 
     50   constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y} {}
     51 
     52   constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
     53 
     54   constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
     55     : S8{b0, b1, b2, b3, b4, b5, b6, b7}
     56   {
     57   }
     58 
     59 public:
     60   union
     61   {
     62     struct
     63     {
     64       s32 x, y;
     65     };
     66     struct
     67     {
     68       s32 r, g;
     69     };
     70     float F32[2];
     71     s8 S8[8];
     72     s16 S16[4];
     73     s32 S32[2];
     74     s64 S64[1];
     75     u8 U8[8];
     76     u16 U16[4];
     77     u32 U32[2];
     78     u64 U64[1];
     79   };
     80 
     81   GSVector2i() = default;
     82 
     83   ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
     84 
     85   ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
     86 
     87   ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
     88 
     89   ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
     90   {
     91     return GSVector2i(cxpr_init, s0, s1, s2, s3);
     92   }
     93 
     94   ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
     95   {
     96     return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7);
     97   }
     98 
     99   ALWAYS_INLINE GSVector2i(s32 x, s32 y)
    100   {
    101     this->x = x;
    102     this->y = y;
    103   }
    104 
    105   ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3)
    106   {
    107     S16[0] = s0;
    108     S16[1] = s1;
    109     S16[2] = s2;
    110     S16[3] = s3;
    111   }
    112 
    113   ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
    114     : S8{b0, b1, b2, b3, b4, b5, b6, b7}
    115   {
    116   }
    117 
    118   ALWAYS_INLINE GSVector2i(const GSVector2i& v) { std::memcpy(S32, v.S32, sizeof(S32)); }
    119 
    120   // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
    121   // so leave the non-constexpr version default
    122   ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }
    123 
    124   ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
    125 
    126   ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
    127 
    128   ALWAYS_INLINE void operator=(const GSVector2i& v) { std::memcpy(S32, v.S32, sizeof(S32)); }
    129   ALWAYS_INLINE void operator=(s32 i)
    130   {
    131     x = i;
    132     y = i;
    133   }
    134 
    135   ALWAYS_INLINE GSVector2i sat_i8(const GSVector2i& min, const GSVector2i& max) const
    136   {
    137     return max_i8(min).min_i8(max);
    138   }
    139   ALWAYS_INLINE GSVector2i sat_i16(const GSVector2i& min, const GSVector2i& max) const
    140   {
    141     return max_i16(min).min_i16(max);
    142   }
    143   ALWAYS_INLINE GSVector2i sat_i32(const GSVector2i& min, const GSVector2i& max) const
    144   {
    145     return max_i32(min).min_i32(max);
    146   }
    147 
    148   ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const
    149   {
    150     return max_u8(min).min_u8(max);
    151   }
    152   ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const
    153   {
    154     return max_u16(min).min_u16(max);
    155   }
    156   ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const
    157   {
    158     return max_u32(min).min_u32(max);
    159   }
    160 
    161   GSVector2i min_i8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = std::min(S8[i], v.S8[i])); }
    162   GSVector2i max_i8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = std::max(S8[i], v.S8[i])); }
    163   GSVector2i min_i16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = std::min(S16[i], v.S16[i])); }
    164   GSVector2i max_i16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = std::max(S16[i], v.S16[i])); }
    165   GSVector2i min_i32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = std::min(S32[i], v.S32[i])); }
    166   GSVector2i max_i32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = std::max(S32[i], v.S32[i])); }
    167 
    168   GSVector2i min_u8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = std::min(U8[i], v.U8[i])); }
    169   GSVector2i max_u8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = std::max(U8[i], v.U8[i])); }
    170   GSVector2i min_u16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = std::min(U16[i], v.U16[i])); }
    171   GSVector2i max_u16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = std::max(U16[i], v.U16[i])); }
    172   GSVector2i min_u32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = std::min(U32[i], v.U32[i])); }
    173   GSVector2i max_u32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = std::max(U32[i], v.U32[i])); }
    174 
    175   s32 addv_s32() const { return (S32[0] + S32[1]); }
    176 
    177   u8 minv_u8() const
    178   {
    179     return std::min(
    180       U8[0],
    181       std::min(U8[1], std::min(U8[2], std::min(U8[3], std::min(U8[4], std::min(U8[5], std::min(U8[6], U8[7])))))));
    182   }
    183 
    184   u16 maxv_u8() const
    185   {
    186     return std::max(
    187       U8[0],
    188       std::max(U8[1], std::max(U8[2], std::max(U8[3], std::max(U8[4], std::max(U8[5], std::max(U8[6], U8[7])))))));
    189   }
    190 
    191   u16 minv_u16() const { return std::min(U16[0], std::min(U16[1], std::min(U16[2], U16[3]))); }
    192 
    193   u16 maxv_u16() const { return std::max(U16[0], std::max(U16[1], std::max(U16[2], U16[3]))); }
    194 
    195   s32 minv_s32() const { return std::min(x, y); }
    196 
    197   u32 minv_u32() const { return std::min(U32[0], U32[1]); }
    198 
    199   s32 maxv_s32() const { return std::max(x, y); }
    200 
    201   u32 maxv_u32() const { return std::max(U32[0], U32[1]); }
    202 
    203   ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
    204 
    205   GSVector2i blend8(const GSVector2i& v, const GSVector2i& mask) const
    206   {
    207     GSVector2i ret;
    208     for (size_t i = 0; i < 8; i++)
    209       ret.U8[i] = (mask.U8[i] & 0x80) ? v.U8[i] : U8[i];
    210     return ret;
    211   }
    212 
    213   template<s32 mask>
    214   GSVector2i blend16(const GSVector2i& v) const
    215   {
    216     GSVector2i ret;
    217     for (size_t i = 0; i < 4; i++)
    218       ret.U16[i] = ((mask & (1 << i)) != 0) ? v.U16[i] : U16[i];
    219     return ret;
    220   }
    221 
    222   template<s32 mask>
    223   GSVector2i blend32(const GSVector2i& v) const
    224   {
    225     GSVector2i ret;
    226     for (size_t i = 0; i < 2; i++)
    227       ret.U32[i] = ((mask & (1 << i)) != 0) ? v.U32[i] : U32[i];
    228     return ret;
    229   }
    230 
    231   GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
    232   {
    233     GSVector2i ret;
    234     ret.U64[0] = (v.U64[0] & mask.U64[0]);
    235     return ret;
    236   }
    237 
    238   ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); }
    239 
    240   GSVector2i shuffle8(const GSVector2i& mask) const
    241   {
    242     ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]));
    243   }
    244 
    245   GSVector2i ps16() const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S16[(i < 4) ? i : (i - 4)])); }
    246   GSVector2i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[(i < 4) ? i : (i - 4)])); }
    247   GSVector2i ps32() const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S32[(i < 2) ? i : (i - 2)])); }
    248   GSVector2i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE16(U32[(i < 2) ? i : (i - 2)])); }
    249 
    250   GSVector2i upl8() const { return GSVector2i(S8[0], 0, S8[1], 0, S8[2], 0, S8[3], 0); }
    251 
    252   GSVector2i upl16() const { return GSVector2i(S16[0], 0, S16[1], 0); }
    253 
    254   GSVector2i upl32() const { return GSVector2i(S32[0], 0); }
    255 
    256   GSVector2i i8to16() const { ALL_LANES_16(ret.S16[i] = S8[i]); }
    257 
    258   template<s32 v>
    259   GSVector2i srl() const
    260   {
    261     GSVector2i ret = {};
    262     if constexpr (v < 8)
    263     {
    264       for (s32 i = 0; i < (8 - v); i++)
    265         ret.U8[i] = U8[v + i];
    266     }
    267     return ret;
    268   }
    269 
    270   template<s32 v>
    271   GSVector2i sll() const
    272   {
    273     GSVector2i ret = {};
    274     if constexpr (v < 8)
    275     {
    276       for (s32 i = 0; i < (8 - v); i++)
    277         ret.U8[v + i] = U8[i];
    278     }
    279     return ret;
    280   }
    281 
    282   template<s32 v>
    283   GSVector2i sll16() const
    284   {
    285     ALL_LANES_16(ret.U16[i] = U16[i] << v);
    286   }
    287 
    288   GSVector2i sll16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v); }
    289 
    290   GSVector2i sllv16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v.U16[i]); }
    291 
    292   template<s32 v>
    293   GSVector2i srl16() const
    294   {
    295     ALL_LANES_16(ret.U16[i] = U16[i] >> v);
    296   }
    297 
    298   GSVector2i srl16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v); }
    299 
    300   GSVector2i srlv16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v.U16[i]); }
    301 
    302   template<s32 v>
    303   GSVector2i sra16() const
    304   {
    305     ALL_LANES_16(ret.S16[i] = S16[i] >> v);
    306   }
    307 
    308   GSVector2i sra16(s32 v) const { ALL_LANES_16(ret.S16[i] = S16[i] >> v); }
    309 
    310   GSVector2i srav16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] >> v.S16[i]); }
    311 
    312   template<s32 v>
    313   GSVector2i sll32() const
    314   {
    315     ALL_LANES_32(ret.U32[i] = U32[i] << v);
    316   }
    317 
    318   GSVector2i sll32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v); }
    319 
    320   GSVector2i sllv32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v.U32[i]); }
    321 
    322   template<s32 v>
    323   GSVector2i srl32() const
    324   {
    325     ALL_LANES_32(ret.U32[i] = U32[i] >> v);
    326   }
    327 
    328   GSVector2i srl32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v); }
    329 
    330   GSVector2i srlv32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v.U32[i]); }
    331 
    332   template<s32 v>
    333   GSVector2i sra32() const
    334   {
    335     ALL_LANES_32(ret.S32[i] = S32[i] >> v);
    336   }
    337 
    338   GSVector2i sra32(s32 v) const { ALL_LANES_32(ret.S32[i] = S32[i] >> v); }
    339 
    340   GSVector2i srav32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] >> v.S32[i]); }
    341 
    342   GSVector2i add8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = S8[i] + v.S8[i]); }
    343 
    344   GSVector2i add16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] + v.S16[i]); }
    345 
    346   GSVector2i add32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] + v.S32[i]); }
    347 
    348   GSVector2i adds8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S8[i] + v.S8[i])); }
    349 
    350   GSVector2i adds16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S16[i] + v.S16[i])); }
    351 
    352   GSVector2i addus8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] + v.U8[i])); }
    353 
    354   GSVector2i addus16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] + v.U16[i])); }
    355 
    356   GSVector2i sub8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = S8[i] - v.S8[i]); }
    357 
    358   GSVector2i sub16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] - v.S16[i]); }
    359 
    360   GSVector2i sub32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] - v.S32[i]); }
    361 
    362   GSVector2i subs8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S8[i] - v.S8[i])); }
    363 
    364   GSVector2i subs16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S16[i] - v.S16[i])); }
    365 
    366   GSVector2i subus8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] - v.U8[i])); }
    367 
    368   GSVector2i subus16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] - v.U16[i])); }
    369 
    370   GSVector2i avg8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = (U8[i] + v.U8[i]) >> 1); }
    371 
    372   GSVector2i avg16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); }
    373 
    374   GSVector2i mul16l(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] * v.S16[i]); }
    375 
    376   GSVector2i mul32l(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] * v.S32[i]); }
    377 
    378   ALWAYS_INLINE bool eq(const GSVector2i& v) const { return (std::memcmp(S32, v.S32, sizeof(S32))) == 0; }
    379 
    380   GSVector2i eq8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] == v.S8[i]) ? -1 : 0); }
    381   GSVector2i eq16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] == v.S16[i]) ? -1 : 0); }
    382   GSVector2i eq32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] == v.S32[i]) ? -1 : 0); }
    383 
    384   GSVector2i neq8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] != v.S8[i]) ? -1 : 0); }
    385   GSVector2i neq16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] != v.S16[i]) ? -1 : 0); }
    386   GSVector2i neq32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] != v.S32[i]) ? -1 : 0); }
    387 
    388   GSVector2i gt8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] > v.S8[i]) ? -1 : 0); }
    389   GSVector2i gt16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] > v.S16[i]) ? -1 : 0); }
    390   GSVector2i gt32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] > v.S32[i]) ? -1 : 0); }
    391 
    392   GSVector2i ge8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] >= v.S8[i]) ? -1 : 0); }
    393   GSVector2i ge16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] >= v.S16[i]) ? -1 : 0); }
    394   GSVector2i ge32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] >= v.S32[i]) ? -1 : 0); }
    395 
    396   GSVector2i lt8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] < v.S8[i]) ? -1 : 0); }
    397   GSVector2i lt16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] < v.S16[i]) ? -1 : 0); }
    398   GSVector2i lt32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] < v.S32[i]) ? -1 : 0); }
    399 
    400   GSVector2i le8(const GSVector2i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] <= v.S8[i]) ? -1 : 0); }
    401   GSVector2i le16(const GSVector2i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] <= v.S16[i]) ? -1 : 0); }
    402   GSVector2i le32(const GSVector2i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] <= v.S32[i]) ? -1 : 0); }
    403 
    404   ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const
    405   {
    406     GSVector2i ret;
    407     ret.U64[0] = (~v.U64[0]) & U64[0];
    408     return ret;
    409   }
    410 
    411   s32 mask() const
    412   {
    413     return static_cast<s32>((static_cast<u32>(U8[0] >> 7) << 0) | (static_cast<u32>(U8[1] >> 7) << 1) |
    414                             (static_cast<u32>(U8[2] >> 7) << 2) | (static_cast<u32>(U8[3] >> 7) << 3) |
    415                             (static_cast<u32>(U8[4] >> 7) << 4) | (static_cast<u32>(U8[5] >> 7) << 5) |
    416                             (static_cast<u32>(U8[6] >> 7) << 6) | (static_cast<u32>(U8[7] >> 7) << 7));
    417   }
    418 
    419   ALWAYS_INLINE bool alltrue() const { return (U64[0] == 0xFFFFFFFFFFFFFFFFULL); }
    420 
    421   ALWAYS_INLINE bool allfalse() const { return (U64[0] == 0); }
    422 
    423   template<s32 i>
    424   ALWAYS_INLINE GSVector2i insert8(s32 a) const
    425   {
    426     GSVector2i ret = *this;
    427     ret.S8[i] = static_cast<s8>(a);
    428     return ret;
    429   }
    430 
    431   template<s32 i>
    432   ALWAYS_INLINE s32 extract8() const
    433   {
    434     return S8[i];
    435   }
    436 
    437   template<s32 i>
    438   ALWAYS_INLINE GSVector2i insert16(s32 a) const
    439   {
    440     GSVector2i ret = *this;
    441     ret.S16[i] = static_cast<s16>(a);
    442     return ret;
    443   }
    444 
    445   template<s32 i>
    446   ALWAYS_INLINE s32 extract16() const
    447   {
    448     return S16[i];
    449   }
    450 
    451   template<s32 i>
    452   ALWAYS_INLINE GSVector2i insert32(s32 a) const
    453   {
    454     GSVector2i ret = *this;
    455     ret.S32[i] = a;
    456     return ret;
    457   }
    458 
    459   template<s32 i>
    460   ALWAYS_INLINE s32 extract32() const
    461   {
    462     return S32[i];
    463   }
    464 
    465   ALWAYS_INLINE static GSVector2i load32(const void* p)
    466   {
    467     GSVector2i ret;
    468     std::memcpy(&ret.x, p, sizeof(s32));
    469     ret.y = 0;
    470     return ret;
    471   }
    472 
    473   ALWAYS_INLINE static GSVector2i load(const void* p)
    474   {
    475     GSVector2i ret;
    476     std::memcpy(ret.S32, p, sizeof(ret.S32));
    477     return ret;
    478   }
    479 
    480   ALWAYS_INLINE static GSVector2i load(s32 i)
    481   {
    482     GSVector2i ret;
    483     ret.x = i;
    484     return ret;
    485   }
    486 
    487   ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.S32, sizeof(S32)); }
    488 
    489   ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { std::memcpy(p, &v.x, sizeof(s32)); }
    490 
    491   ALWAYS_INLINE static s32 store(const GSVector2i& v) { return v.x; }
    492 
    493   ALWAYS_INLINE void operator&=(const GSVector2i& v) { U64[0] &= v.U64[0]; }
    494   ALWAYS_INLINE void operator|=(const GSVector2i& v) { U64[0] |= v.U64[0]; }
    495   ALWAYS_INLINE void operator^=(const GSVector2i& v) { U64[0] ^= v.U64[0]; }
    496 
    497   ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
    498   {
    499     GSVector2i ret;
    500     ret.U64[0] = v1.U64[0] & v2.U64[0];
    501     return ret;
    502   }
    503 
    504   ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2)
    505   {
    506     GSVector2i ret;
    507     ret.U64[0] = v1.U64[0] | v2.U64[0];
    508     return ret;
    509   }
    510 
    511   ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2)
    512   {
    513     GSVector2i ret;
    514     ret.U64[0] = v1.U64[0] ^ v2.U64[0];
    515     return ret;
    516   }
    517 
    518   ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, s32 i) { return v & GSVector2i(i); }
    519 
    520   ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, s32 i) { return v | GSVector2i(i); }
    521 
    522   ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, s32 i) { return v ^ GSVector2i(i); }
    523 
    524   ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); }
    525 
    526   ALWAYS_INLINE static constexpr GSVector2i zero() { return GSVector2i::cxpr(0, 0); }
    527 
    528   ALWAYS_INLINE GSVector2i xy() const { return *this; }
    529   ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(x, x); }
    530   ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(y, x); }
    531   ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(y, y); }
    532 };
    533 
    534 class alignas(16) GSVector2
    535 {
    536   struct cxpr_init_tag
    537   {
    538   };
    539   static constexpr cxpr_init_tag cxpr_init{};
    540 
    541   constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
    542 
    543   constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
    544 
    545 public:
    546   union
    547   {
    548     struct
    549     {
    550       float x, y;
    551     };
    552     struct
    553     {
    554       float r, g;
    555     };
    556     float F32[4];
    557     double F64[2];
    558     s8 I8[16];
    559     s16 I16[8];
    560     s32 I32[4];
    561     s64 I64[2];
    562     u8 U8[16];
    563     u16 U16[8];
    564     u32 U32[4];
    565     u64 U64[2];
    566   };
    567 
    568   GSVector2() = default;
    569 
    570   constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
    571 
    572   constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
    573 
    574   constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
    575 
    576   constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
    577 
    578   ALWAYS_INLINE GSVector2(float x, float y)
    579   {
    580     this->x = x;
    581     this->y = y;
    582   }
    583 
    584   ALWAYS_INLINE GSVector2(int x, int y)
    585   {
    586     this->x = static_cast<float>(x);
    587     this->y = static_cast<float>(y);
    588   }
    589 
    590   ALWAYS_INLINE explicit GSVector2(float f) { x = y = f; }
    591 
    592   ALWAYS_INLINE explicit GSVector2(int i) { x = y = static_cast<float>(i); }
    593 
    594   ALWAYS_INLINE explicit GSVector2(const GSVector2i& v);
    595 
    596   ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
    597 
    598   ALWAYS_INLINE void operator=(float f) { x = y = f; }
    599 
    600   GSVector2 abs() const { return GSVector2(std::fabs(x), std::fabs(y)); }
    601 
    602   GSVector2 neg() const { return GSVector2(-x, -y); }
    603 
    604   GSVector2 rcp() const { return GSVector2(1.0f / x, 1.0f / y); }
    605 
    606   GSVector2 floor() const { return GSVector2(std::floor(x), std::floor(y)); }
    607 
    608   GSVector2 ceil() const { return GSVector2(std::ceil(x), std::ceil(y)); }
    609 
    610   GSVector2 sat(const GSVector2& min, const GSVector2& max) const
    611   {
    612     return GSVector2(std::clamp(x, min.x, max.x), std::clamp(y, min.y, max.y));
    613   }
    614 
    615   GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
    616 
    617   GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); }
    618 
    619   GSVector2 min(const GSVector2& v) const { return GSVector2(std::min(x, v.x), std::min(y, v.y)); }
    620 
    621   GSVector2 max(const GSVector2& v) const { return GSVector2(std::max(x, v.x), std::max(y, v.y)); }
    622 
    623   template<int mask>
    624   GSVector2 blend32(const GSVector2& v) const
    625   {
    626     return GSVector2(v.F32[mask & 1], v.F32[(mask >> 1) & 1]);
    627   }
    628 
    629   ALWAYS_INLINE GSVector2 blend32(const GSVector2& v, const GSVector2& mask) const
    630   {
    631     return GSVector2((mask.U32[0] & 0x80000000u) ? v.x : x, (mask.U32[1] & 0x80000000u) ? v.y : y);
    632   }
    633 
    634   ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const
    635   {
    636     GSVector2 ret;
    637     ret.U32[0] = ((~v.U32[0]) & U32[0]);
    638     ret.U32[1] = ((~v.U32[1]) & U32[1]);
    639     return ret;
    640   }
    641 
    642   ALWAYS_INLINE int mask() const { return (U32[0] >> 31) | ((U32[1] >> 30) & 2); }
    643 
    644   ALWAYS_INLINE bool alltrue() const { return (U64[0] == 0xFFFFFFFFFFFFFFFFULL); }
    645 
    646   ALWAYS_INLINE bool allfalse() const { return (U64[0] == 0); }
    647 
    648   ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
    649 
    650   template<int src, int dst>
    651   ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
    652   {
    653     GSVector2 ret = *this;
    654     ret.F32[dst] = v.F32[src];
    655     return ret;
    656   }
    657 
    658   template<int i>
    659   ALWAYS_INLINE int extract32() const
    660   {
    661     return I32[i];
    662   }
    663 
    664   ALWAYS_INLINE float dot(const GSVector2& v) const { return (x * v.x + y * v.y); }
    665 
    666   ALWAYS_INLINE static constexpr GSVector2 zero() { return GSVector2::cxpr(0.0f, 0.0f); }
    667 
    668   ALWAYS_INLINE static constexpr GSVector2 xffffffff()
    669   {
    670     GSVector2 ret = zero();
    671     ret.U64[0] = ~ret.U64[0];
    672     return ret;
    673   }
    674 
    675   ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(f, f); }
    676 
    677   ALWAYS_INLINE static GSVector2 load(const void* p)
    678   {
    679     GSVector2 ret;
    680     std::memcpy(ret.F32, p, sizeof(F32));
    681     return ret;
    682   }
    683 
    684   ALWAYS_INLINE static void store(void* p, const GSVector2& v) { std::memcpy(p, &v.F32, sizeof(F32)); }
    685 
    686   ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
    687 
    688   void operator+=(const GSVector2& v_)
    689   {
    690     x = x + v_.x;
    691     y = y + v_.y;
    692   }
    693   void operator-=(const GSVector2& v_)
    694   {
    695     x = x - v_.x;
    696     y = y - v_.y;
    697   }
    698   void operator*=(const GSVector2& v_)
    699   {
    700     x = x * v_.x;
    701     y = y * v_.y;
    702   }
    703   void operator/=(const GSVector2& v_)
    704   {
    705     x = x / v_.x;
    706     y = y / v_.y;
    707   }
    708 
    709   void operator+=(const float v_)
    710   {
    711     x = x + v_;
    712     y = y + v_;
    713   }
    714   void operator-=(const float v_)
    715   {
    716     x = x - v_;
    717     y = y - v_;
    718   }
    719   void operator*=(const float v_)
    720   {
    721     x = x * v_;
    722     y = y * v_;
    723   }
    724   void operator/=(const float v_)
    725   {
    726     x = x / v_;
    727     y = y / v_;
    728   }
    729 
    730   void operator&=(const GSVector2& v_) { U64[0] &= v_.U64[0]; }
    731   void operator|=(const GSVector2& v_) { U64[0] |= v_.U64[0]; }
    732   void operator^=(const GSVector2& v_) { U64[0] ^= v_.U64[0]; }
    733 
    734   friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x + v2.x, v1.y + v2.y); }
    735 
    736   friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x - v2.x, v1.y - v2.y); }
    737 
    738   friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x * v2.x, v1.y * v2.y); }
    739 
    740   friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x / v2.x, v1.y / v2.y); }
    741 
    742   friend GSVector2 operator+(const GSVector2& v, float f) { return GSVector2(v.x + f, v.y + f); }
    743 
    744   friend GSVector2 operator-(const GSVector2& v, float f) { return GSVector2(v.x - f, v.y - f); }
    745 
    746   friend GSVector2 operator*(const GSVector2& v, float f) { return GSVector2(v.x * f, v.y * f); }
    747 
    748   friend GSVector2 operator/(const GSVector2& v, float f) { return GSVector2(v.x / f, v.y / f); }
    749 
    750   friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2)
    751   {
    752     GSVector2 ret;
    753     ret.U64[0] = v1.U64[0] & v2.U64[0];
    754     return ret;
    755   }
    756 
    757   ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2)
    758   {
    759     GSVector2 ret;
    760     ret.U64[0] = v1.U64[0] | v2.U64[0];
    761     return ret;
    762   }
    763 
    764   ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2)
    765   {
    766     GSVector2 ret;
    767     ret.U64[0] = v1.U64[0] ^ v2.U64[0];
    768     return ret;
    769   }
    770 
    771   ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2)
    772   {
    773     GSVector2 ret;
    774     ret.I32[0] = (v1.x == v2.x) ? -1 : 0;
    775     ret.I32[1] = (v1.y == v2.y) ? -1 : 0;
    776     return ret;
    777   }
    778 
    779   ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2)
    780   {
    781     GSVector2 ret;
    782     ret.I32[0] = (v1.x != v2.x) ? -1 : 0;
    783     ret.I32[1] = (v1.y != v2.y) ? -1 : 0;
    784     return ret;
    785   }
    786 
    787   ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2)
    788   {
    789     GSVector2 ret;
    790     ret.I32[0] = (v1.x > v2.x) ? -1 : 0;
    791     ret.I32[1] = (v1.y > v2.y) ? -1 : 0;
    792     return ret;
    793   }
    794 
    795   ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2)
    796   {
    797     GSVector2 ret;
    798     ret.I32[0] = (v1.x < v2.x) ? -1 : 0;
    799     ret.I32[1] = (v1.y < v2.y) ? -1 : 0;
    800     return ret;
    801   }
    802 
    803   ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2)
    804   {
    805     GSVector2 ret;
    806     ret.I32[0] = (v1.x >= v2.x) ? -1 : 0;
    807     ret.I32[1] = (v1.y >= v2.y) ? -1 : 0;
    808     return ret;
    809   }
    810 
    811   ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2)
    812   {
    813     GSVector2 ret;
    814     ret.I32[0] = (v1.x <= v2.x) ? -1 : 0;
    815     ret.I32[1] = (v1.y <= v2.y) ? -1 : 0;
    816     return ret;
    817   }
    818 
    819   ALWAYS_INLINE GSVector2 xy() const { return *this; }
    820   ALWAYS_INLINE GSVector2 xx() const { return GSVector2(x, x); }
    821   ALWAYS_INLINE GSVector2 yx() const { return GSVector2(y, x); }
    822   ALWAYS_INLINE GSVector2 yy() const { return GSVector2(y, y); }
    823 };
    824 
    825 #undef ALL_LANES_8
    826 #undef ALL_LANES_16
    827 #undef ALL_LANES_32
    828 
    829 #define ALL_LANES_8(expr)                                                                                              \
    830   GSVector4i ret;                                                                                                      \
    831   for (size_t i = 0; i < 16; i++)                                                                                      \
    832     expr;                                                                                                              \
    833   return ret;
    834 #define ALL_LANES_16(expr)                                                                                             \
    835   GSVector4i ret;                                                                                                      \
    836   for (size_t i = 0; i < 8; i++)                                                                                       \
    837     expr;                                                                                                              \
    838   return ret;
    839 #define ALL_LANES_32(expr)                                                                                             \
    840   GSVector4i ret;                                                                                                      \
    841   for (size_t i = 0; i < 4; i++)                                                                                       \
    842     expr;                                                                                                              \
    843   return ret;
    844 #define ALL_LANES_64(expr)                                                                                             \
    845   GSVector4i ret;                                                                                                      \
    846   for (size_t i = 0; i < 2; i++)                                                                                       \
    847     expr;                                                                                                              \
    848   return ret;
    849 
    850 class alignas(16) GSVector4i
    851 {
    852   struct cxpr_init_tag
    853   {
    854   };
    855   static constexpr cxpr_init_tag cxpr_init{};
    856 
    857   constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {}
    858 
    859   constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
    860     : S16{s0, s1, s2, s3, s4, s5, s6, s7}
    861   {
    862   }
    863 
    864   constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
    865                        s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
    866     : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
    867   {
    868   }
    869 
    870 public:
    871   union
    872   {
    873     struct
    874     {
    875       s32 x, y, z, w;
    876     };
    877     struct
    878     {
    879       s32 r, g, b, a;
    880     };
    881     struct
    882     {
    883       s32 left, top, right, bottom;
    884     };
    885     float F32[4];
    886     s8 S8[16];
    887     s16 S16[8];
    888     s32 S32[4];
    889     s64 S64[2];
    890     u8 U8[16];
    891     u16 U16[8];
    892     u32 U32[4];
    893     u64 U64[2];
    894   };
    895 
    896   GSVector4i() = default;
    897 
    898   ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
    899   {
    900     return GSVector4i(cxpr_init, x, y, z, w);
    901   }
    902 
    903   ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
    904 
    905   ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
    906 
    907   ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
    908   {
    909     return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
    910   }
    911 
    912   ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
    913                                                   s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
    914   {
    915     return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
    916   }
    917 
    918   ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w)
    919   {
    920     this->x = x;
    921     this->y = y;
    922     this->z = z;
    923     this->w = w;
    924   }
    925 
    926   ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); }
    927 
    928   ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
    929   {
    930     S16[0] = s0;
    931     S16[1] = s1;
    932     S16[2] = s2;
    933     S16[3] = s3;
    934     S16[4] = s4;
    935     S16[5] = s5;
    936     S16[6] = s6;
    937     S16[7] = s7;
    938   }
    939 
    940   ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
    941                                      s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
    942     : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
    943   {
    944   }
    945 
    946   ALWAYS_INLINE GSVector4i(const GSVector4i& v) { std::memcpy(S32, v.S32, sizeof(S32)); }
    947   ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) : S32{v.S32[0], v.S32[1], 0, 0} {}
    948 
    949   // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
    950   // so leave the non-constexpr version default
    951   ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
    952 
    953   ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
    954 
    955   ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
    956 
    957   ALWAYS_INLINE void operator=(const GSVector4i& v) { std::memcpy(S32, v.S32, sizeof(S32)); }
    958   ALWAYS_INLINE void operator=(s32 i)
    959   {
    960     x = i;
    961     y = i;
    962     z = i;
    963     w = i;
    964   }
    965 
    966   // rect
    967 
    968   ALWAYS_INLINE s32 width() const { return right - left; }
    969 
    970   ALWAYS_INLINE s32 height() const { return bottom - top; }
    971 
    972   ALWAYS_INLINE GSVector4i rsize() const
    973   {
    974     return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
    975   }
    976 
    977   ALWAYS_INLINE s32 rarea() const { return width() * height(); }
    978 
    979   ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; }
    980 
    981   // TODO: Optimize for no-simd, this generates crap code.
    982   ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_i32(v).upl64(max_i32(v).srl<8>()); }
    983 
    984   ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& v) const { return sat_i32(v); }
    985   ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); }
    986   ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
    987 
    988   ALWAYS_INLINE u32 rgba32() const
    989   {
    990     GSVector4i v = *this;
    991 
    992     v = v.ps32(v);
    993     v = v.pu16(v);
    994 
    995     return (u32)store(v);
    996   }
    997 
    998   ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const
    999   {
   1000     return max_i8(min).min_i8(max);
   1001   }
   1002   ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const
   1003   {
   1004     return max_i8(minmax.xyxy()).min_i8(minmax.zwzw());
   1005   }
   1006   ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const
   1007   {
   1008     return max_i16(min).min_i16(max);
   1009   }
   1010   ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const
   1011   {
   1012     return max_i16(minmax.xyxy()).min_i16(minmax.zwzw());
   1013   }
   1014   ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const
   1015   {
   1016     return max_i32(min).min_i32(max);
   1017   }
   1018   ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const
   1019   {
   1020     return max_i32(minmax.xyxy()).min_i32(minmax.zwzw());
   1021   }
   1022 
   1023   ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
   1024   {
   1025     return max_u8(min).min_u8(max);
   1026   }
   1027   ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
   1028   {
   1029     return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
   1030   }
   1031   ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
   1032   {
   1033     return max_u16(min).min_u16(max);
   1034   }
   1035   ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
   1036   {
   1037     return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
   1038   }
   1039   ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
   1040   {
   1041     return max_u32(min).min_u32(max);
   1042   }
   1043   ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
   1044   {
   1045     return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
   1046   }
   1047 
   1048   GSVector4i min_i8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = std::min(S8[i], v.S8[i])); }
   1049   GSVector4i max_i8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = std::max(S8[i], v.S8[i])); }
   1050   GSVector4i min_i16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = std::min(S16[i], v.S16[i])); }
   1051   GSVector4i max_i16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = std::max(S16[i], v.S16[i])); }
   1052   GSVector4i min_i32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = std::min(S32[i], v.S32[i])); }
   1053   GSVector4i max_i32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = std::max(S32[i], v.S32[i])); }
   1054 
   1055   GSVector4i min_u8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = std::min(U8[i], v.U8[i])); }
   1056   GSVector4i max_u8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = std::max(U8[i], v.U8[i])); }
   1057   GSVector4i min_u16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = std::min(U16[i], v.U16[i])); }
   1058   GSVector4i max_u16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = std::max(U16[i], v.U16[i])); }
   1059   GSVector4i min_u32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = std::min(U32[i], v.U32[i])); }
   1060   GSVector4i max_u32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = std::max(U32[i], v.U32[i])); }
   1061 
   1062   GSVector4i madd_s16(const GSVector4i& v) const
   1063   {
   1064     ALL_LANES_32(ret.S32[i] = (S16[i * 2] * v.S16[i * 2]) + (S16[i * 2 + 1] * v.S16[i * 2 + 1]));
   1065   }
   1066 
   1067   GSVector4i addp_s32() const { return GSVector4i(x + y, z + w, 0, 0); }
   1068 
   1069   s32 addv_s32() const { return (S32[0] + S32[1] + S32[2] + S32[3]); }
   1070 
   1071   u8 minv_u8() const
   1072   {
   1073     return std::min(
   1074       U8[0],
   1075       std::min(
   1076         U8[1],
   1077         std::min(
   1078           U8[2],
   1079           std::min(
   1080             U8[3],
   1081             std::min(
   1082               U8[4],
   1083               std::min(
   1084                 U8[5],
   1085                 std::min(
   1086                   U8[6],
   1087                   std::min(
   1088                     U8[7],
   1089                     std::min(
   1090                       U8[9],
   1091                       std::min(U8[10],
   1092                                std::min(U8[11], std::min(U8[12], std::min(U8[13], std::min(U8[14], U8[15]))))))))))))));
   1093   }
   1094 
   1095   u16 maxv_u8() const
   1096   {
   1097     return std::max(
   1098       U8[0],
   1099       std::max(
   1100         U8[1],
   1101         std::max(
   1102           U8[2],
   1103           std::max(
   1104             U8[3],
   1105             std::max(
   1106               U8[4],
   1107               std::max(
   1108                 U8[5],
   1109                 std::max(
   1110                   U8[6],
   1111                   std::max(
   1112                     U8[7],
   1113                     std::max(
   1114                       U8[9],
   1115                       std::max(U8[10],
   1116                                std::max(U8[11], std::max(U8[12], std::max(U8[13], std::max(U8[14], U8[15]))))))))))))));
   1117   }
   1118 
   1119   u16 minv_u16() const
   1120   {
   1121     return std::min(
   1122       U16[0],
   1123       std::min(U16[1],
   1124                std::min(U16[2], std::min(U16[3], std::min(U16[4], std::min(U16[5], std::min(U16[6], U16[7])))))));
   1125   }
   1126 
   1127   u16 maxv_u16() const
   1128   {
   1129     return std::max(
   1130       U16[0],
   1131       std::max(U16[1],
   1132                std::max(U16[2], std::max(U16[3], std::max(U16[4], std::max(U16[5], std::max(U16[6], U16[7])))))));
   1133   }
   1134 
   1135   s32 minv_s32() const { return std::min(x, std::min(y, std::min(z, w))); }
   1136 
   1137   u32 minv_u32() const { return std::min(U32[0], std::min(U32[1], std::min(U32[2], U32[3]))); }
   1138 
   1139   s32 maxv_s32() const { return std::max(x, std::max(y, std::max(z, w))); }
   1140 
   1141   u32 maxv_u32() const { return std::max(U32[0], std::max(U32[1], std::max(U32[2], U32[3]))); }
   1142 
   1143   static s32 min_i16(s32 a, s32 b) { return store(load(a).min_i16(load(b))); }
   1144 
   1145   ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
   1146 
   1147   GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const
   1148   {
   1149     GSVector4i ret;
   1150     for (size_t i = 0; i < 16; i++)
   1151       ret.U8[i] = (mask.U8[i] & 0x80) ? v.U8[i] : U8[i];
   1152     return ret;
   1153   }
   1154 
   1155   template<s32 mask>
   1156   GSVector4i blend16(const GSVector4i& v) const
   1157   {
   1158     GSVector4i ret;
   1159     for (size_t i = 0; i < 8; i++)
   1160       ret.U16[i] = ((mask & (1 << i)) != 0) ? v.U16[i] : U16[i];
   1161     return ret;
   1162   }
   1163 
   1164   template<s32 mask>
   1165   GSVector4i blend32(const GSVector4i& v) const
   1166   {
   1167     GSVector4i ret;
   1168     for (size_t i = 0; i < 4; i++)
   1169       ret.U32[i] = ((mask & (1 << i)) != 0) ? v.U32[i] : U32[i];
   1170     return ret;
   1171   }
   1172 
   1173   GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
   1174   {
   1175     GSVector4i ret;
   1176     for (size_t i = 0; i < 2; i++)
   1177       ret.U64[i] = (v.U64[i] & mask.U64[i]) | (U64[i] & ~mask.U64[i]);
   1178     return ret;
   1179   }
   1180 
   1181   ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
   1182 
   1183   GSVector4i shuffle8(const GSVector4i& mask) const
   1184   {
   1185     ALL_LANES_8(ret.S8[i] = (mask.S8[i] & 0x80) ? 0 : (S8[mask.S8[i] & 0xf]));
   1186   }
   1187 
   1188   GSVector4i ps16(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = SSATURATE8((i < 8) ? S16[i] : v.S16[i - 8])); }
   1189   GSVector4i ps16() const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S16[(i < 8) ? i : (i - 8)])); }
   1190   GSVector4i pu16(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8((i < 8) ? U16[i] : v.U16[i - 8])); }
   1191   GSVector4i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[(i < 8) ? i : (i - 8)])); }
   1192   GSVector4i ps32(const GSVector4i& v) const
   1193   {
   1194     ALL_LANES_16(ret.U16[i] = SSATURATE16((i < 4) ? S32[i] : v.S32[i - 4]));
   1195   }
   1196   GSVector4i ps32() const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S32[(i < 4) ? i : (i - 4)])); }
   1197   GSVector4i pu32(const GSVector4i& v) const
   1198   {
   1199     ALL_LANES_16(ret.U16[i] = USATURATE16((i < 4) ? U32[i] : v.U32[i - 4]));
   1200   }
   1201   GSVector4i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE16(U32[(i < 4) ? i : (i - 4)])); }
   1202 
   1203   GSVector4i upl8(const GSVector4i& v) const
   1204   {
   1205     return GSVector4i(S8[0], v.S8[0], S8[1], v.S8[1], S8[2], v.S8[2], S8[3], v.S8[3], S8[4], v.S8[4], S8[5], v.S8[5],
   1206                       S8[6], v.S8[6], S8[7], v.S8[7]);
   1207   }
   1208   GSVector4i uph8(const GSVector4i& v) const
   1209   {
   1210     return GSVector4i(S8[8], v.S8[8], S8[9], v.S8[9], S8[10], v.S8[10], S8[11], v.S8[11], S8[12], v.S8[12], S8[13],
   1211                       v.S8[13], S8[14], v.S8[14], S8[15], v.S8[15]);
   1212   }
   1213   GSVector4i upl16(const GSVector4i& v) const
   1214   {
   1215     return GSVector4i(S16[0], v.S16[0], S16[1], v.S16[1], S16[2], v.S16[2], S16[3], v.S16[3]);
   1216   }
   1217   GSVector4i uph16(const GSVector4i& v) const
   1218   {
   1219     return GSVector4i(S16[4], v.S16[4], S16[5], v.S16[5], S16[6], v.S16[6], S16[7], v.S16[7]);
   1220   }
   1221   GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(S32[0], v.S32[0], S32[1], v.S32[1]); }
   1222   GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(S32[2], v.S32[2], S32[3], v.S32[3]); }
   1223   GSVector4i upl64(const GSVector4i& v) const
   1224   {
   1225     GSVector4i ret;
   1226     ret.S64[0] = S64[0];
   1227     ret.S64[1] = v.S64[0];
   1228     return ret;
   1229   }
   1230   GSVector4i uph64(const GSVector4i& v) const
   1231   {
   1232     GSVector4i ret;
   1233     ret.S64[0] = S64[1];
   1234     ret.S64[1] = v.S64[1];
   1235     return ret;
   1236   }
   1237 
   1238   GSVector4i upl8() const
   1239   {
   1240     return GSVector4i(S8[0], 0, S8[1], 0, S8[2], 0, S8[3], 0, S8[4], 0, S8[5], 0, S8[6], 0, S8[7], 0);
   1241   }
   1242   GSVector4i uph8() const
   1243   {
   1244     return GSVector4i(S8[8], 0, S8[9], 0, S8[10], 0, S8[11], 0, S8[12], 0, S8[13], 0, S8[14], 0, S8[15], 0);
   1245   }
   1246 
   1247   GSVector4i upl16() const { return GSVector4i(S16[0], 0, S16[1], 0, S16[2], 0, S16[3], 0); }
   1248   GSVector4i uph16() const { return GSVector4i(S16[4], 0, S16[5], 0, S16[6], 0, S16[7], 0); }
   1249 
   1250   GSVector4i upl32() const { return GSVector4i(S32[0], 0, S32[1], 0); }
   1251   GSVector4i uph32() const { return GSVector4i(S32[2], 0, S32[3], 0); }
   1252   GSVector4i upl64() const
   1253   {
   1254     GSVector4i ret;
   1255     ret.S64[0] = S64[0];
   1256     ret.S64[1] = 0;
   1257     return ret;
   1258   }
   1259   GSVector4i uph64() const
   1260   {
   1261     GSVector4i ret;
   1262     ret.S64[0] = S64[1];
   1263     ret.S64[1] = 0;
   1264     return ret;
   1265   }
   1266 
   1267   GSVector4i s8to16() const { ALL_LANES_16(ret.S16[i] = S8[i]); }
   1268   GSVector4i s8to32() const { ALL_LANES_32(ret.S32[i] = S8[i]); }
   1269   GSVector4i s8to64() const { ALL_LANES_64(ret.S64[i] = S8[i]); }
   1270 
   1271   GSVector4i s16to32() const { ALL_LANES_32(ret.S32[i] = S16[i]); }
   1272   GSVector4i s16to64() const { ALL_LANES_64(ret.S64[i] = S16[i]); }
   1273   GSVector4i s32to64() const { ALL_LANES_64(ret.S64[i] = S32[i]); }
   1274   GSVector4i u8to16() const { ALL_LANES_64(ret.U16[i] = U8[i]); }
   1275   GSVector4i u8to32() const { ALL_LANES_32(ret.U32[i] = U8[i]); }
   1276   GSVector4i u8to64() const { ALL_LANES_64(ret.U64[i] = U8[i]); }
   1277   GSVector4i u16to32() const { ALL_LANES_32(ret.U32[i] = U16[i]); }
   1278   GSVector4i u16to64() const { ALL_LANES_64(ret.U64[i] = U16[i]); }
   1279   GSVector4i u32to64() const { ALL_LANES_64(ret.U64[i] = U32[i]); }
   1280 
   1281   template<s32 v>
   1282   GSVector4i srl() const
   1283   {
   1284     GSVector4i ret = {};
   1285     if constexpr (v < 16)
   1286     {
   1287       for (s32 i = 0; i < (16 - v); i++)
   1288         ret.U8[i] = U8[v + i];
   1289     }
   1290     return ret;
   1291   }
   1292 
   1293   template<s32 v>
   1294   GSVector4i srl(const GSVector4i& r)
   1295   {
   1296     // This sucks. Hopefully it's never used.
   1297     u8 concat[32];
   1298     std::memcpy(concat, U8, sizeof(u8) * 16);
   1299     std::memcpy(concat + 16, r.U8, sizeof(u8) * 16);
   1300 
   1301     GSVector4i ret;
   1302     std::memcpy(ret.U8, &concat[v], sizeof(u8) * 16);
   1303     return ret;
   1304   }
   1305 
   1306   template<s32 v>
   1307   GSVector4i sll() const
   1308   {
   1309     GSVector4i ret = {};
   1310     if constexpr (v < 16)
   1311     {
   1312       for (s32 i = 0; i < (16 - v); i++)
   1313         ret.U8[v + i] = U8[i];
   1314     }
   1315     return ret;
   1316   }
   1317 
   1318   template<s32 v>
   1319   GSVector4i sll16() const
   1320   {
   1321     ALL_LANES_16(ret.U16[i] = U16[i] << v);
   1322   }
   1323 
   1324   GSVector4i sll16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v); }
   1325 
   1326   GSVector4i sllv16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v.U16[i]); }
   1327 
   1328   template<s32 v>
   1329   GSVector4i srl16() const
   1330   {
   1331     ALL_LANES_16(ret.U16[i] = U16[i] >> v);
   1332   }
   1333 
   1334   GSVector4i srl16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v); }
   1335 
   1336   GSVector4i srlv16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v.U16[i]); }
   1337 
   1338   template<s32 v>
   1339   GSVector4i sra16() const
   1340   {
   1341     ALL_LANES_16(ret.S16[i] = S16[i] >> v);
   1342   }
   1343 
   1344   GSVector4i sra16(s32 v) const { ALL_LANES_16(ret.S16[i] = S16[i] >> v); }
   1345 
   1346   GSVector4i srav16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] >> v.S16[i]); }
   1347 
   1348   template<s32 v>
   1349   GSVector4i sll32() const
   1350   {
   1351     ALL_LANES_32(ret.U32[i] = U32[i] << v);
   1352   }
   1353 
   1354   GSVector4i sll32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v); }
   1355 
   1356   GSVector4i sllv32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v.U32[i]); }
   1357 
   1358   template<s32 v>
   1359   GSVector4i srl32() const
   1360   {
   1361     ALL_LANES_32(ret.U32[i] = U32[i] >> v);
   1362   }
   1363 
   1364   GSVector4i srl32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v); }
   1365 
   1366   GSVector4i srlv32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v.U32[i]); }
   1367 
   1368   template<s32 v>
   1369   GSVector4i sra32() const
   1370   {
   1371     ALL_LANES_32(ret.S32[i] = S32[i] >> v);
   1372   }
   1373 
   1374   GSVector4i sra32(s32 v) const { ALL_LANES_32(ret.S32[i] = S32[i] >> v); }
   1375 
   1376   GSVector4i srav32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] >> v.S32[i]); }
   1377 
   1378   template<s64 v>
   1379   GSVector4i sll64() const
   1380   {
   1381     ALL_LANES_64(ret.U64[i] = U64[i] << v);
   1382   }
   1383 
   1384   GSVector4i sll64(s32 v) const { ALL_LANES_64(ret.U64[i] = U64[i] << v); }
   1385 
   1386   GSVector4i sllv64(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = U64[i] << v.U64[i]); }
   1387 
   1388   template<s64 v>
   1389   GSVector4i srl64() const
   1390   {
   1391     ALL_LANES_64(ret.U64[i] = U64[i] >> v);
   1392   }
   1393 
   1394   GSVector4i srl64(s32 v) const { ALL_LANES_64(ret.U64[i] = U64[i] >> v); }
   1395 
   1396   GSVector4i srlv64(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = U64[i] >> v.U64[i]); }
   1397 
   1398   template<s64 v>
   1399   GSVector4i sra64() const
   1400   {
   1401     ALL_LANES_64(ret.S64[i] = S64[i] >> v);
   1402   }
   1403 
   1404   GSVector4i sra64(s32 v) const { ALL_LANES_64(ret.S64[i] = S64[i] >> v); }
   1405 
   1406   GSVector4i srav64(const GSVector4i& v) const { ALL_LANES_64(ret.S64[i] = S64[i] >> v.S64[i]); }
   1407 
   1408   GSVector4i add8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = S8[i] + v.S8[i]); }
   1409 
   1410   GSVector4i add16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] + v.S16[i]); }
   1411 
   1412   GSVector4i add32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] + v.S32[i]); }
   1413 
   1414   GSVector4i adds8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S8[i] + v.S8[i])); }
   1415 
   1416   GSVector4i adds16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S16[i] + v.S16[i])); }
   1417 
   1418   GSVector4i hadds16(const GSVector4i& v) const
   1419   {
   1420     return GSVector4i(SSATURATE16(S16[0] + S16[1]), SSATURATE16(S16[2] + S16[3]), SSATURATE16(S16[4] + S16[5]),
   1421                       SSATURATE16(S16[6] + S16[7]), SSATURATE16(v.S16[0] + v.S16[1]), SSATURATE16(v.S16[2] + v.S16[3]),
   1422                       SSATURATE16(v.S16[4] + v.S16[5]), SSATURATE16(v.S16[6] + v.S16[7]));
   1423   }
   1424 
   1425   GSVector4i addus8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] + v.U8[i])); }
   1426 
   1427   GSVector4i addus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] + v.U16[i])); }
   1428 
   1429   GSVector4i sub8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = S8[i] - v.S8[i]); }
   1430 
   1431   GSVector4i sub16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] - v.S16[i]); }
   1432 
   1433   GSVector4i sub32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] - v.S32[i]); }
   1434 
   1435   GSVector4i subs8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = SSATURATE8(S8[i] - v.S8[i])); }
   1436 
   1437   GSVector4i subs16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = SSATURATE16(S16[i] - v.S16[i])); }
   1438 
   1439   GSVector4i subus8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] - v.U8[i])); }
   1440 
   1441   GSVector4i subus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] - v.U16[i])); }
   1442 
   1443   GSVector4i avg8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = (U8[i] + v.U8[i]) >> 1); }
   1444 
   1445   GSVector4i avg16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); }
   1446 
   1447   GSVector4i mul16hs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] * v.S16[i]) >> 16); }
   1448 
   1449   GSVector4i mul16hu(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] * v.U16[i]) >> 16); }
   1450 
   1451   GSVector4i mul16l(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = S16[i] * v.S16[i]); }
   1452 
   1453   GSVector4i mul16hrs(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = ((S16[i] * v.S16[i]) >> 14) + 1); }
   1454 
   1455   GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] * v.S32[i]); }
   1456 
   1457   ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(S32, v.S32, sizeof(S32))) == 0; }
   1458 
   1459   GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] == v.S8[i]) ? -1 : 0); }
   1460   GSVector4i eq16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] == v.S16[i]) ? -1 : 0); }
   1461   GSVector4i eq32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] == v.S32[i]) ? -1 : 0); }
   1462   GSVector4i eq64(const GSVector4i& v) const { ALL_LANES_64(ret.S64[i] = (S64[i] == v.S64[i]) ? -1 : 0); }
   1463 
   1464   GSVector4i neq8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] != v.S8[i]) ? -1 : 0); }
   1465   GSVector4i neq16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] != v.S16[i]) ? -1 : 0); }
   1466   GSVector4i neq32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] != v.S32[i]) ? -1 : 0); }
   1467 
   1468   GSVector4i gt8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] > v.S8[i]) ? -1 : 0); }
   1469   GSVector4i gt16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] > v.S16[i]) ? -1 : 0); }
   1470   GSVector4i gt32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] > v.S32[i]) ? -1 : 0); }
   1471 
   1472   GSVector4i ge8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] >= v.S8[i]) ? -1 : 0); }
   1473   GSVector4i ge16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] >= v.S16[i]) ? -1 : 0); }
   1474   GSVector4i ge32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] >= v.S32[i]) ? -1 : 0); }
   1475 
   1476   GSVector4i lt8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] < v.S8[i]) ? -1 : 0); }
   1477   GSVector4i lt16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] < v.S16[i]) ? -1 : 0); }
   1478   GSVector4i lt32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] < v.S32[i]) ? -1 : 0); }
   1479 
   1480   GSVector4i le8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] <= v.S8[i]) ? -1 : 0); }
   1481   GSVector4i le16(const GSVector4i& v) const { ALL_LANES_16(ret.S16[i] = (S16[i] <= v.S16[i]) ? -1 : 0); }
   1482   GSVector4i le32(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = (S32[i] <= v.S32[i]) ? -1 : 0); }
   1483 
   1484   ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = (~v.U64[i]) & U64[i]); }
   1485 
   1486   s32 mask() const
   1487   {
   1488     return static_cast<s32>((static_cast<u32>(U8[0] >> 7) << 0) | (static_cast<u32>(U8[1] >> 7) << 1) |
   1489                             (static_cast<u32>(U8[2] >> 7) << 2) | (static_cast<u32>(U8[3] >> 7) << 3) |
   1490                             (static_cast<u32>(U8[4] >> 7) << 4) | (static_cast<u32>(U8[5] >> 7) << 5) |
   1491                             (static_cast<u32>(U8[6] >> 7) << 6) | (static_cast<u32>(U8[7] >> 7) << 7) |
   1492                             (static_cast<u32>(U8[8] >> 7) << 8) | (static_cast<u32>(U8[9] >> 7) << 9) |
   1493                             (static_cast<u32>(U8[10] >> 7) << 10) | (static_cast<u32>(U8[11] >> 7) << 11) |
   1494                             (static_cast<u32>(U8[12] >> 7) << 12) | (static_cast<u32>(U8[13] >> 7) << 13) |
   1495                             (static_cast<u32>(U8[14] >> 7) << 14) | (static_cast<u32>(U8[15] >> 7) << 15));
   1496   }
   1497 
   1498   ALWAYS_INLINE bool alltrue() const { return ((U64[0] & U64[1]) == 0xFFFFFFFFFFFFFFFFULL); }
   1499 
   1500   ALWAYS_INLINE bool allfalse() const { return ((U64[0] | U64[1]) == 0); }
   1501 
   1502   template<s32 i>
   1503   ALWAYS_INLINE GSVector4i insert8(s32 a) const
   1504   {
   1505     GSVector4i ret = *this;
   1506     ret.S8[i] = static_cast<s8>(a);
   1507     return ret;
   1508   }
   1509 
   1510   template<s32 i>
   1511   ALWAYS_INLINE s32 extract8() const
   1512   {
   1513     return S8[i];
   1514   }
   1515 
   1516   template<s32 i>
   1517   ALWAYS_INLINE GSVector4i insert16(s32 a) const
   1518   {
   1519     GSVector4i ret = *this;
   1520     ret.S16[i] = static_cast<s16>(a);
   1521     return ret;
   1522   }
   1523 
   1524   template<s32 i>
   1525   ALWAYS_INLINE s32 extract16() const
   1526   {
   1527     return S16[i];
   1528   }
   1529 
   1530   template<s32 i>
   1531   ALWAYS_INLINE GSVector4i insert32(s32 a) const
   1532   {
   1533     GSVector4i ret = *this;
   1534     ret.S32[i] = a;
   1535     return ret;
   1536   }
   1537 
   1538   template<s32 i>
   1539   ALWAYS_INLINE s32 extract32() const
   1540   {
   1541     return S32[i];
   1542   }
   1543 
   1544   template<s32 i>
   1545   ALWAYS_INLINE GSVector4i insert64(s64 a) const
   1546   {
   1547     GSVector4i ret = *this;
   1548     ret.S64[i] = a;
   1549     return ret;
   1550   }
   1551 
   1552   template<s32 i>
   1553   ALWAYS_INLINE s64 extract64() const
   1554   {
   1555     return S64[i];
   1556   }
   1557 
   1558   ALWAYS_INLINE static GSVector4i loadnt(const void* p)
   1559   {
   1560     GSVector4i ret;
   1561     std::memcpy(&ret, p, sizeof(ret.S32));
   1562     return ret;
   1563   }
   1564 
   1565   ALWAYS_INLINE static GSVector4i load32(const void* p)
   1566   {
   1567     GSVector4i ret;
   1568     std::memcpy(&ret.x, p, sizeof(s32));
   1569     ret.y = 0;
   1570     ret.z = 0;
   1571     ret.w = 0;
   1572     return ret;
   1573   }
   1574 
   1575   ALWAYS_INLINE static GSVector4i loadl(const void* p)
   1576   {
   1577     GSVector4i ret;
   1578     std::memcpy(&ret.U64[0], p, sizeof(ret.U64[0]));
   1579     ret.U64[1] = 0;
   1580     return ret;
   1581   }
   1582 
   1583   ALWAYS_INLINE static GSVector4i loadh(const void* p)
   1584   {
   1585     GSVector4i ret;
   1586     ret.U64[0] = 0;
   1587     std::memcpy(&ret.U64[1], p, sizeof(ret.U64[1]));
   1588     return ret;
   1589   }
   1590 
   1591   ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
   1592 
   1593   template<bool aligned>
   1594   ALWAYS_INLINE static GSVector4i load(const void* p)
   1595   {
   1596     GSVector4i ret;
   1597     std::memcpy(ret.S32, p, sizeof(ret.S32));
   1598     return ret;
   1599   }
   1600 
   1601   ALWAYS_INLINE static GSVector4i load(s32 i)
   1602   {
   1603     GSVector4i ret;
   1604     ret.x = i;
   1605     ret.y = 0;
   1606     ret.z = 0;
   1607     ret.w = 0;
   1608     return ret;
   1609   }
   1610 
   1611   ALWAYS_INLINE static GSVector4i loadq(s64 i)
   1612   {
   1613     GSVector4i ret;
   1614     ret.S64[0] = i;
   1615     ret.S64[1] = 0;
   1616     return ret;
   1617   }
   1618 
   1619   ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.S32, sizeof(v.S32)); }
   1620 
   1621   ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[0], sizeof(s32) * 2); }
   1622 
   1623   ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[2], sizeof(s32) * 2); }
   1624 
   1625   ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v)
   1626   {
   1627     GSVector4i::storel(pl, v);
   1628     GSVector4i::storeh(ph, v);
   1629   }
   1630 
   1631   template<bool aligned>
   1632   ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
   1633   {
   1634     std::memcpy(p, v.S32, sizeof(S32));
   1635   }
   1636 
   1637   ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { std::memcpy(p, &v.x, sizeof(s32)); }
   1638 
   1639   ALWAYS_INLINE static s32 store(const GSVector4i& v) { return v.x; }
   1640 
   1641   ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return v.S64[0]; }
   1642 
   1643   ALWAYS_INLINE void operator&=(const GSVector4i& v)
   1644   {
   1645     U64[0] &= v.U64[0];
   1646     U64[1] &= v.U64[1];
   1647   }
   1648   ALWAYS_INLINE void operator|=(const GSVector4i& v)
   1649   {
   1650     U64[0] |= v.U64[0];
   1651     U64[1] |= v.U64[1];
   1652   }
   1653   ALWAYS_INLINE void operator^=(const GSVector4i& v)
   1654   {
   1655     U64[0] ^= v.U64[0];
   1656     U64[1] ^= v.U64[1];
   1657   }
   1658 
   1659   ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
   1660   {
   1661     GSVector4i ret;
   1662     ret.U64[0] = v1.U64[0] & v2.U64[0];
   1663     ret.U64[1] = v1.U64[1] & v2.U64[1];
   1664     return ret;
   1665   }
   1666 
   1667   ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
   1668   {
   1669     GSVector4i ret;
   1670     ret.U64[0] = v1.U64[0] | v2.U64[0];
   1671     ret.U64[1] = v1.U64[1] | v2.U64[1];
   1672     return ret;
   1673   }
   1674 
   1675   ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
   1676   {
   1677     GSVector4i ret;
   1678     ret.U64[0] = v1.U64[0] ^ v2.U64[0];
   1679     ret.U64[1] = v1.U64[1] ^ v2.U64[1];
   1680     return ret;
   1681   }
   1682 
   1683   ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); }
   1684 
   1685   ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); }
   1686 
   1687   ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); }
   1688 
   1689   ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); }
   1690 
   1691   ALWAYS_INLINE static constexpr GSVector4i zero() { return GSVector4i::cxpr(0, 0, 0, 0); }
   1692 
   1693   ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
   1694 
   1695   ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(x, y); }
   1696   ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(z, w); }
   1697 
   1698   // clang-format off
   1699   // l/h/lh not implemented until needed
   1700 
   1701 #define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
   1702     ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(S32[xn], S32[yn], S32[zn], S32[wn]);}
   1703 
   1704 #define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
   1705     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
   1706     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
   1707     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
   1708     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
   1709 
   1710 #define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
   1711     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
   1712     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
   1713     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
   1714     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
   1715 
   1716 #define VECTOR4i_SHUFFLE_1(xs, xn) \
   1717     VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
   1718     VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
   1719     VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
   1720     VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
   1721 
   1722   VECTOR4i_SHUFFLE_1(x, 0)
   1723     VECTOR4i_SHUFFLE_1(y, 1)
   1724     VECTOR4i_SHUFFLE_1(z, 2)
   1725     VECTOR4i_SHUFFLE_1(w, 3)
   1726 
   1727   // clang-format on
   1728 };
   1729 
   1730 class alignas(16) GSVector4
   1731 {
   1732   struct cxpr_init_tag
   1733   {
   1734   };
   1735   static constexpr cxpr_init_tag cxpr_init{};
   1736 
   1737   constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
   1738 
   1739   constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
   1740 
   1741   constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
   1742 
   1743   constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
   1744 
   1745 public:
   1746   union
   1747   {
   1748     struct
   1749     {
   1750       float x, y, z, w;
   1751     };
   1752     struct
   1753     {
   1754       float r, g, b, a;
   1755     };
   1756     struct
   1757     {
   1758       float left, top, right, bottom;
   1759     };
   1760     float F32[4];
   1761     double F64[2];
   1762     s8 I8[16];
   1763     s16 I16[8];
   1764     s32 I32[4];
   1765     s64 I64[2];
   1766     u8 U8[16];
   1767     u16 U16[8];
   1768     u32 U32[4];
   1769     u64 U64[2];
   1770   };
   1771 
   1772   GSVector4() = default;
   1773 
   1774   constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
   1775 
   1776   constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
   1777 
   1778   constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
   1779 
   1780   constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
   1781 
   1782   constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
   1783 
   1784   constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
   1785 
   1786   constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
   1787 
   1788   constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
   1789 
   1790   ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
   1791   {
   1792     this->x = x;
   1793     this->y = y;
   1794     this->z = z;
   1795     this->w = w;
   1796   }
   1797 
   1798   ALWAYS_INLINE GSVector4(float x, float y)
   1799   {
   1800     this->x = x;
   1801     this->y = y;
   1802     this->z = 0.0f;
   1803     this->w = 0.0f;
   1804   }
   1805 
   1806   ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
   1807   {
   1808     this->x = static_cast<float>(x);
   1809     this->y = static_cast<float>(y);
   1810     this->z = static_cast<float>(z);
   1811     this->w = static_cast<float>(w);
   1812   }
   1813 
   1814   ALWAYS_INLINE GSVector4(int x, int y)
   1815   {
   1816     this->x = static_cast<float>(x);
   1817     this->y = static_cast<float>(y);
   1818     this->z = 0.0f;
   1819     this->w = 0.0f;
   1820   }
   1821 
   1822   ALWAYS_INLINE explicit GSVector4(float f) { x = y = z = w = f; }
   1823 
   1824   ALWAYS_INLINE explicit GSVector4(int i) { x = y = z = w = static_cast<float>(i); }
   1825 
   1826   ALWAYS_INLINE explicit GSVector4(const GSVector2& v) : x(v.x), y(v.y), z(0.0f), w(0.0f) {}
   1827   ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
   1828 
   1829   ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
   1830 
   1831   ALWAYS_INLINE static GSVector4 f64(double x, double y)
   1832   {
   1833     GSVector4 ret;
   1834     ret.F64[0] = x;
   1835     ret.F64[1] = y;
   1836     return ret;
   1837   }
   1838 
   1839   ALWAYS_INLINE static GSVector4 f64(double x)
   1840   {
   1841     GSVector4 ret;
   1842     ret.F64[0] = ret.F64[1] = x;
   1843     return ret;
   1844   }
   1845 
   1846   ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; }
   1847 
   1848   u32 rgba32() const { return GSVector4i(*this).rgba32(); }
   1849 
   1850   ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
   1851 
   1852   ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
   1853 
   1854   GSVector4 abs() const { return GSVector4(std::fabs(x), std::fabs(y), std::fabs(z), std::fabs(w)); }
   1855 
   1856   GSVector4 neg() const { return GSVector4(-x, -y, -z, -w); }
   1857 
   1858   GSVector4 rcp() const { return GSVector4(1.0f / x, 1.0f / y, 1.0f / z, 1.0f / w); }
   1859 
   1860   GSVector4 rcpnr() const
   1861   {
   1862     GSVector4 v_ = rcp();
   1863 
   1864     return (v_ + v_) - (v_ * v_) * *this;
   1865   }
   1866 
   1867   GSVector4 floor() const { return GSVector4(std::floor(x), std::floor(y), std::floor(z), std::floor(w)); }
   1868 
   1869   GSVector4 ceil() const { return GSVector4(std::ceil(x), std::ceil(y), std::ceil(z), std::ceil(w)); }
   1870 
   1871   GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ + b_; }
   1872 
   1873   GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ - b_; }
   1874 
   1875   GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const { return b_ - *this * a_; }
   1876 
   1877   GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const { return -b_ - *this * a_; }
   1878 
   1879   GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const
   1880   {
   1881     return a_.madd(b_, *this); // *this + a * b
   1882   }
   1883 
   1884   GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const
   1885   {
   1886     return a_.nmadd(b_, *this); // *this - a * b
   1887   }
   1888 
   1889   GSVector4 hadd() const { return GSVector4(x + y, z + w, x + y, z + w); }
   1890 
   1891   GSVector4 hadd(const GSVector4& v) const { return GSVector4(x + y, z + w, v.x + v.y, v.z + v.w); }
   1892 
   1893   GSVector4 hsub() const { return GSVector4(x - y, z - w, x - y, z - w); }
   1894 
   1895   GSVector4 hsub(const GSVector4& v) const { return GSVector4(x - y, z - w, v.x - v.y, v.z - v.w); }
   1896 
   1897   template<int i>
   1898   GSVector4 dp(const GSVector4& v) const
   1899   {
   1900     float res = 0.0f;
   1901     if constexpr (i & 0x10)
   1902       res += x * v.x;
   1903     if constexpr (i & 0x20)
   1904       res += y * v.y;
   1905     if constexpr (i & 0x40)
   1906       res += z * v.z;
   1907     if constexpr (i & 0x80)
   1908       res += w * v.w;
   1909     return GSVector4((i & 0x01) ? res : 0.0f, (i & 0x02) ? res : 0.0f, (i & 0x04) ? res : 0.0f,
   1910                      (i & 0x08) ? res : 0.0f);
   1911   }
   1912 
   1913   GSVector4 sat(const GSVector4& min, const GSVector4& max) const
   1914   {
   1915     return GSVector4(std::clamp(x, min.x, max.x), std::clamp(y, min.y, max.y), std::clamp(z, min.z, max.z),
   1916                      std::clamp(w, min.w, max.w));
   1917   }
   1918 
   1919   GSVector4 sat(const GSVector4& v) const
   1920   {
   1921     return GSVector4(std::clamp(x, v.x, v.z), std::clamp(y, v.y, v.w), std::clamp(z, v.x, v.z),
   1922                      std::clamp(w, v.y, v.w));
   1923   }
   1924 
   1925   GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
   1926 
   1927   GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
   1928 
   1929   GSVector4 min(const GSVector4& v) const
   1930   {
   1931     return GSVector4(std::min(x, v.x), std::min(y, v.y), std::min(z, v.z), std::min(w, v.w));
   1932   }
   1933 
   1934   GSVector4 max(const GSVector4& v) const
   1935   {
   1936     return GSVector4(std::max(x, v.x), std::max(y, v.y), std::max(z, v.z), std::max(w, v.w));
   1937   }
   1938 
   1939   template<int mask>
   1940   GSVector4 blend32(const GSVector4& v) const
   1941   {
   1942     return GSVector4(v.F32[mask & 1], v.F32[(mask >> 1) & 1], v.F32[(mask >> 2) & 1], v.F32[(mask >> 3) & 1]);
   1943   }
   1944 
   1945   ALWAYS_INLINE GSVector4 blend32(const GSVector4& v, const GSVector4& mask) const
   1946   {
   1947     return GSVector4((mask.U32[0] & 0x80000000u) ? v.x : x, (mask.U32[1] & 0x80000000u) ? v.y : y,
   1948                      (mask.U32[2] & 0x80000000u) ? v.z : z, (mask.U32[3] & 0x80000000u) ? v.w : w);
   1949   }
   1950 
   1951   GSVector4 upl(const GSVector4& v) const { return GSVector4(x, y, v.x, v.y); }
   1952 
   1953   GSVector4 uph(const GSVector4& v) const { return GSVector4(z, w, v.z, v.w); }
   1954 
   1955   GSVector4 upld(const GSVector4& v) const
   1956   {
   1957     GSVector4 ret;
   1958     ret.U64[0] = U64[0];
   1959     ret.U64[1] = v.U64[0];
   1960     return ret;
   1961   }
   1962 
   1963   GSVector4 uphd(const GSVector4& v) const
   1964   {
   1965     GSVector4 ret;
   1966     ret.U64[0] = U64[1];
   1967     ret.U64[1] = v.U64[1];
   1968     return ret;
   1969   }
   1970 
   1971   ALWAYS_INLINE GSVector4 l2h(const GSVector4& v) const { return GSVector4(x, y, v.x, v.y); }
   1972 
   1973   ALWAYS_INLINE GSVector4 h2l(const GSVector4& v) const { return GSVector4(v.z, v.w, z, w); }
   1974 
   1975   ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
   1976   {
   1977     GSVector4 ret;
   1978     ret.U32[0] = ((~v.U32[0]) & U32[0]);
   1979     ret.U32[1] = ((~v.U32[1]) & U32[1]);
   1980     ret.U32[2] = ((~v.U32[2]) & U32[2]);
   1981     ret.U32[3] = ((~v.U32[3]) & U32[3]);
   1982     return ret;
   1983   }
   1984 
   1985   ALWAYS_INLINE int mask() const
   1986   {
   1987     return (U32[0] >> 31) | ((U32[1] >> 30) & 2) | ((U32[2] >> 29) & 4) | ((U32[3] >> 28) & 8);
   1988   }
   1989 
   1990   ALWAYS_INLINE bool alltrue() const { return ((U64[0] & U64[1]) == 0xFFFFFFFFFFFFFFFFULL); }
   1991 
   1992   ALWAYS_INLINE bool allfalse() const { return ((U64[0] | U64[1]) == 0); }
   1993 
   1994   ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
   1995 
   1996   template<int src, int dst>
   1997   ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
   1998   {
   1999     GSVector4 ret = *this;
   2000     ret.F32[dst] = v.F32[src];
   2001     return ret;
   2002   }
   2003 
   2004   template<int i>
   2005   ALWAYS_INLINE int extract32() const
   2006   {
   2007     return I32[i];
   2008   }
   2009 
   2010   template<int dst>
   2011   ALWAYS_INLINE GSVector4 insert64(double v) const
   2012   {
   2013     GSVector4 ret;
   2014     ret.F64[dst] = v;
   2015     return ret;
   2016   }
   2017 
   2018   template<int src>
   2019   ALWAYS_INLINE double extract64() const
   2020   {
   2021     return F64[src];
   2022   }
   2023 
   2024   ALWAYS_INLINE static constexpr GSVector4 zero() { return GSVector4::cxpr(0.0f, 0.0f, 0.0f, 0.0f); }
   2025 
   2026   ALWAYS_INLINE static constexpr GSVector4 xffffffff()
   2027   {
   2028     GSVector4 ret = zero();
   2029     ret.U64[0] = ~ret.U64[0];
   2030     ret.U64[1] = ~ret.U64[1];
   2031     return ret;
   2032   }
   2033 
   2034   ALWAYS_INLINE static GSVector4 loadl(const void* p)
   2035   {
   2036     GSVector4 ret;
   2037     std::memcpy(&ret.x, p, sizeof(float) * 2);
   2038     ret.z = 0.0f;
   2039     ret.w = 0.0f;
   2040     return ret;
   2041   }
   2042 
   2043   ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(f, f, f, f); }
   2044 
   2045   template<bool aligned>
   2046   ALWAYS_INLINE static GSVector4 load(const void* p)
   2047   {
   2048     GSVector4 ret;
   2049     std::memcpy(&ret.x, p, sizeof(float) * 4);
   2050     return ret;
   2051   }
   2052 
   2053   ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { std::memcpy(p, &v, sizeof(v)); }
   2054 
   2055   ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { std::memcpy(p, &v.x, sizeof(float) * 2); }
   2056 
   2057   ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { std::memcpy(p, &v.z, sizeof(float) * 2); }
   2058 
   2059   template<bool aligned>
   2060   ALWAYS_INLINE static void store(void* p, const GSVector4& v)
   2061   {
   2062     std::memcpy(p, v.F32, sizeof(F32));
   2063   }
   2064 
   2065   ALWAYS_INLINE static void store(float* p, const GSVector4& v) { *p = v.x; }
   2066 
   2067   ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
   2068 
   2069   void operator+=(const GSVector4& v_)
   2070   {
   2071     x = x + v_.x;
   2072     y = y + v_.y;
   2073     z = z + v_.z;
   2074     w = w + v_.w;
   2075   }
   2076   void operator-=(const GSVector4& v_)
   2077   {
   2078     x = x - v_.x;
   2079     y = y - v_.y;
   2080     z = z - v_.z;
   2081     w = w - v_.w;
   2082   }
   2083   void operator*=(const GSVector4& v_)
   2084   {
   2085     x = x * v_.x;
   2086     y = y * v_.y;
   2087     z = z * v_.z;
   2088     w = w * v_.w;
   2089   }
   2090   void operator/=(const GSVector4& v_)
   2091   {
   2092     x = x / v_.x;
   2093     y = y / v_.y;
   2094     z = z / v_.z;
   2095     w = w / v_.w;
   2096   }
   2097 
   2098   void operator+=(const float v_)
   2099   {
   2100     x = x + v_;
   2101     y = y + v_;
   2102     z = z + v_;
   2103     w = w + v_;
   2104   }
   2105   void operator-=(const float v_)
   2106   {
   2107     x = x - v_;
   2108     y = y - v_;
   2109     z = z - v_;
   2110     w = w - v_;
   2111   }
   2112   void operator*=(const float v_)
   2113   {
   2114     x = x * v_;
   2115     y = y * v_;
   2116     z = z * v_;
   2117     w = w * v_;
   2118   }
   2119   void operator/=(const float v_)
   2120   {
   2121     x = x / v_;
   2122     y = y / v_;
   2123     z = z / v_;
   2124     w = w / v_;
   2125   }
   2126 
   2127   void operator&=(const GSVector4& v_)
   2128   {
   2129     U64[0] &= v_.U64[0];
   2130     U64[1] &= v_.U64[1];
   2131   }
   2132   void operator|=(const GSVector4& v_)
   2133   {
   2134     U64[0] |= v_.U64[0];
   2135     U64[1] |= v_.U64[1];
   2136   }
   2137   void operator^=(const GSVector4& v_)
   2138   {
   2139     U64[0] ^= v_.U64[0];
   2140     U64[1] ^= v_.U64[1];
   2141   }
   2142 
   2143   friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
   2144   {
   2145     return GSVector4(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z, v1.w + v2.w);
   2146   }
   2147 
   2148   friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
   2149   {
   2150     return GSVector4(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z, v1.w - v2.w);
   2151   }
   2152 
   2153   friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
   2154   {
   2155     return GSVector4(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z, v1.w * v2.w);
   2156   }
   2157 
   2158   friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
   2159   {
   2160     return GSVector4(v1.x / v2.x, v1.y / v2.y, v1.z / v2.z, v1.w / v2.w);
   2161   }
   2162 
   2163   friend GSVector4 operator+(const GSVector4& v, float f) { return GSVector4(v.x + f, v.y + f, v.z + f, v.w + f); }
   2164 
   2165   friend GSVector4 operator-(const GSVector4& v, float f) { return GSVector4(v.x - f, v.y - f, v.z - f, v.w - f); }
   2166 
   2167   friend GSVector4 operator*(const GSVector4& v, float f) { return GSVector4(v.x * f, v.y * f, v.z * f, v.w * f); }
   2168 
   2169   friend GSVector4 operator/(const GSVector4& v, float f) { return GSVector4(v.x / f, v.y / f, v.z / f, v.w / f); }
   2170 
   2171   friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
   2172   {
   2173     GSVector4 ret;
   2174     ret.U64[0] = v1.U64[0] & v2.U64[0];
   2175     ret.U64[1] = v1.U64[1] & v2.U64[1];
   2176     return ret;
   2177   }
   2178 
   2179   ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
   2180   {
   2181     GSVector4 ret;
   2182     ret.U64[0] = v1.U64[0] | v2.U64[0];
   2183     ret.U64[1] = v1.U64[1] | v2.U64[1];
   2184     return ret;
   2185   }
   2186 
   2187   ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
   2188   {
   2189     GSVector4 ret;
   2190     ret.U64[0] = v1.U64[0] ^ v2.U64[0];
   2191     ret.U64[1] = v1.U64[1] ^ v2.U64[1];
   2192     return ret;
   2193   }
   2194 
   2195   ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
   2196   {
   2197     GSVector4 ret;
   2198     ret.I32[0] = (v1.x == v2.x) ? -1 : 0;
   2199     ret.I32[1] = (v1.y == v2.y) ? -1 : 0;
   2200     ret.I32[2] = (v1.z == v2.z) ? -1 : 0;
   2201     ret.I32[3] = (v1.w == v2.w) ? -1 : 0;
   2202     return ret;
   2203   }
   2204 
   2205   ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
   2206   {
   2207     GSVector4 ret;
   2208     ret.I32[0] = (v1.x != v2.x) ? -1 : 0;
   2209     ret.I32[1] = (v1.y != v2.y) ? -1 : 0;
   2210     ret.I32[2] = (v1.z != v2.z) ? -1 : 0;
   2211     ret.I32[3] = (v1.w != v2.w) ? -1 : 0;
   2212     return ret;
   2213   }
   2214 
   2215   ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
   2216   {
   2217     GSVector4 ret;
   2218     ret.I32[0] = (v1.x > v2.x) ? -1 : 0;
   2219     ret.I32[1] = (v1.y > v2.y) ? -1 : 0;
   2220     ret.I32[2] = (v1.z > v2.z) ? -1 : 0;
   2221     ret.I32[3] = (v1.w > v2.w) ? -1 : 0;
   2222     return ret;
   2223   }
   2224 
   2225   ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
   2226   {
   2227     GSVector4 ret;
   2228     ret.I32[0] = (v1.x < v2.x) ? -1 : 0;
   2229     ret.I32[1] = (v1.y < v2.y) ? -1 : 0;
   2230     ret.I32[2] = (v1.z < v2.z) ? -1 : 0;
   2231     ret.I32[3] = (v1.w < v2.w) ? -1 : 0;
   2232     return ret;
   2233   }
   2234 
   2235   ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
   2236   {
   2237     GSVector4 ret;
   2238     ret.I32[0] = (v1.x >= v2.x) ? -1 : 0;
   2239     ret.I32[1] = (v1.y >= v2.y) ? -1 : 0;
   2240     ret.I32[2] = (v1.z >= v2.z) ? -1 : 0;
   2241     ret.I32[3] = (v1.w >= v2.w) ? -1 : 0;
   2242     return ret;
   2243   }
   2244 
   2245   ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
   2246   {
   2247     GSVector4 ret;
   2248     ret.I32[0] = (v1.x <= v2.x) ? -1 : 0;
   2249     ret.I32[1] = (v1.y <= v2.y) ? -1 : 0;
   2250     ret.I32[2] = (v1.z <= v2.z) ? -1 : 0;
   2251     ret.I32[3] = (v1.w <= v2.w) ? -1 : 0;
   2252     return ret;
   2253   }
   2254 
   2255   ALWAYS_INLINE GSVector4 mul64(const GSVector4& v_) const
   2256   {
   2257     GSVector4 ret;
   2258     ret.F64[0] = F64[0] * v_.F64[0];
   2259     ret.F64[1] = F64[1] * v_.F64[1];
   2260     return ret;
   2261   }
   2262 
   2263   ALWAYS_INLINE GSVector4 add64(const GSVector4& v_) const
   2264   {
   2265     GSVector4 ret;
   2266     ret.F64[0] = F64[0] + v_.F64[0];
   2267     ret.F64[1] = F64[1] + v_.F64[1];
   2268     return ret;
   2269   }
   2270 
   2271   ALWAYS_INLINE GSVector4 sub64(const GSVector4& v_) const
   2272   {
   2273     GSVector4 ret;
   2274     ret.F64[0] = F64[0] - v_.F64[0];
   2275     ret.F64[1] = F64[1] - v_.F64[1];
   2276     return ret;
   2277   }
   2278 
   2279   ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
   2280   {
   2281     return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
   2282   }
   2283 
   2284   ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
   2285   {
   2286     GSVector4 ret;
   2287     ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2288     ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2289     return ret;
   2290   }
   2291 
   2292   ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
   2293   {
   2294     GSVector4 ret;
   2295     ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2296     ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2297     return ret;
   2298   }
   2299 
   2300   ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
   2301   {
   2302     GSVector4 ret;
   2303     ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2304     ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2305     return ret;
   2306   }
   2307 
   2308   ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
   2309   {
   2310     GSVector4 ret;
   2311     ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2312     ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2313     return ret;
   2314   }
   2315 
   2316   ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
   2317   {
   2318     GSVector4 ret;
   2319     ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2320     ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2321     return ret;
   2322   }
   2323 
   2324   ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
   2325   {
   2326     return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
   2327   }
   2328 
   2329   ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
   2330   {
   2331     return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
   2332   }
   2333 
   2334   ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
   2335 
   2336   ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL(); }
   2337 
   2338   ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1])); }
   2339 
   2340   ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]); }
   2341 
   2342   ALWAYS_INLINE GSVector4 floor64() const { return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1])); }
   2343 
   2344   ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_)
   2345   {
   2346     GSVector4 ret;
   2347     ret.F64[0] = v_.x;
   2348     ret.F64[1] = v_.y;
   2349     return ret;
   2350   }
   2351 
   2352   ALWAYS_INLINE static GSVector4 f32to64(const void* p)
   2353   {
   2354     float f[2];
   2355     std::memcpy(f, p, sizeof(f));
   2356     GSVector4 ret;
   2357     ret.F64[0] = f[0];
   2358     ret.F64[1] = f[1];
   2359     return ret;
   2360   }
   2361 
   2362   ALWAYS_INLINE GSVector4i f64toi32() const
   2363   {
   2364     return GSVector4i(static_cast<s32>(F64[0]), static_cast<s32>(F64[1]), 0, 0);
   2365   }
   2366 
   2367   // clang-format off
   2368 
   2369 #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
   2370     ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); } \
   2371     ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(F32[xn], F32[yn], v_.F32[zn], v_.F32[wn]); }
   2372 
   2373 #define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
   2374     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
   2375     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
   2376     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
   2377     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
   2378 
   2379 #define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
   2380     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
   2381     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
   2382     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
   2383     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
   2384 
   2385 #define VECTOR4_SHUFFLE_1(xs, xn) \
   2386     VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
   2387     VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
   2388     VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
   2389     VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
   2390 
   2391   VECTOR4_SHUFFLE_1(x, 0)
   2392     VECTOR4_SHUFFLE_1(y, 1)
   2393     VECTOR4_SHUFFLE_1(z, 2)
   2394     VECTOR4_SHUFFLE_1(w, 3)
   2395 
   2396   // clang-format on
   2397 
   2398   ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(x, x, x, x); }
   2399 
   2400   ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(v.x, v.x, v.x, v.x); }
   2401 
   2402   ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
   2403   {
   2404     float ff;
   2405     std::memcpy(&ff, f, sizeof(ff));
   2406     return GSVector4(ff, ff, ff, ff);
   2407   }
   2408 
   2409   ALWAYS_INLINE static GSVector4 broadcast64(const void* d)
   2410   {
   2411     GSVector4 ret;
   2412     std::memcpy(&ret.F64[0], d, sizeof(ret.F64[0]));
   2413     ret.F64[1] = ret.F64[0];
   2414     return ret;
   2415   }
   2416 };
   2417 
   2418 ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
   2419 {
   2420   x = static_cast<s32>(v.x);
   2421   y = static_cast<s32>(v.y);
   2422 }
   2423 
   2424 ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
   2425 {
   2426   x = static_cast<float>(v.x);
   2427   y = static_cast<float>(v.y);
   2428 }
   2429 
   2430 ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v)
   2431 {
   2432   GSVector2i ret;
   2433   std::memcpy(&ret, &v, sizeof(ret));
   2434   return ret;
   2435 }
   2436 
   2437 ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
   2438 {
   2439   GSVector2 ret;
   2440   std::memcpy(&ret, &v, sizeof(ret));
   2441   return ret;
   2442 }
   2443 
   2444 ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
   2445 {
   2446   x = static_cast<s32>(v.x);
   2447   y = static_cast<s32>(v.y);
   2448   z = static_cast<s32>(v.z);
   2449   w = static_cast<s32>(v.w);
   2450 }
   2451 
   2452 ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
   2453 {
   2454   x = static_cast<float>(v.x);
   2455   y = static_cast<float>(v.y);
   2456   z = static_cast<float>(v.z);
   2457   w = static_cast<float>(v.w);
   2458 }
   2459 
   2460 ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
   2461 {
   2462   GSVector4i ret;
   2463   std::memcpy(&ret, &v, sizeof(ret));
   2464   return ret;
   2465 }
   2466 
   2467 ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
   2468 {
   2469   GSVector4 ret;
   2470   std::memcpy(&ret, &v, sizeof(ret));
   2471   return ret;
   2472 }
   2473 
   2474 #undef SSATURATE8
   2475 #undef USATURATE8
   2476 #undef SSATURATE16
   2477 #undef USATURATE16
   2478 #undef ALL_LANES_8
   2479 #undef ALL_LANES_16
   2480 #undef ALL_LANES_32
   2481 #undef ALL_LANES_64