duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

gsvector_neon.h (107684B)


      1 // SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "common/intrin.h"
      5 #include "common/types.h"
      6 
      7 #include <algorithm>
      8 #include <cmath>
      9 
     10 #define GSVECTOR_HAS_UNSIGNED 1
     11 #define GSVECTOR_HAS_SRLV 1
     12 
     13 class GSVector2;
     14 class GSVector2i;
     15 class GSVector4;
     16 class GSVector4i;
     17 
     18 class alignas(16) GSVector2i
     19 {
     20   struct cxpr_init_tag
     21   {
     22   };
     23   static constexpr cxpr_init_tag cxpr_init{};
     24 
     25   constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y} {}
     26 
     27   constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
     28 
     29   constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
     30     : S8{b0, b1, b2, b3, b4, b5, b6, b7}
     31   {
     32   }
     33 
     34 public:
     35   union
     36   {
     37     struct
     38     {
     39       s32 x, y;
     40     };
     41     struct
     42     {
     43       s32 r, g;
     44     };
     45     float F32[2];
     46     s8 S8[8];
     47     s16 S16[4];
     48     s32 S32[2];
     49     s64 S64[1];
     50     u8 U8[8];
     51     u16 U16[4];
     52     u32 U32[2];
     53     u64 U64[1];
     54     int32x2_t v2s;
     55   };
     56 
     57   GSVector2i() = default;
     58 
     59   ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
     60 
     61   ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
     62 
     63   ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
     64 
     65   ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
     66   {
     67     return GSVector2i(cxpr_init, s0, s1, s2, s3);
     68   }
     69 
     70   ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
     71   {
     72     return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7);
     73   }
     74 
     75   ALWAYS_INLINE GSVector2i(s32 x, s32 y) { v2s = vset_lane_s32(y, vdup_n_s32(x), 1); }
     76 
     77   ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
     78 
     79   ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
     80     : S8{b0, b1, b2, b3, b4, b5, b6, b7}
     81   {
     82   }
     83 
     84   // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
     85   // so leave the non-constexpr version default
     86   ALWAYS_INLINE explicit GSVector2i(int i) { *this = i; }
     87 
     88   ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {}
     89 
     90   ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
     91 
     92   ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
     93 
     94   ALWAYS_INLINE void operator=(int i) { v2s = vdup_n_s32(i); }
     95 
     96   ALWAYS_INLINE operator int32x2_t() const { return v2s; }
     97 
     98   ALWAYS_INLINE GSVector2i sat_i8(const GSVector2i& min, const GSVector2i& max) const
     99   {
    100     return max_i8(min).min_i8(max);
    101   }
    102   ALWAYS_INLINE GSVector2i sat_i16(const GSVector2i& min, const GSVector2i& max) const
    103   {
    104     return max_i16(min).min_i16(max);
    105   }
    106   ALWAYS_INLINE GSVector2i sat_i32(const GSVector2i& min, const GSVector2i& max) const
    107   {
    108     return max_i32(min).min_i32(max);
    109   }
    110 
    111   ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const
    112   {
    113     return max_u8(min).min_u8(max);
    114   }
    115   ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const
    116   {
    117     return max_u16(min).min_u16(max);
    118   }
    119   ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const
    120   {
    121     return max_u32(min).min_u32(max);
    122   }
    123 
    124   ALWAYS_INLINE GSVector2i min_i8(const GSVector2i& v) const
    125   {
    126     return GSVector2i(vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    127   }
    128 
    129   ALWAYS_INLINE GSVector2i max_i8(const GSVector2i& v) const
    130   {
    131     return GSVector2i(vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    132   }
    133 
    134   ALWAYS_INLINE GSVector2i min_i16(const GSVector2i& v) const
    135   {
    136     return GSVector2i(vreinterpret_s32_s16(vmin_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    137   }
    138 
    139   ALWAYS_INLINE GSVector2i max_i16(const GSVector2i& v) const
    140   {
    141     return GSVector2i(vreinterpret_s32_s16(vmax_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    142   }
    143 
    144   ALWAYS_INLINE GSVector2i min_i32(const GSVector2i& v) const { return GSVector2i(vmin_s32(v2s, v.v2s)); }
    145 
    146   ALWAYS_INLINE GSVector2i max_i32(const GSVector2i& v) const { return GSVector2i(vmax_s32(v2s, v.v2s)); }
    147 
    148   ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const
    149   {
    150     return GSVector2i(vreinterpret_s32_u8(vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
    151   }
    152 
    153   ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const
    154   {
    155     return GSVector2i(vreinterpret_s32_u8(vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
    156   }
    157 
    158   ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const
    159   {
    160     return GSVector2i(vreinterpret_s32_u16(vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
    161   }
    162 
    163   ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const
    164   {
    165     return GSVector2i(vreinterpret_s32_u16(vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
    166   }
    167 
    168   ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const
    169   {
    170     return GSVector2i(vreinterpret_s32_u32(vmin_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
    171   }
    172 
    173   ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const
    174   {
    175     return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
    176   }
    177 
    178   ALWAYS_INLINE s32 addv_s32() const
    179   {
    180 #ifdef CPU_ARCH_ARM64
    181     return vaddv_s32(v2s);
    182 #else
    183     return vget_lane_s32(v2s, 0) + vget_lane_s32(v2s, 1);
    184 #endif
    185   }
    186 
    187 #ifdef CPU_ARCH_ARM64
    188 
    189   ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); }
    190 
    191   ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); }
    192 
    193   ALWAYS_INLINE u16 minv_u16() const { return vminv_u16(vreinterpret_u16_s32(v2s)); }
    194 
    195   ALWAYS_INLINE u16 maxv_u16() const { return vmaxv_u16(vreinterpret_u16_s32(v2s)); }
    196 
    197   ALWAYS_INLINE s32 minv_s32() const { return vminv_s32(v2s); }
    198 
    199   ALWAYS_INLINE u32 minv_u32() const { return vminv_u32(v2s); }
    200 
    201   ALWAYS_INLINE s32 maxv_s32() const { return vmaxv_s32(v2s); }
    202 
    203   ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(v2s); }
    204 
    205 #else
    206 
    207   ALWAYS_INLINE u8 minv_u8() const
    208   {
    209     uint8x8_t vmin = vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
    210     return static_cast<u8>(
    211       std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
    212                std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
    213                         std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
    214   }
    215 
    216   ALWAYS_INLINE u16 maxv_u8() const
    217   {
    218     uint8x8_t vmax = vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
    219     return static_cast<u8>(
    220       std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
    221                std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
    222                         std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
    223   }
    224 
    225   ALWAYS_INLINE u16 minv_u16() const
    226   {
    227     uint16x4_t vmin = vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
    228     return static_cast<u16>(
    229       std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
    230   }
    231 
    232   ALWAYS_INLINE u16 maxv_u16() const
    233   {
    234     uint16x4_t vmax = vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
    235     return static_cast<u16>(
    236       std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
    237   }
    238 
    239   ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
    240 
    241   ALWAYS_INLINE u32 minv_u32() const
    242   {
    243     return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
    244   }
    245 
    246   ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
    247 
    248   ALWAYS_INLINE u32 maxv_u32() const
    249   {
    250     return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
    251   }
    252 
    253 #endif
    254 
    255   ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
    256 
    257   ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const
    258   {
    259     uint8x8_t mask2 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_s32(mask.v2s), 7));
    260     return GSVector2i(vreinterpret_s32_u8(vbsl_u8(mask2, vreinterpret_u8_s32(a.v2s), vreinterpret_u8_s32(v2s))));
    261   }
    262 
    263   template<int mask>
    264   ALWAYS_INLINE GSVector2i blend16(const GSVector2i& a) const
    265   {
    266     static constexpr const uint16_t _mask[4] = {
    267       ((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0,
    268       ((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0};
    269     return GSVector2i(
    270       vreinterpret_s32_u16(vbsl_u16(vld1_u16(_mask), vreinterpret_u16_s32(a.v2s), vreinterpret_u16_s32(v2s))));
    271   }
    272 
    273   template<int mask>
    274   ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const
    275   {
    276     constexpr int bit1 = ((mask & 2) * 3) << 1;
    277     constexpr int bit0 = (mask & 1) * 3;
    278     return blend16<bit1 | bit0>(v);
    279   }
    280 
    281   ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
    282   {
    283     return GSVector2i(vreinterpret_s32_s8(vorr_s8(vbic_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(mask.v2s)),
    284                                                   vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s)))));
    285   }
    286 
    287   ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); }
    288 
    289   ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const
    290   {
    291     return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_u8_s32(mask.v2s))));
    292   }
    293 
    294   ALWAYS_INLINE GSVector2i ps16() const
    295   {
    296     return GSVector2i(vreinterpret_s32_s8(vqmovn_s16(vcombine_s16(vreinterpret_s16_s32(v2s), vcreate_s16(0)))));
    297   }
    298 
    299   ALWAYS_INLINE GSVector2i pu16() const
    300   {
    301     return GSVector2i(vreinterpret_s32_u8(vqmovn_u16(vcombine_u16(vreinterpret_u16_s32(v2s), vcreate_u16(0)))));
    302   }
    303 
    304   ALWAYS_INLINE GSVector2i ps32() const
    305   {
    306     return GSVector2i(vreinterpret_s32_s16(vqmovn_s16(vcombine_s32(v2s, vcreate_s32(0)))));
    307   }
    308 
    309   ALWAYS_INLINE GSVector2i pu32() const
    310   {
    311     return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0)))));
    312   }
    313 
    314 #ifdef CPU_ARCH_ARM64
    315 
    316   ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
    317   {
    318     return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    319   }
    320 
    321   ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
    322   {
    323     return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    324   }
    325   ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip1_s32(v2s, v.v2s)); }
    326 
    327   ALWAYS_INLINE GSVector2i upl8() const
    328   {
    329     return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0))));
    330   }
    331 
    332   ALWAYS_INLINE GSVector2i upl16() const
    333   {
    334     return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0))));
    335   }
    336 
    337   ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); }
    338 
    339 #else
    340 
    341   ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
    342   {
    343     return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)).val[0]));
    344   }
    345 
    346   ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
    347   {
    348     return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)).val[0]));
    349   }
    350   ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip_s32(v2s, v.v2s).val[0]); }
    351 
    352   ALWAYS_INLINE GSVector2i upl8() const
    353   {
    354     return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)).val[0]));
    355   }
    356 
    357   ALWAYS_INLINE GSVector2i upl16() const
    358   {
    359     return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)).val[0]));
    360   }
    361 
    362   ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip_s32(v2s, vdup_n_s32(0)).val[0]); }
    363 
    364 #endif
    365 
    366   ALWAYS_INLINE GSVector2i i8to16() const
    367   {
    368     return GSVector2i(vreinterpret_s32_s16(vget_low_s8(vmovl_s8(vreinterpret_s8_s32(v2s)))));
    369   }
    370 
    371   ALWAYS_INLINE GSVector2i u8to16() const
    372   {
    373     return GSVector2i(vreinterpret_s32_u16(vget_low_u8(vmovl_u8(vreinterpret_u8_s32(v2s)))));
    374   }
    375 
    376   template<int i>
    377   ALWAYS_INLINE GSVector2i srl() const
    378   {
    379     return GSVector2i(vreinterpret_s32_s8(vext_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0), i)));
    380   }
    381 
    382   template<int i>
    383   ALWAYS_INLINE GSVector2i sll() const
    384   {
    385     return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 16 - i)));
    386   }
    387 
    388   template<int i>
    389   ALWAYS_INLINE GSVector2i sll16() const
    390   {
    391     return GSVector2i(vreinterpret_s32_s16(vshl_n_s16(vreinterpret_s16_s32(v2s), i)));
    392   }
    393 
    394   ALWAYS_INLINE GSVector2i sll16(s32 i) const
    395   {
    396     return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(i))));
    397   }
    398 
    399   ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const
    400   {
    401     return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    402   }
    403 
    404   template<int i>
    405   ALWAYS_INLINE GSVector2i srl16() const
    406   {
    407     return GSVector2i(vreinterpret_s32_u16(vshr_n_u16(vreinterpret_u16_s32(v2s), i)));
    408   }
    409 
    410   ALWAYS_INLINE GSVector2i srl16(s32 i) const
    411   {
    412     return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_u16(-i))));
    413   }
    414 
    415   ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const
    416   {
    417     return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
    418   }
    419 
    420   template<int i>
    421   ALWAYS_INLINE GSVector2i sra16() const
    422   {
    423     constexpr int count = (i & ~15) ? 15 : i;
    424     return GSVector2i(vreinterpret_s32_s16(vshr_n_s16(vreinterpret_s16_s32(v2s), count)));
    425   }
    426 
    427   ALWAYS_INLINE GSVector2i sra16(s32 i) const
    428   {
    429     return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(-i))));
    430   }
    431 
    432   ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const
    433   {
    434     return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
    435   }
    436 
    437   template<int i>
    438   ALWAYS_INLINE GSVector2i sll32() const
    439   {
    440     return GSVector2i(vshl_n_s32(v2s, i));
    441   }
    442 
    443   ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(i))); }
    444 
    445   ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(vshl_s32(v2s, v.v2s)); }
    446 
    447   template<int i>
    448   ALWAYS_INLINE GSVector2i srl32() const
    449   {
    450     return GSVector2i(vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(v2s), i)));
    451   }
    452 
    453   ALWAYS_INLINE GSVector2i srl32(s32 i) const
    454   {
    455     return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(-i))));
    456   }
    457 
    458   ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const
    459   {
    460     return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s))));
    461   }
    462 
    463   template<int i>
    464   ALWAYS_INLINE GSVector2i sra32() const
    465   {
    466     return GSVector2i(vshr_n_s32(v2s, i));
    467   }
    468 
    469   ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(-i))); }
    470 
    471   ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const
    472   {
    473     return GSVector2i(vshl_s32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s)));
    474   }
    475 
    476   ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const
    477   {
    478     return GSVector2i(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    479   }
    480 
    481   ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const
    482   {
    483     return GSVector2i(vreinterpret_s32_s16(vadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    484   }
    485 
    486   ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(vadd_s32(v2s, v.v2s)); }
    487 
    488   ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const
    489   {
    490     return GSVector2i(vreinterpret_s32_s8(vqadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    491   }
    492 
    493   ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const
    494   {
    495     return GSVector2i(vreinterpret_s32_s16(vqadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    496   }
    497 
    498   ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const
    499   {
    500     return GSVector2i(vreinterpret_s32_u8(vqadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
    501   }
    502 
    503   ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const
    504   {
    505     return GSVector2i(vreinterpret_s32_u16(vqadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
    506   }
    507 
    508   ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const
    509   {
    510     return GSVector2i(vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    511   }
    512 
    513   ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const
    514   {
    515     return GSVector2i(vreinterpret_s32_s16(vsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    516   }
    517 
    518   ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(vsub_s32(v2s, v.v2s)); }
    519 
    520   ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const
    521   {
    522     return GSVector2i(vreinterpret_s32_s8(vqsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    523   }
    524 
    525   ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const
    526   {
    527     return GSVector2i(vreinterpret_s32_s16(vqsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    528   }
    529 
    530   ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const
    531   {
    532     return GSVector2i(vreinterpret_s32_u8(vqsub_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
    533   }
    534 
    535   ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const
    536   {
    537     return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
    538   }
    539 
    540   ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const
    541   {
    542     return GSVector2i(vreinterpret_s32_u8(vrhadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
    543   }
    544 
    545   ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const
    546   {
    547     return GSVector2i(vreinterpret_s32_u16(vrhadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
    548   }
    549 
    550   ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
    551   {
    552     return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    553   }
    554 
    555   ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(vmul_s32(v2s, v.v2s)); }
    556 
    557   ALWAYS_INLINE bool eq(const GSVector2i& v) const
    558   {
    559     return (vget_lane_u64(vreinterpret_u64_s32(veor_s32(v2s, v.v2s)), 0) == 0);
    560   }
    561 
    562   ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const
    563   {
    564     return GSVector2i(vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    565   }
    566 
    567   ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const
    568   {
    569     return GSVector2i(vreinterpret_s32_u16(vceq_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    570   }
    571 
    572   ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const
    573   {
    574     return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s)));
    575   }
    576 
    577   ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }
    578 
    579   ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
    580 
    581   ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); }
    582 
    583   ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const
    584   {
    585     return GSVector2i(vreinterpret_s32_s8(vcgt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    586   }
    587 
    588   ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const
    589   {
    590     return GSVector2i(vreinterpret_s32_s16(vcgt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    591   }
    592 
    593   ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(vcgt_s32(v2s, v.v2s)); }
    594 
    595   ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const
    596   {
    597     return GSVector2i(vreinterpret_s32_s8(vcge_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    598   }
    599   ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const
    600   {
    601     return GSVector2i(vreinterpret_s32_s16(vcge_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    602   }
    603   ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return GSVector2i(vcge_s32(v2s, v.v2s)); }
    604 
    605   ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const
    606   {
    607     return GSVector2i(vreinterpret_s32_s8(vclt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    608   }
    609 
    610   ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const
    611   {
    612     return GSVector2i(vreinterpret_s32_s16(vclt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    613   }
    614 
    615   ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(vclt_s32(v2s, v.v2s)); }
    616 
    617   ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const
    618   {
    619     return GSVector2i(vreinterpret_s32_s8(vcle_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
    620   }
    621   ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const
    622   {
    623     return GSVector2i(vreinterpret_s32_s16(vcle_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
    624   }
    625   ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return GSVector2i(vcle_s32(v2s, v.v2s)); }
    626 
    627   ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(vbic_s32(v2s, v.v2s)); }
    628 
    629   ALWAYS_INLINE int mask() const
    630   {
    631     // borrowed from sse2neon
    632     const uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(vreinterpret_u8_s32(v2s), 7));
    633     const uint32x2_t paired16 = vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
    634     const uint64x1_t paired32 = vreinterpret_u64_u32(vsra_n_u32(paired16, paired16, 14));
    635     const uint8x8_t paired64 = vreinterpret_u8_u64(vsra_n_u64(paired32, paired32, 28));
    636     return static_cast<int>(vget_lane_u8(paired64, 0));
    637   }
    638 
    639   ALWAYS_INLINE bool alltrue() const
    640   {
    641     // MSB should be set in all 8-bit lanes.
    642 #ifdef CPU_ARCH_ARM64
    643     return (vminv_u8(vreinterpret_u8_s32(v2s)) & 0x80) == 0x80;
    644 #else
    645     return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) ==
    646             0x80808080u);
    647 #endif
    648   }
    649 
    650   ALWAYS_INLINE bool allfalse() const
    651   {
    652     // MSB should be clear in all 8-bit lanes.
    653 #ifdef CPU_ARCH_ARM64
    654     return (vmaxv_u32(vreinterpret_u8_s32(v2s)) & 0x80) != 0x80;
    655 #else
    656     return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) ==
    657             0);
    658 #endif
    659   }
    660 
    661   template<int i>
    662   ALWAYS_INLINE GSVector2i insert8(int a) const
    663   {
    664     return GSVector2i(vreinterpret_s32_u8(vset_lane_u8(a, vreinterpret_u8_s32(v2s), static_cast<uint8_t>(i))));
    665   }
    666 
    667   template<int i>
    668   ALWAYS_INLINE int extract8() const
    669   {
    670     return vget_lane_u8(vreinterpret_u8_s32(v2s), i);
    671   }
    672 
    673   template<int i>
    674   ALWAYS_INLINE GSVector2i insert16(int a) const
    675   {
    676     return GSVector2i(vreinterpret_s32_u16(vset_lane_u16(a, vreinterpret_u16_s32(v2s), static_cast<uint16_t>(i))));
    677   }
    678 
    679   template<int i>
    680   ALWAYS_INLINE int extract16() const
    681   {
    682     return vget_lane_u16(vreinterpret_u16_s32(v2s), i);
    683   }
    684 
    685   template<int i>
    686   ALWAYS_INLINE GSVector2i insert32(int a) const
    687   {
    688     return GSVector2i(vset_lane_s32(a, v2s, i));
    689   }
    690 
    691   template<int i>
    692   ALWAYS_INLINE int extract32() const
    693   {
    694     return vget_lane_s32(v2s, i);
    695   }
    696 
    697   ALWAYS_INLINE static GSVector2i load32(const void* p)
    698   {
    699     // should be ldr s0, [x0]
    700     u32 val;
    701     std::memcpy(&val, p, sizeof(u32));
    702     return GSVector2i(vset_lane_u32(val, vdup_n_u32(0), 0));
    703   }
    704 
    705   ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(vld1_s32((const int32_t*)p)); }
    706 
    707   ALWAYS_INLINE static GSVector2i load(int i) { return GSVector2i(vset_lane_s32(i, vdup_n_s32(0), 0)); }
    708 
    709   ALWAYS_INLINE static void store32(void* p, const GSVector2i& v)
    710   {
    711     s32 val = vget_lane_s32(v, 0);
    712     std::memcpy(p, &val, sizeof(s32));
    713   }
    714 
    715   ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { vst1_s32((int32_t*)p, v.v2s); }
    716 
    717   ALWAYS_INLINE static int store(const GSVector2i& v) { return vget_lane_s32(v.v2s, 0); }
    718 
    719   ALWAYS_INLINE void operator&=(const GSVector2i& v)
    720   {
    721     v2s = vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
    722   }
    723 
    724   ALWAYS_INLINE void operator|=(const GSVector2i& v)
    725   {
    726     v2s = vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
    727   }
    728 
    729   ALWAYS_INLINE void operator^=(const GSVector2i& v)
    730   {
    731     v2s = vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
    732   }
    733 
    734   ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
    735   {
    736     return GSVector2i(vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
    737   }
    738 
    739   ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2)
    740   {
    741     return GSVector2i(vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
    742   }
    743 
    744   ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2)
    745   {
    746     return GSVector2i(vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
    747   }
    748 
    749   ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, int i) { return v & GSVector2i(i); }
    750 
    751   ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, int i) { return v | GSVector2i(i); }
    752 
    753   ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, int i) { return v ^ GSVector2i(i); }
    754 
    755   ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return GSVector2i(vmvn_s32(v.v2s)); }
    756 
    757   ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(0); }
    758 
    759   ALWAYS_INLINE GSVector2i xy() const { return *this; }
    760   ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 0, 0)); }
    761   ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 0)); }
    762   ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 1)); }
    763 };
    764 
    765 class alignas(16) GSVector2
    766 {
    767   struct cxpr_init_tag
    768   {
    769   };
    770   static constexpr cxpr_init_tag cxpr_init{};
    771 
    772   constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
    773 
    774   constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
    775 
    776 public:
    777   union
    778   {
    779     struct
    780     {
    781       float x, y;
    782     };
    783     struct
    784     {
    785       float r, g;
    786     };
    787     float F32[2];
    788     double F64[1];
    789     s8 I8[8];
    790     s16 I16[4];
    791     s32 I32[2];
    792     s64 I64[1];
    793     u8 U8[8];
    794     u16 U16[4];
    795     u32 U32[2];
    796     u64 U64[1];
    797     float32x2_t v2s;
    798   };
    799 
    800   GSVector2() = default;
    801 
    802   constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
    803 
    804   constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
    805 
    806   constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
    807 
    808   constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
    809 
    810   ALWAYS_INLINE GSVector2(float x, float y) : v2s(vset_lane_f32(y, vdup_n_f32(x), 1)) {}
    811 
    812   ALWAYS_INLINE GSVector2(int x, int y) : v2s(vcvt_f32_s32(vset_lane_s32(y, vdup_n_s32(x), 1))) {}
    813 
    814   ALWAYS_INLINE constexpr explicit GSVector2(float32x2_t m) : v2s(m) {}
    815 
    816   ALWAYS_INLINE explicit GSVector2(float f) { v2s = vdup_n_f32(f); }
    817 
    818   ALWAYS_INLINE explicit GSVector2(int i) { v2s = vcvt_f32_s32(vdup_n_s32(i)); }
    819 
    820   ALWAYS_INLINE explicit GSVector2(const GSVector2i& v);
    821 
    822   ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
    823 
    824   ALWAYS_INLINE void operator=(float f) { v2s = vdup_n_f32(f); }
    825 
    826   ALWAYS_INLINE void operator=(float32x2_t m) { v2s = m; }
    827 
    828   ALWAYS_INLINE operator float32x2_t() const { return v2s; }
    829 
    830   ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); }
    831   ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); }
    832   ALWAYS_INLINE GSVector2 rcp() const
    833   {
    834     float32x2_t recip = vrecpe_f32(v2s);
    835     recip = vmul_f32(recip, vrecps_f32(recip, v2s));
    836     return GSVector2(recip);
    837   }
    838 
    839 #ifdef CPU_ARCH_ARM64
    840 
    841   ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
    842   ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
    843 
    844 #else
    845 
    846   ALWAYS_INLINE GSVector2 floor() const
    847   {
    848     return GSVector2(std::floor(vget_lane_f32(v2s, 0)), std::floor(vget_lane_f32(v2s, 1)));
    849   }
    850 
    851   ALWAYS_INLINE GSVector2 ceil() const
    852   {
    853     return GSVector2(std::ceil(vget_lane_f32(v2s, 0)), std::ceil(vget_lane_f32(v2s, 1)));
    854   }
    855 
    856 #endif
    857 
    858   ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); }
    859 
    860   ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
    861 
    862   ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); }
    863 
    864   ALWAYS_INLINE GSVector2 min(const GSVector2& a) const { return GSVector2(vmin_f32(v2s, a.v2s)); }
    865 
    866   ALWAYS_INLINE GSVector2 max(const GSVector2& a) const { return GSVector2(vmax_f32(v2s, a.v2s)); }
    867 
    868   template<int mask>
    869   ALWAYS_INLINE GSVector2 blend32(const GSVector2& a) const
    870   {
    871     return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1));
    872   }
    873 
    874   ALWAYS_INLINE GSVector2 blend32(const GSVector2& a, const GSVector2& mask) const
    875   {
    876     // duplicate sign bit across and bit select
    877     const uint32x2_t bitmask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(mask.v2s), 31));
    878     return GSVector2(vbsl_f32(bitmask, a.v2s, v2s));
    879   }
    880 
    881   ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const
    882   {
    883     return GSVector2(vreinterpret_f32_s32(vbic_s32(vreinterpret_s32_f32(v2s), vreinterpret_s32_f32(v.v2s))));
    884   }
    885 
    886   ALWAYS_INLINE int mask() const
    887   {
    888     const uint32x2_t masks = vshr_n_u32(vreinterpret_u32_s32(v2s), 31);
    889     return (vget_lane_u32(masks, 0) | (vget_lane_u32(masks, 1) << 1));
    890   }
    891 
    892   ALWAYS_INLINE bool alltrue() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0xFFFFFFFFFFFFFFFFULL); }
    893 
    894   ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0); }
    895 
    896   ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
    897 
    898   template<int src, int dst>
    899   ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
    900   {
    901 #ifdef CPU_ARCH_ARM64
    902     return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src));
    903 #else
    904     return GSVector2(vset_lane_f32(vget_lane_f32(v.v2s, src), v2s, dst));
    905 #endif
    906   }
    907 
    908   template<int i>
    909   ALWAYS_INLINE int extract32() const
    910   {
    911     return vget_lane_s32(vreinterpret_s32_f32(v2s), i);
    912   }
    913 
    914   ALWAYS_INLINE float dot(const GSVector2& v) const
    915   {
    916 #ifdef CPU_ARCH_ARM64
    917     return vaddv_f32(vmul_f32(v2s, v.v2s));
    918 #else
    919     const float32x2_t dp = vmul_f32(v2s, v.v2s);
    920     return vget_lane_f32(vadd_f32(dp, vdup_lane_f32(dp, 1)), 0);
    921 #endif
    922   }
    923 
    924   ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); }
    925 
    926   ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); }
    927 
    928   ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(vset_lane_f32(f, vmov_n_f32(0.0f), 0)); }
    929 
    930   ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(vld1_f32((const float*)p)); }
    931 
    932   ALWAYS_INLINE static void store(void* p, const GSVector2& v) { vst1_f32((float*)p, v.v2s); }
    933 
    934   ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
    935 
    936   ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); }
    937   ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); }
    938   ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); }
    939   ALWAYS_INLINE void operator/=(const GSVector2& v)
    940   {
    941 #ifdef CPU_ARCH_ARM64
    942     v2s = vdiv_f32(v2s, v.v2s);
    943 #else
    944     *this = GSVector2(vget_lane_f32(v2s, 0) / vget_lane_f32(v.v2s, 0), vget_lane_f32(v2s, 1) / vget_lane_f32(v.v2s, 1));
    945 #endif
    946   }
    947 
    948   ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); }
    949   ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); }
    950   ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); }
    951   ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); }
    952 
    953   ALWAYS_INLINE void operator&=(const GSVector2& v)
    954   {
    955     v2s = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
    956   }
    957 
    958   ALWAYS_INLINE void operator|=(const GSVector2& v)
    959   {
    960     v2s = vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
    961   }
    962 
    963   ALWAYS_INLINE void operator^=(const GSVector2& v)
    964   {
    965     v2s = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
    966   }
    967 
    968   ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2)
    969   {
    970     return GSVector2(vadd_f32(v1.v2s, v2.v2s));
    971   }
    972 
    973   ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2)
    974   {
    975     return GSVector2(vsub_f32(v1.v2s, v2.v2s));
    976   }
    977 
    978   ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2)
    979   {
    980     return GSVector2(vmul_f32(v1.v2s, v2.v2s));
    981   }
    982 
    983   ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
    984   {
    985 #ifdef CPU_ARCH_ARM64
    986     return GSVector2(vdiv_f32(v1.v2s, v2.v2s));
    987 #else
    988     return GSVector2(vget_lane_f32(v1.v2s, 0) / vget_lane_f32(v2.v2s, 0),
    989                      vget_lane_f32(v1.v2s, 1) / vget_lane_f32(v2.v2s, 1));
    990 #endif
    991   }
    992 
    993   ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
    994   ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); }
    995   ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); }
    996   ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); }
    997 
    998   ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2)
    999   {
   1000     return GSVector2(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
   1001   }
   1002 
   1003   ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2)
   1004   {
   1005     return GSVector2(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
   1006   }
   1007 
   1008   ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2)
   1009   {
   1010     return GSVector2(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
   1011   }
   1012 
   1013   ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2)
   1014   {
   1015     return GSVector2(vreinterpret_f32_u32(vceq_f32(v1.v2s, v2.v2s)));
   1016   }
   1017 
   1018   ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2)
   1019   {
   1020     // NEON has no !=
   1021     return GSVector2(vreinterpret_f32_u32(vmvn_u32(vceq_f32(v1.v2s, v2.v2s))));
   1022   }
   1023 
   1024   ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2)
   1025   {
   1026     return GSVector2(vreinterpret_f32_u32(vcgt_f32(v1.v2s, v2.v2s)));
   1027   }
   1028 
   1029   ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2)
   1030   {
   1031     return GSVector2(vreinterpret_f32_u32(vclt_f32(v1.v2s, v2.v2s)));
   1032   }
   1033 
   1034   ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2)
   1035   {
   1036     return GSVector2(vreinterpret_f32_u32(vcge_f32(v1.v2s, v2.v2s)));
   1037   }
   1038 
   1039   ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2)
   1040   {
   1041     return GSVector2(vreinterpret_f32_u32(vcle_f32(v1.v2s, v2.v2s)));
   1042   }
   1043 
   1044   ALWAYS_INLINE GSVector2 xy() const { return *this; }
   1045   ALWAYS_INLINE GSVector2 xx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 0, 0)); }
   1046   ALWAYS_INLINE GSVector2 yx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 0)); }
   1047   ALWAYS_INLINE GSVector2 yy() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 1)); }
   1048 };
   1049 
   1050 class alignas(16) GSVector4i
   1051 {
   1052   struct cxpr_init_tag
   1053   {
   1054   };
   1055   static constexpr cxpr_init_tag cxpr_init{};
   1056 
   1057   constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {}
   1058 
   1059   constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
   1060     : S16{s0, s1, s2, s3, s4, s5, s6, s7}
   1061   {
   1062   }
   1063 
   1064   constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
   1065                        s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
   1066     : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
   1067   {
   1068   }
   1069 
   1070 public:
   1071   union
   1072   {
   1073     struct
   1074     {
   1075       int x, y, z, w;
   1076     };
   1077     struct
   1078     {
   1079       int r, g, b, a;
   1080     };
   1081     struct
   1082     {
   1083       int left, top, right, bottom;
   1084     };
   1085     float F32[4];
   1086     s8 S8[16];
   1087     s16 S16[8];
   1088     s32 S32[4];
   1089     s64 S64[2];
   1090     u8 U8[16];
   1091     u16 U16[8];
   1092     u32 U32[4];
   1093     u64 U64[2];
   1094     int32x4_t v4s;
   1095   };
   1096 
   1097   GSVector4i() = default;
   1098 
   1099   ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
   1100   {
   1101     return GSVector4i(cxpr_init, x, y, z, w);
   1102   }
   1103 
   1104   ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
   1105 
   1106   ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
   1107 
   1108   ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
   1109   {
   1110     return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
   1111   }
   1112 
   1113   ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
   1114                                                   s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
   1115   {
   1116     return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
   1117   }
   1118 
   1119   ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w)
   1120   {
   1121     GSVector4i xz = load(x).upl32(load(z));
   1122     GSVector4i yw = load(y).upl32(load(w));
   1123 
   1124     *this = xz.upl32(yw);
   1125   }
   1126 
   1127   ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); }
   1128 
   1129   ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
   1130     : S16{s0, s1, s2, s3, s4, s5, s6, s7}
   1131   {
   1132   }
   1133 
   1134   constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12,
   1135                        s8 b13, s8 b14, s8 b15)
   1136     : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
   1137   {
   1138   }
   1139 
   1140   // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
   1141   // so leave the non-constexpr version default
   1142   ALWAYS_INLINE explicit GSVector4i(int i) { *this = i; }
   1143 
   1144   ALWAYS_INLINE explicit GSVector4i(int32x2_t m) : v4s(vcombine_s32(m, vcreate_s32(0))) {}
   1145   ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {}
   1146 
   1147   ALWAYS_INLINE explicit GSVector4i(const GSVector2& v);
   1148   ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
   1149 
   1150   ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
   1151 
   1152   ALWAYS_INLINE void operator=(int i) { v4s = vdupq_n_s32(i); }
   1153 
   1154   ALWAYS_INLINE operator int32x4_t() const { return v4s; }
   1155 
   1156   // rect
   1157 
   1158   ALWAYS_INLINE int width() const { return right - left; }
   1159 
   1160   ALWAYS_INLINE int height() const { return bottom - top; }
   1161 
   1162   ALWAYS_INLINE GSVector4i rsize() const
   1163   {
   1164     return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
   1165   }
   1166 
   1167   ALWAYS_INLINE s32 rarea() const { return width() * height(); }
   1168 
   1169   ALWAYS_INLINE bool rempty() const
   1170   {
   1171 #ifdef CPU_ARCH_ARM64
   1172     return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0);
   1173 #else
   1174     return (vget_lane_u64(vreinterpret_u64_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))), 0) == 0);
   1175 #endif
   1176   }
   1177 
   1178   ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_i32(a).upl64(max_i32(a).srl<8>()); }
   1179 
   1180   ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& a) const { return sat_i32(a); }
   1181   ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); }
   1182   ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
   1183 
   1184   ALWAYS_INLINE u32 rgba32() const
   1185   {
   1186     GSVector4i v = *this;
   1187 
   1188     v = v.ps32(v);
   1189     v = v.pu16(v);
   1190 
   1191     return (u32)store(v);
   1192   }
   1193 
   1194   ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const
   1195   {
   1196     return max_i8(min).min_i8(max);
   1197   }
   1198   ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const
   1199   {
   1200     return max_i8(minmax.xyxy()).min_i8(minmax.zwzw());
   1201   }
   1202   ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const
   1203   {
   1204     return max_i16(min).min_i16(max);
   1205   }
   1206   ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const
   1207   {
   1208     return max_i16(minmax.xyxy()).min_i16(minmax.zwzw());
   1209   }
   1210   ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const
   1211   {
   1212     return max_i32(min).min_i32(max);
   1213   }
   1214   ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const
   1215   {
   1216     return max_i32(minmax.xyxy()).min_i32(minmax.zwzw());
   1217   }
   1218 
   1219   ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
   1220   {
   1221     return max_u8(min).min_u8(max);
   1222   }
   1223   ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
   1224   {
   1225     return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
   1226   }
   1227   ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
   1228   {
   1229     return max_u16(min).min_u16(max);
   1230   }
   1231   ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
   1232   {
   1233     return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
   1234   }
   1235   ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
   1236   {
   1237     return max_u32(min).min_u32(max);
   1238   }
   1239   ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
   1240   {
   1241     return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
   1242   }
   1243 
   1244   ALWAYS_INLINE GSVector4i min_i8(const GSVector4i& v) const
   1245   {
   1246     return GSVector4i(vreinterpretq_s32_s8(vminq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   1247   }
   1248 
   1249   ALWAYS_INLINE GSVector4i max_i8(const GSVector4i& v) const
   1250   {
   1251     return GSVector4i(vreinterpretq_s32_s8(vmaxq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   1252   }
   1253 
   1254   ALWAYS_INLINE GSVector4i min_i16(const GSVector4i& v) const
   1255   {
   1256     return GSVector4i(vreinterpretq_s32_s16(vminq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   1257   }
   1258 
   1259   ALWAYS_INLINE GSVector4i max_i16(const GSVector4i& v) const
   1260   {
   1261     return GSVector4i(vreinterpretq_s32_s16(vmaxq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   1262   }
   1263 
   1264   ALWAYS_INLINE GSVector4i min_i32(const GSVector4i& v) const { return GSVector4i(vminq_s32(v4s, v.v4s)); }
   1265 
   1266   ALWAYS_INLINE GSVector4i max_i32(const GSVector4i& v) const { return GSVector4i(vmaxq_s32(v4s, v.v4s)); }
   1267 
   1268   ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const
   1269   {
   1270     return GSVector4i(vreinterpretq_s32_u8(vminq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
   1271   }
   1272 
   1273   ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const
   1274   {
   1275     return GSVector4i(vreinterpretq_s32_u8(vmaxq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
   1276   }
   1277 
   1278   ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const
   1279   {
   1280     return GSVector4i(vreinterpretq_s32_u16(vminq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
   1281   }
   1282 
   1283   ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const
   1284   {
   1285     return GSVector4i(vreinterpretq_s32_u16(vmaxq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
   1286   }
   1287 
   1288   ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const
   1289   {
   1290     return GSVector4i(vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
   1291   }
   1292 
   1293   ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const
   1294   {
   1295     return GSVector4i(vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
   1296   }
   1297 
   1298   ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
   1299   {
   1300 #ifdef CPU_ARCH_ARM64
   1301     const int32x4_t acc =
   1302       vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
   1303     return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)));
   1304 #else
   1305     // borrowed from sse2neon
   1306     const int32x4_t low =
   1307       vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
   1308     const int32x4_t high =
   1309       vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
   1310     return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
   1311                                    vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
   1312 #endif
   1313   }
   1314 
   1315   ALWAYS_INLINE GSVector4i addp_s32() const
   1316   {
   1317 #ifdef CPU_ARCH_ARM64
   1318     return GSVector4i(vpaddq_s32(v4s, v4s));
   1319 #else
   1320     const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
   1321     return GSVector4i(vcombine_s32(res, res));
   1322 #endif
   1323   }
   1324 
   1325   ALWAYS_INLINE s32 addv_s32() const
   1326   {
   1327 #ifdef CPU_ARCH_ARM64
   1328     return vaddvq_s32(v4s);
   1329 #else
   1330     const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
   1331     return vget_lane_s32(res, 0) + vget_lane_s32(res, 1);
   1332 #endif
   1333   }
   1334 
   1335 #ifdef CPU_ARCH_ARM64
   1336 
   1337   ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); }
   1338 
   1339   ALWAYS_INLINE u16 maxv_u8() const { return vmaxvq_u8(vreinterpretq_u8_s32(v4s)); }
   1340 
   1341   ALWAYS_INLINE u16 minv_u16() const { return vminvq_u16(vreinterpretq_u16_s32(v4s)); }
   1342 
   1343   ALWAYS_INLINE u16 maxv_u16() const { return vmaxvq_u16(vreinterpretq_u16_s32(v4s)); }
   1344 
   1345   ALWAYS_INLINE s32 minv_s32() const { return vminvq_s32(v4s); }
   1346 
   1347   ALWAYS_INLINE u32 minv_u32() const { return vminvq_u32(v4s); }
   1348 
   1349   ALWAYS_INLINE s32 maxv_s32() const { return vmaxvq_s32(v4s); }
   1350 
   1351   ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); }
   1352 
   1353 #else
   1354 
   1355   ALWAYS_INLINE u8 minv_u8() const
   1356   {
   1357     uint8x8_t vmin = vmin_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
   1358     vmin = vmin_u8(vmin, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmin), 1)));
   1359     return static_cast<u8>(
   1360       std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
   1361                std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
   1362                         std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
   1363   }
   1364 
   1365   ALWAYS_INLINE u16 maxv_u8() const
   1366   {
   1367     uint8x8_t vmax = vmax_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
   1368     vmax = vmax_u8(vmax, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmax), 1)));
   1369     return static_cast<u8>(
   1370       std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
   1371                std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
   1372                         std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
   1373   }
   1374 
   1375   ALWAYS_INLINE u16 minv_u16() const
   1376   {
   1377     uint16x4_t vmin = vmin_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
   1378     vmin = vmin_u16(vmin, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmin), 1)));
   1379     return static_cast<u16>(
   1380       std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
   1381   }
   1382 
   1383   ALWAYS_INLINE u16 maxv_u16() const
   1384   {
   1385     uint16x4_t vmax = vmax_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
   1386     vmax = vmax_u16(vmax, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmax), 1)));
   1387     return static_cast<u16>(
   1388       std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
   1389   }
   1390 
   1391   ALWAYS_INLINE s32 minv_s32() const
   1392   {
   1393     int32x2_t vmin = vmin_s32(vget_low_s32(v4s), vget_high_s32(v4s));
   1394     return std::min<s32>(vget_lane_s32(vmin, 0), vget_lane_s32(vmin, 1));
   1395   }
   1396 
   1397   ALWAYS_INLINE u32 minv_u32() const
   1398   {
   1399     uint32x2_t vmin = vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
   1400     return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(vmin), 0), vget_lane_u32(vreinterpret_u32_s32(vmin), 1));
   1401   }
   1402 
   1403   ALWAYS_INLINE s32 maxv_s32() const
   1404   {
   1405     int32x2_t vmax = vmax_s32(vget_low_s32(v4s), vget_high_s32(v4s));
   1406     return std::max<s32>(vget_lane_s32(vmax, 0), vget_lane_s32(vmax, 1));
   1407   }
   1408 
   1409   ALWAYS_INLINE u32 maxv_u32() const
   1410   {
   1411     uint32x2_t vmax = vmax_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
   1412     return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(vmax), 0), vget_lane_u32(vreinterpret_u32_s32(vmax), 1));
   1413   }
   1414 
   1415 #endif
   1416 
   1417   ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
   1418 
   1419   ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
   1420   {
   1421     uint8x16_t mask2 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_s32(mask.v4s), 7));
   1422     return GSVector4i(vreinterpretq_s32_u8(vbslq_u8(mask2, vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(v4s))));
   1423   }
   1424 
   1425   template<int mask>
   1426   ALWAYS_INLINE GSVector4i blend16(const GSVector4i& a) const
   1427   {
   1428     static constexpr const uint16_t _mask[8] = {
   1429       ((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0,
   1430       ((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0,
   1431       ((mask) & (1 << 4)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 5)) ? (uint16_t)-1 : 0x0,
   1432       ((mask) & (1 << 6)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 7)) ? (uint16_t)-1 : 0x0};
   1433     return GSVector4i(
   1434       vreinterpretq_s32_u16(vbslq_u16(vld1q_u16(_mask), vreinterpretq_u16_s32(a.v4s), vreinterpretq_u16_s32(v4s))));
   1435   }
   1436 
   1437   template<int mask>
   1438   ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
   1439   {
   1440     constexpr int bit3 = ((mask & 8) * 3) << 3;
   1441     constexpr int bit2 = ((mask & 4) * 3) << 2;
   1442     constexpr int bit1 = ((mask & 2) * 3) << 1;
   1443     constexpr int bit0 = (mask & 1) * 3;
   1444     return blend16<bit3 | bit2 | bit1 | bit0>(v);
   1445   }
   1446 
   1447   ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
   1448   {
   1449     return GSVector4i(
   1450       vreinterpretq_s32_s8(vorrq_s8(vbicq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(mask.v4s)),
   1451                                     vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s)))));
   1452   }
   1453 
   1454   ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
   1455 
   1456   ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
   1457   {
   1458 #ifdef CPU_ARCH_ARM64
   1459     return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
   1460 #else
   1461     int8x8x2_t split = {vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v4s))};
   1462     return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vtbl2_s8(split, vget_low_s8(vreinterpretq_s8_s32(mask.v4s))),
   1463                                                        vtbl2_s8(split, vget_high_s8(vreinterpretq_s8_s32(mask.v4s))))));
   1464 #endif
   1465   }
   1466 
   1467   ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const
   1468   {
   1469     return GSVector4i(vreinterpretq_s32_s8(
   1470       vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v.v4s)))));
   1471   }
   1472 
   1473   ALWAYS_INLINE GSVector4i ps16() const
   1474   {
   1475     return GSVector4i(vreinterpretq_s32_s8(
   1476       vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v4s)))));
   1477   }
   1478 
   1479   ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const
   1480   {
   1481     return GSVector4i(vreinterpretq_s32_u8(
   1482       vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v.v4s)))));
   1483   }
   1484 
   1485   ALWAYS_INLINE GSVector4i pu16() const
   1486   {
   1487     return GSVector4i(vreinterpretq_s32_u8(
   1488       vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v4s)))));
   1489   }
   1490 
   1491   ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const
   1492   {
   1493     return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v.v4s))));
   1494   }
   1495 
   1496   ALWAYS_INLINE GSVector4i ps32() const
   1497   {
   1498     return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v4s))));
   1499   }
   1500 
   1501   ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const
   1502   {
   1503     return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v.v4s))));
   1504   }
   1505 
   1506   ALWAYS_INLINE GSVector4i pu32() const
   1507   {
   1508     return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
   1509   }
   1510 
   1511 #ifdef CPU_ARCH_ARM64
   1512 
   1513   ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
   1514   {
   1515     return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   1516   }
   1517 
   1518   ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
   1519   {
   1520     return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   1521   }
   1522 
   1523   ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
   1524   {
   1525     return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   1526   }
   1527 
   1528   ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
   1529   {
   1530     return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   1531   }
   1532 
   1533   ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(vzip1q_s32(v4s, v.v4s)); }
   1534 
   1535   ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(vzip2q_s32(v4s, v.v4s)); }
   1536 
   1537   ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
   1538   {
   1539     return GSVector4i(vreinterpretq_s32_s64(
   1540       vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
   1541   }
   1542 
   1543   ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
   1544   {
   1545     return GSVector4i(vreinterpretq_s32_s64(
   1546       vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
   1547   }
   1548 
   1549   ALWAYS_INLINE GSVector4i upl8() const
   1550   {
   1551     return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
   1552   }
   1553 
   1554   ALWAYS_INLINE GSVector4i uph8() const
   1555   {
   1556     return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
   1557   }
   1558 
   1559   ALWAYS_INLINE GSVector4i upl16() const
   1560   {
   1561     return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
   1562   }
   1563 
   1564   ALWAYS_INLINE GSVector4i uph16() const
   1565   {
   1566     return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
   1567   }
   1568 
   1569   ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(vzip1q_s32(v4s, vdupq_n_s32(0))); }
   1570 
   1571   ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(vzip2q_s32(v4s, vdupq_n_s32(0))); }
   1572 
   1573   ALWAYS_INLINE GSVector4i upl64() const
   1574   {
   1575     return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
   1576   }
   1577 
   1578   ALWAYS_INLINE GSVector4i uph64() const
   1579   {
   1580     return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
   1581   }
   1582 
   1583 #else
   1584 
   1585   ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
   1586   {
   1587     const int8x8x2_t res = vzip_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
   1588     return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
   1589   }
   1590 
   1591   ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
   1592   {
   1593     const int8x8x2_t res = vzip_s8(vget_high_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
   1594     return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
   1595   }
   1596 
   1597   ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
   1598   {
   1599     const int16x4x2_t res =
   1600       vzip_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
   1601     return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
   1602   }
   1603 
   1604   ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
   1605   {
   1606     const int16x4x2_t res =
   1607       vzip_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
   1608     return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
   1609   }
   1610 
   1611   ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const
   1612   {
   1613     const int32x2x2_t res = vzip_s32(vget_low_s32(v4s), vget_low_s32(v.v4s));
   1614     return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
   1615   }
   1616 
   1617   ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const
   1618   {
   1619     const int32x2x2_t res = vzip_s32(vget_high_s32(v4s), vget_high_s32(v.v4s));
   1620     return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
   1621   }
   1622 
   1623   ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
   1624   {
   1625     return GSVector4i(vreinterpretq_s32_s64(
   1626       vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
   1627   }
   1628 
   1629   ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
   1630   {
   1631     return GSVector4i(vreinterpretq_s32_s64(
   1632       vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
   1633   }
   1634 
   1635   ALWAYS_INLINE GSVector4i upl8() const { return upl8(GSVector4i(vdupq_n_s32(0))); }
   1636 
   1637   ALWAYS_INLINE GSVector4i uph8() const { return uph8(GSVector4i(vdupq_n_s32(0))); }
   1638 
   1639   ALWAYS_INLINE GSVector4i upl16() const { return upl16(GSVector4i(vdupq_n_s32(0))); }
   1640 
   1641   ALWAYS_INLINE GSVector4i uph16() const { return uph16(GSVector4i(vdupq_n_s32(0))); }
   1642 
   1643   ALWAYS_INLINE GSVector4i upl32() const { return upl32(GSVector4i(vdupq_n_s32(0))); }
   1644 
   1645   ALWAYS_INLINE GSVector4i uph32() const { return uph32(GSVector4i(vdupq_n_s32(0))); }
   1646 
   1647   ALWAYS_INLINE GSVector4i upl64() const
   1648   {
   1649     return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
   1650   }
   1651 
   1652   ALWAYS_INLINE GSVector4i uph64() const
   1653   {
   1654     return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
   1655   }
   1656 #endif
   1657 
   1658   ALWAYS_INLINE GSVector4i s8to16() const
   1659   {
   1660     return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
   1661   }
   1662 
   1663   ALWAYS_INLINE GSVector4i u8to16() const
   1664   {
   1665     return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))));
   1666   }
   1667 
   1668   ALWAYS_INLINE GSVector4i s8to32() const
   1669   {
   1670     return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))));
   1671   }
   1672 
   1673   ALWAYS_INLINE GSVector4i u8to32() const
   1674   {
   1675     return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))));
   1676   }
   1677 
   1678   ALWAYS_INLINE GSVector4i s8to64() const
   1679   {
   1680     return GSVector4i(vreinterpretq_s32_s64(
   1681       vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))))));
   1682   }
   1683 
   1684   ALWAYS_INLINE GSVector4i u8to64() const
   1685   {
   1686     return GSVector4i(vreinterpretq_s32_u64(
   1687       vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))))));
   1688   }
   1689 
   1690   ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); }
   1691 
   1692   ALWAYS_INLINE GSVector4i u16to32() const
   1693   {
   1694     return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))));
   1695   }
   1696 
   1697   ALWAYS_INLINE GSVector4i s16to64() const
   1698   {
   1699     return GSVector4i(
   1700       vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))))));
   1701   }
   1702 
   1703   ALWAYS_INLINE GSVector4i u16to64() const
   1704   {
   1705     return GSVector4i(
   1706       vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))))));
   1707   }
   1708 
   1709   ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); }
   1710 
   1711   ALWAYS_INLINE GSVector4i u32to64() const
   1712   {
   1713     return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)))));
   1714   }
   1715 
   1716   template<int i>
   1717   ALWAYS_INLINE GSVector4i srl() const
   1718   {
   1719     return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0), i)));
   1720   }
   1721 
   1722   template<int i>
   1723   ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
   1724   {
   1725     if constexpr (i >= 16)
   1726       return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v.v4s), vdupq_n_u8(0), i - 16)));
   1727     else
   1728       return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s), i)));
   1729   }
   1730 
   1731   template<int i>
   1732   ALWAYS_INLINE GSVector4i sll() const
   1733   {
   1734     return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(v4s), 16 - i)));
   1735   }
   1736 
   1737   template<int i>
   1738   ALWAYS_INLINE GSVector4i sll16() const
   1739   {
   1740     return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i)));
   1741   }
   1742 
   1743   ALWAYS_INLINE GSVector4i sll16(s32 i) const
   1744   {
   1745     return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i))));
   1746   }
   1747 
   1748   ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const
   1749   {
   1750     return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   1751   }
   1752 
   1753   template<int i>
   1754   ALWAYS_INLINE GSVector4i srl16() const
   1755   {
   1756     return GSVector4i(vreinterpretq_s32_u16(vshrq_n_u16(vreinterpretq_u16_s32(v4s), i)));
   1757   }
   1758 
   1759   ALWAYS_INLINE GSVector4i srl16(s32 i) const
   1760   {
   1761     return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_u16(-i))));
   1762   }
   1763 
   1764   ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const
   1765   {
   1766     return GSVector4i(
   1767       vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
   1768   }
   1769 
   1770   template<int i>
   1771   ALWAYS_INLINE GSVector4i sra16() const
   1772   {
   1773     constexpr int count = (i & ~15) ? 15 : i;
   1774     return GSVector4i(vreinterpretq_s32_s16(vshrq_n_s16(vreinterpretq_s16_s32(v4s), count)));
   1775   }
   1776 
   1777   ALWAYS_INLINE GSVector4i sra16(s32 i) const
   1778   {
   1779     return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(-i))));
   1780   }
   1781 
   1782   ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const
   1783   {
   1784     return GSVector4i(
   1785       vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
   1786   }
   1787 
   1788   template<int i>
   1789   ALWAYS_INLINE GSVector4i sll32() const
   1790   {
   1791     return GSVector4i(vshlq_n_s32(v4s, i));
   1792   }
   1793 
   1794   ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); }
   1795 
   1796   ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); }
   1797 
   1798   template<int i>
   1799   ALWAYS_INLINE GSVector4i srl32() const
   1800   {
   1801     return GSVector4i(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(v4s), i)));
   1802   }
   1803 
   1804   ALWAYS_INLINE GSVector4i srl32(s32 i) const
   1805   {
   1806     return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i))));
   1807   }
   1808 
   1809   ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const
   1810   {
   1811     return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s))));
   1812   }
   1813 
   1814   template<int i>
   1815   ALWAYS_INLINE GSVector4i sra32() const
   1816   {
   1817     return GSVector4i(vshrq_n_s32(v4s, i));
   1818   }
   1819 
   1820   ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(-i))); }
   1821 
   1822   ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const
   1823   {
   1824     return GSVector4i(vshlq_s32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s)));
   1825   }
   1826 
   1827   template<int i>
   1828   ALWAYS_INLINE GSVector4i sll64() const
   1829   {
   1830     return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i)));
   1831   }
   1832 
   1833   ALWAYS_INLINE GSVector4i sll64(s32 i) const
   1834   {
   1835     return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(i))));
   1836   }
   1837 
   1838   ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const
   1839   {
   1840     return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
   1841   }
   1842 
   1843   template<int i>
   1844   ALWAYS_INLINE GSVector4i sra64() const
   1845   {
   1846     return GSVector4i(vreinterpretq_s32_s64(vshrq_n_s64(vreinterpretq_s64_s32(v4s), i)));
   1847   }
   1848 
   1849   ALWAYS_INLINE GSVector4i sra64(s32 i) const
   1850   {
   1851     return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(-i))));
   1852   }
   1853 
   1854 #ifdef CPU_ARCH_ARM64
   1855   // not on arm32, hopefully we can do without
   1856   ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const
   1857   {
   1858     return GSVector4i(
   1859       vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
   1860   }
   1861 #endif
   1862 
   1863   template<int i>
   1864   ALWAYS_INLINE GSVector4i srl64() const
   1865   {
   1866     return GSVector4i(vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(v4s), i)));
   1867   }
   1868 
   1869   ALWAYS_INLINE GSVector4i srl64(s32 i) const
   1870   {
   1871     return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i))));
   1872   }
   1873 
   1874 #ifdef CPU_ARCH_ARM64
   1875   ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const
   1876   {
   1877     return GSVector4i(
   1878       vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
   1879   }
   1880 #endif
   1881 
   1882   ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const
   1883   {
   1884     return GSVector4i(vreinterpretq_s32_s8(vaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   1885   }
   1886 
   1887   ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const
   1888   {
   1889     return GSVector4i(vreinterpretq_s32_s16(vaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   1890   }
   1891 
   1892   ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(vaddq_s32(v4s, v.v4s)); }
   1893 
   1894   ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const
   1895   {
   1896     return GSVector4i(vreinterpretq_s32_s8(vqaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   1897   }
   1898 
   1899   ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const
   1900   {
   1901     return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   1902   }
   1903 
   1904   ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const
   1905   {
   1906     // can't use vpaddq_s16() here, because we need saturation.
   1907     // return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   1908     const int16x8_t a = vreinterpretq_s16_s32(v4s);
   1909     const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
   1910 #ifdef CPU_ARCH_ARM64
   1911     return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
   1912 #else
   1913     // sse2neon again
   1914     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
   1915     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
   1916     return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(ab0246, ab1357)));
   1917 #endif
   1918   }
   1919 
   1920   ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const
   1921   {
   1922     return GSVector4i(vreinterpretq_s32_u8(vqaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
   1923   }
   1924 
   1925   ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const
   1926   {
   1927     return GSVector4i(vreinterpretq_s32_u16(vqaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
   1928   }
   1929 
   1930   ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const
   1931   {
   1932     return GSVector4i(vreinterpretq_s32_s8(vsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   1933   }
   1934 
   1935   ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const
   1936   {
   1937     return GSVector4i(vreinterpretq_s32_s16(vsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   1938   }
   1939 
   1940   ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(vsubq_s32(v4s, v.v4s)); }
   1941 
   1942   ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const
   1943   {
   1944     return GSVector4i(vreinterpretq_s32_s8(vqsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   1945   }
   1946 
   1947   ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const
   1948   {
   1949     return GSVector4i(vreinterpretq_s32_s16(vqsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   1950   }
   1951 
   1952   ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const
   1953   {
   1954     return GSVector4i(vreinterpretq_s32_u8(vqsubq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
   1955   }
   1956 
   1957   ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const
   1958   {
   1959     return GSVector4i(vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
   1960   }
   1961 
   1962   ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const
   1963   {
   1964     return GSVector4i(vreinterpretq_s32_u8(vrhaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
   1965   }
   1966 
   1967   ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const
   1968   {
   1969     return GSVector4i(vreinterpretq_s32_u16(vrhaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
   1970   }
   1971 
   1972   ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const
   1973   {
   1974     // from sse2neon
   1975     int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_s32(v4s));
   1976     int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_s32(v.v4s));
   1977     int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
   1978     int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_s32(v4s));
   1979     int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_s32(v.v4s));
   1980     int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
   1981     uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
   1982     return GSVector4i(vreinterpretq_s32_u16(r.val[1]));
   1983   }
   1984 
   1985   ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const
   1986   {
   1987     return GSVector4i(vreinterpretq_s32_s16(vmulq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   1988   }
   1989 
   1990   ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const
   1991   {
   1992     int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
   1993     int32x4_t mul_hi =
   1994       vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
   1995     int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
   1996     int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
   1997     return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(narrow_lo, narrow_hi)));
   1998   }
   1999 
   2000   ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
   2001 
   2002   ALWAYS_INLINE bool eq(const GSVector4i& v) const
   2003   {
   2004     const int32x4_t res = veorq_s32(v4s, v.v4s);
   2005 #ifdef CPU_ARCH_ARM64
   2006     return (vmaxvq_u32(vreinterpretq_u32_s32(res)) == 0);
   2007 #else
   2008     const int32x2_t paired = vorr_s32(vget_low_s32(res), vget_high_s32(res));
   2009     return (vget_lane_u64(vreinterpret_u64_s32(paired), 0) == 0);
   2010 #endif
   2011   }
   2012 
   2013   ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const
   2014   {
   2015     return GSVector4i(vreinterpretq_s32_u8(vceqq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   2016   }
   2017 
   2018   ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const
   2019   {
   2020     return GSVector4i(vreinterpretq_s32_u16(vceqq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   2021   }
   2022 
   2023   ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const
   2024   {
   2025     return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
   2026   }
   2027 
   2028 #ifdef CPU_ARCH_ARM64
   2029   ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const
   2030   {
   2031     return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
   2032   }
   2033 #endif
   2034 
   2035   ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
   2036 
   2037   ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
   2038 
   2039   ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
   2040 
   2041   ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const
   2042   {
   2043     return GSVector4i(vreinterpretq_s32_s8(vcgtq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   2044   }
   2045 
   2046   ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const
   2047   {
   2048     return GSVector4i(vreinterpretq_s32_s16(vcgtq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   2049   }
   2050 
   2051   ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(vcgtq_s32(v4s, v.v4s)); }
   2052 
   2053   ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const
   2054   {
   2055     return GSVector4i(vreinterpretq_s32_s8(vcgeq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   2056   }
   2057   ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const
   2058   {
   2059     return GSVector4i(vreinterpretq_s32_s16(vcgeq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   2060   }
   2061   ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return GSVector4i(vcgeq_s32(v4s, v.v4s)); }
   2062 
   2063   ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const
   2064   {
   2065     return GSVector4i(vreinterpretq_s32_s8(vcltq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   2066   }
   2067 
   2068   ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const
   2069   {
   2070     return GSVector4i(vreinterpretq_s32_s16(vcltq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   2071   }
   2072 
   2073   ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(vcltq_s32(v4s, v.v4s)); }
   2074 
   2075   ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const
   2076   {
   2077     return GSVector4i(vreinterpretq_s32_s8(vcleq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
   2078   }
   2079   ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const
   2080   {
   2081     return GSVector4i(vreinterpretq_s32_s16(vcleq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
   2082   }
   2083   ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return GSVector4i(vcleq_s32(v4s, v.v4s)); }
   2084 
   2085   ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(vbicq_s32(v4s, v.v4s)); }
   2086 
   2087   ALWAYS_INLINE int mask() const
   2088   {
   2089     // borrowed from sse2neon
   2090     const uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s32(v4s), 7));
   2091     const uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
   2092     const uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
   2093     const uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
   2094     return static_cast<int>(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8));
   2095   }
   2096 
   2097   ALWAYS_INLINE bool alltrue() const
   2098   {
   2099     // MSB should be set in all 8-bit lanes.
   2100 #ifdef CPU_ARCH_ARM64
   2101     return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80;
   2102 #else
   2103     const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
   2104     return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0x80808080u);
   2105 #endif
   2106   }
   2107 
   2108   ALWAYS_INLINE bool allfalse() const
   2109   {
   2110     // MSB should be clear in all 8-bit lanes.
   2111 #ifdef CPU_ARCH_ARM64
   2112     return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80;
   2113 #else
   2114     const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
   2115     return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0);
   2116 #endif
   2117   }
   2118 
   2119   template<int i>
   2120   ALWAYS_INLINE GSVector4i insert8(int a) const
   2121   {
   2122     return GSVector4i(vreinterpretq_s32_u8(vsetq_lane_u8(a, vreinterpretq_u8_s32(v4s), static_cast<uint8_t>(i))));
   2123   }
   2124 
   2125   template<int i>
   2126   ALWAYS_INLINE int extract8() const
   2127   {
   2128     return vgetq_lane_u8(vreinterpretq_u8_s32(v4s), i);
   2129   }
   2130 
   2131   template<int i>
   2132   ALWAYS_INLINE GSVector4i insert16(int a) const
   2133   {
   2134     return GSVector4i(vreinterpretq_s32_u16(vsetq_lane_u16(a, vreinterpretq_u16_s32(v4s), static_cast<uint16_t>(i))));
   2135   }
   2136 
   2137   template<int i>
   2138   ALWAYS_INLINE int extract16() const
   2139   {
   2140     return vgetq_lane_u16(vreinterpretq_u16_s32(v4s), i);
   2141   }
   2142 
   2143   template<int i>
   2144   ALWAYS_INLINE GSVector4i insert32(int a) const
   2145   {
   2146     return GSVector4i(vsetq_lane_s32(a, v4s, i));
   2147   }
   2148 
   2149   template<int i>
   2150   ALWAYS_INLINE int extract32() const
   2151   {
   2152     return vgetq_lane_s32(v4s, i);
   2153   }
   2154 
   2155   template<int i>
   2156   ALWAYS_INLINE GSVector4i insert64(s64 a) const
   2157   {
   2158     return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(a, vreinterpretq_s64_s32(v4s), i)));
   2159   }
   2160 
   2161   template<int i>
   2162   ALWAYS_INLINE s64 extract64() const
   2163   {
   2164     return vgetq_lane_s64(vreinterpretq_s64_s32(v4s), i);
   2165   }
   2166 
   2167   ALWAYS_INLINE static GSVector4i loadnt(const void* p)
   2168   {
   2169 #if __has_builtin(__builtin_nontemporal_store)
   2170     return GSVector4i(__builtin_nontemporal_load((int32x4_t*)p));
   2171 #else
   2172     return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
   2173 #endif
   2174   }
   2175 
   2176   ALWAYS_INLINE static GSVector4i load32(const void* p)
   2177   {
   2178     // should be ldr s0, [x0]
   2179     u32 val;
   2180     std::memcpy(&val, p, sizeof(u32));
   2181     return GSVector4i(vsetq_lane_u32(val, vdupq_n_u32(0), 0));
   2182   }
   2183 
   2184   ALWAYS_INLINE static GSVector4i loadl(const void* p)
   2185   {
   2186     return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
   2187   }
   2188 
   2189   ALWAYS_INLINE static GSVector4i loadh(const void* p)
   2190   {
   2191     return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
   2192   }
   2193 
   2194   ALWAYS_INLINE static GSVector4i loadh(const void* p, const GSVector4i& v)
   2195   {
   2196     return GSVector4i(
   2197       vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v.v4s)), vld1_s64((int64_t*)p))));
   2198   }
   2199 
   2200   ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return GSVector4i(vcombine_s32(vcreate_s32(0), v.v2s)); }
   2201 
   2202   ALWAYS_INLINE static GSVector4i load(const void* pl, const void* ph)
   2203   {
   2204     return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vld1_s64((int64_t*)pl), vld1_s64((int64_t*)ph))));
   2205   }
   2206 
   2207   template<bool aligned>
   2208   ALWAYS_INLINE static GSVector4i load(const void* p)
   2209   {
   2210     return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
   2211   }
   2212 
   2213   ALWAYS_INLINE static GSVector4i load(int i) { return GSVector4i(vsetq_lane_s32(i, vdupq_n_s32(0), 0)); }
   2214 
   2215   ALWAYS_INLINE static GSVector4i loadq(s64 i)
   2216   {
   2217     return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(i, vdupq_n_s64(0), 0)));
   2218   }
   2219 
   2220   ALWAYS_INLINE static void storent(void* p, const GSVector4i& v)
   2221   {
   2222 #if __has_builtin(__builtin_nontemporal_store)
   2223     __builtin_nontemporal_store(v.v4s, ((int32x4_t*)p));
   2224 #else
   2225     vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
   2226 #endif
   2227   }
   2228 
   2229   ALWAYS_INLINE static void store32(void* p, const GSVector4i& v)
   2230   {
   2231     u32 val = vgetq_lane_s32(v, 0);
   2232     std::memcpy(p, &val, sizeof(u32));
   2233   }
   2234 
   2235   ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
   2236   {
   2237     vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
   2238   }
   2239 
   2240   ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
   2241   {
   2242     vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
   2243   }
   2244 
   2245   ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v)
   2246   {
   2247     GSVector4i::storel(pl, v);
   2248     GSVector4i::storeh(ph, v);
   2249   }
   2250 
   2251   template<bool aligned>
   2252   ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
   2253   {
   2254     vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
   2255   }
   2256 
   2257   ALWAYS_INLINE static int store(const GSVector4i& v) { return vgetq_lane_s32(v.v4s, 0); }
   2258 
   2259   ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return vgetq_lane_s64(vreinterpretq_s64_s32(v.v4s), 0); }
   2260 
   2261   ALWAYS_INLINE void operator&=(const GSVector4i& v)
   2262   {
   2263     v4s = vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
   2264   }
   2265 
   2266   ALWAYS_INLINE void operator|=(const GSVector4i& v)
   2267   {
   2268     v4s = vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
   2269   }
   2270 
   2271   ALWAYS_INLINE void operator^=(const GSVector4i& v)
   2272   {
   2273     v4s = vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
   2274   }
   2275 
   2276   ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
   2277   {
   2278     return GSVector4i(vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
   2279   }
   2280 
   2281   ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
   2282   {
   2283     return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
   2284   }
   2285 
   2286   ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
   2287   {
   2288     return GSVector4i(vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
   2289   }
   2290 
   2291   ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, int i) { return v & GSVector4i(i); }
   2292 
   2293   ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, int i) { return v | GSVector4i(i); }
   2294 
   2295   ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, int i) { return v ^ GSVector4i(i); }
   2296 
   2297   ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return GSVector4i(vmvnq_s32(v.v4s)); }
   2298 
   2299   ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(0); }
   2300 
   2301   ALWAYS_INLINE static GSVector4i xffffffff() { return GSVector4i(0xFFFFFFFF); }
   2302 
   2303   ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
   2304 
   2305   ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(vget_low_s32(v4s)); }
   2306 
   2307   ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); }
   2308 
   2309   // clang-format off
   2310 
   2311 
   2312 #define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
   2313     ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); }
   2314 
   2315 #define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
   2316     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
   2317     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
   2318     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
   2319     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
   2320 
   2321 #define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
   2322     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
   2323     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
   2324     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
   2325     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
   2326 
   2327 #define VECTOR4i_SHUFFLE_1(xs, xn) \
   2328     VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
   2329     VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
   2330     VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
   2331     VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
   2332 
   2333   VECTOR4i_SHUFFLE_1(x, 0)
   2334     VECTOR4i_SHUFFLE_1(y, 1)
   2335     VECTOR4i_SHUFFLE_1(z, 2)
   2336     VECTOR4i_SHUFFLE_1(w, 3)
   2337 
   2338   // clang-format on
   2339 };
   2340 
   2341 class alignas(16) GSVector4
   2342 {
   2343   struct cxpr_init_tag
   2344   {
   2345   };
   2346   static constexpr cxpr_init_tag cxpr_init{};
   2347 
   2348   constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
   2349 
   2350   constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
   2351 
   2352   constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
   2353 
   2354   constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
   2355 
   2356 public:
   2357   union
   2358   {
   2359     struct
   2360     {
   2361       float x, y, z, w;
   2362     };
   2363     struct
   2364     {
   2365       float r, g, b, a;
   2366     };
   2367     struct
   2368     {
   2369       float left, top, right, bottom;
   2370     };
   2371     float F32[4];
   2372     double F64[2];
   2373     s8 I8[16];
   2374     s16 I16[8];
   2375     s32 I32[4];
   2376     s64 I64[2];
   2377     u8 U8[16];
   2378     u16 U16[8];
   2379     u32 U32[4];
   2380     u64 U64[2];
   2381     float32x4_t v4s;
   2382   };
   2383 
   2384   GSVector4() = default;
   2385 
   2386   constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
   2387 
   2388   constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
   2389 
   2390   constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
   2391 
   2392   constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
   2393 
   2394   constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
   2395 
   2396   constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
   2397 
   2398   constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
   2399 
   2400   constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
   2401 
   2402   ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
   2403   {
   2404     const float arr[4] = {x, y, z, w};
   2405     v4s = vld1q_f32(arr);
   2406   }
   2407 
   2408   ALWAYS_INLINE GSVector4(float x, float y) { v4s = vsetq_lane_f32(x, vsetq_lane_f32(y, vdupq_n_f32(0.0f), 1), 0); }
   2409 
   2410   ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
   2411   {
   2412     const int arr[4] = {x, y, z, w};
   2413     v4s = vcvtq_f32_s32(vld1q_s32(arr));
   2414   }
   2415 
   2416   ALWAYS_INLINE GSVector4(int x, int y)
   2417   {
   2418     v4s = vcvtq_f32_s32(vsetq_lane_s32(x, vsetq_lane_s32(y, vdupq_n_s32(0), 0), 0));
   2419   }
   2420 
   2421   ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); }
   2422 
   2423   ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { v4s = vcombine_f32(vcvt_f32_s32(v.v2s), vcreate_f32(0)); }
   2424 
   2425   ALWAYS_INLINE constexpr explicit GSVector4(float32x4_t m) : v4s(m) {}
   2426 
   2427   ALWAYS_INLINE explicit GSVector4(float f) { v4s = vdupq_n_f32(f); }
   2428 
   2429   ALWAYS_INLINE explicit GSVector4(int i) { v4s = vcvtq_f32_s32(vdupq_n_s32(i)); }
   2430 
   2431   ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
   2432 
   2433   ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
   2434 
   2435   ALWAYS_INLINE static GSVector4 f64(double x, double y)
   2436   {
   2437 #ifdef CPU_ARCH_ARM64
   2438     return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
   2439 #else
   2440     GSVector4 ret;
   2441     ret.F64[0] = x;
   2442     ret.F64[1] = y;
   2443     return ret;
   2444 #endif
   2445   }
   2446 
   2447   ALWAYS_INLINE static GSVector4 f64(double x)
   2448   {
   2449 #ifdef CPU_ARCH_ARM64
   2450     return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x)));
   2451 #else
   2452     GSVector4 ret;
   2453     ret.F64[0] = ret.F64[1] = x;
   2454     return ret;
   2455 #endif
   2456   }
   2457 
   2458   ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
   2459 
   2460   ALWAYS_INLINE void operator=(float32x4_t m) { v4s = m; }
   2461 
   2462   ALWAYS_INLINE operator float32x4_t() const { return v4s; }
   2463 
   2464   ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); }
   2465 
   2466   ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
   2467 
   2468   ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
   2469 
   2470   ALWAYS_INLINE GSVector4 abs() const { return GSVector4(vabsq_f32(v4s)); }
   2471 
   2472   ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); }
   2473 
   2474   ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(vrecpeq_f32(v4s)); }
   2475 
   2476   ALWAYS_INLINE GSVector4 rcpnr() const
   2477   {
   2478     float32x4_t recip = vrecpeq_f32(v4s);
   2479     recip = vmulq_f32(recip, vrecpsq_f32(recip, v4s));
   2480     return GSVector4(recip);
   2481   }
   2482 
   2483 #ifdef _M_ARM64
   2484 
   2485   ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
   2486 
   2487   ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); }
   2488 
   2489 #else
   2490 
   2491   ALWAYS_INLINE GSVector4 floor() const
   2492   {
   2493     return GSVector4(std::floor(vgetq_lane_f32(v4s, 0)), std::floor(vgetq_lane_f32(v4s, 1)),
   2494                      std::floor(vgetq_lane_f32(v4s, 2)), std::floor(vgetq_lane_f32(v4s, 3)));
   2495   }
   2496 
   2497   ALWAYS_INLINE GSVector4 ceil() const
   2498   {
   2499     return GSVector4(std::ceil(vgetq_lane_f32(v4s, 0)), std::ceil(vgetq_lane_f32(v4s, 1)),
   2500                      std::ceil(vgetq_lane_f32(v4s, 2)), std::ceil(vgetq_lane_f32(v4s, 3)));
   2501   }
   2502 
   2503 #endif
   2504 
   2505   ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const
   2506   {
   2507     return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s));
   2508   }
   2509   ALWAYS_INLINE GSVector4 msub(const GSVector4& a, const GSVector4& b) const
   2510   {
   2511     return GSVector4(vfmsq_f32(b.v4s, v4s, a.v4s));
   2512   }
   2513   ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const { return b - *this * a; }
   2514   ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const { return -b - *this * a; }
   2515 
   2516   ALWAYS_INLINE GSVector4 addm(const GSVector4& a, const GSVector4& b) const
   2517   {
   2518     return a.madd(b, *this); // *this + a * b
   2519   }
   2520 
   2521   ALWAYS_INLINE GSVector4 subm(const GSVector4& a, const GSVector4& b) const
   2522   {
   2523     return a.nmadd(b, *this); // *this - a * b
   2524   }
   2525 
   2526 #ifdef CPU_ARCH_ARM64
   2527 
   2528   ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
   2529 
   2530   ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); }
   2531 
   2532   ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s))); }
   2533 
   2534   ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
   2535   {
   2536     return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
   2537   }
   2538 
   2539 #else
   2540 
   2541   ALWAYS_INLINE GSVector4 hadd() const
   2542   {
   2543     const float32x2_t res = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
   2544     return GSVector4(vcombine_f32(res, res));
   2545   }
   2546 
   2547   ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const
   2548   {
   2549     const float32x2_t res1 = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
   2550     const float32x2_t res2 = vpadd_f32(vget_low_f32(v.v4s), vget_high_f32(v.v4s));
   2551     return GSVector4(vcombine_f32(res1, res2));
   2552   }
   2553 
   2554   ALWAYS_INLINE GSVector4 hsub() const
   2555   {
   2556     const float32x4x2_t res = vuzpq_f32(v4s, v4s);
   2557     return GSVector4(vsubq_f32(res.val[0], res.val[0]));
   2558   }
   2559 
   2560   ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
   2561   {
   2562     const float32x4x2_t res = vuzpq_f32(v4s, v.v4s);
   2563     return GSVector4(vsubq_f32(res.val[0], res.val[1]));
   2564   }
   2565 
   2566 #endif
   2567 
   2568   ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
   2569 
   2570   ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
   2571   {
   2572 #ifdef CPU_ARCH_ARM64
   2573     const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
   2574     const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
   2575 #else
   2576     const GSVector4 minv(a.xyxy());
   2577     const GSVector4 maxv(a.zwzw());
   2578 #endif
   2579     return sat(minv, maxv);
   2580   }
   2581 
   2582   ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
   2583 
   2584   ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
   2585 
   2586   ALWAYS_INLINE GSVector4 min(const GSVector4& a) const { return GSVector4(vminq_f32(v4s, a.v4s)); }
   2587 
   2588   ALWAYS_INLINE GSVector4 max(const GSVector4& a) const { return GSVector4(vmaxq_f32(v4s, a.v4s)); }
   2589 
   2590   template<int mask>
   2591   ALWAYS_INLINE GSVector4 blend32(const GSVector4& a) const
   2592   {
   2593     return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2,
   2594                                              (mask & 8) ? 7 : 3));
   2595   }
   2596 
   2597   ALWAYS_INLINE GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
   2598   {
   2599     // duplicate sign bit across and bit select
   2600     const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31));
   2601     return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
   2602   }
   2603 
   2604 #ifdef CPU_ARCH_ARM64
   2605 
   2606   ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); }
   2607 
   2608   ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); }
   2609 
   2610   ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
   2611   {
   2612     return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
   2613   }
   2614 
   2615   ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
   2616   {
   2617     return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
   2618   }
   2619 
   2620 #else
   2621 
   2622   ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const
   2623   {
   2624     const float32x2x2_t res = vzip_f32(vget_low_f32(v4s), vget_low_f32(a.v4s));
   2625     return GSVector4(vcombine_f32(res.val[0], res.val[1]));
   2626   }
   2627 
   2628   ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const
   2629   {
   2630     const float32x2x2_t res = vzip_f32(vget_high_f32(v4s), vget_high_f32(a.v4s));
   2631     return GSVector4(vcombine_f32(res.val[0], res.val[1]));
   2632   }
   2633 
   2634   ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
   2635   {
   2636     return GSVector4(vreinterpretq_f32_s64(
   2637       vcombine_s64(vget_low_s64(vreinterpretq_s64_f32(v4s)), vget_low_s64(vreinterpretq_s64_f32(a.v4s)))));
   2638   }
   2639 
   2640   ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
   2641   {
   2642     return GSVector4(vreinterpretq_f32_s64(
   2643       vcombine_s64(vget_high_s64(vreinterpretq_s64_f32(v4s)), vget_high_s64(vreinterpretq_s64_f32(a.v4s)))));
   2644   }
   2645 
   2646 #endif
   2647 
   2648   ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const
   2649   {
   2650     return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
   2651   }
   2652 
   2653   ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const
   2654   {
   2655     return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
   2656   }
   2657 
   2658   ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
   2659   {
   2660     return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s))));
   2661   }
   2662 
   2663   ALWAYS_INLINE int mask() const
   2664   {
   2665 #ifdef CPU_ARCH_ARM64
   2666     static constexpr const int32_t shifts[] = {0, 1, 2, 3};
   2667     return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
   2668 #else
   2669     // sse2neon again
   2670     uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31));
   2671     uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
   2672     return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
   2673 #endif
   2674   }
   2675 
   2676   ALWAYS_INLINE bool alltrue() const
   2677   {
   2678     // return mask() == 0xf;
   2679     return ~(vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) & vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
   2680   }
   2681 
   2682   ALWAYS_INLINE bool allfalse() const
   2683   {
   2684     return (vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) | vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
   2685   }
   2686 
   2687   ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
   2688 
   2689   template<int src, int dst>
   2690   ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
   2691   {
   2692 #ifdef CPU_ARCH_ARM64
   2693     return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
   2694 #else
   2695     return GSVector4(vsetq_lane_f32(vgetq_lane_f32(v.v4s, src), v4s, dst));
   2696 #endif
   2697   }
   2698 
   2699   template<int i>
   2700   ALWAYS_INLINE int extract32() const
   2701   {
   2702     return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i);
   2703   }
   2704 
   2705   template<int dst>
   2706   ALWAYS_INLINE GSVector4 insert64(double v) const
   2707   {
   2708 #ifdef CPU_ARCH_ARM64
   2709     return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst)));
   2710 #else
   2711     GSVector4 ret;
   2712     ret.F64[dst] = v;
   2713     return ret;
   2714 #endif
   2715   }
   2716 
   2717   template<int src>
   2718   ALWAYS_INLINE double extract64() const
   2719   {
   2720 #ifdef CPU_ARCH_ARM64
   2721     return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src);
   2722 #else
   2723     return F64[src];
   2724 #endif
   2725   }
   2726 
   2727   ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
   2728 
   2729   ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
   2730 
   2731   ALWAYS_INLINE static GSVector4 loadl(const void* p)
   2732   {
   2733     return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
   2734   }
   2735 
   2736   ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0)); }
   2737 
   2738   template<bool aligned>
   2739   ALWAYS_INLINE static GSVector4 load(const void* p)
   2740   {
   2741     return GSVector4(vld1q_f32((const float*)p));
   2742   }
   2743 
   2744   ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }
   2745 
   2746   ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
   2747   {
   2748 #ifdef CPU_ARCH_ARM64
   2749     vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
   2750 #else
   2751     vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s)));
   2752 #endif
   2753   }
   2754 
   2755   ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
   2756   {
   2757 #ifdef CPU_ARCH_ARM64
   2758     vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
   2759 #else
   2760     vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s)));
   2761 #endif
   2762   }
   2763 
   2764   template<bool aligned>
   2765   ALWAYS_INLINE static void store(void* p, const GSVector4& v)
   2766   {
   2767     vst1q_f32((float*)p, v.v4s);
   2768   }
   2769 
   2770   ALWAYS_INLINE static void store(float* p, const GSVector4& v) { vst1q_lane_f32(p, v.v4s, 0); }
   2771 
   2772   ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
   2773 
   2774   ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); }
   2775   ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); }
   2776   ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); }
   2777   ALWAYS_INLINE void operator/=(const GSVector4& v)
   2778   {
   2779 #ifdef CPU_ARCH_ARM64
   2780     v4s = vdivq_f32(v4s, v.v4s);
   2781 #else
   2782     *this =
   2783       GSVector4(vgetq_lane_f32(v4s, 0) / vgetq_lane_f32(v.v4s, 0), vgetq_lane_f32(v4s, 1) / vgetq_lane_f32(v.v4s, 1),
   2784                 vgetq_lane_f32(v4s, 2) / vgetq_lane_f32(v.v4s, 2), vgetq_lane_f32(v4s, 3) / vgetq_lane_f32(v.v4s, 3));
   2785 #endif
   2786   }
   2787 
   2788   ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
   2789   ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
   2790   ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
   2791   ALWAYS_INLINE void operator/=(float f)
   2792   {
   2793 #ifdef CPU_ARCH_ARM64
   2794     *this /= GSVector4(f);
   2795 #else
   2796     *this = GSVector4(vgetq_lane_f32(v4s, 0) / f, vgetq_lane_f32(v4s, 1) / f, vgetq_lane_f32(v4s, 2) / f,
   2797                       vgetq_lane_f32(v4s, 3) / f);
   2798 #endif
   2799   }
   2800 
   2801   ALWAYS_INLINE void operator&=(const GSVector4& v)
   2802   {
   2803     v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
   2804   }
   2805 
   2806   ALWAYS_INLINE void operator|=(const GSVector4& v)
   2807   {
   2808     v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
   2809   }
   2810 
   2811   ALWAYS_INLINE void operator^=(const GSVector4& v)
   2812   {
   2813     v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
   2814   }
   2815 
   2816   ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
   2817   {
   2818     return GSVector4(vaddq_f32(v1.v4s, v2.v4s));
   2819   }
   2820 
   2821   ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
   2822   {
   2823     return GSVector4(vsubq_f32(v1.v4s, v2.v4s));
   2824   }
   2825 
   2826   ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
   2827   {
   2828     return GSVector4(vmulq_f32(v1.v4s, v2.v4s));
   2829   }
   2830 
   2831   ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
   2832   {
   2833 #ifdef CPU_ARCH_ARM64
   2834     return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
   2835 #else
   2836     return GSVector4(
   2837       vgetq_lane_f32(v1.v4s, 0) / vgetq_lane_f32(v2.v4s, 0), vgetq_lane_f32(v1.v4s, 1) / vgetq_lane_f32(v2.v4s, 1),
   2838       vgetq_lane_f32(v1.v4s, 2) / vgetq_lane_f32(v2.v4s, 2), vgetq_lane_f32(v1.v4s, 3) / vgetq_lane_f32(v2.v4s, 3));
   2839 #endif
   2840   }
   2841 
   2842   ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
   2843   ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
   2844   ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
   2845   ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f)
   2846   {
   2847 #ifdef CPU_ARCH_ARM64
   2848     return v / GSVector4(f);
   2849 #else
   2850     return GSVector4(vgetq_lane_f32(v.v4s, 0) / f, vgetq_lane_f32(v.v4s, 1) / f, vgetq_lane_f32(v.v4s, 2) / f,
   2851                      vgetq_lane_f32(v.v4s, 3) / f);
   2852 #endif
   2853   }
   2854 
   2855   ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
   2856   {
   2857     return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
   2858   }
   2859 
   2860   ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
   2861   {
   2862     return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
   2863   }
   2864 
   2865   ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
   2866   {
   2867     return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
   2868   }
   2869 
   2870   ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
   2871   {
   2872     return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s)));
   2873   }
   2874 
   2875   ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
   2876   {
   2877     // NEON has no !=
   2878     return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s))));
   2879   }
   2880 
   2881   ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
   2882   {
   2883     return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s)));
   2884   }
   2885 
   2886   ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
   2887   {
   2888     return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s)));
   2889   }
   2890 
   2891   ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
   2892   {
   2893     return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s)));
   2894   }
   2895 
   2896   ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
   2897   {
   2898     return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
   2899   }
   2900 
   2901   ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
   2902   {
   2903 #ifdef CPU_ARCH_ARM64
   2904     return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
   2905 #else
   2906     return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]);
   2907 #endif
   2908   }
   2909 
   2910   ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
   2911   {
   2912 #ifdef CPU_ARCH_ARM64
   2913     return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
   2914 #else
   2915     return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]);
   2916 #endif
   2917   }
   2918 
   2919   ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
   2920   {
   2921 #ifdef CPU_ARCH_ARM64
   2922     return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
   2923 #else
   2924     return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]);
   2925 #endif
   2926   }
   2927 
   2928   ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
   2929   {
   2930 #ifdef CPU_ARCH_ARM64
   2931     return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
   2932 #else
   2933     return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
   2934 #endif
   2935   }
   2936 
   2937   ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
   2938   {
   2939 #ifdef CPU_ARCH_ARM64
   2940     return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
   2941 #else
   2942     GSVector4 ret;
   2943     ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2944     ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2945     return ret;
   2946 #endif
   2947   }
   2948 
   2949   ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
   2950   {
   2951 #ifdef CPU_ARCH_ARM64
   2952     return GSVector4(vreinterpretq_f32_f64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
   2953 #else
   2954     GSVector4 ret;
   2955     ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2956     ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2957     return ret;
   2958 #endif
   2959   }
   2960 
   2961   ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
   2962   {
   2963 #ifdef CPU_ARCH_ARM64
   2964     return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
   2965 #else
   2966     GSVector4 ret;
   2967     ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2968     ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2969     return ret;
   2970 #endif
   2971   }
   2972 
   2973   ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
   2974   {
   2975 #ifdef CPU_ARCH_ARM64
   2976     return GSVector4(vreinterpretq_f32_f64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
   2977 #else
   2978     GSVector4 ret;
   2979     ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2980     ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2981     return ret;
   2982 #endif
   2983   }
   2984 
   2985   ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
   2986   {
   2987 #ifdef CPU_ARCH_ARM64
   2988     return GSVector4(vreinterpretq_f32_f64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
   2989 #else
   2990     GSVector4 ret;
   2991     ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2992     ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
   2993     return ret;
   2994 #endif
   2995   }
   2996 
   2997   ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
   2998   {
   2999 #ifdef CPU_ARCH_ARM64
   3000     return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
   3001 #else
   3002     return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
   3003 #endif
   3004   }
   3005 
   3006   ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
   3007   {
   3008 #ifdef CPU_ARCH_ARM64
   3009     return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
   3010 #else
   3011     return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
   3012 #endif
   3013   }
   3014 
   3015   ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
   3016 
   3017   ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
   3018 
   3019   ALWAYS_INLINE GSVector4 sqrt64() const
   3020   {
   3021 #ifdef CPU_ARCH_ARM64
   3022     return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
   3023 #else
   3024     return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1]));
   3025 #endif
   3026   }
   3027 
   3028   ALWAYS_INLINE GSVector4 sqr64() const
   3029   {
   3030 #ifdef CPU_ARCH_ARM64
   3031     return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
   3032 #else
   3033     return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
   3034 #endif
   3035   }
   3036 
   3037   ALWAYS_INLINE GSVector4 floor64() const
   3038   {
   3039 #ifdef CPU_ARCH_ARM64
   3040     return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s))));
   3041 #else
   3042     return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1]));
   3043 #endif
   3044   }
   3045 
   3046   ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
   3047   {
   3048 #ifdef CPU_ARCH_ARM64
   3049     return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
   3050 #else
   3051     return GSVector4::f64(static_cast<double>(vgetq_lane_f32(v.v4s, 0)), static_cast<double>(vgetq_lane_f32(v.v4s, 1)));
   3052 #endif
   3053   }
   3054 
   3055   ALWAYS_INLINE static GSVector4 f32to64(const void* p)
   3056   {
   3057 #ifdef CPU_ARCH_ARM64
   3058     return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
   3059 #else
   3060     const float* fp = static_cast<const float*>(p);
   3061     return GSVector4::f64(static_cast<double>(fp[0]), static_cast<double>(fp[1]));
   3062 #endif
   3063   }
   3064 
   3065   ALWAYS_INLINE GSVector4i f64toi32() const
   3066   {
   3067 #ifdef CPU_ARCH_ARM64
   3068     const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
   3069     const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
   3070 #else
   3071     const s32 low = static_cast<s32>(F64[0]);
   3072     const s32 high = static_cast<s32>(F64[1]);
   3073 #endif
   3074     return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
   3075   }
   3076 
   3077   // clang-format off
   3078 
   3079 #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
   3080     ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } \
   3081     ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const { return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); }
   3082 
   3083 #define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
   3084     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
   3085     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
   3086     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
   3087     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
   3088 
   3089 #define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
   3090     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
   3091     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
   3092     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
   3093     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
   3094 
   3095 #define VECTOR4_SHUFFLE_1(xs, xn) \
   3096     VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
   3097     VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
   3098     VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
   3099     VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
   3100 
   3101   VECTOR4_SHUFFLE_1(x, 0)
   3102     VECTOR4_SHUFFLE_1(y, 1)
   3103     VECTOR4_SHUFFLE_1(z, 2)
   3104     VECTOR4_SHUFFLE_1(w, 3)
   3105 
   3106   // clang-format on
   3107 
   3108   ALWAYS_INLINE GSVector4 broadcast32() const
   3109   {
   3110 #ifdef CPU_ARCH_ARM64
   3111     return GSVector4(vdupq_laneq_f32(v4s, 0));
   3112 #else
   3113     return xxxx();
   3114 #endif
   3115   }
   3116 
   3117   ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v)
   3118   {
   3119 #ifdef CPU_ARCH_ARM64
   3120     return GSVector4(vdupq_laneq_f32(v.v4s, 0));
   3121 #else
   3122     return v.xxxx();
   3123 #endif
   3124   }
   3125 
   3126   ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); }
   3127 
   3128   ALWAYS_INLINE static GSVector4 broadcast64(const void* f)
   3129   {
   3130 #ifdef CPU_ARCH_ARM64
   3131     return GSVector4(vreinterpretq_f32_f64(vld1q_dup_f64((const double*)f)));
   3132 #else
   3133     return GSVector4(vreinterpretq_f32_s64(vld1q_dup_s64((const s64*)f)));
   3134 #endif
   3135   }
   3136 };
   3137 
   3138 ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
   3139 {
   3140   v2s = vcvt_s32_f32(v.v2s);
   3141 }
   3142 
   3143 ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
   3144 {
   3145   v2s = vcvt_f32_s32(v.v2s);
   3146 }
   3147 
   3148 ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v)
   3149 {
   3150   return GSVector2i(vreinterpret_s32_f32(v.v2s));
   3151 }
   3152 
   3153 ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
   3154 {
   3155   return GSVector2(vreinterpret_f32_s32(v.v2s));
   3156 }
   3157 
   3158 ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
   3159 {
   3160   v4s = vcvtq_s32_f32(v.v4s);
   3161 }
   3162 
   3163 ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
   3164 {
   3165   v4s = vcvtq_f32_s32(v.v4s);
   3166 }
   3167 
   3168 ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
   3169 {
   3170   return GSVector4i(vreinterpretq_s32_f32(v.v4s));
   3171 }
   3172 
   3173 ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
   3174 {
   3175   return GSVector4(vreinterpretq_f32_s32(v.v4s));
   3176 }