duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

gsvector_sse.h (75332B)


      1 // SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: LGPL-3.0+
      3 //
      4 // Lightweight wrapper over native SIMD types for cross-platform vector code.
      5 // Rewritten and NEON+No-SIMD variants added for DuckStation.
      6 //
      7 
      8 #pragma once
      9 
     10 #include "common/intrin.h"
     11 #include "common/types.h"
     12 
     13 #include <algorithm>
     14 
     15 #ifdef CPU_ARCH_AVX2
     16 #define GSVECTOR_HAS_UNSIGNED 1
     17 #define GSVECTOR_HAS_SRLV 1
     18 #endif
     19 
     20 class GSVector2;
     21 class GSVector2i;
     22 class GSVector4;
     23 class GSVector4i;
     24 
     25 class alignas(16) GSVector2i
     26 {
     27   struct cxpr_init_tag
     28   {
     29   };
     30   static constexpr cxpr_init_tag cxpr_init{};
     31 
     32   constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y, 0, 0} {}
     33 
     34   constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3, 0, 0, 0, 0} {}
     35 
     36   constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
     37     : S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0}
     38   {
     39   }
     40 
     41 public:
     42   union
     43   {
     44     struct
     45     {
     46       s32 x, y;
     47     };
     48     struct
     49     {
     50       s32 r, g;
     51     };
     52     float F32[4];
     53     s8 S8[16];
     54     s16 S16[8];
     55     s32 S32[4];
     56     s64 S64[2];
     57     u8 U8[16];
     58     u16 U16[8];
     59     u32 U32[4];
     60     u64 U64[2];
     61     __m128i m;
     62   };
     63 
     64   GSVector2i() = default;
     65 
     66   ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
     67   ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
     68 
     69   ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
     70   ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
     71   {
     72     return GSVector2i(cxpr_init, s0, s1, s2, s3);
     73   }
     74 
     75   ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
     76   {
     77     return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7);
     78   }
     79 
     80   ALWAYS_INLINE GSVector2i(s32 x, s32 y) { m = _mm_set_epi32(0, 0, y, x); }
     81   ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) { m = _mm_set_epi16(0, 0, 0, 0, s3, s2, s1, s0); }
     82   ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
     83     : S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0}
     84   {
     85   }
     86   ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }
     87   ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
     88   ALWAYS_INLINE constexpr explicit GSVector2i(__m128i m) : m(m) {}
     89 
     90   ALWAYS_INLINE GSVector2i& operator=(s32 i)
     91   {
     92     m = _mm_set1_epi32(i);
     93     return *this;
     94   }
     95 
     96   ALWAYS_INLINE GSVector2i& operator=(__m128i m_)
     97   {
     98     m = m_;
     99     return *this;
    100   }
    101 
    102   ALWAYS_INLINE operator __m128i() const { return m; }
    103 
    104   ALWAYS_INLINE GSVector2i sat_i8(const GSVector2i& min, const GSVector2i& max) const
    105   {
    106     return max_i8(min).min_i8(max);
    107   }
    108   ALWAYS_INLINE GSVector2i sat_i16(const GSVector2i& min, const GSVector2i& max) const
    109   {
    110     return max_i16(min).min_i16(max);
    111   }
    112   ALWAYS_INLINE GSVector2i sat_i32(const GSVector2i& min, const GSVector2i& max) const
    113   {
    114     return max_i32(min).min_i32(max);
    115   }
    116 
    117   ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const
    118   {
    119     return max_u8(min).min_u8(max);
    120   }
    121   ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const
    122   {
    123     return max_u16(min).min_u16(max);
    124   }
    125   ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const
    126   {
    127     return max_u32(min).min_u32(max);
    128   }
    129 
    130   ALWAYS_INLINE GSVector2i min_i8(const GSVector2i& v) const { return GSVector2i(_mm_min_epi8(m, v)); }
    131   ALWAYS_INLINE GSVector2i max_i8(const GSVector2i& v) const { return GSVector2i(_mm_max_epi8(m, v)); }
    132   ALWAYS_INLINE GSVector2i min_i16(const GSVector2i& v) const { return GSVector2i(_mm_min_epi16(m, v)); }
    133   ALWAYS_INLINE GSVector2i max_i16(const GSVector2i& v) const { return GSVector2i(_mm_max_epi16(m, v)); }
    134   ALWAYS_INLINE GSVector2i min_i32(const GSVector2i& v) const { return GSVector2i(_mm_min_epi32(m, v)); }
    135   ALWAYS_INLINE GSVector2i max_i32(const GSVector2i& v) const { return GSVector2i(_mm_max_epi32(m, v)); }
    136 
    137   ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const { return GSVector2i(_mm_min_epu8(m, v)); }
    138   ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const { return GSVector2i(_mm_max_epu8(m, v)); }
    139   ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const { return GSVector2i(_mm_min_epu16(m, v)); }
    140   ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const { return GSVector2i(_mm_max_epu16(m, v)); }
    141   ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(_mm_min_epu32(m, v)); }
    142   ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(_mm_max_epu32(m, v)); }
    143 
    144   ALWAYS_INLINE s32 addv_s32() const { return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m)); }
    145 
    146   ALWAYS_INLINE u8 minv_u8() const
    147   {
    148     __m128i vmin = _mm_min_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1)));
    149     return static_cast<u8>(std::min(
    150       static_cast<u32>(_mm_extract_epi8(vmin, 0)),
    151       std::min(static_cast<u32>(_mm_extract_epi8(vmin, 1)),
    152                std::min(static_cast<u32>(_mm_extract_epi8(vmin, 2)), static_cast<u32>(_mm_extract_epi8(vmin, 3))))));
    153   }
    154 
    155   ALWAYS_INLINE u16 maxv_u8() const
    156   {
    157     __m128i vmax = _mm_max_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1)));
    158     return static_cast<u8>(std::max(
    159       static_cast<u32>(_mm_extract_epi8(vmax, 0)),
    160       std::max(static_cast<u32>(_mm_extract_epi8(vmax, 1)),
    161                std::max(static_cast<u32>(_mm_extract_epi8(vmax, 2)), static_cast<u32>(_mm_extract_epi8(vmax, 3))))));
    162   }
    163 
    164   ALWAYS_INLINE u16 minv_u16() const
    165   {
    166     __m128i vmin = _mm_min_epu16(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1)));
    167     return static_cast<u16>(
    168       std::min(static_cast<u32>(_mm_extract_epi16(vmin, 0)), static_cast<u32>(_mm_extract_epi16(vmin, 1))));
    169   }
    170 
    171   ALWAYS_INLINE u16 maxv_u16() const
    172   {
    173     __m128i vmax = _mm_max_epu16(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1)));
    174     return static_cast<u16>(
    175       std::max<u32>(static_cast<u32>(_mm_extract_epi16(vmax, 0)), static_cast<u32>(_mm_extract_epi16(vmax, 1))));
    176   }
    177 
    178   ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
    179   ALWAYS_INLINE u32 minv_u32() const { return std::min<u32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
    180   ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
    181   ALWAYS_INLINE u32 maxv_u32() const { return std::max<u32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
    182 
    183   ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
    184 
    185   ALWAYS_INLINE GSVector2i blend8(const GSVector2i& v, const GSVector2i& mask) const
    186   {
    187     return GSVector2i(_mm_blendv_epi8(m, v, mask));
    188   }
    189 
    190   template<s32 mask>
    191   ALWAYS_INLINE GSVector2i blend16(const GSVector2i& v) const
    192   {
    193     return GSVector2i(_mm_blend_epi16(m, v, mask));
    194   }
    195 
    196   template<s32 mask>
    197   ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const
    198   {
    199 #if defined(__AVX2__)
    200     return GSVector2i(_mm_blend_epi32(m, v.m, mask));
    201 #else
    202     constexpr s32 bit1 = ((mask & 2) * 3) << 1;
    203     constexpr s32 bit0 = (mask & 1) * 3;
    204     return blend16<bit1 | bit0>(v);
    205 #endif
    206   }
    207 
    208   ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
    209   {
    210     return GSVector2i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v)));
    211   }
    212 
    213   ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); }
    214 
    215   ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const { return GSVector2i(_mm_shuffle_epi8(m, mask)); }
    216 
    217   ALWAYS_INLINE GSVector2i ps16() const { return GSVector2i(_mm_packs_epi16(m, m)); }
    218   ALWAYS_INLINE GSVector2i pu16() const { return GSVector2i(_mm_packus_epi16(m, m)); }
    219   ALWAYS_INLINE GSVector2i ps32() const { return GSVector2i(_mm_packs_epi32(m, m)); }
    220   ALWAYS_INLINE GSVector2i pu32() const { return GSVector2i(_mm_packus_epi32(m, m)); }
    221 
    222   ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi8(m, v)); }
    223   ALWAYS_INLINE GSVector2i uph8(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi8(m, v)); }
    224   ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi16(m, v)); }
    225   ALWAYS_INLINE GSVector2i uph16(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi16(m, v)); }
    226   ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi32(m, v)); }
    227   ALWAYS_INLINE GSVector2i uph32(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi32(m, v)); }
    228 
    229   ALWAYS_INLINE GSVector2i upl8() const { return GSVector2i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); }
    230   ALWAYS_INLINE GSVector2i uph8() const { return GSVector2i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); }
    231 
    232   ALWAYS_INLINE GSVector2i upl16() const { return GSVector2i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); }
    233   ALWAYS_INLINE GSVector2i uph16() const { return GSVector2i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); }
    234 
    235   ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); }
    236   ALWAYS_INLINE GSVector2i uph32() const { return GSVector2i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); }
    237 
    238   ALWAYS_INLINE GSVector2i i8to16() const { return GSVector2i(_mm_cvtepi8_epi16(m)); }
    239 
    240 #ifdef CPU_ARCH_SSE41
    241   ALWAYS_INLINE GSVector2i u8to16() const { return GSVector2i(_mm_cvtepu8_epi16(m)); }
    242 #endif
    243 
    244   template<s32 i>
    245   ALWAYS_INLINE GSVector2i srl() const
    246   {
    247     return GSVector2i(_mm_srli_si128(m, i));
    248   }
    249 
    250   template<s32 i>
    251   ALWAYS_INLINE GSVector2i sll() const
    252   {
    253     return GSVector2i(_mm_slli_si128(m, i));
    254   }
    255 
    256   template<s32 i>
    257   ALWAYS_INLINE GSVector2i sll16() const
    258   {
    259     return GSVector2i(_mm_slli_epi16(m, i));
    260   }
    261 
    262   ALWAYS_INLINE GSVector2i sll16(s32 i) const { return GSVector2i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); }
    263 
    264 #ifdef CPU_ARCH_AVX2
    265   ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const { return GSVector2i(_mm_sllv_epi16(m, v.m)); }
    266 #endif
    267 
    268   template<s32 i>
    269   ALWAYS_INLINE GSVector2i srl16() const
    270   {
    271     return GSVector2i(_mm_srli_epi16(m, i));
    272   }
    273 
    274   ALWAYS_INLINE GSVector2i srl16(s32 i) const { return GSVector2i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); }
    275 
    276 #ifdef CPU_ARCH_AVX2
    277   ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const { return GSVector2i(_mm_srlv_epi16(m, v.m)); }
    278 #endif
    279 
    280   template<s32 i>
    281   ALWAYS_INLINE GSVector2i sra16() const
    282   {
    283     return GSVector2i(_mm_srai_epi16(m, i));
    284   }
    285 
    286   ALWAYS_INLINE GSVector2i sra16(s32 i) const { return GSVector2i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); }
    287 
    288 #ifdef CPU_ARCH_AVX2
    289   ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const { return GSVector2i(_mm_srav_epi16(m, v.m)); }
    290 #endif
    291 
    292   template<s32 i>
    293   ALWAYS_INLINE GSVector2i sll32() const
    294   {
    295     return GSVector2i(_mm_slli_epi32(m, i));
    296   }
    297 
    298   ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); }
    299 
    300 #ifdef CPU_ARCH_AVX2
    301   ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(_mm_sllv_epi32(m, v.m)); }
    302 #endif
    303 
    304   template<s32 i>
    305   ALWAYS_INLINE GSVector2i srl32() const
    306   {
    307     return GSVector2i(_mm_srli_epi32(m, i));
    308   }
    309 
    310   ALWAYS_INLINE GSVector2i srl32(s32 i) const { return GSVector2i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); }
    311 
    312 #ifdef CPU_ARCH_AVX2
    313   ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const { return GSVector2i(_mm_srlv_epi32(m, v.m)); }
    314 #endif
    315 
    316   template<s32 i>
    317   ALWAYS_INLINE GSVector2i sra32() const
    318   {
    319     return GSVector2i(_mm_srai_epi32(m, i));
    320   }
    321 
    322   ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); }
    323 
    324 #ifdef CPU_ARCH_AVX2
    325   ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const { return GSVector2i(_mm_srav_epi32(m, v.m)); }
    326 #endif
    327 
    328   ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const { return GSVector2i(_mm_add_epi8(m, v.m)); }
    329   ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const { return GSVector2i(_mm_add_epi16(m, v.m)); }
    330   ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(_mm_add_epi32(m, v.m)); }
    331   ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi8(m, v.m)); }
    332   ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi16(m, v.m)); }
    333   ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu8(m, v.m)); }
    334   ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu16(m, v.m)); }
    335 
    336   ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi8(m, v.m)); }
    337   ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi16(m, v.m)); }
    338   ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi32(m, v.m)); }
    339   ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi8(m, v.m)); }
    340   ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi16(m, v.m)); }
    341   ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu8(m, v.m)); }
    342   ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu16(m, v.m)); }
    343 
    344   ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu8(m, v.m)); }
    345   ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu16(m, v.m)); }
    346 
    347   ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi16(m, v.m)); }
    348   ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi32(m, v.m)); }
    349 
    350   ALWAYS_INLINE bool eq(const GSVector2i& v) const { return eq8(v).alltrue(); }
    351 
    352   ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi8(m, v.m)); }
    353   ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi16(m, v.m)); }
    354   ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi32(m, v.m)); }
    355 
    356   ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }
    357   ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
    358   ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); }
    359 
    360   ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi8(m, v.m)); }
    361   ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi16(m, v.m)); }
    362   ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi32(m, v.m)); }
    363 
    364   ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi8(m, v.m)); }
    365   ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi16(m, v.m)); }
    366   ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi32(m, v.m)); }
    367 
    368   ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi8(m, v.m)); }
    369   ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi16(m, v.m)); }
    370   ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi32(m, v.m)); }
    371 
    372   ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi8(m, v.m)); }
    373   ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi16(m, v.m)); }
    374   ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi32(m, v.m)); }
    375 
    376   ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(_mm_andnot_si128(v.m, m)); }
    377 
    378   ALWAYS_INLINE s32 mask() const { return (_mm_movemask_epi8(m) & 0xff); }
    379 
    380   ALWAYS_INLINE bool alltrue() const { return (mask() == 0xff); }
    381   ALWAYS_INLINE bool allfalse() const { return (mask() == 0x00); }
    382 
    383   template<s32 i>
    384   ALWAYS_INLINE GSVector2i insert8(s32 a) const
    385   {
    386     return GSVector2i(_mm_insert_epi8(m, a, i));
    387   }
    388 
    389   template<s32 i>
    390   ALWAYS_INLINE s32 extract8() const
    391   {
    392     return _mm_extract_epi8(m, i);
    393   }
    394 
    395   template<s32 i>
    396   ALWAYS_INLINE GSVector2i insert16(s32 a) const
    397   {
    398     return GSVector2i(_mm_insert_epi16(m, a, i));
    399   }
    400 
    401   template<s32 i>
    402   ALWAYS_INLINE s32 extract16() const
    403   {
    404     return _mm_extract_epi16(m, i);
    405   }
    406 
    407   template<s32 i>
    408   ALWAYS_INLINE GSVector2i insert32(s32 a) const
    409   {
    410     return GSVector2i(_mm_insert_epi32(m, a, i));
    411   }
    412 
    413   template<s32 i>
    414   ALWAYS_INLINE s32 extract32() const
    415   {
    416     if constexpr (i == 0)
    417       return GSVector2i::store(*this);
    418 
    419     return _mm_extract_epi32(m, i);
    420   }
    421 
    422   ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); }
    423   ALWAYS_INLINE static GSVector2i load(const void* p)
    424   {
    425     return GSVector2i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
    426   }
    427   ALWAYS_INLINE static GSVector2i load(s32 i) { return GSVector2i(_mm_cvtsi32_si128(i)); }
    428   ALWAYS_INLINE static GSVector2i loadq(s64 i) { return GSVector2i(_mm_cvtsi64_si128(i)); }
    429 
    430   ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
    431   ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); }
    432   ALWAYS_INLINE static s32 store(const GSVector2i& v) { return _mm_cvtsi128_si32(v.m); }
    433   ALWAYS_INLINE static s64 storeq(const GSVector2i& v) { return _mm_cvtsi128_si64(v.m); }
    434 
    435   ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v)
    436   {
    437     m = _mm_and_si128(m, v);
    438     return *this;
    439   }
    440 
    441   ALWAYS_INLINE GSVector2i& operator|=(const GSVector2i& v)
    442   {
    443     m = _mm_or_si128(m, v);
    444     return *this;
    445   }
    446 
    447   ALWAYS_INLINE GSVector2i& operator^=(const GSVector2i& v)
    448   {
    449     m = _mm_xor_si128(m, v);
    450     return *this;
    451   }
    452 
    453   ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
    454   {
    455     return GSVector2i(_mm_and_si128(v1, v2));
    456   }
    457 
    458   ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2)
    459   {
    460     return GSVector2i(_mm_or_si128(v1, v2));
    461   }
    462 
    463   ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2)
    464   {
    465     return GSVector2i(_mm_xor_si128(v1, v2));
    466   }
    467 
    468   ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, s32 i) { return v & GSVector2i(i); }
    469 
    470   ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, s32 i) { return v | GSVector2i(i); }
    471 
    472   ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, s32 i) { return v ^ GSVector2i(i); }
    473 
    474   ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); }
    475 
    476   ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(_mm_setzero_si128()); }
    477   ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
    478 
    479   ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); }
    480   ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 0))); }
    481   ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 1))); }
    482   ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 1, 1))); }
    483 };
    484 
    485 class alignas(16) GSVector2
    486 {
    487   struct cxpr_init_tag
    488   {
    489   };
    490   static constexpr cxpr_init_tag cxpr_init{};
    491 
    492   constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
    493   constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
    494 
    495 public:
    496   union
    497   {
    498     struct
    499     {
    500       float x, y;
    501     };
    502     struct
    503     {
    504       float r, g;
    505     };
    506     float F32[4];
    507     double F64[2];
    508     s8 I8[16];
    509     s16 I16[8];
    510     s32 I32[4];
    511     s64 I64[2];
    512     u8 U8[16];
    513     u16 U16[8];
    514     u32 U32[4];
    515     u64 U64[2];
    516     __m128 m;
    517   };
    518 
    519   GSVector2() = default;
    520 
    521   constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
    522   constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
    523   constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
    524   constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
    525 
    526   ALWAYS_INLINE GSVector2(float x, float y) { m = _mm_set_ps(0, 0, y, x); }
    527   ALWAYS_INLINE GSVector2(int x, int y)
    528   {
    529     GSVector2i v_(x, y);
    530     m = _mm_cvtepi32_ps(v_.m);
    531   }
    532 
    533   ALWAYS_INLINE constexpr explicit GSVector2(__m128 m) : m(m) {}
    534   ALWAYS_INLINE explicit GSVector2(__m128d m) : m(_mm_castpd_ps(m)) {}
    535   ALWAYS_INLINE explicit GSVector2(float f) { *this = f; }
    536   ALWAYS_INLINE explicit GSVector2(int i)
    537   {
    538 #ifdef CPU_ARCH_AVX2
    539     m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
    540 #else
    541     *this = GSVector2(GSVector2i(i));
    542 #endif
    543   }
    544 
    545   ALWAYS_INLINE explicit GSVector2(const GSVector2i& v);
    546 
    547   ALWAYS_INLINE GSVector2& operator=(float f)
    548   {
    549     m = _mm_set1_ps(f);
    550     return *this;
    551   }
    552 
    553   ALWAYS_INLINE GSVector2& operator=(__m128 m_)
    554   {
    555     m = m_;
    556     return *this;
    557   }
    558 
    559   ALWAYS_INLINE operator __m128() const { return m; }
    560 
    561   ALWAYS_INLINE GSVector2 abs() const { return *this & cast(GSVector2i::cxpr(0x7fffffff)); }
    562   ALWAYS_INLINE GSVector2 neg() const { return *this ^ cast(GSVector2i::cxpr(0x80000000)); }
    563   ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(_mm_rcp_ps(m)); }
    564   ALWAYS_INLINE GSVector2 floor() const
    565   {
    566     return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
    567   }
    568 
    569   ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
    570 
    571   ALWAYS_INLINE GSVector2 sat(const GSVector2& min, const GSVector2& max) const
    572   {
    573     return GSVector2(_mm_min_ps(_mm_max_ps(m, min), max));
    574   }
    575 
    576   ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
    577 
    578   ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); }
    579 
    580   ALWAYS_INLINE GSVector2 min(const GSVector2& v) const { return GSVector2(_mm_min_ps(m, v)); }
    581 
    582   ALWAYS_INLINE GSVector2 max(const GSVector2& v) const { return GSVector2(_mm_max_ps(m, v)); }
    583 
    584   template<int mask>
    585   ALWAYS_INLINE GSVector2 blend32(const GSVector2& v) const
    586   {
    587     return GSVector2(_mm_blend_ps(m, v, mask));
    588   }
    589 
    590   ALWAYS_INLINE GSVector2 blend32(const GSVector2& v, const GSVector2& mask) const
    591   {
    592     return GSVector2(_mm_blendv_ps(m, v, mask));
    593   }
    594 
    595   ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const { return GSVector2(_mm_andnot_ps(v.m, m)); }
    596 
    597   ALWAYS_INLINE int mask() const { return (_mm_movemask_ps(m) & 0x3); }
    598 
    599   ALWAYS_INLINE bool alltrue() const { return (mask() == 0x3); }
    600 
    601   ALWAYS_INLINE bool allfalse() const { return (mask() == 0x0); }
    602 
    603   ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
    604 
    605   template<int src, int dst>
    606   ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
    607   {
    608     if constexpr (src == dst)
    609       return GSVector2(_mm_blend_ps(m, v.m, 1 << src));
    610     else
    611       return GSVector2(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0)));
    612   }
    613 
    614   template<int i>
    615   ALWAYS_INLINE int extract32() const
    616   {
    617     return _mm_extract_ps(m, i);
    618   }
    619 
    620   ALWAYS_INLINE float dot(const GSVector2& v) const { return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0x31)); }
    621 
    622   ALWAYS_INLINE static GSVector2 zero() { return GSVector2(_mm_setzero_ps()); }
    623 
    624   ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); }
    625 
    626   ALWAYS_INLINE static GSVector2 load(const void* p)
    627   {
    628     return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
    629   }
    630 
    631   ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(_mm_load_ss(&f)); }
    632 
    633   ALWAYS_INLINE static void store(void* p, const GSVector2& v)
    634   {
    635     _mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
    636   }
    637 
    638   ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
    639 
    640   ALWAYS_INLINE GSVector2& operator+=(const GSVector2& v_)
    641   {
    642     m = _mm_add_ps(m, v_);
    643     return *this;
    644   }
    645   ALWAYS_INLINE GSVector2& operator-=(const GSVector2& v_)
    646   {
    647     m = _mm_sub_ps(m, v_);
    648     return *this;
    649   }
    650   ALWAYS_INLINE GSVector2& operator*=(const GSVector2& v_)
    651   {
    652     m = _mm_mul_ps(m, v_);
    653     return *this;
    654   }
    655   ALWAYS_INLINE GSVector2& operator/=(const GSVector2& v_)
    656   {
    657     m = _mm_div_ps(m, v_);
    658     return *this;
    659   }
    660 
    661   ALWAYS_INLINE GSVector2& operator+=(float f)
    662   {
    663     *this += GSVector2(f);
    664     return *this;
    665   }
    666   ALWAYS_INLINE GSVector2& operator-=(float f)
    667   {
    668     *this -= GSVector2(f);
    669     return *this;
    670   }
    671   ALWAYS_INLINE GSVector2& operator*=(float f)
    672   {
    673     *this *= GSVector2(f);
    674     return *this;
    675   }
    676   ALWAYS_INLINE GSVector2& operator/=(float f)
    677   {
    678     *this /= GSVector2(f);
    679     return *this;
    680   }
    681 
    682   ALWAYS_INLINE GSVector2& operator&=(const GSVector2& v_)
    683   {
    684     m = _mm_and_ps(m, v_);
    685     return *this;
    686   }
    687   ALWAYS_INLINE GSVector2& operator|=(const GSVector2& v_)
    688   {
    689     m = _mm_or_ps(m, v_);
    690     return *this;
    691   }
    692   ALWAYS_INLINE GSVector2& operator^=(const GSVector2& v_)
    693   {
    694     m = _mm_xor_ps(m, v_);
    695     return *this;
    696   }
    697 
    698   ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2)
    699   {
    700     return GSVector2(_mm_add_ps(v1, v2));
    701   }
    702 
    703   ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2)
    704   {
    705     return GSVector2(_mm_sub_ps(v1, v2));
    706   }
    707 
    708   ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2)
    709   {
    710     return GSVector2(_mm_mul_ps(v1, v2));
    711   }
    712 
    713   ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
    714   {
    715     return GSVector2(_mm_div_ps(v1, v2));
    716   }
    717 
    718   ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
    719 
    720   ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); }
    721 
    722   ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); }
    723 
    724   ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); }
    725 
    726   ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2)
    727   {
    728     return GSVector2(_mm_and_ps(v1, v2));
    729   }
    730 
    731   ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2)
    732   {
    733     return GSVector2(_mm_or_ps(v1, v2));
    734   }
    735 
    736   ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2)
    737   {
    738     return GSVector2(_mm_xor_ps(v1, v2));
    739   }
    740 
    741   ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2)
    742   {
    743     return GSVector2(_mm_cmpeq_ps(v1, v2));
    744   }
    745 
    746   ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2)
    747   {
    748     return GSVector2(_mm_cmpneq_ps(v1, v2));
    749   }
    750 
    751   ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2)
    752   {
    753     return GSVector2(_mm_cmpgt_ps(v1, v2));
    754   }
    755 
    756   ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2)
    757   {
    758     return GSVector2(_mm_cmplt_ps(v1, v2));
    759   }
    760 
    761   ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2)
    762   {
    763     return GSVector2(_mm_cmpge_ps(v1, v2));
    764   }
    765 
    766   ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2)
    767   {
    768     return GSVector2(_mm_cmple_ps(v1, v2));
    769   }
    770 
    771   ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
    772 
    773   ALWAYS_INLINE GSVector2 xy() const { return *this; }
    774   ALWAYS_INLINE GSVector2 xx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 0))); }
    775   ALWAYS_INLINE GSVector2 yx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 1))); }
    776   ALWAYS_INLINE GSVector2 yy() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 1, 1))); }
    777 };
    778 
    779 class alignas(16) GSVector4i
    780 {
    781   struct cxpr_init_tag
    782   {
    783   };
    784   static constexpr cxpr_init_tag cxpr_init{};
    785 
    786   constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {}
    787 
    788   constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
    789     : S16{s0, s1, s2, s3, s4, s5, s6, s7}
    790   {
    791   }
    792 
    793   constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
    794                        s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
    795     : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
    796   {
    797   }
    798 
    799 public:
    800   union
    801   {
    802     struct
    803     {
    804       s32 x, y, z, w;
    805     };
    806     struct
    807     {
    808       s32 r, g, b, a;
    809     };
    810     struct
    811     {
    812       s32 left, top, right, bottom;
    813     };
    814     float F32[4];
    815     s8 S8[16];
    816     s16 S16[8];
    817     s32 S32[4];
    818     s64 S64[2];
    819     u8 U8[16];
    820     u16 U16[8];
    821     u32 U32[4];
    822     u64 U64[2];
    823     __m128i m;
    824   };
    825 
    826   GSVector4i() = default;
    827 
    828   ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
    829   {
    830     return GSVector4i(cxpr_init, x, y, z, w);
    831   }
    832   ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
    833 
    834   ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
    835   ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
    836   {
    837     return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
    838   }
    839 
    840   ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
    841                                                   s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
    842   {
    843     return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
    844   }
    845 
    846   ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); }
    847   ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); }
    848   ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
    849   {
    850     m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
    851   }
    852 
    853   ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
    854                                      s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
    855     : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
    856   {
    857   }
    858 
    859   ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = v.m; }
    860 
    861   ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
    862 
    863   ALWAYS_INLINE explicit GSVector4i(const GSVector2& v);
    864 
    865   ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
    866 
    867   ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {}
    868 
    869   ALWAYS_INLINE GSVector4i& operator=(s32 i)
    870   {
    871     m = _mm_set1_epi32(i);
    872     return *this;
    873   }
    874   ALWAYS_INLINE GSVector4i& operator=(__m128i m_)
    875   {
    876     m = m_;
    877     return *this;
    878   }
    879 
    880   ALWAYS_INLINE operator __m128i() const { return m; }
    881 
    882   ALWAYS_INLINE s32 width() const { return right - left; }
    883 
    884   ALWAYS_INLINE s32 height() const { return bottom - top; }
    885 
    886   ALWAYS_INLINE GSVector4i rsize() const
    887   {
    888     return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
    889   }
    890 
    891   ALWAYS_INLINE s32 rarea() const { return width() * height(); }
    892 
    893   ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; }
    894 
    895   ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_i32(v).upl64(max_i32(v).srl<8>()); }
    896 
    897   ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& v) const { return sat_i32(v); }
    898   ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); }
    899   ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
    900 
    901   ALWAYS_INLINE u32 rgba32() const
    902   {
    903     GSVector4i v = *this;
    904 
    905     v = v.ps32(v);
    906     v = v.pu16(v);
    907 
    908     return (u32)store(v);
    909   }
    910 
    911   ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const
    912   {
    913     return max_i8(min).min_i8(max);
    914   }
    915   ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const
    916   {
    917     return max_i8(minmax.xyxy()).min_i8(minmax.zwzw());
    918   }
    919   ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const
    920   {
    921     return max_i16(min).min_i16(max);
    922   }
    923   ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const
    924   {
    925     return max_i16(minmax.xyxy()).min_i16(minmax.zwzw());
    926   }
    927   ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const
    928   {
    929     return max_i32(min).min_i32(max);
    930   }
    931   ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const
    932   {
    933     return max_i32(minmax.xyxy()).min_i32(minmax.zwzw());
    934   }
    935 
    936   ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
    937   {
    938     return max_u8(min).min_u8(max);
    939   }
    940   ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
    941   {
    942     return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
    943   }
    944   ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
    945   {
    946     return max_u16(min).min_u16(max);
    947   }
    948   ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
    949   {
    950     return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
    951   }
    952   ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
    953   {
    954     return max_u32(min).min_u32(max);
    955   }
    956   ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
    957   {
    958     return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
    959   }
    960 
    961   ALWAYS_INLINE GSVector4i min_i8(const GSVector4i& v) const { return GSVector4i(_mm_min_epi8(m, v)); }
    962   ALWAYS_INLINE GSVector4i max_i8(const GSVector4i& v) const { return GSVector4i(_mm_max_epi8(m, v)); }
    963   ALWAYS_INLINE GSVector4i min_i16(const GSVector4i& v) const { return GSVector4i(_mm_min_epi16(m, v)); }
    964   ALWAYS_INLINE GSVector4i max_i16(const GSVector4i& v) const { return GSVector4i(_mm_max_epi16(m, v)); }
    965   ALWAYS_INLINE GSVector4i min_i32(const GSVector4i& v) const { return GSVector4i(_mm_min_epi32(m, v)); }
    966   ALWAYS_INLINE GSVector4i max_i32(const GSVector4i& v) const { return GSVector4i(_mm_max_epi32(m, v)); }
    967 
    968   ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const { return GSVector4i(_mm_min_epu8(m, v)); }
    969   ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const { return GSVector4i(_mm_max_epu8(m, v)); }
    970   ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const { return GSVector4i(_mm_min_epu16(m, v)); }
    971   ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const { return GSVector4i(_mm_max_epu16(m, v)); }
    972   ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const { return GSVector4i(_mm_min_epu32(m, v)); }
    973   ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const { return GSVector4i(_mm_max_epu32(m, v)); }
    974 
    975   ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const { return GSVector4i(_mm_madd_epi16(m, v.m)); }
    976 
    977   ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(_mm_hadd_epi32(m, m)); }
    978 
    979   ALWAYS_INLINE s32 addv_s32() const
    980   {
    981     const __m128i pairs = _mm_hadd_epi32(m, m);
    982     return _mm_cvtsi128_si32(_mm_hadd_epi32(pairs, pairs));
    983   }
    984 
    985   ALWAYS_INLINE u8 minv_u8() const
    986   {
    987     __m128i vmin = _mm_min_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
    988     vmin = _mm_min_epu8(vmin, _mm_shuffle_epi32(vmin, _MM_SHUFFLE(1, 1, 1, 1)));
    989     return static_cast<u8>(std::min(
    990       static_cast<u32>(_mm_extract_epi8(vmin, 0)),
    991       std::min(static_cast<u32>(_mm_extract_epi8(vmin, 1)),
    992                std::min(static_cast<u32>(_mm_extract_epi8(vmin, 2)), static_cast<u32>(_mm_extract_epi8(vmin, 3))))));
    993   }
    994 
    995   ALWAYS_INLINE u16 maxv_u8() const
    996   {
    997     __m128i vmax = _mm_max_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
    998     vmax = _mm_max_epu8(vmax, _mm_shuffle_epi32(vmax, _MM_SHUFFLE(1, 1, 1, 1)));
    999     return static_cast<u8>(std::max(
   1000       static_cast<u32>(_mm_extract_epi8(vmax, 0)),
   1001       std::max(static_cast<u32>(_mm_extract_epi8(vmax, 1)),
   1002                std::max(static_cast<u32>(_mm_extract_epi8(vmax, 2)), static_cast<u32>(_mm_extract_epi8(vmax, 3))))));
   1003   }
   1004 
   1005   ALWAYS_INLINE u16 minv_u16() const
   1006   {
   1007     __m128i vmin = _mm_min_epu16(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
   1008     vmin = _mm_min_epu16(vmin, _mm_shuffle_epi32(vmin, _MM_SHUFFLE(1, 1, 1, 1)));
   1009     return static_cast<u16>(
   1010       std::min(static_cast<u32>(_mm_extract_epi16(vmin, 0)), static_cast<u32>(_mm_extract_epi16(vmin, 1))));
   1011   }
   1012 
   1013   ALWAYS_INLINE u16 maxv_u16() const
   1014   {
   1015     __m128i vmax = _mm_max_epu16(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
   1016     vmax = _mm_max_epu16(vmax, _mm_shuffle_epi32(vmax, _MM_SHUFFLE(1, 1, 1, 1)));
   1017     return static_cast<u16>(
   1018       std::max<u32>(static_cast<u32>(_mm_extract_epi16(vmax, 0)), static_cast<u32>(_mm_extract_epi16(vmax, 1))));
   1019   }
   1020 
   1021   ALWAYS_INLINE s32 minv_s32() const
   1022   {
   1023     const __m128i vmin = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
   1024     return std::min<s32>(_mm_extract_epi32(vmin, 0), _mm_extract_epi32(vmin, 1));
   1025   }
   1026 
   1027   ALWAYS_INLINE u32 minv_u32() const
   1028   {
   1029     const __m128i vmin = _mm_min_epu32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
   1030     return std::min<u32>(_mm_extract_epi32(vmin, 0), _mm_extract_epi32(vmin, 1));
   1031   }
   1032 
   1033   ALWAYS_INLINE s32 maxv_s32() const
   1034   {
   1035     const __m128i vmax = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
   1036     return std::max<s32>(_mm_extract_epi32(vmax, 0), _mm_extract_epi32(vmax, 1));
   1037   }
   1038 
   1039   ALWAYS_INLINE u32 maxv_u32() const
   1040   {
   1041     const __m128i vmax = _mm_max_epu32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
   1042     return std::max<u32>(_mm_extract_epi32(vmax, 0), _mm_extract_epi32(vmax, 1));
   1043   }
   1044 
   1045   ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
   1046 
   1047   ALWAYS_INLINE GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const
   1048   {
   1049     return GSVector4i(_mm_blendv_epi8(m, v, mask));
   1050   }
   1051 
   1052   template<s32 mask>
   1053   ALWAYS_INLINE GSVector4i blend16(const GSVector4i& v) const
   1054   {
   1055     return GSVector4i(_mm_blend_epi16(m, v, mask));
   1056   }
   1057 
   1058   template<s32 mask>
   1059   ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
   1060   {
   1061 #if defined(__AVX2__)
   1062     return GSVector4i(_mm_blend_epi32(m, v.m, mask));
   1063 #else
   1064     constexpr s32 bit3 = ((mask & 8) * 3) << 3;
   1065     constexpr s32 bit2 = ((mask & 4) * 3) << 2;
   1066     constexpr s32 bit1 = ((mask & 2) * 3) << 1;
   1067     constexpr s32 bit0 = (mask & 1) * 3;
   1068     return blend16<bit3 | bit2 | bit1 | bit0>(v);
   1069 #endif
   1070   }
   1071 
   1072   ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
   1073   {
   1074     return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v)));
   1075   }
   1076 
   1077   ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
   1078 
   1079   ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const { return GSVector4i(_mm_shuffle_epi8(m, mask)); }
   1080 
   1081   ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi16(m, v)); }
   1082   ALWAYS_INLINE GSVector4i ps16() const { return GSVector4i(_mm_packs_epi16(m, m)); }
   1083   ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi16(m, v)); }
   1084   ALWAYS_INLINE GSVector4i pu16() const { return GSVector4i(_mm_packus_epi16(m, m)); }
   1085   ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi32(m, v)); }
   1086   ALWAYS_INLINE GSVector4i ps32() const { return GSVector4i(_mm_packs_epi32(m, m)); }
   1087   ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi32(m, v)); }
   1088   ALWAYS_INLINE GSVector4i pu32() const { return GSVector4i(_mm_packus_epi32(m, m)); }
   1089 
   1090   ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi8(m, v)); }
   1091   ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi8(m, v)); }
   1092   ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi16(m, v)); }
   1093   ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi16(m, v)); }
   1094   ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi32(m, v)); }
   1095   ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi32(m, v)); }
   1096   ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi64(m, v)); }
   1097   ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi64(m, v)); }
   1098 
   1099   ALWAYS_INLINE GSVector4i upl8() const { return GSVector4i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); }
   1100   ALWAYS_INLINE GSVector4i uph8() const { return GSVector4i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); }
   1101 
   1102   ALWAYS_INLINE GSVector4i upl16() const { return GSVector4i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); }
   1103   ALWAYS_INLINE GSVector4i uph16() const { return GSVector4i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); }
   1104 
   1105   ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); }
   1106 
   1107   ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); }
   1108   ALWAYS_INLINE GSVector4i upl64() const { return GSVector4i(_mm_unpacklo_epi64(m, _mm_setzero_si128())); }
   1109   ALWAYS_INLINE GSVector4i uph64() const { return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128())); }
   1110 
   1111   ALWAYS_INLINE GSVector4i s8to16() const { return GSVector4i(_mm_cvtepi8_epi16(m)); }
   1112   ALWAYS_INLINE GSVector4i s8to32() const { return GSVector4i(_mm_cvtepi8_epi32(m)); }
   1113   ALWAYS_INLINE GSVector4i s8to64() const { return GSVector4i(_mm_cvtepi8_epi64(m)); }
   1114 
   1115 #ifdef CPU_ARCH_SSE41
   1116   ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(_mm_cvtepi16_epi32(m)); }
   1117   ALWAYS_INLINE GSVector4i s16to64() const { return GSVector4i(_mm_cvtepi16_epi64(m)); }
   1118   ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(_mm_cvtepi32_epi64(m)); }
   1119   ALWAYS_INLINE GSVector4i u8to16() const { return GSVector4i(_mm_cvtepu8_epi16(m)); }
   1120   ALWAYS_INLINE GSVector4i u8to32() const { return GSVector4i(_mm_cvtepu8_epi32(m)); }
   1121   ALWAYS_INLINE GSVector4i u8to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); }
   1122   ALWAYS_INLINE GSVector4i u16to32() const { return GSVector4i(_mm_cvtepu16_epi32(m)); }
   1123   ALWAYS_INLINE GSVector4i u16to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); }
   1124   ALWAYS_INLINE GSVector4i u32to64() const { return GSVector4i(_mm_cvtepu32_epi64(m)); }
   1125 #endif
   1126 
   1127   template<s32 i>
   1128   ALWAYS_INLINE GSVector4i srl() const
   1129   {
   1130     return GSVector4i(_mm_srli_si128(m, i));
   1131   }
   1132 
   1133   template<s32 i>
   1134   ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
   1135   {
   1136     return GSVector4i(_mm_alignr_epi8(v.m, m, i));
   1137   }
   1138 
   1139   template<s32 i>
   1140   ALWAYS_INLINE GSVector4i sll() const
   1141   {
   1142     return GSVector4i(_mm_slli_si128(m, i));
   1143   }
   1144 
   1145   template<s32 i>
   1146   ALWAYS_INLINE GSVector4i sll16() const
   1147   {
   1148     return GSVector4i(_mm_slli_epi16(m, i));
   1149   }
   1150 
   1151   ALWAYS_INLINE GSVector4i sll16(s32 i) const { return GSVector4i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); }
   1152 
   1153 #ifdef CPU_ARCH_AVX2
   1154   ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi16(m, v.m)); }
   1155 #endif
   1156 
   1157   template<s32 i>
   1158   ALWAYS_INLINE GSVector4i srl16() const
   1159   {
   1160     return GSVector4i(_mm_srli_epi16(m, i));
   1161   }
   1162 
   1163   ALWAYS_INLINE GSVector4i srl16(s32 i) const { return GSVector4i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); }
   1164 
   1165 #ifdef CPU_ARCH_AVX2
   1166   ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi16(m, v.m)); }
   1167 #endif
   1168 
   1169   template<s32 i>
   1170   ALWAYS_INLINE GSVector4i sra16() const
   1171   {
   1172     return GSVector4i(_mm_srai_epi16(m, i));
   1173   }
   1174 
   1175   ALWAYS_INLINE GSVector4i sra16(s32 i) const { return GSVector4i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); }
   1176 
   1177 #ifdef CPU_ARCH_AVX2
   1178   ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi16(m, v.m)); }
   1179 #endif
   1180 
   1181   template<s32 i>
   1182   ALWAYS_INLINE GSVector4i sll32() const
   1183   {
   1184     return GSVector4i(_mm_slli_epi32(m, i));
   1185   }
   1186 
   1187   ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); }
   1188 
   1189 #ifdef CPU_ARCH_AVX2
   1190   ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi32(m, v.m)); }
   1191 #endif
   1192 
   1193   template<s32 i>
   1194   ALWAYS_INLINE GSVector4i srl32() const
   1195   {
   1196     return GSVector4i(_mm_srli_epi32(m, i));
   1197   }
   1198 
   1199   ALWAYS_INLINE GSVector4i srl32(s32 i) const { return GSVector4i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); }
   1200 
   1201 #ifdef CPU_ARCH_AVX2
   1202   ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi32(m, v.m)); }
   1203 #endif
   1204 
   1205   template<s32 i>
   1206   ALWAYS_INLINE GSVector4i sra32() const
   1207   {
   1208     return GSVector4i(_mm_srai_epi32(m, i));
   1209   }
   1210 
   1211   ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); }
   1212 
   1213 #ifdef CPU_ARCH_AVX2
   1214   ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi32(m, v.m)); }
   1215 #endif
   1216 
   1217   template<s64 i>
   1218   ALWAYS_INLINE GSVector4i sll64() const
   1219   {
   1220     return GSVector4i(_mm_slli_epi64(m, i));
   1221   }
   1222 
   1223   ALWAYS_INLINE GSVector4i sll64(s32 i) const { return GSVector4i(_mm_sll_epi64(m, _mm_cvtsi32_si128(i))); }
   1224 
   1225 #ifdef CPU_ARCH_AVX2
   1226   ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi64(m, v.m)); }
   1227 #endif
   1228 
   1229   template<s64 i>
   1230   ALWAYS_INLINE GSVector4i srl64() const
   1231   {
   1232     return GSVector4i(_mm_srli_epi64(m, i));
   1233   }
   1234 
   1235   ALWAYS_INLINE GSVector4i srl64(s32 i) const { return GSVector4i(_mm_srl_epi64(m, _mm_cvtsi32_si128(i))); }
   1236 
   1237 #ifdef CPU_ARCH_AVX2
   1238   ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi64(m, v.m)); }
   1239 #endif
   1240 
   1241   template<s64 i>
   1242   ALWAYS_INLINE GSVector4i sra64() const
   1243   {
   1244     return GSVector4i(_mm_srai_epi64(m, i));
   1245   }
   1246 
   1247   ALWAYS_INLINE GSVector4i sra64(s32 i) const { return GSVector4i(_mm_sra_epi64(m, _mm_cvtsi32_si128(i))); }
   1248 
   1249 #ifdef CPU_ARCH_AVX2
   1250   ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi64(m, v.m)); }
   1251 #endif
   1252 
   1253   ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); }
   1254   ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const { return GSVector4i(_mm_add_epi16(m, v.m)); }
   1255   ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(_mm_add_epi32(m, v.m)); }
   1256   ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi8(m, v.m)); }
   1257   ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi16(m, v.m)); }
   1258   ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const { return GSVector4i(_mm_hadds_epi16(m, v.m)); }
   1259   ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); }
   1260   ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu16(m, v.m)); }
   1261 
   1262   ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi8(m, v.m)); }
   1263   ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi16(m, v.m)); }
   1264   ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi32(m, v.m)); }
   1265   ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi8(m, v.m)); }
   1266   ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi16(m, v.m)); }
   1267   ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu8(m, v.m)); }
   1268   ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu16(m, v.m)); }
   1269 
   1270   ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu8(m, v.m)); }
   1271   ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu16(m, v.m)); }
   1272 
   1273   ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epi16(m, v.m)); }
   1274   ALWAYS_INLINE GSVector4i mul16hu(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epu16(m, v.m)); }
   1275   ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi16(m, v.m)); }
   1276   ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); }
   1277   ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi32(m, v.m)); }
   1278 
   1279   ALWAYS_INLINE bool eq(const GSVector4i& v) const
   1280   {
   1281     const GSVector4i t = *this ^ v;
   1282     return _mm_testz_si128(t, t) != 0;
   1283   }
   1284 
   1285   ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi8(m, v.m)); }
   1286   ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi16(m, v.m)); }
   1287   ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi32(m, v.m)); }
   1288   ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi64(m, v.m)); }
   1289 
   1290   ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
   1291   ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
   1292   ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
   1293 
   1294   ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi8(m, v.m)); }
   1295   ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi16(m, v.m)); }
   1296   ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi32(m, v.m)); }
   1297 
   1298   ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi8(m, v.m)); }
   1299   ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi16(m, v.m)); }
   1300   ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi32(m, v.m)); }
   1301 
   1302   ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi8(m, v.m)); }
   1303   ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi16(m, v.m)); }
   1304   ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi32(m, v.m)); }
   1305 
   1306   ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi8(m, v.m)); }
   1307   ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi16(m, v.m)); }
   1308   ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi32(m, v.m)); }
   1309 
   1310   ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(_mm_andnot_si128(v.m, m)); }
   1311 
   1312   ALWAYS_INLINE s32 mask() const { return _mm_movemask_epi8(m); }
   1313 
   1314   ALWAYS_INLINE bool alltrue() const { return mask() == 0xffff; }
   1315 
   1316   ALWAYS_INLINE bool allfalse() const { return _mm_testz_si128(m, m) != 0; }
   1317 
   1318   template<s32 i>
   1319   ALWAYS_INLINE GSVector4i insert8(s32 a) const
   1320   {
   1321     return GSVector4i(_mm_insert_epi8(m, a, i));
   1322   }
   1323 
   1324   template<s32 i>
   1325   ALWAYS_INLINE s32 extract8() const
   1326   {
   1327     return _mm_extract_epi8(m, i);
   1328   }
   1329 
   1330   template<s32 i>
   1331   ALWAYS_INLINE GSVector4i insert16(s32 a) const
   1332   {
   1333     return GSVector4i(_mm_insert_epi16(m, a, i));
   1334   }
   1335 
   1336   template<s32 i>
   1337   ALWAYS_INLINE s32 extract16() const
   1338   {
   1339     return _mm_extract_epi16(m, i);
   1340   }
   1341 
   1342   template<s32 i>
   1343   ALWAYS_INLINE GSVector4i insert32(s32 a) const
   1344   {
   1345     return GSVector4i(_mm_insert_epi32(m, a, i));
   1346   }
   1347 
   1348   template<s32 i>
   1349   ALWAYS_INLINE s32 extract32() const
   1350   {
   1351     if constexpr (i == 0)
   1352       return GSVector4i::store(*this);
   1353 
   1354     return _mm_extract_epi32(m, i);
   1355   }
   1356 
   1357   template<s32 i>
   1358   ALWAYS_INLINE GSVector4i insert64(s64 a) const
   1359   {
   1360     return GSVector4i(_mm_insert_epi64(m, a, i));
   1361   }
   1362 
   1363   template<s32 i>
   1364   ALWAYS_INLINE s64 extract64() const
   1365   {
   1366     if (i == 0)
   1367       return GSVector4i::storeq(*this);
   1368 
   1369     return _mm_extract_epi64(m, i);
   1370   }
   1371 
   1372   ALWAYS_INLINE static GSVector4i loadnt(const void* p)
   1373   {
   1374     return GSVector4i(_mm_stream_load_si128(static_cast<const __m128i*>(p)));
   1375   }
   1376 
   1377   ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); }
   1378 
   1379   ALWAYS_INLINE static GSVector4i loadl(const void* p)
   1380   {
   1381     return GSVector4i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
   1382   }
   1383 
   1384   ALWAYS_INLINE static GSVector4i loadh(const void* p)
   1385   {
   1386     return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast<const __m64*>(p))));
   1387   }
   1388 
   1389   ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
   1390   {
   1391     return GSVector4i(_mm_unpacklo_epi64(_mm_setzero_si128(), v.m));
   1392   }
   1393 
   1394   template<bool aligned>
   1395   ALWAYS_INLINE static GSVector4i load(const void* p)
   1396   {
   1397     return GSVector4i(aligned ? _mm_load_si128(static_cast<const __m128i*>(p)) :
   1398                                 _mm_loadu_si128(static_cast<const __m128i*>(p)));
   1399   }
   1400 
   1401   ALWAYS_INLINE static GSVector4i load(s32 i) { return GSVector4i(_mm_cvtsi32_si128(i)); }
   1402   ALWAYS_INLINE static GSVector4i loadq(s64 i) { return GSVector4i(_mm_cvtsi64_si128(i)); }
   1403 
   1404   ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); }
   1405   ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
   1406   ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
   1407   {
   1408     _mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m));
   1409   }
   1410 
   1411   ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v)
   1412   {
   1413     GSVector4i::storel(pl, v);
   1414     GSVector4i::storeh(ph, v);
   1415   }
   1416 
   1417   template<bool aligned>
   1418   ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
   1419   {
   1420     if constexpr (aligned)
   1421       _mm_store_si128(static_cast<__m128i*>(p), v.m);
   1422     else
   1423       _mm_storeu_si128(static_cast<__m128i*>(p), v.m);
   1424   }
   1425 
   1426   ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); }
   1427   ALWAYS_INLINE static s32 store(const GSVector4i& v) { return _mm_cvtsi128_si32(v.m); }
   1428   ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return _mm_cvtsi128_si64(v.m); }
   1429 
   1430   ALWAYS_INLINE GSVector4i& operator&=(const GSVector4i& v)
   1431   {
   1432     m = _mm_and_si128(m, v);
   1433     return *this;
   1434   }
   1435   ALWAYS_INLINE GSVector4i& operator|=(const GSVector4i& v)
   1436   {
   1437     m = _mm_or_si128(m, v);
   1438     return *this;
   1439   }
   1440   ALWAYS_INLINE GSVector4i& operator^=(const GSVector4i& v)
   1441   {
   1442     m = _mm_xor_si128(m, v);
   1443     return *this;
   1444   }
   1445 
   1446   ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
   1447   {
   1448     return GSVector4i(_mm_and_si128(v1, v2));
   1449   }
   1450 
   1451   ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
   1452   {
   1453     return GSVector4i(_mm_or_si128(v1, v2));
   1454   }
   1455 
   1456   ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
   1457   {
   1458     return GSVector4i(_mm_xor_si128(v1, v2));
   1459   }
   1460 
   1461   ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); }
   1462   ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); }
   1463   ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); }
   1464   ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); }
   1465 
   1466   ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); }
   1467   ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
   1468 
   1469   ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
   1470 
   1471   ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); }
   1472 
   1473   ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); }
   1474 
   1475   // clang-format off
   1476 
   1477 #define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
   1478     ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
   1479     ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
   1480     ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
   1481     ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \
   1482 
   1483 #define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
   1484     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
   1485     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
   1486     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
   1487     VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
   1488 
   1489 #define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
   1490     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
   1491     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
   1492     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
   1493     VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
   1494 
   1495 #define VECTOR4i_SHUFFLE_1(xs, xn) \
   1496     VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
   1497     VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
   1498     VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
   1499     VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
   1500 
   1501   VECTOR4i_SHUFFLE_1(x, 0)
   1502     VECTOR4i_SHUFFLE_1(y, 1)
   1503     VECTOR4i_SHUFFLE_1(z, 2)
   1504     VECTOR4i_SHUFFLE_1(w, 3)
   1505 
   1506   // clang-format on
   1507 };
   1508 
   1509 class alignas(16) GSVector4
   1510 {
   1511   struct cxpr_init_tag
   1512   {
   1513   };
   1514   static constexpr cxpr_init_tag cxpr_init{};
   1515 
   1516   constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
   1517 
   1518   constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
   1519 
   1520   constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
   1521 
   1522   constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
   1523 
   1524 public:
   1525   union
   1526   {
   1527     struct
   1528     {
   1529       float x, y, z, w;
   1530     };
   1531     struct
   1532     {
   1533       float r, g, b, a;
   1534     };
   1535     struct
   1536     {
   1537       float left, top, right, bottom;
   1538     };
   1539     float F32[4];
   1540     double F64[2];
   1541     s8 I8[16];
   1542     s16 I16[8];
   1543     s32 I32[4];
   1544     s64 I64[2];
   1545     u8 U8[16];
   1546     u16 U16[8];
   1547     u32 U32[4];
   1548     u64 U64[2];
   1549     __m128 m;
   1550   };
   1551 
   1552   GSVector4() = default;
   1553 
   1554   constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
   1555   constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
   1556   constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
   1557   constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
   1558 
   1559   constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
   1560   constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
   1561 
   1562   constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
   1563   constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
   1564 
   1565   ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
   1566   ALWAYS_INLINE GSVector4(float x, float y) { m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y)); }
   1567   ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
   1568   {
   1569     GSVector4i v_(x, y, z, w);
   1570     m = _mm_cvtepi32_ps(v_.m);
   1571   }
   1572   ALWAYS_INLINE GSVector4(int x, int y)
   1573   {
   1574     m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
   1575   }
   1576 
   1577   ALWAYS_INLINE explicit GSVector4(const GSVector2& v) : m(v.m) {}
   1578   ALWAYS_INLINE explicit GSVector4(const GSVector2i& v)
   1579     : m(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtepi32_ps(v.m)), _mm_setzero_pd())))
   1580   {
   1581   }
   1582 
   1583   ALWAYS_INLINE constexpr explicit GSVector4(__m128 m) : m(m) {}
   1584 
   1585   ALWAYS_INLINE explicit GSVector4(__m128d m) : m(_mm_castpd_ps(m)) {}
   1586 
   1587   ALWAYS_INLINE explicit GSVector4(float f) { *this = f; }
   1588 
   1589   ALWAYS_INLINE explicit GSVector4(int i)
   1590   {
   1591 #ifdef CPU_ARCH_AVX2
   1592     m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
   1593 #else
   1594     *this = GSVector4(GSVector4i(i));
   1595 #endif
   1596   }
   1597 
   1598   ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
   1599 
   1600   ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); }
   1601   ALWAYS_INLINE static GSVector4 f64(double x) { return GSVector4(_mm_castpd_ps(_mm_set1_pd(x))); }
   1602 
   1603   ALWAYS_INLINE GSVector4& operator=(float f)
   1604   {
   1605     m = _mm_set1_ps(f);
   1606     return *this;
   1607   }
   1608 
   1609   ALWAYS_INLINE GSVector4& operator=(__m128 m_)
   1610   {
   1611     this->m = m_;
   1612     return *this;
   1613   }
   1614 
   1615   ALWAYS_INLINE operator __m128() const { return m; }
   1616 
   1617   u32 rgba32() const { return GSVector4i(*this).rgba32(); }
   1618 
   1619   ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
   1620 
   1621   ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
   1622 
   1623   ALWAYS_INLINE GSVector4 abs() const { return *this & cast(GSVector4i::cxpr(0x7fffffff)); }
   1624 
   1625   ALWAYS_INLINE GSVector4 neg() const { return *this ^ cast(GSVector4i::cxpr(0x80000000)); }
   1626 
   1627   ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(_mm_rcp_ps(m)); }
   1628 
   1629   ALWAYS_INLINE GSVector4 rcpnr() const
   1630   {
   1631     GSVector4 v_ = rcp();
   1632 
   1633     return (v_ + v_) - (v_ * v_) * *this;
   1634   }
   1635 
   1636   ALWAYS_INLINE GSVector4 floor() const
   1637   {
   1638     return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
   1639   }
   1640 
   1641   ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
   1642 
   1643   ALWAYS_INLINE GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const
   1644   {
   1645 #ifdef CPU_ARCH_AVX2
   1646     return GSVector4(_mm_fmadd_ps(m, a_, b_));
   1647 #else
   1648     return *this * a_ + b_;
   1649 #endif
   1650   }
   1651 
   1652   ALWAYS_INLINE GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const
   1653   {
   1654 #ifdef CPU_ARCH_AVX2
   1655     return GSVector4(_mm_fmsub_ps(m, a_, b_));
   1656 #else
   1657     return *this * a_ - b_;
   1658 #endif
   1659   }
   1660 
   1661   ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const
   1662   {
   1663 #ifdef CPU_ARCH_AVX2
   1664     return GSVector4(_mm_fnmadd_ps(m, a_, b_));
   1665 #else
   1666     return b_ - *this * a_;
   1667 #endif
   1668   }
   1669 
   1670   ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const
   1671   {
   1672 #ifdef CPU_ARCH_AVX2
   1673     return GSVector4(_mm_fnmsub_ps(m, a_, b_));
   1674 #else
   1675     return -b_ - *this * a_;
   1676 #endif
   1677   }
   1678 
   1679   ALWAYS_INLINE GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const
   1680   {
   1681     return a_.madd(b_, *this); // *this + a * b
   1682   }
   1683 
   1684   ALWAYS_INLINE GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const
   1685   {
   1686     return a_.nmadd(b_, *this); // *this - a * b
   1687   }
   1688 
   1689   ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(_mm_hadd_ps(m, m)); }
   1690 
   1691   ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(_mm_hadd_ps(m, v.m)); }
   1692 
   1693   ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(_mm_hsub_ps(m, m)); }
   1694 
   1695   ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const { return GSVector4(_mm_hsub_ps(m, v.m)); }
   1696 
   1697   template<int i>
   1698   ALWAYS_INLINE GSVector4 dp(const GSVector4& v) const
   1699   {
   1700     return GSVector4(_mm_dp_ps(m, v.m, i));
   1701   }
   1702 
   1703   ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const
   1704   {
   1705     return GSVector4(_mm_min_ps(_mm_max_ps(m, min), max));
   1706   }
   1707 
   1708   ALWAYS_INLINE GSVector4 sat(const GSVector4& v) const
   1709   {
   1710     return GSVector4(_mm_min_ps(_mm_max_ps(m, v.xyxy()), v.zwzw()));
   1711   }
   1712 
   1713   ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
   1714 
   1715   ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
   1716 
   1717   ALWAYS_INLINE GSVector4 min(const GSVector4& v) const { return GSVector4(_mm_min_ps(m, v)); }
   1718 
   1719   ALWAYS_INLINE GSVector4 max(const GSVector4& v) const { return GSVector4(_mm_max_ps(m, v)); }
   1720 
   1721   template<int mask>
   1722   ALWAYS_INLINE GSVector4 blend32(const GSVector4& v) const
   1723   {
   1724     return GSVector4(_mm_blend_ps(m, v, mask));
   1725   }
   1726 
   1727   ALWAYS_INLINE GSVector4 blend32(const GSVector4& v, const GSVector4& mask) const
   1728   {
   1729     return GSVector4(_mm_blendv_ps(m, v, mask));
   1730   }
   1731 
   1732   ALWAYS_INLINE GSVector4 upl(const GSVector4& v) const { return GSVector4(_mm_unpacklo_ps(m, v)); }
   1733 
   1734   ALWAYS_INLINE GSVector4 uph(const GSVector4& v) const { return GSVector4(_mm_unpackhi_ps(m, v)); }
   1735 
   1736   ALWAYS_INLINE GSVector4 upld(const GSVector4& v) const
   1737   {
   1738     return GSVector4(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))));
   1739   }
   1740 
   1741   ALWAYS_INLINE GSVector4 uphd(const GSVector4& v) const
   1742   {
   1743     return GSVector4(_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))));
   1744   }
   1745 
   1746   ALWAYS_INLINE GSVector4 l2h(const GSVector4& v) const { return GSVector4(_mm_movelh_ps(m, v)); }
   1747 
   1748   ALWAYS_INLINE GSVector4 h2l(const GSVector4& v) const { return GSVector4(_mm_movehl_ps(m, v)); }
   1749 
   1750   ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const { return GSVector4(_mm_andnot_ps(v.m, m)); }
   1751 
   1752   ALWAYS_INLINE int mask() const { return _mm_movemask_ps(m); }
   1753 
   1754   ALWAYS_INLINE bool alltrue() const { return mask() == 0xf; }
   1755 
   1756   ALWAYS_INLINE bool allfalse() const
   1757   {
   1758 #ifdef CPU_ARCH_AVX2
   1759     return _mm_testz_ps(m, m) != 0;
   1760 #else
   1761     const __m128i ii = _mm_castps_si128(m);
   1762     return _mm_testz_si128(ii, ii) != 0;
   1763 #endif
   1764   }
   1765 
   1766   ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
   1767 
   1768   template<int src, int dst>
   1769   ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
   1770   {
   1771     if constexpr (src == dst)
   1772       return GSVector4(_mm_blend_ps(m, v.m, 1 << src));
   1773     else
   1774       return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0)));
   1775   }
   1776 
   1777   template<int i>
   1778   ALWAYS_INLINE int extract32() const
   1779   {
   1780     return _mm_extract_ps(m, i);
   1781   }
   1782 
   1783   template<int dst>
   1784   ALWAYS_INLINE GSVector4 insert64(double v) const
   1785   {
   1786     if constexpr (dst == 0)
   1787       return GSVector4(_mm_move_sd(_mm_castps_pd(m), _mm_load_pd(&v)));
   1788     else
   1789       return GSVector4(_mm_shuffle_pd(_mm_castps_pd(m), _mm_load_pd(&v), 0));
   1790   }
   1791 
   1792   template<int src>
   1793   ALWAYS_INLINE double extract64() const
   1794   {
   1795     double ret;
   1796     if constexpr (src == 0)
   1797       _mm_storel_pd(&ret, _mm_castps_pd(m));
   1798     else
   1799       _mm_storeh_pd(&ret, _mm_castps_pd(m));
   1800     return ret;
   1801   }
   1802 
   1803   ALWAYS_INLINE static GSVector4 zero() { return GSVector4(_mm_setzero_ps()); }
   1804   ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
   1805 
   1806   ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); }
   1807 
   1808   ALWAYS_INLINE static GSVector4 loadl(const void* p)
   1809   {
   1810     return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
   1811   }
   1812 
   1813   ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(_mm_load_ss(&f)); }
   1814 
   1815   template<bool aligned>
   1816   ALWAYS_INLINE static GSVector4 load(const void* p)
   1817   {
   1818     return GSVector4(aligned ? _mm_load_ps(static_cast<const float*>(p)) : _mm_loadu_ps(static_cast<const float*>(p)));
   1819   }
   1820 
   1821   ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast<float*>(p), v.m); }
   1822   ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
   1823   {
   1824     _mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
   1825   }
   1826   ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
   1827   {
   1828     _mm_storeh_pd(static_cast<double*>(p), _mm_castps_pd(v.m));
   1829   }
   1830 
   1831   template<bool aligned>
   1832   ALWAYS_INLINE static void store(void* p, const GSVector4& v)
   1833   {
   1834     if constexpr (aligned)
   1835       _mm_store_ps(static_cast<float*>(p), v.m);
   1836     else
   1837       _mm_storeu_ps(static_cast<float*>(p), v.m);
   1838   }
   1839 
   1840   ALWAYS_INLINE static void store(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); }
   1841 
   1842   ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
   1843 
   1844   ALWAYS_INLINE GSVector4& operator+=(const GSVector4& v_)
   1845   {
   1846     m = _mm_add_ps(m, v_);
   1847     return *this;
   1848   }
   1849 
   1850   ALWAYS_INLINE GSVector4& operator-=(const GSVector4& v_)
   1851   {
   1852     m = _mm_sub_ps(m, v_);
   1853     return *this;
   1854   }
   1855 
   1856   ALWAYS_INLINE GSVector4& operator*=(const GSVector4& v_)
   1857   {
   1858     m = _mm_mul_ps(m, v_);
   1859     return *this;
   1860   }
   1861 
   1862   ALWAYS_INLINE GSVector4& operator/=(const GSVector4& v_)
   1863   {
   1864     m = _mm_div_ps(m, v_);
   1865     return *this;
   1866   }
   1867 
   1868   ALWAYS_INLINE GSVector4& operator+=(float f)
   1869   {
   1870     *this += GSVector4(f);
   1871     return *this;
   1872   }
   1873 
   1874   ALWAYS_INLINE GSVector4& operator-=(float f)
   1875   {
   1876     *this -= GSVector4(f);
   1877     return *this;
   1878   }
   1879 
   1880   ALWAYS_INLINE GSVector4& operator*=(float f)
   1881   {
   1882     *this *= GSVector4(f);
   1883     return *this;
   1884   }
   1885 
   1886   ALWAYS_INLINE GSVector4& operator/=(float f)
   1887   {
   1888     *this /= GSVector4(f);
   1889     return *this;
   1890   }
   1891 
   1892   ALWAYS_INLINE GSVector4& operator&=(const GSVector4& v_)
   1893   {
   1894     m = _mm_and_ps(m, v_);
   1895     return *this;
   1896   }
   1897 
   1898   ALWAYS_INLINE GSVector4& operator|=(const GSVector4& v_)
   1899   {
   1900     m = _mm_or_ps(m, v_);
   1901     return *this;
   1902   }
   1903 
   1904   ALWAYS_INLINE GSVector4& operator^=(const GSVector4& v_)
   1905   {
   1906     m = _mm_xor_ps(m, v_);
   1907     return *this;
   1908   }
   1909 
   1910   ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
   1911   {
   1912     return GSVector4(_mm_add_ps(v1, v2));
   1913   }
   1914 
   1915   ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
   1916   {
   1917     return GSVector4(_mm_sub_ps(v1, v2));
   1918   }
   1919 
   1920   ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
   1921   {
   1922     return GSVector4(_mm_mul_ps(v1, v2));
   1923   }
   1924 
   1925   ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
   1926   {
   1927     return GSVector4(_mm_div_ps(v1, v2));
   1928   }
   1929 
   1930   ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
   1931 
   1932   ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
   1933 
   1934   ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
   1935 
   1936   ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); }
   1937 
   1938   ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
   1939   {
   1940     return GSVector4(_mm_and_ps(v1, v2));
   1941   }
   1942 
   1943   ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
   1944   {
   1945     return GSVector4(_mm_or_ps(v1, v2));
   1946   }
   1947 
   1948   ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
   1949   {
   1950     return GSVector4(_mm_xor_ps(v1, v2));
   1951   }
   1952 
   1953   ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
   1954   {
   1955     return GSVector4(_mm_cmpeq_ps(v1, v2));
   1956   }
   1957 
   1958   ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
   1959   {
   1960     return GSVector4(_mm_cmpneq_ps(v1, v2));
   1961   }
   1962 
   1963   ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
   1964   {
   1965     return GSVector4(_mm_cmpgt_ps(v1, v2));
   1966   }
   1967 
   1968   ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
   1969   {
   1970     return GSVector4(_mm_cmplt_ps(v1, v2));
   1971   }
   1972 
   1973   ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
   1974   {
   1975     return GSVector4(_mm_cmpge_ps(v1, v2));
   1976   }
   1977 
   1978   ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
   1979   {
   1980     return GSVector4(_mm_cmple_ps(v1, v2));
   1981   }
   1982 
   1983   ALWAYS_INLINE GSVector4 mul64(const GSVector4& v_) const
   1984   {
   1985     return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
   1986   }
   1987 
   1988   ALWAYS_INLINE GSVector4 add64(const GSVector4& v_) const
   1989   {
   1990     return GSVector4(_mm_add_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
   1991   }
   1992 
   1993   ALWAYS_INLINE GSVector4 sub64(const GSVector4& v_) const
   1994   {
   1995     return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
   1996   }
   1997 
   1998   ALWAYS_INLINE GSVector4 div64(const GSVector4& v_) const
   1999   {
   2000     return GSVector4(_mm_div_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
   2001   }
   2002 
   2003   ALWAYS_INLINE GSVector4 gt64(const GSVector4& v2) const
   2004   {
   2005     return GSVector4(_mm_cmpgt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
   2006   }
   2007 
   2008   ALWAYS_INLINE GSVector4 eq64(const GSVector4& v2) const
   2009   {
   2010     return GSVector4(_mm_cmpeq_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
   2011   }
   2012 
   2013   ALWAYS_INLINE GSVector4 lt64(const GSVector4& v2) const
   2014   {
   2015     return GSVector4(_mm_cmplt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
   2016   }
   2017 
   2018   ALWAYS_INLINE GSVector4 ge64(const GSVector4& v2) const
   2019   {
   2020     return GSVector4(_mm_cmpge_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
   2021   }
   2022 
   2023   ALWAYS_INLINE GSVector4 le64(const GSVector4& v2) const
   2024   {
   2025     return GSVector4(_mm_cmple_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
   2026   }
   2027 
   2028   ALWAYS_INLINE GSVector4 min64(const GSVector4& v2) const
   2029   {
   2030     return GSVector4(_mm_min_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
   2031   }
   2032 
   2033   ALWAYS_INLINE GSVector4 max64(const GSVector4& v2) const
   2034   {
   2035     return GSVector4(_mm_max_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
   2036   }
   2037 
   2038   ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
   2039 
   2040   ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
   2041 
   2042   ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4(_mm_sqrt_pd(_mm_castps_pd(m))); }
   2043 
   2044   ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(m))); }
   2045 
   2046   ALWAYS_INLINE GSVector4 floor64() const
   2047   {
   2048     return GSVector4(_mm_round_pd(_mm_castps_pd(m), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
   2049   }
   2050 
   2051   ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { return GSVector4(_mm_cvtps_pd(v_.m)); }
   2052 
   2053   ALWAYS_INLINE static GSVector4 f32to64(const void* p)
   2054   {
   2055     return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p)))));
   2056   }
   2057 
   2058   ALWAYS_INLINE GSVector4i f64toi32() const { return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m))); }
   2059 
   2060   // clang-format off
   2061 
   2062 #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
   2063     ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn))); } \
   2064     ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(_mm_shuffle_ps(m, v_.m, _MM_SHUFFLE(wn, zn, yn, xn))); } \
   2065 
   2066 #define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
   2067     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
   2068     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
   2069     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
   2070     VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
   2071 
   2072 #define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
   2073     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
   2074     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
   2075     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
   2076     VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
   2077 
   2078 #define VECTOR4_SHUFFLE_1(xs, xn) \
   2079     VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
   2080     VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
   2081     VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
   2082     VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
   2083 
   2084   VECTOR4_SHUFFLE_1(x, 0)
   2085     VECTOR4_SHUFFLE_1(y, 1)
   2086     VECTOR4_SHUFFLE_1(z, 2)
   2087     VECTOR4_SHUFFLE_1(w, 3)
   2088 
   2089   // clang-format on
   2090 
   2091 #if CPU_ARCH_AVX2
   2092 
   2093   ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(_mm_broadcastss_ps(m)); }
   2094 
   2095   ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(_mm_broadcastss_ps(v.m)); }
   2096 
   2097   ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
   2098   {
   2099     return GSVector4(_mm_broadcastss_ps(_mm_load_ss(static_cast<const float*>(f))));
   2100   }
   2101 
   2102 #endif
   2103 
   2104   ALWAYS_INLINE static GSVector4 broadcast64(const void* d)
   2105   {
   2106     return GSVector4(_mm_loaddup_pd(static_cast<const double*>(d)));
   2107   }
   2108 };
   2109 
   2110 ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
   2111 {
   2112   m = _mm_cvttps_epi32(v);
   2113 }
   2114 
   2115 ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
   2116 {
   2117   m = _mm_cvtepi32_ps(v);
   2118 }
   2119 
   2120 ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v)
   2121 {
   2122   return GSVector2i(_mm_castps_si128(v.m));
   2123 }
   2124 
   2125 ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
   2126 {
   2127   return GSVector2(_mm_castsi128_ps(v.m));
   2128 }
   2129 
   2130 ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
   2131 {
   2132   m = _mm_cvttps_epi32(v);
   2133 }
   2134 
   2135 ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
   2136 {
   2137   m = _mm_cvtepi32_ps(v);
   2138 }
   2139 
   2140 ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
   2141 {
   2142   return GSVector4i(_mm_castps_si128(v.m));
   2143 }
   2144 
   2145 ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
   2146 {
   2147   return GSVector4(_mm_castsi128_ps(v.m));
   2148 }