gsvector_sse.h (75332B)
1 // SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin <stenzek@gmail.com> 2 // SPDX-License-Identifier: LGPL-3.0+ 3 // 4 // Lightweight wrapper over native SIMD types for cross-platform vector code. 5 // Rewritten and NEON+No-SIMD variants added for DuckStation. 6 // 7 8 #pragma once 9 10 #include "common/intrin.h" 11 #include "common/types.h" 12 13 #include <algorithm> 14 15 #ifdef CPU_ARCH_AVX2 16 #define GSVECTOR_HAS_UNSIGNED 1 17 #define GSVECTOR_HAS_SRLV 1 18 #endif 19 20 class GSVector2; 21 class GSVector2i; 22 class GSVector4; 23 class GSVector4i; 24 25 class alignas(16) GSVector2i 26 { 27 struct cxpr_init_tag 28 { 29 }; 30 static constexpr cxpr_init_tag cxpr_init{}; 31 32 constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y, 0, 0} {} 33 34 constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3, 0, 0, 0, 0} {} 35 36 constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) 37 : S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0} 38 { 39 } 40 41 public: 42 union 43 { 44 struct 45 { 46 s32 x, y; 47 }; 48 struct 49 { 50 s32 r, g; 51 }; 52 float F32[4]; 53 s8 S8[16]; 54 s16 S16[8]; 55 s32 S32[4]; 56 s64 S64[2]; 57 u8 U8[16]; 58 u16 U16[8]; 59 u32 U32[4]; 60 u64 U64[2]; 61 __m128i m; 62 }; 63 64 GSVector2i() = default; 65 66 ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); } 67 ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); } 68 69 ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); } 70 ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3) 71 { 72 return GSVector2i(cxpr_init, s0, s1, s2, s3); 73 } 74 75 ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) 76 { 77 return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7); 78 } 79 80 ALWAYS_INLINE GSVector2i(s32 x, s32 y) { m = _mm_set_epi32(0, 0, y, x); } 81 ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) { m = _mm_set_epi16(0, 0, 0, 0, s3, s2, s1, s0); } 82 ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) 83 : S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0} 84 { 85 } 86 ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; } 87 ALWAYS_INLINE explicit GSVector2i(const GSVector2& v); 88 ALWAYS_INLINE constexpr explicit GSVector2i(__m128i m) : m(m) {} 89 90 ALWAYS_INLINE GSVector2i& operator=(s32 i) 91 { 92 m = _mm_set1_epi32(i); 93 return *this; 94 } 95 96 ALWAYS_INLINE GSVector2i& operator=(__m128i m_) 97 { 98 m = m_; 99 return *this; 100 } 101 102 ALWAYS_INLINE operator __m128i() const { return m; } 103 104 ALWAYS_INLINE GSVector2i sat_i8(const GSVector2i& min, const GSVector2i& max) const 105 { 106 return max_i8(min).min_i8(max); 107 } 108 ALWAYS_INLINE GSVector2i sat_i16(const GSVector2i& min, const GSVector2i& max) const 109 { 110 return max_i16(min).min_i16(max); 111 } 112 ALWAYS_INLINE GSVector2i sat_i32(const GSVector2i& min, const GSVector2i& max) const 113 { 114 return max_i32(min).min_i32(max); 115 } 116 117 ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const 118 { 119 return max_u8(min).min_u8(max); 120 } 121 ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const 122 { 123 return max_u16(min).min_u16(max); 124 } 125 ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const 126 { 127 return max_u32(min).min_u32(max); 128 } 129 130 ALWAYS_INLINE GSVector2i min_i8(const GSVector2i& v) const { return GSVector2i(_mm_min_epi8(m, v)); } 131 ALWAYS_INLINE GSVector2i max_i8(const GSVector2i& v) const { return GSVector2i(_mm_max_epi8(m, v)); } 132 ALWAYS_INLINE GSVector2i min_i16(const GSVector2i& v) const { return GSVector2i(_mm_min_epi16(m, v)); } 133 ALWAYS_INLINE GSVector2i max_i16(const GSVector2i& v) const { return GSVector2i(_mm_max_epi16(m, v)); } 134 ALWAYS_INLINE GSVector2i min_i32(const GSVector2i& v) const { return GSVector2i(_mm_min_epi32(m, v)); } 135 ALWAYS_INLINE GSVector2i max_i32(const GSVector2i& v) const { return GSVector2i(_mm_max_epi32(m, v)); } 136 137 ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const { return GSVector2i(_mm_min_epu8(m, v)); } 138 ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const { return GSVector2i(_mm_max_epu8(m, v)); } 139 ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const { return GSVector2i(_mm_min_epu16(m, v)); } 140 ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const { return GSVector2i(_mm_max_epu16(m, v)); } 141 ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(_mm_min_epu32(m, v)); } 142 ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(_mm_max_epu32(m, v)); } 143 144 ALWAYS_INLINE s32 addv_s32() const { return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m)); } 145 146 ALWAYS_INLINE u8 minv_u8() const 147 { 148 __m128i vmin = _mm_min_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); 149 return static_cast<u8>(std::min( 150 static_cast<u32>(_mm_extract_epi8(vmin, 0)), 151 std::min(static_cast<u32>(_mm_extract_epi8(vmin, 1)), 152 std::min(static_cast<u32>(_mm_extract_epi8(vmin, 2)), static_cast<u32>(_mm_extract_epi8(vmin, 3)))))); 153 } 154 155 ALWAYS_INLINE u16 maxv_u8() const 156 { 157 __m128i vmax = _mm_max_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); 158 return static_cast<u8>(std::max( 159 static_cast<u32>(_mm_extract_epi8(vmax, 0)), 160 std::max(static_cast<u32>(_mm_extract_epi8(vmax, 1)), 161 std::max(static_cast<u32>(_mm_extract_epi8(vmax, 2)), static_cast<u32>(_mm_extract_epi8(vmax, 3)))))); 162 } 163 164 ALWAYS_INLINE u16 minv_u16() const 165 { 166 __m128i vmin = _mm_min_epu16(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); 167 return static_cast<u16>( 168 std::min(static_cast<u32>(_mm_extract_epi16(vmin, 0)), static_cast<u32>(_mm_extract_epi16(vmin, 1)))); 169 } 170 171 ALWAYS_INLINE u16 maxv_u16() const 172 { 173 __m128i vmax = _mm_max_epu16(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 1, 1, 1))); 174 return static_cast<u16>( 175 std::max<u32>(static_cast<u32>(_mm_extract_epi16(vmax, 0)), static_cast<u32>(_mm_extract_epi16(vmax, 1)))); 176 } 177 178 ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } 179 ALWAYS_INLINE u32 minv_u32() const { return std::min<u32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } 180 ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } 181 ALWAYS_INLINE u32 maxv_u32() const { return std::max<u32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); } 182 183 ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); } 184 185 ALWAYS_INLINE GSVector2i blend8(const GSVector2i& v, const GSVector2i& mask) const 186 { 187 return GSVector2i(_mm_blendv_epi8(m, v, mask)); 188 } 189 190 template<s32 mask> 191 ALWAYS_INLINE GSVector2i blend16(const GSVector2i& v) const 192 { 193 return GSVector2i(_mm_blend_epi16(m, v, mask)); 194 } 195 196 template<s32 mask> 197 ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const 198 { 199 #if defined(__AVX2__) 200 return GSVector2i(_mm_blend_epi32(m, v.m, mask)); 201 #else 202 constexpr s32 bit1 = ((mask & 2) * 3) << 1; 203 constexpr s32 bit0 = (mask & 1) * 3; 204 return blend16<bit1 | bit0>(v); 205 #endif 206 } 207 208 ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const 209 { 210 return GSVector2i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v))); 211 } 212 213 ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); } 214 215 ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const { return GSVector2i(_mm_shuffle_epi8(m, mask)); } 216 217 ALWAYS_INLINE GSVector2i ps16() const { return GSVector2i(_mm_packs_epi16(m, m)); } 218 ALWAYS_INLINE GSVector2i pu16() const { return GSVector2i(_mm_packus_epi16(m, m)); } 219 ALWAYS_INLINE GSVector2i ps32() const { return GSVector2i(_mm_packs_epi32(m, m)); } 220 ALWAYS_INLINE GSVector2i pu32() const { return GSVector2i(_mm_packus_epi32(m, m)); } 221 222 ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi8(m, v)); } 223 ALWAYS_INLINE GSVector2i uph8(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi8(m, v)); } 224 ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi16(m, v)); } 225 ALWAYS_INLINE GSVector2i uph16(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi16(m, v)); } 226 ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(_mm_unpacklo_epi32(m, v)); } 227 ALWAYS_INLINE GSVector2i uph32(const GSVector2i& v) const { return GSVector2i(_mm_unpackhi_epi32(m, v)); } 228 229 ALWAYS_INLINE GSVector2i upl8() const { return GSVector2i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); } 230 ALWAYS_INLINE GSVector2i uph8() const { return GSVector2i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); } 231 232 ALWAYS_INLINE GSVector2i upl16() const { return GSVector2i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); } 233 ALWAYS_INLINE GSVector2i uph16() const { return GSVector2i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); } 234 235 ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); } 236 ALWAYS_INLINE GSVector2i uph32() const { return GSVector2i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); } 237 238 ALWAYS_INLINE GSVector2i i8to16() const { return GSVector2i(_mm_cvtepi8_epi16(m)); } 239 240 #ifdef CPU_ARCH_SSE41 241 ALWAYS_INLINE GSVector2i u8to16() const { return GSVector2i(_mm_cvtepu8_epi16(m)); } 242 #endif 243 244 template<s32 i> 245 ALWAYS_INLINE GSVector2i srl() const 246 { 247 return GSVector2i(_mm_srli_si128(m, i)); 248 } 249 250 template<s32 i> 251 ALWAYS_INLINE GSVector2i sll() const 252 { 253 return GSVector2i(_mm_slli_si128(m, i)); 254 } 255 256 template<s32 i> 257 ALWAYS_INLINE GSVector2i sll16() const 258 { 259 return GSVector2i(_mm_slli_epi16(m, i)); 260 } 261 262 ALWAYS_INLINE GSVector2i sll16(s32 i) const { return GSVector2i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); } 263 264 #ifdef CPU_ARCH_AVX2 265 ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const { return GSVector2i(_mm_sllv_epi16(m, v.m)); } 266 #endif 267 268 template<s32 i> 269 ALWAYS_INLINE GSVector2i srl16() const 270 { 271 return GSVector2i(_mm_srli_epi16(m, i)); 272 } 273 274 ALWAYS_INLINE GSVector2i srl16(s32 i) const { return GSVector2i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); } 275 276 #ifdef CPU_ARCH_AVX2 277 ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const { return GSVector2i(_mm_srlv_epi16(m, v.m)); } 278 #endif 279 280 template<s32 i> 281 ALWAYS_INLINE GSVector2i sra16() const 282 { 283 return GSVector2i(_mm_srai_epi16(m, i)); 284 } 285 286 ALWAYS_INLINE GSVector2i sra16(s32 i) const { return GSVector2i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); } 287 288 #ifdef CPU_ARCH_AVX2 289 ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const { return GSVector2i(_mm_srav_epi16(m, v.m)); } 290 #endif 291 292 template<s32 i> 293 ALWAYS_INLINE GSVector2i sll32() const 294 { 295 return GSVector2i(_mm_slli_epi32(m, i)); 296 } 297 298 ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); } 299 300 #ifdef CPU_ARCH_AVX2 301 ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(_mm_sllv_epi32(m, v.m)); } 302 #endif 303 304 template<s32 i> 305 ALWAYS_INLINE GSVector2i srl32() const 306 { 307 return GSVector2i(_mm_srli_epi32(m, i)); 308 } 309 310 ALWAYS_INLINE GSVector2i srl32(s32 i) const { return GSVector2i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); } 311 312 #ifdef CPU_ARCH_AVX2 313 ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const { return GSVector2i(_mm_srlv_epi32(m, v.m)); } 314 #endif 315 316 template<s32 i> 317 ALWAYS_INLINE GSVector2i sra32() const 318 { 319 return GSVector2i(_mm_srai_epi32(m, i)); 320 } 321 322 ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); } 323 324 #ifdef CPU_ARCH_AVX2 325 ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const { return GSVector2i(_mm_srav_epi32(m, v.m)); } 326 #endif 327 328 ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const { return GSVector2i(_mm_add_epi8(m, v.m)); } 329 ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const { return GSVector2i(_mm_add_epi16(m, v.m)); } 330 ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(_mm_add_epi32(m, v.m)); } 331 ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi8(m, v.m)); } 332 ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi16(m, v.m)); } 333 ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu8(m, v.m)); } 334 ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu16(m, v.m)); } 335 336 ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi8(m, v.m)); } 337 ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi16(m, v.m)); } 338 ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi32(m, v.m)); } 339 ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi8(m, v.m)); } 340 ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi16(m, v.m)); } 341 ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu8(m, v.m)); } 342 ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu16(m, v.m)); } 343 344 ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu8(m, v.m)); } 345 ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu16(m, v.m)); } 346 347 ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi16(m, v.m)); } 348 ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi32(m, v.m)); } 349 350 ALWAYS_INLINE bool eq(const GSVector2i& v) const { return eq8(v).alltrue(); } 351 352 ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi8(m, v.m)); } 353 ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi16(m, v.m)); } 354 ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const { return GSVector2i(_mm_cmpeq_epi32(m, v.m)); } 355 356 ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); } 357 ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); } 358 ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); } 359 360 ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi8(m, v.m)); } 361 ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi16(m, v.m)); } 362 ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(_mm_cmpgt_epi32(m, v.m)); } 363 364 ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi8(m, v.m)); } 365 ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi16(m, v.m)); } 366 ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return ~GSVector2i(_mm_cmplt_epi32(m, v.m)); } 367 368 ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi8(m, v.m)); } 369 ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi16(m, v.m)); } 370 ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(_mm_cmplt_epi32(m, v.m)); } 371 372 ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi8(m, v.m)); } 373 ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi16(m, v.m)); } 374 ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return ~GSVector2i(_mm_cmpgt_epi32(m, v.m)); } 375 376 ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(_mm_andnot_si128(v.m, m)); } 377 378 ALWAYS_INLINE s32 mask() const { return (_mm_movemask_epi8(m) & 0xff); } 379 380 ALWAYS_INLINE bool alltrue() const { return (mask() == 0xff); } 381 ALWAYS_INLINE bool allfalse() const { return (mask() == 0x00); } 382 383 template<s32 i> 384 ALWAYS_INLINE GSVector2i insert8(s32 a) const 385 { 386 return GSVector2i(_mm_insert_epi8(m, a, i)); 387 } 388 389 template<s32 i> 390 ALWAYS_INLINE s32 extract8() const 391 { 392 return _mm_extract_epi8(m, i); 393 } 394 395 template<s32 i> 396 ALWAYS_INLINE GSVector2i insert16(s32 a) const 397 { 398 return GSVector2i(_mm_insert_epi16(m, a, i)); 399 } 400 401 template<s32 i> 402 ALWAYS_INLINE s32 extract16() const 403 { 404 return _mm_extract_epi16(m, i); 405 } 406 407 template<s32 i> 408 ALWAYS_INLINE GSVector2i insert32(s32 a) const 409 { 410 return GSVector2i(_mm_insert_epi32(m, a, i)); 411 } 412 413 template<s32 i> 414 ALWAYS_INLINE s32 extract32() const 415 { 416 if constexpr (i == 0) 417 return GSVector2i::store(*this); 418 419 return _mm_extract_epi32(m, i); 420 } 421 422 ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); } 423 ALWAYS_INLINE static GSVector2i load(const void* p) 424 { 425 return GSVector2i(_mm_loadl_epi64(static_cast<const __m128i*>(p))); 426 } 427 ALWAYS_INLINE static GSVector2i load(s32 i) { return GSVector2i(_mm_cvtsi32_si128(i)); } 428 ALWAYS_INLINE static GSVector2i loadq(s64 i) { return GSVector2i(_mm_cvtsi64_si128(i)); } 429 430 ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); } 431 ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); } 432 ALWAYS_INLINE static s32 store(const GSVector2i& v) { return _mm_cvtsi128_si32(v.m); } 433 ALWAYS_INLINE static s64 storeq(const GSVector2i& v) { return _mm_cvtsi128_si64(v.m); } 434 435 ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v) 436 { 437 m = _mm_and_si128(m, v); 438 return *this; 439 } 440 441 ALWAYS_INLINE GSVector2i& operator|=(const GSVector2i& v) 442 { 443 m = _mm_or_si128(m, v); 444 return *this; 445 } 446 447 ALWAYS_INLINE GSVector2i& operator^=(const GSVector2i& v) 448 { 449 m = _mm_xor_si128(m, v); 450 return *this; 451 } 452 453 ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2) 454 { 455 return GSVector2i(_mm_and_si128(v1, v2)); 456 } 457 458 ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2) 459 { 460 return GSVector2i(_mm_or_si128(v1, v2)); 461 } 462 463 ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2) 464 { 465 return GSVector2i(_mm_xor_si128(v1, v2)); 466 } 467 468 ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, s32 i) { return v & GSVector2i(i); } 469 470 ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, s32 i) { return v | GSVector2i(i); } 471 472 ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, s32 i) { return v ^ GSVector2i(i); } 473 474 ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); } 475 476 ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(_mm_setzero_si128()); } 477 ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); 478 479 ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); } 480 ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 0))); } 481 ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 1))); } 482 ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 1, 1))); } 483 }; 484 485 class alignas(16) GSVector2 486 { 487 struct cxpr_init_tag 488 { 489 }; 490 static constexpr cxpr_init_tag cxpr_init{}; 491 492 constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {} 493 constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {} 494 495 public: 496 union 497 { 498 struct 499 { 500 float x, y; 501 }; 502 struct 503 { 504 float r, g; 505 }; 506 float F32[4]; 507 double F64[2]; 508 s8 I8[16]; 509 s16 I16[8]; 510 s32 I32[4]; 511 s64 I64[2]; 512 u8 U8[16]; 513 u16 U16[8]; 514 u32 U32[4]; 515 u64 U64[2]; 516 __m128 m; 517 }; 518 519 GSVector2() = default; 520 521 constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); } 522 constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); } 523 constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); } 524 constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); } 525 526 ALWAYS_INLINE GSVector2(float x, float y) { m = _mm_set_ps(0, 0, y, x); } 527 ALWAYS_INLINE GSVector2(int x, int y) 528 { 529 GSVector2i v_(x, y); 530 m = _mm_cvtepi32_ps(v_.m); 531 } 532 533 ALWAYS_INLINE constexpr explicit GSVector2(__m128 m) : m(m) {} 534 ALWAYS_INLINE explicit GSVector2(__m128d m) : m(_mm_castpd_ps(m)) {} 535 ALWAYS_INLINE explicit GSVector2(float f) { *this = f; } 536 ALWAYS_INLINE explicit GSVector2(int i) 537 { 538 #ifdef CPU_ARCH_AVX2 539 m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i))); 540 #else 541 *this = GSVector2(GSVector2i(i)); 542 #endif 543 } 544 545 ALWAYS_INLINE explicit GSVector2(const GSVector2i& v); 546 547 ALWAYS_INLINE GSVector2& operator=(float f) 548 { 549 m = _mm_set1_ps(f); 550 return *this; 551 } 552 553 ALWAYS_INLINE GSVector2& operator=(__m128 m_) 554 { 555 m = m_; 556 return *this; 557 } 558 559 ALWAYS_INLINE operator __m128() const { return m; } 560 561 ALWAYS_INLINE GSVector2 abs() const { return *this & cast(GSVector2i::cxpr(0x7fffffff)); } 562 ALWAYS_INLINE GSVector2 neg() const { return *this ^ cast(GSVector2i::cxpr(0x80000000)); } 563 ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(_mm_rcp_ps(m)); } 564 ALWAYS_INLINE GSVector2 floor() const 565 { 566 return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); 567 } 568 569 ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); } 570 571 ALWAYS_INLINE GSVector2 sat(const GSVector2& min, const GSVector2& max) const 572 { 573 return GSVector2(_mm_min_ps(_mm_max_ps(m, min), max)); 574 } 575 576 ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); } 577 578 ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); } 579 580 ALWAYS_INLINE GSVector2 min(const GSVector2& v) const { return GSVector2(_mm_min_ps(m, v)); } 581 582 ALWAYS_INLINE GSVector2 max(const GSVector2& v) const { return GSVector2(_mm_max_ps(m, v)); } 583 584 template<int mask> 585 ALWAYS_INLINE GSVector2 blend32(const GSVector2& v) const 586 { 587 return GSVector2(_mm_blend_ps(m, v, mask)); 588 } 589 590 ALWAYS_INLINE GSVector2 blend32(const GSVector2& v, const GSVector2& mask) const 591 { 592 return GSVector2(_mm_blendv_ps(m, v, mask)); 593 } 594 595 ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const { return GSVector2(_mm_andnot_ps(v.m, m)); } 596 597 ALWAYS_INLINE int mask() const { return (_mm_movemask_ps(m) & 0x3); } 598 599 ALWAYS_INLINE bool alltrue() const { return (mask() == 0x3); } 600 601 ALWAYS_INLINE bool allfalse() const { return (mask() == 0x0); } 602 603 ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); } 604 605 template<int src, int dst> 606 ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const 607 { 608 if constexpr (src == dst) 609 return GSVector2(_mm_blend_ps(m, v.m, 1 << src)); 610 else 611 return GSVector2(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0))); 612 } 613 614 template<int i> 615 ALWAYS_INLINE int extract32() const 616 { 617 return _mm_extract_ps(m, i); 618 } 619 620 ALWAYS_INLINE float dot(const GSVector2& v) const { return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0x31)); } 621 622 ALWAYS_INLINE static GSVector2 zero() { return GSVector2(_mm_setzero_ps()); } 623 624 ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); } 625 626 ALWAYS_INLINE static GSVector2 load(const void* p) 627 { 628 return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p)))); 629 } 630 631 ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(_mm_load_ss(&f)); } 632 633 ALWAYS_INLINE static void store(void* p, const GSVector2& v) 634 { 635 _mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m)); 636 } 637 638 ALWAYS_INLINE GSVector2 operator-() const { return neg(); } 639 640 ALWAYS_INLINE GSVector2& operator+=(const GSVector2& v_) 641 { 642 m = _mm_add_ps(m, v_); 643 return *this; 644 } 645 ALWAYS_INLINE GSVector2& operator-=(const GSVector2& v_) 646 { 647 m = _mm_sub_ps(m, v_); 648 return *this; 649 } 650 ALWAYS_INLINE GSVector2& operator*=(const GSVector2& v_) 651 { 652 m = _mm_mul_ps(m, v_); 653 return *this; 654 } 655 ALWAYS_INLINE GSVector2& operator/=(const GSVector2& v_) 656 { 657 m = _mm_div_ps(m, v_); 658 return *this; 659 } 660 661 ALWAYS_INLINE GSVector2& operator+=(float f) 662 { 663 *this += GSVector2(f); 664 return *this; 665 } 666 ALWAYS_INLINE GSVector2& operator-=(float f) 667 { 668 *this -= GSVector2(f); 669 return *this; 670 } 671 ALWAYS_INLINE GSVector2& operator*=(float f) 672 { 673 *this *= GSVector2(f); 674 return *this; 675 } 676 ALWAYS_INLINE GSVector2& operator/=(float f) 677 { 678 *this /= GSVector2(f); 679 return *this; 680 } 681 682 ALWAYS_INLINE GSVector2& operator&=(const GSVector2& v_) 683 { 684 m = _mm_and_ps(m, v_); 685 return *this; 686 } 687 ALWAYS_INLINE GSVector2& operator|=(const GSVector2& v_) 688 { 689 m = _mm_or_ps(m, v_); 690 return *this; 691 } 692 ALWAYS_INLINE GSVector2& operator^=(const GSVector2& v_) 693 { 694 m = _mm_xor_ps(m, v_); 695 return *this; 696 } 697 698 ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2) 699 { 700 return GSVector2(_mm_add_ps(v1, v2)); 701 } 702 703 ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2) 704 { 705 return GSVector2(_mm_sub_ps(v1, v2)); 706 } 707 708 ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2) 709 { 710 return GSVector2(_mm_mul_ps(v1, v2)); 711 } 712 713 ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2) 714 { 715 return GSVector2(_mm_div_ps(v1, v2)); 716 } 717 718 ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); } 719 720 ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); } 721 722 ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); } 723 724 ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); } 725 726 ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2) 727 { 728 return GSVector2(_mm_and_ps(v1, v2)); 729 } 730 731 ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2) 732 { 733 return GSVector2(_mm_or_ps(v1, v2)); 734 } 735 736 ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2) 737 { 738 return GSVector2(_mm_xor_ps(v1, v2)); 739 } 740 741 ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2) 742 { 743 return GSVector2(_mm_cmpeq_ps(v1, v2)); 744 } 745 746 ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2) 747 { 748 return GSVector2(_mm_cmpneq_ps(v1, v2)); 749 } 750 751 ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2) 752 { 753 return GSVector2(_mm_cmpgt_ps(v1, v2)); 754 } 755 756 ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2) 757 { 758 return GSVector2(_mm_cmplt_ps(v1, v2)); 759 } 760 761 ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2) 762 { 763 return GSVector2(_mm_cmpge_ps(v1, v2)); 764 } 765 766 ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2) 767 { 768 return GSVector2(_mm_cmple_ps(v1, v2)); 769 } 770 771 ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v); 772 773 ALWAYS_INLINE GSVector2 xy() const { return *this; } 774 ALWAYS_INLINE GSVector2 xx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 0))); } 775 ALWAYS_INLINE GSVector2 yx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 1))); } 776 ALWAYS_INLINE GSVector2 yy() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 1, 1))); } 777 }; 778 779 class alignas(16) GSVector4i 780 { 781 struct cxpr_init_tag 782 { 783 }; 784 static constexpr cxpr_init_tag cxpr_init{}; 785 786 constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {} 787 788 constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) 789 : S16{s0, s1, s2, s3, s4, s5, s6, s7} 790 { 791 } 792 793 constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, 794 s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) 795 : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} 796 { 797 } 798 799 public: 800 union 801 { 802 struct 803 { 804 s32 x, y, z, w; 805 }; 806 struct 807 { 808 s32 r, g, b, a; 809 }; 810 struct 811 { 812 s32 left, top, right, bottom; 813 }; 814 float F32[4]; 815 s8 S8[16]; 816 s16 S16[8]; 817 s32 S32[4]; 818 s64 S64[2]; 819 u8 U8[16]; 820 u16 U16[8]; 821 u32 U32[4]; 822 u64 U64[2]; 823 __m128i m; 824 }; 825 826 GSVector4i() = default; 827 828 ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w) 829 { 830 return GSVector4i(cxpr_init, x, y, z, w); 831 } 832 ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); } 833 834 ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); } 835 ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) 836 { 837 return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7); 838 } 839 840 ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, 841 s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) 842 { 843 return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15); 844 } 845 846 ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); } 847 ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); } 848 ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) 849 { 850 m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); 851 } 852 853 ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, 854 s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) 855 : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} 856 { 857 } 858 859 ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = v.m; } 860 861 ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; } 862 863 ALWAYS_INLINE explicit GSVector4i(const GSVector2& v); 864 865 ALWAYS_INLINE explicit GSVector4i(const GSVector4& v); 866 867 ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {} 868 869 ALWAYS_INLINE GSVector4i& operator=(s32 i) 870 { 871 m = _mm_set1_epi32(i); 872 return *this; 873 } 874 ALWAYS_INLINE GSVector4i& operator=(__m128i m_) 875 { 876 m = m_; 877 return *this; 878 } 879 880 ALWAYS_INLINE operator __m128i() const { return m; } 881 882 ALWAYS_INLINE s32 width() const { return right - left; } 883 884 ALWAYS_INLINE s32 height() const { return bottom - top; } 885 886 ALWAYS_INLINE GSVector4i rsize() const 887 { 888 return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height()); 889 } 890 891 ALWAYS_INLINE s32 rarea() const { return width() * height(); } 892 893 ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; } 894 895 ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_i32(v).upl64(max_i32(v).srl<8>()); } 896 897 ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& v) const { return sat_i32(v); } 898 ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); } 899 ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); } 900 901 ALWAYS_INLINE u32 rgba32() const 902 { 903 GSVector4i v = *this; 904 905 v = v.ps32(v); 906 v = v.pu16(v); 907 908 return (u32)store(v); 909 } 910 911 ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const 912 { 913 return max_i8(min).min_i8(max); 914 } 915 ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const 916 { 917 return max_i8(minmax.xyxy()).min_i8(minmax.zwzw()); 918 } 919 ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const 920 { 921 return max_i16(min).min_i16(max); 922 } 923 ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const 924 { 925 return max_i16(minmax.xyxy()).min_i16(minmax.zwzw()); 926 } 927 ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const 928 { 929 return max_i32(min).min_i32(max); 930 } 931 ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const 932 { 933 return max_i32(minmax.xyxy()).min_i32(minmax.zwzw()); 934 } 935 936 ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const 937 { 938 return max_u8(min).min_u8(max); 939 } 940 ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const 941 { 942 return max_u8(minmax.xyxy()).min_u8(minmax.zwzw()); 943 } 944 ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const 945 { 946 return max_u16(min).min_u16(max); 947 } 948 ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const 949 { 950 return max_u16(minmax.xyxy()).min_u16(minmax.zwzw()); 951 } 952 ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const 953 { 954 return max_u32(min).min_u32(max); 955 } 956 ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const 957 { 958 return max_u32(minmax.xyxy()).min_u32(minmax.zwzw()); 959 } 960 961 ALWAYS_INLINE GSVector4i min_i8(const GSVector4i& v) const { return GSVector4i(_mm_min_epi8(m, v)); } 962 ALWAYS_INLINE GSVector4i max_i8(const GSVector4i& v) const { return GSVector4i(_mm_max_epi8(m, v)); } 963 ALWAYS_INLINE GSVector4i min_i16(const GSVector4i& v) const { return GSVector4i(_mm_min_epi16(m, v)); } 964 ALWAYS_INLINE GSVector4i max_i16(const GSVector4i& v) const { return GSVector4i(_mm_max_epi16(m, v)); } 965 ALWAYS_INLINE GSVector4i min_i32(const GSVector4i& v) const { return GSVector4i(_mm_min_epi32(m, v)); } 966 ALWAYS_INLINE GSVector4i max_i32(const GSVector4i& v) const { return GSVector4i(_mm_max_epi32(m, v)); } 967 968 ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const { return GSVector4i(_mm_min_epu8(m, v)); } 969 ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const { return GSVector4i(_mm_max_epu8(m, v)); } 970 ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const { return GSVector4i(_mm_min_epu16(m, v)); } 971 ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const { return GSVector4i(_mm_max_epu16(m, v)); } 972 ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const { return GSVector4i(_mm_min_epu32(m, v)); } 973 ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const { return GSVector4i(_mm_max_epu32(m, v)); } 974 975 ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const { return GSVector4i(_mm_madd_epi16(m, v.m)); } 976 977 ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(_mm_hadd_epi32(m, m)); } 978 979 ALWAYS_INLINE s32 addv_s32() const 980 { 981 const __m128i pairs = _mm_hadd_epi32(m, m); 982 return _mm_cvtsi128_si32(_mm_hadd_epi32(pairs, pairs)); 983 } 984 985 ALWAYS_INLINE u8 minv_u8() const 986 { 987 __m128i vmin = _mm_min_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); 988 vmin = _mm_min_epu8(vmin, _mm_shuffle_epi32(vmin, _MM_SHUFFLE(1, 1, 1, 1))); 989 return static_cast<u8>(std::min( 990 static_cast<u32>(_mm_extract_epi8(vmin, 0)), 991 std::min(static_cast<u32>(_mm_extract_epi8(vmin, 1)), 992 std::min(static_cast<u32>(_mm_extract_epi8(vmin, 2)), static_cast<u32>(_mm_extract_epi8(vmin, 3)))))); 993 } 994 995 ALWAYS_INLINE u16 maxv_u8() const 996 { 997 __m128i vmax = _mm_max_epu8(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); 998 vmax = _mm_max_epu8(vmax, _mm_shuffle_epi32(vmax, _MM_SHUFFLE(1, 1, 1, 1))); 999 return static_cast<u8>(std::max( 1000 static_cast<u32>(_mm_extract_epi8(vmax, 0)), 1001 std::max(static_cast<u32>(_mm_extract_epi8(vmax, 1)), 1002 std::max(static_cast<u32>(_mm_extract_epi8(vmax, 2)), static_cast<u32>(_mm_extract_epi8(vmax, 3)))))); 1003 } 1004 1005 ALWAYS_INLINE u16 minv_u16() const 1006 { 1007 __m128i vmin = _mm_min_epu16(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); 1008 vmin = _mm_min_epu16(vmin, _mm_shuffle_epi32(vmin, _MM_SHUFFLE(1, 1, 1, 1))); 1009 return static_cast<u16>( 1010 std::min(static_cast<u32>(_mm_extract_epi16(vmin, 0)), static_cast<u32>(_mm_extract_epi16(vmin, 1)))); 1011 } 1012 1013 ALWAYS_INLINE u16 maxv_u16() const 1014 { 1015 __m128i vmax = _mm_max_epu16(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); 1016 vmax = _mm_max_epu16(vmax, _mm_shuffle_epi32(vmax, _MM_SHUFFLE(1, 1, 1, 1))); 1017 return static_cast<u16>( 1018 std::max<u32>(static_cast<u32>(_mm_extract_epi16(vmax, 0)), static_cast<u32>(_mm_extract_epi16(vmax, 1)))); 1019 } 1020 1021 ALWAYS_INLINE s32 minv_s32() const 1022 { 1023 const __m128i vmin = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); 1024 return std::min<s32>(_mm_extract_epi32(vmin, 0), _mm_extract_epi32(vmin, 1)); 1025 } 1026 1027 ALWAYS_INLINE u32 minv_u32() const 1028 { 1029 const __m128i vmin = _mm_min_epu32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); 1030 return std::min<u32>(_mm_extract_epi32(vmin, 0), _mm_extract_epi32(vmin, 1)); 1031 } 1032 1033 ALWAYS_INLINE s32 maxv_s32() const 1034 { 1035 const __m128i vmax = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); 1036 return std::max<s32>(_mm_extract_epi32(vmax, 0), _mm_extract_epi32(vmax, 1)); 1037 } 1038 1039 ALWAYS_INLINE u32 maxv_u32() const 1040 { 1041 const __m128i vmax = _mm_max_epu32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); 1042 return std::max<u32>(_mm_extract_epi32(vmax, 0), _mm_extract_epi32(vmax, 1)); 1043 } 1044 1045 ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); } 1046 1047 ALWAYS_INLINE GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const 1048 { 1049 return GSVector4i(_mm_blendv_epi8(m, v, mask)); 1050 } 1051 1052 template<s32 mask> 1053 ALWAYS_INLINE GSVector4i blend16(const GSVector4i& v) const 1054 { 1055 return GSVector4i(_mm_blend_epi16(m, v, mask)); 1056 } 1057 1058 template<s32 mask> 1059 ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const 1060 { 1061 #if defined(__AVX2__) 1062 return GSVector4i(_mm_blend_epi32(m, v.m, mask)); 1063 #else 1064 constexpr s32 bit3 = ((mask & 8) * 3) << 3; 1065 constexpr s32 bit2 = ((mask & 4) * 3) << 2; 1066 constexpr s32 bit1 = ((mask & 2) * 3) << 1; 1067 constexpr s32 bit0 = (mask & 1) * 3; 1068 return blend16<bit3 | bit2 | bit1 | bit0>(v); 1069 #endif 1070 } 1071 1072 ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const 1073 { 1074 return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v))); 1075 } 1076 1077 ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); } 1078 1079 ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const { return GSVector4i(_mm_shuffle_epi8(m, mask)); } 1080 1081 ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi16(m, v)); } 1082 ALWAYS_INLINE GSVector4i ps16() const { return GSVector4i(_mm_packs_epi16(m, m)); } 1083 ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi16(m, v)); } 1084 ALWAYS_INLINE GSVector4i pu16() const { return GSVector4i(_mm_packus_epi16(m, m)); } 1085 ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi32(m, v)); } 1086 ALWAYS_INLINE GSVector4i ps32() const { return GSVector4i(_mm_packs_epi32(m, m)); } 1087 ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi32(m, v)); } 1088 ALWAYS_INLINE GSVector4i pu32() const { return GSVector4i(_mm_packus_epi32(m, m)); } 1089 1090 ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi8(m, v)); } 1091 ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi8(m, v)); } 1092 ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi16(m, v)); } 1093 ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi16(m, v)); } 1094 ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi32(m, v)); } 1095 ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi32(m, v)); } 1096 ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi64(m, v)); } 1097 ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi64(m, v)); } 1098 1099 ALWAYS_INLINE GSVector4i upl8() const { return GSVector4i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); } 1100 ALWAYS_INLINE GSVector4i uph8() const { return GSVector4i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); } 1101 1102 ALWAYS_INLINE GSVector4i upl16() const { return GSVector4i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); } 1103 ALWAYS_INLINE GSVector4i uph16() const { return GSVector4i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); } 1104 1105 ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); } 1106 1107 ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); } 1108 ALWAYS_INLINE GSVector4i upl64() const { return GSVector4i(_mm_unpacklo_epi64(m, _mm_setzero_si128())); } 1109 ALWAYS_INLINE GSVector4i uph64() const { return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128())); } 1110 1111 ALWAYS_INLINE GSVector4i s8to16() const { return GSVector4i(_mm_cvtepi8_epi16(m)); } 1112 ALWAYS_INLINE GSVector4i s8to32() const { return GSVector4i(_mm_cvtepi8_epi32(m)); } 1113 ALWAYS_INLINE GSVector4i s8to64() const { return GSVector4i(_mm_cvtepi8_epi64(m)); } 1114 1115 #ifdef CPU_ARCH_SSE41 1116 ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(_mm_cvtepi16_epi32(m)); } 1117 ALWAYS_INLINE GSVector4i s16to64() const { return GSVector4i(_mm_cvtepi16_epi64(m)); } 1118 ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(_mm_cvtepi32_epi64(m)); } 1119 ALWAYS_INLINE GSVector4i u8to16() const { return GSVector4i(_mm_cvtepu8_epi16(m)); } 1120 ALWAYS_INLINE GSVector4i u8to32() const { return GSVector4i(_mm_cvtepu8_epi32(m)); } 1121 ALWAYS_INLINE GSVector4i u8to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); } 1122 ALWAYS_INLINE GSVector4i u16to32() const { return GSVector4i(_mm_cvtepu16_epi32(m)); } 1123 ALWAYS_INLINE GSVector4i u16to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); } 1124 ALWAYS_INLINE GSVector4i u32to64() const { return GSVector4i(_mm_cvtepu32_epi64(m)); } 1125 #endif 1126 1127 template<s32 i> 1128 ALWAYS_INLINE GSVector4i srl() const 1129 { 1130 return GSVector4i(_mm_srli_si128(m, i)); 1131 } 1132 1133 template<s32 i> 1134 ALWAYS_INLINE GSVector4i srl(const GSVector4i& v) 1135 { 1136 return GSVector4i(_mm_alignr_epi8(v.m, m, i)); 1137 } 1138 1139 template<s32 i> 1140 ALWAYS_INLINE GSVector4i sll() const 1141 { 1142 return GSVector4i(_mm_slli_si128(m, i)); 1143 } 1144 1145 template<s32 i> 1146 ALWAYS_INLINE GSVector4i sll16() const 1147 { 1148 return GSVector4i(_mm_slli_epi16(m, i)); 1149 } 1150 1151 ALWAYS_INLINE GSVector4i sll16(s32 i) const { return GSVector4i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); } 1152 1153 #ifdef CPU_ARCH_AVX2 1154 ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi16(m, v.m)); } 1155 #endif 1156 1157 template<s32 i> 1158 ALWAYS_INLINE GSVector4i srl16() const 1159 { 1160 return GSVector4i(_mm_srli_epi16(m, i)); 1161 } 1162 1163 ALWAYS_INLINE GSVector4i srl16(s32 i) const { return GSVector4i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); } 1164 1165 #ifdef CPU_ARCH_AVX2 1166 ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi16(m, v.m)); } 1167 #endif 1168 1169 template<s32 i> 1170 ALWAYS_INLINE GSVector4i sra16() const 1171 { 1172 return GSVector4i(_mm_srai_epi16(m, i)); 1173 } 1174 1175 ALWAYS_INLINE GSVector4i sra16(s32 i) const { return GSVector4i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); } 1176 1177 #ifdef CPU_ARCH_AVX2 1178 ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi16(m, v.m)); } 1179 #endif 1180 1181 template<s32 i> 1182 ALWAYS_INLINE GSVector4i sll32() const 1183 { 1184 return GSVector4i(_mm_slli_epi32(m, i)); 1185 } 1186 1187 ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); } 1188 1189 #ifdef CPU_ARCH_AVX2 1190 ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi32(m, v.m)); } 1191 #endif 1192 1193 template<s32 i> 1194 ALWAYS_INLINE GSVector4i srl32() const 1195 { 1196 return GSVector4i(_mm_srli_epi32(m, i)); 1197 } 1198 1199 ALWAYS_INLINE GSVector4i srl32(s32 i) const { return GSVector4i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); } 1200 1201 #ifdef CPU_ARCH_AVX2 1202 ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi32(m, v.m)); } 1203 #endif 1204 1205 template<s32 i> 1206 ALWAYS_INLINE GSVector4i sra32() const 1207 { 1208 return GSVector4i(_mm_srai_epi32(m, i)); 1209 } 1210 1211 ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); } 1212 1213 #ifdef CPU_ARCH_AVX2 1214 ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi32(m, v.m)); } 1215 #endif 1216 1217 template<s64 i> 1218 ALWAYS_INLINE GSVector4i sll64() const 1219 { 1220 return GSVector4i(_mm_slli_epi64(m, i)); 1221 } 1222 1223 ALWAYS_INLINE GSVector4i sll64(s32 i) const { return GSVector4i(_mm_sll_epi64(m, _mm_cvtsi32_si128(i))); } 1224 1225 #ifdef CPU_ARCH_AVX2 1226 ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi64(m, v.m)); } 1227 #endif 1228 1229 template<s64 i> 1230 ALWAYS_INLINE GSVector4i srl64() const 1231 { 1232 return GSVector4i(_mm_srli_epi64(m, i)); 1233 } 1234 1235 ALWAYS_INLINE GSVector4i srl64(s32 i) const { return GSVector4i(_mm_srl_epi64(m, _mm_cvtsi32_si128(i))); } 1236 1237 #ifdef CPU_ARCH_AVX2 1238 ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi64(m, v.m)); } 1239 #endif 1240 1241 template<s64 i> 1242 ALWAYS_INLINE GSVector4i sra64() const 1243 { 1244 return GSVector4i(_mm_srai_epi64(m, i)); 1245 } 1246 1247 ALWAYS_INLINE GSVector4i sra64(s32 i) const { return GSVector4i(_mm_sra_epi64(m, _mm_cvtsi32_si128(i))); } 1248 1249 #ifdef CPU_ARCH_AVX2 1250 ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi64(m, v.m)); } 1251 #endif 1252 1253 ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); } 1254 ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const { return GSVector4i(_mm_add_epi16(m, v.m)); } 1255 ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(_mm_add_epi32(m, v.m)); } 1256 ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi8(m, v.m)); } 1257 ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi16(m, v.m)); } 1258 ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const { return GSVector4i(_mm_hadds_epi16(m, v.m)); } 1259 ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); } 1260 ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu16(m, v.m)); } 1261 1262 ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi8(m, v.m)); } 1263 ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi16(m, v.m)); } 1264 ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi32(m, v.m)); } 1265 ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi8(m, v.m)); } 1266 ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi16(m, v.m)); } 1267 ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu8(m, v.m)); } 1268 ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu16(m, v.m)); } 1269 1270 ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu8(m, v.m)); } 1271 ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu16(m, v.m)); } 1272 1273 ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epi16(m, v.m)); } 1274 ALWAYS_INLINE GSVector4i mul16hu(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epu16(m, v.m)); } 1275 ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi16(m, v.m)); } 1276 ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); } 1277 ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi32(m, v.m)); } 1278 1279 ALWAYS_INLINE bool eq(const GSVector4i& v) const 1280 { 1281 const GSVector4i t = *this ^ v; 1282 return _mm_testz_si128(t, t) != 0; 1283 } 1284 1285 ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi8(m, v.m)); } 1286 ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi16(m, v.m)); } 1287 ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi32(m, v.m)); } 1288 ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi64(m, v.m)); } 1289 1290 ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); } 1291 ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); } 1292 ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); } 1293 1294 ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi8(m, v.m)); } 1295 ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi16(m, v.m)); } 1296 ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi32(m, v.m)); } 1297 1298 ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi8(m, v.m)); } 1299 ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi16(m, v.m)); } 1300 ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi32(m, v.m)); } 1301 1302 ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi8(m, v.m)); } 1303 ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi16(m, v.m)); } 1304 ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi32(m, v.m)); } 1305 1306 ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi8(m, v.m)); } 1307 ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi16(m, v.m)); } 1308 ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi32(m, v.m)); } 1309 1310 ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(_mm_andnot_si128(v.m, m)); } 1311 1312 ALWAYS_INLINE s32 mask() const { return _mm_movemask_epi8(m); } 1313 1314 ALWAYS_INLINE bool alltrue() const { return mask() == 0xffff; } 1315 1316 ALWAYS_INLINE bool allfalse() const { return _mm_testz_si128(m, m) != 0; } 1317 1318 template<s32 i> 1319 ALWAYS_INLINE GSVector4i insert8(s32 a) const 1320 { 1321 return GSVector4i(_mm_insert_epi8(m, a, i)); 1322 } 1323 1324 template<s32 i> 1325 ALWAYS_INLINE s32 extract8() const 1326 { 1327 return _mm_extract_epi8(m, i); 1328 } 1329 1330 template<s32 i> 1331 ALWAYS_INLINE GSVector4i insert16(s32 a) const 1332 { 1333 return GSVector4i(_mm_insert_epi16(m, a, i)); 1334 } 1335 1336 template<s32 i> 1337 ALWAYS_INLINE s32 extract16() const 1338 { 1339 return _mm_extract_epi16(m, i); 1340 } 1341 1342 template<s32 i> 1343 ALWAYS_INLINE GSVector4i insert32(s32 a) const 1344 { 1345 return GSVector4i(_mm_insert_epi32(m, a, i)); 1346 } 1347 1348 template<s32 i> 1349 ALWAYS_INLINE s32 extract32() const 1350 { 1351 if constexpr (i == 0) 1352 return GSVector4i::store(*this); 1353 1354 return _mm_extract_epi32(m, i); 1355 } 1356 1357 template<s32 i> 1358 ALWAYS_INLINE GSVector4i insert64(s64 a) const 1359 { 1360 return GSVector4i(_mm_insert_epi64(m, a, i)); 1361 } 1362 1363 template<s32 i> 1364 ALWAYS_INLINE s64 extract64() const 1365 { 1366 if (i == 0) 1367 return GSVector4i::storeq(*this); 1368 1369 return _mm_extract_epi64(m, i); 1370 } 1371 1372 ALWAYS_INLINE static GSVector4i loadnt(const void* p) 1373 { 1374 return GSVector4i(_mm_stream_load_si128(static_cast<const __m128i*>(p))); 1375 } 1376 1377 ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); } 1378 1379 ALWAYS_INLINE static GSVector4i loadl(const void* p) 1380 { 1381 return GSVector4i(_mm_loadl_epi64(static_cast<const __m128i*>(p))); 1382 } 1383 1384 ALWAYS_INLINE static GSVector4i loadh(const void* p) 1385 { 1386 return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast<const __m64*>(p)))); 1387 } 1388 1389 ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) 1390 { 1391 return GSVector4i(_mm_unpacklo_epi64(_mm_setzero_si128(), v.m)); 1392 } 1393 1394 template<bool aligned> 1395 ALWAYS_INLINE static GSVector4i load(const void* p) 1396 { 1397 return GSVector4i(aligned ? _mm_load_si128(static_cast<const __m128i*>(p)) : 1398 _mm_loadu_si128(static_cast<const __m128i*>(p))); 1399 } 1400 1401 ALWAYS_INLINE static GSVector4i load(s32 i) { return GSVector4i(_mm_cvtsi32_si128(i)); } 1402 ALWAYS_INLINE static GSVector4i loadq(s64 i) { return GSVector4i(_mm_cvtsi64_si128(i)); } 1403 1404 ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); } 1405 ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); } 1406 ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) 1407 { 1408 _mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m)); 1409 } 1410 1411 ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v) 1412 { 1413 GSVector4i::storel(pl, v); 1414 GSVector4i::storeh(ph, v); 1415 } 1416 1417 template<bool aligned> 1418 ALWAYS_INLINE static void store(void* p, const GSVector4i& v) 1419 { 1420 if constexpr (aligned) 1421 _mm_store_si128(static_cast<__m128i*>(p), v.m); 1422 else 1423 _mm_storeu_si128(static_cast<__m128i*>(p), v.m); 1424 } 1425 1426 ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); } 1427 ALWAYS_INLINE static s32 store(const GSVector4i& v) { return _mm_cvtsi128_si32(v.m); } 1428 ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return _mm_cvtsi128_si64(v.m); } 1429 1430 ALWAYS_INLINE GSVector4i& operator&=(const GSVector4i& v) 1431 { 1432 m = _mm_and_si128(m, v); 1433 return *this; 1434 } 1435 ALWAYS_INLINE GSVector4i& operator|=(const GSVector4i& v) 1436 { 1437 m = _mm_or_si128(m, v); 1438 return *this; 1439 } 1440 ALWAYS_INLINE GSVector4i& operator^=(const GSVector4i& v) 1441 { 1442 m = _mm_xor_si128(m, v); 1443 return *this; 1444 } 1445 1446 ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2) 1447 { 1448 return GSVector4i(_mm_and_si128(v1, v2)); 1449 } 1450 1451 ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2) 1452 { 1453 return GSVector4i(_mm_or_si128(v1, v2)); 1454 } 1455 1456 ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2) 1457 { 1458 return GSVector4i(_mm_xor_si128(v1, v2)); 1459 } 1460 1461 ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); } 1462 ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); } 1463 ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); } 1464 ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); } 1465 1466 ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); } 1467 ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); 1468 1469 ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } 1470 1471 ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); } 1472 1473 ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); } 1474 1475 // clang-format off 1476 1477 #define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ 1478 ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ 1479 ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ 1480 ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ 1481 ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \ 1482 1483 #define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ 1484 VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ 1485 VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ 1486 VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ 1487 VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ 1488 1489 #define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \ 1490 VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ 1491 VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ 1492 VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ 1493 VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ 1494 1495 #define VECTOR4i_SHUFFLE_1(xs, xn) \ 1496 VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \ 1497 VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \ 1498 VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \ 1499 VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \ 1500 1501 VECTOR4i_SHUFFLE_1(x, 0) 1502 VECTOR4i_SHUFFLE_1(y, 1) 1503 VECTOR4i_SHUFFLE_1(z, 2) 1504 VECTOR4i_SHUFFLE_1(w, 3) 1505 1506 // clang-format on 1507 }; 1508 1509 class alignas(16) GSVector4 1510 { 1511 struct cxpr_init_tag 1512 { 1513 }; 1514 static constexpr cxpr_init_tag cxpr_init{}; 1515 1516 constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {} 1517 1518 constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {} 1519 1520 constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {} 1521 1522 constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {} 1523 1524 public: 1525 union 1526 { 1527 struct 1528 { 1529 float x, y, z, w; 1530 }; 1531 struct 1532 { 1533 float r, g, b, a; 1534 }; 1535 struct 1536 { 1537 float left, top, right, bottom; 1538 }; 1539 float F32[4]; 1540 double F64[2]; 1541 s8 I8[16]; 1542 s16 I16[8]; 1543 s32 I32[4]; 1544 s64 I64[2]; 1545 u8 U8[16]; 1546 u16 U16[8]; 1547 u32 U32[4]; 1548 u64 U64[2]; 1549 __m128 m; 1550 }; 1551 1552 GSVector4() = default; 1553 1554 constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); } 1555 constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); } 1556 constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); } 1557 constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); } 1558 1559 constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); } 1560 constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); } 1561 1562 constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); } 1563 constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); } 1564 1565 ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); } 1566 ALWAYS_INLINE GSVector4(float x, float y) { m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y)); } 1567 ALWAYS_INLINE GSVector4(int x, int y, int z, int w) 1568 { 1569 GSVector4i v_(x, y, z, w); 1570 m = _mm_cvtepi32_ps(v_.m); 1571 } 1572 ALWAYS_INLINE GSVector4(int x, int y) 1573 { 1574 m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y))); 1575 } 1576 1577 ALWAYS_INLINE explicit GSVector4(const GSVector2& v) : m(v.m) {} 1578 ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) 1579 : m(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtepi32_ps(v.m)), _mm_setzero_pd()))) 1580 { 1581 } 1582 1583 ALWAYS_INLINE constexpr explicit GSVector4(__m128 m) : m(m) {} 1584 1585 ALWAYS_INLINE explicit GSVector4(__m128d m) : m(_mm_castpd_ps(m)) {} 1586 1587 ALWAYS_INLINE explicit GSVector4(float f) { *this = f; } 1588 1589 ALWAYS_INLINE explicit GSVector4(int i) 1590 { 1591 #ifdef CPU_ARCH_AVX2 1592 m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i))); 1593 #else 1594 *this = GSVector4(GSVector4i(i)); 1595 #endif 1596 } 1597 1598 ALWAYS_INLINE explicit GSVector4(const GSVector4i& v); 1599 1600 ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); } 1601 ALWAYS_INLINE static GSVector4 f64(double x) { return GSVector4(_mm_castpd_ps(_mm_set1_pd(x))); } 1602 1603 ALWAYS_INLINE GSVector4& operator=(float f) 1604 { 1605 m = _mm_set1_ps(f); 1606 return *this; 1607 } 1608 1609 ALWAYS_INLINE GSVector4& operator=(__m128 m_) 1610 { 1611 this->m = m_; 1612 return *this; 1613 } 1614 1615 ALWAYS_INLINE operator __m128() const { return m; } 1616 1617 u32 rgba32() const { return GSVector4i(*this).rgba32(); } 1618 1619 ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); } 1620 1621 ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); } 1622 1623 ALWAYS_INLINE GSVector4 abs() const { return *this & cast(GSVector4i::cxpr(0x7fffffff)); } 1624 1625 ALWAYS_INLINE GSVector4 neg() const { return *this ^ cast(GSVector4i::cxpr(0x80000000)); } 1626 1627 ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(_mm_rcp_ps(m)); } 1628 1629 ALWAYS_INLINE GSVector4 rcpnr() const 1630 { 1631 GSVector4 v_ = rcp(); 1632 1633 return (v_ + v_) - (v_ * v_) * *this; 1634 } 1635 1636 ALWAYS_INLINE GSVector4 floor() const 1637 { 1638 return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); 1639 } 1640 1641 ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(_mm_round_ps(m, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); } 1642 1643 ALWAYS_INLINE GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const 1644 { 1645 #ifdef CPU_ARCH_AVX2 1646 return GSVector4(_mm_fmadd_ps(m, a_, b_)); 1647 #else 1648 return *this * a_ + b_; 1649 #endif 1650 } 1651 1652 ALWAYS_INLINE GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const 1653 { 1654 #ifdef CPU_ARCH_AVX2 1655 return GSVector4(_mm_fmsub_ps(m, a_, b_)); 1656 #else 1657 return *this * a_ - b_; 1658 #endif 1659 } 1660 1661 ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const 1662 { 1663 #ifdef CPU_ARCH_AVX2 1664 return GSVector4(_mm_fnmadd_ps(m, a_, b_)); 1665 #else 1666 return b_ - *this * a_; 1667 #endif 1668 } 1669 1670 ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const 1671 { 1672 #ifdef CPU_ARCH_AVX2 1673 return GSVector4(_mm_fnmsub_ps(m, a_, b_)); 1674 #else 1675 return -b_ - *this * a_; 1676 #endif 1677 } 1678 1679 ALWAYS_INLINE GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const 1680 { 1681 return a_.madd(b_, *this); // *this + a * b 1682 } 1683 1684 ALWAYS_INLINE GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const 1685 { 1686 return a_.nmadd(b_, *this); // *this - a * b 1687 } 1688 1689 ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(_mm_hadd_ps(m, m)); } 1690 1691 ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(_mm_hadd_ps(m, v.m)); } 1692 1693 ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(_mm_hsub_ps(m, m)); } 1694 1695 ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const { return GSVector4(_mm_hsub_ps(m, v.m)); } 1696 1697 template<int i> 1698 ALWAYS_INLINE GSVector4 dp(const GSVector4& v) const 1699 { 1700 return GSVector4(_mm_dp_ps(m, v.m, i)); 1701 } 1702 1703 ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const 1704 { 1705 return GSVector4(_mm_min_ps(_mm_max_ps(m, min), max)); 1706 } 1707 1708 ALWAYS_INLINE GSVector4 sat(const GSVector4& v) const 1709 { 1710 return GSVector4(_mm_min_ps(_mm_max_ps(m, v.xyxy()), v.zwzw())); 1711 } 1712 1713 ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); } 1714 1715 ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); } 1716 1717 ALWAYS_INLINE GSVector4 min(const GSVector4& v) const { return GSVector4(_mm_min_ps(m, v)); } 1718 1719 ALWAYS_INLINE GSVector4 max(const GSVector4& v) const { return GSVector4(_mm_max_ps(m, v)); } 1720 1721 template<int mask> 1722 ALWAYS_INLINE GSVector4 blend32(const GSVector4& v) const 1723 { 1724 return GSVector4(_mm_blend_ps(m, v, mask)); 1725 } 1726 1727 ALWAYS_INLINE GSVector4 blend32(const GSVector4& v, const GSVector4& mask) const 1728 { 1729 return GSVector4(_mm_blendv_ps(m, v, mask)); 1730 } 1731 1732 ALWAYS_INLINE GSVector4 upl(const GSVector4& v) const { return GSVector4(_mm_unpacklo_ps(m, v)); } 1733 1734 ALWAYS_INLINE GSVector4 uph(const GSVector4& v) const { return GSVector4(_mm_unpackhi_ps(m, v)); } 1735 1736 ALWAYS_INLINE GSVector4 upld(const GSVector4& v) const 1737 { 1738 return GSVector4(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m), _mm_castps_pd(v.m)))); 1739 } 1740 1741 ALWAYS_INLINE GSVector4 uphd(const GSVector4& v) const 1742 { 1743 return GSVector4(_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m), _mm_castps_pd(v.m)))); 1744 } 1745 1746 ALWAYS_INLINE GSVector4 l2h(const GSVector4& v) const { return GSVector4(_mm_movelh_ps(m, v)); } 1747 1748 ALWAYS_INLINE GSVector4 h2l(const GSVector4& v) const { return GSVector4(_mm_movehl_ps(m, v)); } 1749 1750 ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const { return GSVector4(_mm_andnot_ps(v.m, m)); } 1751 1752 ALWAYS_INLINE int mask() const { return _mm_movemask_ps(m); } 1753 1754 ALWAYS_INLINE bool alltrue() const { return mask() == 0xf; } 1755 1756 ALWAYS_INLINE bool allfalse() const 1757 { 1758 #ifdef CPU_ARCH_AVX2 1759 return _mm_testz_ps(m, m) != 0; 1760 #else 1761 const __m128i ii = _mm_castps_si128(m); 1762 return _mm_testz_si128(ii, ii) != 0; 1763 #endif 1764 } 1765 1766 ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); } 1767 1768 template<int src, int dst> 1769 ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const 1770 { 1771 if constexpr (src == dst) 1772 return GSVector4(_mm_blend_ps(m, v.m, 1 << src)); 1773 else 1774 return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0))); 1775 } 1776 1777 template<int i> 1778 ALWAYS_INLINE int extract32() const 1779 { 1780 return _mm_extract_ps(m, i); 1781 } 1782 1783 template<int dst> 1784 ALWAYS_INLINE GSVector4 insert64(double v) const 1785 { 1786 if constexpr (dst == 0) 1787 return GSVector4(_mm_move_sd(_mm_castps_pd(m), _mm_load_pd(&v))); 1788 else 1789 return GSVector4(_mm_shuffle_pd(_mm_castps_pd(m), _mm_load_pd(&v), 0)); 1790 } 1791 1792 template<int src> 1793 ALWAYS_INLINE double extract64() const 1794 { 1795 double ret; 1796 if constexpr (src == 0) 1797 _mm_storel_pd(&ret, _mm_castps_pd(m)); 1798 else 1799 _mm_storeh_pd(&ret, _mm_castps_pd(m)); 1800 return ret; 1801 } 1802 1803 ALWAYS_INLINE static GSVector4 zero() { return GSVector4(_mm_setzero_ps()); } 1804 ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); 1805 1806 ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); } 1807 1808 ALWAYS_INLINE static GSVector4 loadl(const void* p) 1809 { 1810 return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p)))); 1811 } 1812 1813 ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(_mm_load_ss(&f)); } 1814 1815 template<bool aligned> 1816 ALWAYS_INLINE static GSVector4 load(const void* p) 1817 { 1818 return GSVector4(aligned ? _mm_load_ps(static_cast<const float*>(p)) : _mm_loadu_ps(static_cast<const float*>(p))); 1819 } 1820 1821 ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast<float*>(p), v.m); } 1822 ALWAYS_INLINE static void storel(void* p, const GSVector4& v) 1823 { 1824 _mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m)); 1825 } 1826 ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) 1827 { 1828 _mm_storeh_pd(static_cast<double*>(p), _mm_castps_pd(v.m)); 1829 } 1830 1831 template<bool aligned> 1832 ALWAYS_INLINE static void store(void* p, const GSVector4& v) 1833 { 1834 if constexpr (aligned) 1835 _mm_store_ps(static_cast<float*>(p), v.m); 1836 else 1837 _mm_storeu_ps(static_cast<float*>(p), v.m); 1838 } 1839 1840 ALWAYS_INLINE static void store(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); } 1841 1842 ALWAYS_INLINE GSVector4 operator-() const { return neg(); } 1843 1844 ALWAYS_INLINE GSVector4& operator+=(const GSVector4& v_) 1845 { 1846 m = _mm_add_ps(m, v_); 1847 return *this; 1848 } 1849 1850 ALWAYS_INLINE GSVector4& operator-=(const GSVector4& v_) 1851 { 1852 m = _mm_sub_ps(m, v_); 1853 return *this; 1854 } 1855 1856 ALWAYS_INLINE GSVector4& operator*=(const GSVector4& v_) 1857 { 1858 m = _mm_mul_ps(m, v_); 1859 return *this; 1860 } 1861 1862 ALWAYS_INLINE GSVector4& operator/=(const GSVector4& v_) 1863 { 1864 m = _mm_div_ps(m, v_); 1865 return *this; 1866 } 1867 1868 ALWAYS_INLINE GSVector4& operator+=(float f) 1869 { 1870 *this += GSVector4(f); 1871 return *this; 1872 } 1873 1874 ALWAYS_INLINE GSVector4& operator-=(float f) 1875 { 1876 *this -= GSVector4(f); 1877 return *this; 1878 } 1879 1880 ALWAYS_INLINE GSVector4& operator*=(float f) 1881 { 1882 *this *= GSVector4(f); 1883 return *this; 1884 } 1885 1886 ALWAYS_INLINE GSVector4& operator/=(float f) 1887 { 1888 *this /= GSVector4(f); 1889 return *this; 1890 } 1891 1892 ALWAYS_INLINE GSVector4& operator&=(const GSVector4& v_) 1893 { 1894 m = _mm_and_ps(m, v_); 1895 return *this; 1896 } 1897 1898 ALWAYS_INLINE GSVector4& operator|=(const GSVector4& v_) 1899 { 1900 m = _mm_or_ps(m, v_); 1901 return *this; 1902 } 1903 1904 ALWAYS_INLINE GSVector4& operator^=(const GSVector4& v_) 1905 { 1906 m = _mm_xor_ps(m, v_); 1907 return *this; 1908 } 1909 1910 ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2) 1911 { 1912 return GSVector4(_mm_add_ps(v1, v2)); 1913 } 1914 1915 ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2) 1916 { 1917 return GSVector4(_mm_sub_ps(v1, v2)); 1918 } 1919 1920 ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2) 1921 { 1922 return GSVector4(_mm_mul_ps(v1, v2)); 1923 } 1924 1925 ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2) 1926 { 1927 return GSVector4(_mm_div_ps(v1, v2)); 1928 } 1929 1930 ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); } 1931 1932 ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); } 1933 1934 ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); } 1935 1936 ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); } 1937 1938 ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2) 1939 { 1940 return GSVector4(_mm_and_ps(v1, v2)); 1941 } 1942 1943 ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2) 1944 { 1945 return GSVector4(_mm_or_ps(v1, v2)); 1946 } 1947 1948 ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2) 1949 { 1950 return GSVector4(_mm_xor_ps(v1, v2)); 1951 } 1952 1953 ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2) 1954 { 1955 return GSVector4(_mm_cmpeq_ps(v1, v2)); 1956 } 1957 1958 ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2) 1959 { 1960 return GSVector4(_mm_cmpneq_ps(v1, v2)); 1961 } 1962 1963 ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2) 1964 { 1965 return GSVector4(_mm_cmpgt_ps(v1, v2)); 1966 } 1967 1968 ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2) 1969 { 1970 return GSVector4(_mm_cmplt_ps(v1, v2)); 1971 } 1972 1973 ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2) 1974 { 1975 return GSVector4(_mm_cmpge_ps(v1, v2)); 1976 } 1977 1978 ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2) 1979 { 1980 return GSVector4(_mm_cmple_ps(v1, v2)); 1981 } 1982 1983 ALWAYS_INLINE GSVector4 mul64(const GSVector4& v_) const 1984 { 1985 return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m))); 1986 } 1987 1988 ALWAYS_INLINE GSVector4 add64(const GSVector4& v_) const 1989 { 1990 return GSVector4(_mm_add_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m))); 1991 } 1992 1993 ALWAYS_INLINE GSVector4 sub64(const GSVector4& v_) const 1994 { 1995 return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m))); 1996 } 1997 1998 ALWAYS_INLINE GSVector4 div64(const GSVector4& v_) const 1999 { 2000 return GSVector4(_mm_div_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m))); 2001 } 2002 2003 ALWAYS_INLINE GSVector4 gt64(const GSVector4& v2) const 2004 { 2005 return GSVector4(_mm_cmpgt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); 2006 } 2007 2008 ALWAYS_INLINE GSVector4 eq64(const GSVector4& v2) const 2009 { 2010 return GSVector4(_mm_cmpeq_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); 2011 } 2012 2013 ALWAYS_INLINE GSVector4 lt64(const GSVector4& v2) const 2014 { 2015 return GSVector4(_mm_cmplt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); 2016 } 2017 2018 ALWAYS_INLINE GSVector4 ge64(const GSVector4& v2) const 2019 { 2020 return GSVector4(_mm_cmpge_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); 2021 } 2022 2023 ALWAYS_INLINE GSVector4 le64(const GSVector4& v2) const 2024 { 2025 return GSVector4(_mm_cmple_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); 2026 } 2027 2028 ALWAYS_INLINE GSVector4 min64(const GSVector4& v2) const 2029 { 2030 return GSVector4(_mm_min_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); 2031 } 2032 2033 ALWAYS_INLINE GSVector4 max64(const GSVector4& v2) const 2034 { 2035 return GSVector4(_mm_max_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m))); 2036 } 2037 2038 ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); } 2039 2040 ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); } 2041 2042 ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4(_mm_sqrt_pd(_mm_castps_pd(m))); } 2043 2044 ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(m))); } 2045 2046 ALWAYS_INLINE GSVector4 floor64() const 2047 { 2048 return GSVector4(_mm_round_pd(_mm_castps_pd(m), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); 2049 } 2050 2051 ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { return GSVector4(_mm_cvtps_pd(v_.m)); } 2052 2053 ALWAYS_INLINE static GSVector4 f32to64(const void* p) 2054 { 2055 return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))))); 2056 } 2057 2058 ALWAYS_INLINE GSVector4i f64toi32() const { return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m))); } 2059 2060 // clang-format off 2061 2062 #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ 2063 ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn))); } \ 2064 ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(_mm_shuffle_ps(m, v_.m, _MM_SHUFFLE(wn, zn, yn, xn))); } \ 2065 2066 #define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ 2067 VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ 2068 VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ 2069 VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ 2070 VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ 2071 2072 #define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \ 2073 VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ 2074 VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ 2075 VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ 2076 VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ 2077 2078 #define VECTOR4_SHUFFLE_1(xs, xn) \ 2079 VECTOR4_SHUFFLE_2(xs, xn, x, 0) \ 2080 VECTOR4_SHUFFLE_2(xs, xn, y, 1) \ 2081 VECTOR4_SHUFFLE_2(xs, xn, z, 2) \ 2082 VECTOR4_SHUFFLE_2(xs, xn, w, 3) \ 2083 2084 VECTOR4_SHUFFLE_1(x, 0) 2085 VECTOR4_SHUFFLE_1(y, 1) 2086 VECTOR4_SHUFFLE_1(z, 2) 2087 VECTOR4_SHUFFLE_1(w, 3) 2088 2089 // clang-format on 2090 2091 #if CPU_ARCH_AVX2 2092 2093 ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(_mm_broadcastss_ps(m)); } 2094 2095 ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(_mm_broadcastss_ps(v.m)); } 2096 2097 ALWAYS_INLINE static GSVector4 broadcast32(const void* f) 2098 { 2099 return GSVector4(_mm_broadcastss_ps(_mm_load_ss(static_cast<const float*>(f)))); 2100 } 2101 2102 #endif 2103 2104 ALWAYS_INLINE static GSVector4 broadcast64(const void* d) 2105 { 2106 return GSVector4(_mm_loaddup_pd(static_cast<const double*>(d))); 2107 } 2108 }; 2109 2110 ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v) 2111 { 2112 m = _mm_cvttps_epi32(v); 2113 } 2114 2115 ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v) 2116 { 2117 m = _mm_cvtepi32_ps(v); 2118 } 2119 2120 ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v) 2121 { 2122 return GSVector2i(_mm_castps_si128(v.m)); 2123 } 2124 2125 ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v) 2126 { 2127 return GSVector2(_mm_castsi128_ps(v.m)); 2128 } 2129 2130 ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v) 2131 { 2132 m = _mm_cvttps_epi32(v); 2133 } 2134 2135 ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v) 2136 { 2137 m = _mm_cvtepi32_ps(v); 2138 } 2139 2140 ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v) 2141 { 2142 return GSVector4i(_mm_castps_si128(v.m)); 2143 } 2144 2145 ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v) 2146 { 2147 return GSVector4(_mm_castsi128_ps(v.m)); 2148 }