gsvector_neon.h (107684B)
1 // SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com> 2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) 3 4 #include "common/intrin.h" 5 #include "common/types.h" 6 7 #include <algorithm> 8 #include <cmath> 9 10 #define GSVECTOR_HAS_UNSIGNED 1 11 #define GSVECTOR_HAS_SRLV 1 12 13 class GSVector2; 14 class GSVector2i; 15 class GSVector4; 16 class GSVector4i; 17 18 class alignas(16) GSVector2i 19 { 20 struct cxpr_init_tag 21 { 22 }; 23 static constexpr cxpr_init_tag cxpr_init{}; 24 25 constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y} {} 26 27 constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {} 28 29 constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) 30 : S8{b0, b1, b2, b3, b4, b5, b6, b7} 31 { 32 } 33 34 public: 35 union 36 { 37 struct 38 { 39 s32 x, y; 40 }; 41 struct 42 { 43 s32 r, g; 44 }; 45 float F32[2]; 46 s8 S8[8]; 47 s16 S16[4]; 48 s32 S32[2]; 49 s64 S64[1]; 50 u8 U8[8]; 51 u16 U16[4]; 52 u32 U32[2]; 53 u64 U64[1]; 54 int32x2_t v2s; 55 }; 56 57 GSVector2i() = default; 58 59 ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); } 60 61 ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); } 62 63 ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); } 64 65 ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3) 66 { 67 return GSVector2i(cxpr_init, s0, s1, s2, s3); 68 } 69 70 ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) 71 { 72 return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7); 73 } 74 75 ALWAYS_INLINE GSVector2i(s32 x, s32 y) { v2s = vset_lane_s32(y, vdup_n_s32(x), 1); } 76 77 ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {} 78 79 ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7) 80 : S8{b0, b1, b2, b3, b4, b5, b6, b7} 81 { 82 } 83 84 // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), 85 // so leave the non-constexpr version default 86 ALWAYS_INLINE explicit GSVector2i(int i) { *this = i; } 87 88 ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {} 89 90 ALWAYS_INLINE explicit GSVector2i(const GSVector2& v); 91 92 ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); 93 94 ALWAYS_INLINE void operator=(int i) { v2s = vdup_n_s32(i); } 95 96 ALWAYS_INLINE operator int32x2_t() const { return v2s; } 97 98 ALWAYS_INLINE GSVector2i sat_i8(const GSVector2i& min, const GSVector2i& max) const 99 { 100 return max_i8(min).min_i8(max); 101 } 102 ALWAYS_INLINE GSVector2i sat_i16(const GSVector2i& min, const GSVector2i& max) const 103 { 104 return max_i16(min).min_i16(max); 105 } 106 ALWAYS_INLINE GSVector2i sat_i32(const GSVector2i& min, const GSVector2i& max) const 107 { 108 return max_i32(min).min_i32(max); 109 } 110 111 ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const 112 { 113 return max_u8(min).min_u8(max); 114 } 115 ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const 116 { 117 return max_u16(min).min_u16(max); 118 } 119 ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const 120 { 121 return max_u32(min).min_u32(max); 122 } 123 124 ALWAYS_INLINE GSVector2i min_i8(const GSVector2i& v) const 125 { 126 return GSVector2i(vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 127 } 128 129 ALWAYS_INLINE GSVector2i max_i8(const GSVector2i& v) const 130 { 131 return GSVector2i(vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 132 } 133 134 ALWAYS_INLINE GSVector2i min_i16(const GSVector2i& v) const 135 { 136 return GSVector2i(vreinterpret_s32_s16(vmin_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 137 } 138 139 ALWAYS_INLINE GSVector2i max_i16(const GSVector2i& v) const 140 { 141 return GSVector2i(vreinterpret_s32_s16(vmax_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 142 } 143 144 ALWAYS_INLINE GSVector2i min_i32(const GSVector2i& v) const { return GSVector2i(vmin_s32(v2s, v.v2s)); } 145 146 ALWAYS_INLINE GSVector2i max_i32(const GSVector2i& v) const { return GSVector2i(vmax_s32(v2s, v.v2s)); } 147 148 ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const 149 { 150 return GSVector2i(vreinterpret_s32_u8(vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s)))); 151 } 152 153 ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const 154 { 155 return GSVector2i(vreinterpret_s32_u8(vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s)))); 156 } 157 158 ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const 159 { 160 return GSVector2i(vreinterpret_s32_u16(vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); 161 } 162 163 ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const 164 { 165 return GSVector2i(vreinterpret_s32_u16(vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); 166 } 167 168 ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const 169 { 170 return GSVector2i(vreinterpret_s32_u32(vmin_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s)))); 171 } 172 173 ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const 174 { 175 return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s)))); 176 } 177 178 ALWAYS_INLINE s32 addv_s32() const 179 { 180 #ifdef CPU_ARCH_ARM64 181 return vaddv_s32(v2s); 182 #else 183 return vget_lane_s32(v2s, 0) + vget_lane_s32(v2s, 1); 184 #endif 185 } 186 187 #ifdef CPU_ARCH_ARM64 188 189 ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); } 190 191 ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); } 192 193 ALWAYS_INLINE u16 minv_u16() const { return vminv_u16(vreinterpret_u16_s32(v2s)); } 194 195 ALWAYS_INLINE u16 maxv_u16() const { return vmaxv_u16(vreinterpret_u16_s32(v2s)); } 196 197 ALWAYS_INLINE s32 minv_s32() const { return vminv_s32(v2s); } 198 199 ALWAYS_INLINE u32 minv_u32() const { return vminv_u32(v2s); } 200 201 ALWAYS_INLINE s32 maxv_s32() const { return vmaxv_s32(v2s); } 202 203 ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(v2s); } 204 205 #else 206 207 ALWAYS_INLINE u8 minv_u8() const 208 { 209 uint8x8_t vmin = vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1))); 210 return static_cast<u8>( 211 std::min(static_cast<u32>(vget_lane_u8(vmin, 0)), 212 std::min(static_cast<u32>(vget_lane_u8(vmin, 1)), 213 std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3)))))); 214 } 215 216 ALWAYS_INLINE u16 maxv_u8() const 217 { 218 uint8x8_t vmax = vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1))); 219 return static_cast<u8>( 220 std::max(static_cast<u32>(vget_lane_u8(vmax, 0)), 221 std::max(static_cast<u32>(vget_lane_u8(vmax, 1)), 222 std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3)))))); 223 } 224 225 ALWAYS_INLINE u16 minv_u16() const 226 { 227 uint16x4_t vmin = vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1))); 228 return static_cast<u16>( 229 std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1)))); 230 } 231 232 ALWAYS_INLINE u16 maxv_u16() const 233 { 234 uint16x4_t vmax = vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1))); 235 return static_cast<u16>( 236 std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1)))); 237 } 238 239 ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); } 240 241 ALWAYS_INLINE u32 minv_u32() const 242 { 243 return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1)); 244 } 245 246 ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); } 247 248 ALWAYS_INLINE u32 maxv_u32() const 249 { 250 return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1)); 251 } 252 253 #endif 254 255 ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); } 256 257 ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const 258 { 259 uint8x8_t mask2 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_s32(mask.v2s), 7)); 260 return GSVector2i(vreinterpret_s32_u8(vbsl_u8(mask2, vreinterpret_u8_s32(a.v2s), vreinterpret_u8_s32(v2s)))); 261 } 262 263 template<int mask> 264 ALWAYS_INLINE GSVector2i blend16(const GSVector2i& a) const 265 { 266 static constexpr const uint16_t _mask[4] = { 267 ((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0, 268 ((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0}; 269 return GSVector2i( 270 vreinterpret_s32_u16(vbsl_u16(vld1_u16(_mask), vreinterpret_u16_s32(a.v2s), vreinterpret_u16_s32(v2s)))); 271 } 272 273 template<int mask> 274 ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const 275 { 276 constexpr int bit1 = ((mask & 2) * 3) << 1; 277 constexpr int bit0 = (mask & 1) * 3; 278 return blend16<bit1 | bit0>(v); 279 } 280 281 ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const 282 { 283 return GSVector2i(vreinterpret_s32_s8(vorr_s8(vbic_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(mask.v2s)), 284 vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s))))); 285 } 286 287 ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); } 288 289 ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const 290 { 291 return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_u8_s32(mask.v2s)))); 292 } 293 294 ALWAYS_INLINE GSVector2i ps16() const 295 { 296 return GSVector2i(vreinterpret_s32_s8(vqmovn_s16(vcombine_s16(vreinterpret_s16_s32(v2s), vcreate_s16(0))))); 297 } 298 299 ALWAYS_INLINE GSVector2i pu16() const 300 { 301 return GSVector2i(vreinterpret_s32_u8(vqmovn_u16(vcombine_u16(vreinterpret_u16_s32(v2s), vcreate_u16(0))))); 302 } 303 304 ALWAYS_INLINE GSVector2i ps32() const 305 { 306 return GSVector2i(vreinterpret_s32_s16(vqmovn_s16(vcombine_s32(v2s, vcreate_s32(0))))); 307 } 308 309 ALWAYS_INLINE GSVector2i pu32() const 310 { 311 return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0))))); 312 } 313 314 #ifdef CPU_ARCH_ARM64 315 316 ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const 317 { 318 return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 319 } 320 321 ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const 322 { 323 return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 324 } 325 ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip1_s32(v2s, v.v2s)); } 326 327 ALWAYS_INLINE GSVector2i upl8() const 328 { 329 return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)))); 330 } 331 332 ALWAYS_INLINE GSVector2i upl16() const 333 { 334 return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)))); 335 } 336 337 ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); } 338 339 #else 340 341 ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const 342 { 343 return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)).val[0])); 344 } 345 346 ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const 347 { 348 return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)).val[0])); 349 } 350 ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip_s32(v2s, v.v2s).val[0]); } 351 352 ALWAYS_INLINE GSVector2i upl8() const 353 { 354 return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)).val[0])); 355 } 356 357 ALWAYS_INLINE GSVector2i upl16() const 358 { 359 return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)).val[0])); 360 } 361 362 ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip_s32(v2s, vdup_n_s32(0)).val[0]); } 363 364 #endif 365 366 ALWAYS_INLINE GSVector2i i8to16() const 367 { 368 return GSVector2i(vreinterpret_s32_s16(vget_low_s8(vmovl_s8(vreinterpret_s8_s32(v2s))))); 369 } 370 371 ALWAYS_INLINE GSVector2i u8to16() const 372 { 373 return GSVector2i(vreinterpret_s32_u16(vget_low_u8(vmovl_u8(vreinterpret_u8_s32(v2s))))); 374 } 375 376 template<int i> 377 ALWAYS_INLINE GSVector2i srl() const 378 { 379 return GSVector2i(vreinterpret_s32_s8(vext_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0), i))); 380 } 381 382 template<int i> 383 ALWAYS_INLINE GSVector2i sll() const 384 { 385 return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 16 - i))); 386 } 387 388 template<int i> 389 ALWAYS_INLINE GSVector2i sll16() const 390 { 391 return GSVector2i(vreinterpret_s32_s16(vshl_n_s16(vreinterpret_s16_s32(v2s), i))); 392 } 393 394 ALWAYS_INLINE GSVector2i sll16(s32 i) const 395 { 396 return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(i)))); 397 } 398 399 ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const 400 { 401 return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 402 } 403 404 template<int i> 405 ALWAYS_INLINE GSVector2i srl16() const 406 { 407 return GSVector2i(vreinterpret_s32_u16(vshr_n_u16(vreinterpret_u16_s32(v2s), i))); 408 } 409 410 ALWAYS_INLINE GSVector2i srl16(s32 i) const 411 { 412 return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_u16(-i)))); 413 } 414 415 ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const 416 { 417 return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s))))); 418 } 419 420 template<int i> 421 ALWAYS_INLINE GSVector2i sra16() const 422 { 423 constexpr int count = (i & ~15) ? 15 : i; 424 return GSVector2i(vreinterpret_s32_s16(vshr_n_s16(vreinterpret_s16_s32(v2s), count))); 425 } 426 427 ALWAYS_INLINE GSVector2i sra16(s32 i) const 428 { 429 return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(-i)))); 430 } 431 432 ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const 433 { 434 return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s))))); 435 } 436 437 template<int i> 438 ALWAYS_INLINE GSVector2i sll32() const 439 { 440 return GSVector2i(vshl_n_s32(v2s, i)); 441 } 442 443 ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(i))); } 444 445 ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(vshl_s32(v2s, v.v2s)); } 446 447 template<int i> 448 ALWAYS_INLINE GSVector2i srl32() const 449 { 450 return GSVector2i(vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(v2s), i))); 451 } 452 453 ALWAYS_INLINE GSVector2i srl32(s32 i) const 454 { 455 return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(-i)))); 456 } 457 458 ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const 459 { 460 return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s)))); 461 } 462 463 template<int i> 464 ALWAYS_INLINE GSVector2i sra32() const 465 { 466 return GSVector2i(vshr_n_s32(v2s, i)); 467 } 468 469 ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(-i))); } 470 471 ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const 472 { 473 return GSVector2i(vshl_s32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s))); 474 } 475 476 ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const 477 { 478 return GSVector2i(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 479 } 480 481 ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const 482 { 483 return GSVector2i(vreinterpret_s32_s16(vadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 484 } 485 486 ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(vadd_s32(v2s, v.v2s)); } 487 488 ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const 489 { 490 return GSVector2i(vreinterpret_s32_s8(vqadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 491 } 492 493 ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const 494 { 495 return GSVector2i(vreinterpret_s32_s16(vqadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 496 } 497 498 ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const 499 { 500 return GSVector2i(vreinterpret_s32_u8(vqadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s)))); 501 } 502 503 ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const 504 { 505 return GSVector2i(vreinterpret_s32_u16(vqadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); 506 } 507 508 ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const 509 { 510 return GSVector2i(vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 511 } 512 513 ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const 514 { 515 return GSVector2i(vreinterpret_s32_s16(vsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 516 } 517 518 ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(vsub_s32(v2s, v.v2s)); } 519 520 ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const 521 { 522 return GSVector2i(vreinterpret_s32_s8(vqsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 523 } 524 525 ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const 526 { 527 return GSVector2i(vreinterpret_s32_s16(vqsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 528 } 529 530 ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const 531 { 532 return GSVector2i(vreinterpret_s32_u8(vqsub_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s)))); 533 } 534 535 ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const 536 { 537 return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); 538 } 539 540 ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const 541 { 542 return GSVector2i(vreinterpret_s32_u8(vrhadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s)))); 543 } 544 545 ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const 546 { 547 return GSVector2i(vreinterpret_s32_u16(vrhadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s)))); 548 } 549 550 ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const 551 { 552 return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 553 } 554 555 ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(vmul_s32(v2s, v.v2s)); } 556 557 ALWAYS_INLINE bool eq(const GSVector2i& v) const 558 { 559 return (vget_lane_u64(vreinterpret_u64_s32(veor_s32(v2s, v.v2s)), 0) == 0); 560 } 561 562 ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const 563 { 564 return GSVector2i(vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 565 } 566 567 ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const 568 { 569 return GSVector2i(vreinterpret_s32_u16(vceq_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 570 } 571 572 ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const 573 { 574 return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s))); 575 } 576 577 ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); } 578 579 ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); } 580 581 ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); } 582 583 ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const 584 { 585 return GSVector2i(vreinterpret_s32_s8(vcgt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 586 } 587 588 ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const 589 { 590 return GSVector2i(vreinterpret_s32_s16(vcgt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 591 } 592 593 ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(vcgt_s32(v2s, v.v2s)); } 594 595 ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const 596 { 597 return GSVector2i(vreinterpret_s32_s8(vcge_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 598 } 599 ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const 600 { 601 return GSVector2i(vreinterpret_s32_s16(vcge_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 602 } 603 ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return GSVector2i(vcge_s32(v2s, v.v2s)); } 604 605 ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const 606 { 607 return GSVector2i(vreinterpret_s32_s8(vclt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 608 } 609 610 ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const 611 { 612 return GSVector2i(vreinterpret_s32_s16(vclt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 613 } 614 615 ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(vclt_s32(v2s, v.v2s)); } 616 617 ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const 618 { 619 return GSVector2i(vreinterpret_s32_s8(vcle_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); 620 } 621 ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const 622 { 623 return GSVector2i(vreinterpret_s32_s16(vcle_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)))); 624 } 625 ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return GSVector2i(vcle_s32(v2s, v.v2s)); } 626 627 ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(vbic_s32(v2s, v.v2s)); } 628 629 ALWAYS_INLINE int mask() const 630 { 631 // borrowed from sse2neon 632 const uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(vreinterpret_u8_s32(v2s), 7)); 633 const uint32x2_t paired16 = vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); 634 const uint64x1_t paired32 = vreinterpret_u64_u32(vsra_n_u32(paired16, paired16, 14)); 635 const uint8x8_t paired64 = vreinterpret_u8_u64(vsra_n_u64(paired32, paired32, 28)); 636 return static_cast<int>(vget_lane_u8(paired64, 0)); 637 } 638 639 ALWAYS_INLINE bool alltrue() const 640 { 641 // MSB should be set in all 8-bit lanes. 642 #ifdef CPU_ARCH_ARM64 643 return (vminv_u8(vreinterpret_u8_s32(v2s)) & 0x80) == 0x80; 644 #else 645 return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) == 646 0x80808080u); 647 #endif 648 } 649 650 ALWAYS_INLINE bool allfalse() const 651 { 652 // MSB should be clear in all 8-bit lanes. 653 #ifdef CPU_ARCH_ARM64 654 return (vmaxv_u32(vreinterpret_u8_s32(v2s)) & 0x80) != 0x80; 655 #else 656 return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) == 657 0); 658 #endif 659 } 660 661 template<int i> 662 ALWAYS_INLINE GSVector2i insert8(int a) const 663 { 664 return GSVector2i(vreinterpret_s32_u8(vset_lane_u8(a, vreinterpret_u8_s32(v2s), static_cast<uint8_t>(i)))); 665 } 666 667 template<int i> 668 ALWAYS_INLINE int extract8() const 669 { 670 return vget_lane_u8(vreinterpret_u8_s32(v2s), i); 671 } 672 673 template<int i> 674 ALWAYS_INLINE GSVector2i insert16(int a) const 675 { 676 return GSVector2i(vreinterpret_s32_u16(vset_lane_u16(a, vreinterpret_u16_s32(v2s), static_cast<uint16_t>(i)))); 677 } 678 679 template<int i> 680 ALWAYS_INLINE int extract16() const 681 { 682 return vget_lane_u16(vreinterpret_u16_s32(v2s), i); 683 } 684 685 template<int i> 686 ALWAYS_INLINE GSVector2i insert32(int a) const 687 { 688 return GSVector2i(vset_lane_s32(a, v2s, i)); 689 } 690 691 template<int i> 692 ALWAYS_INLINE int extract32() const 693 { 694 return vget_lane_s32(v2s, i); 695 } 696 697 ALWAYS_INLINE static GSVector2i load32(const void* p) 698 { 699 // should be ldr s0, [x0] 700 u32 val; 701 std::memcpy(&val, p, sizeof(u32)); 702 return GSVector2i(vset_lane_u32(val, vdup_n_u32(0), 0)); 703 } 704 705 ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(vld1_s32((const int32_t*)p)); } 706 707 ALWAYS_INLINE static GSVector2i load(int i) { return GSVector2i(vset_lane_s32(i, vdup_n_s32(0), 0)); } 708 709 ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) 710 { 711 s32 val = vget_lane_s32(v, 0); 712 std::memcpy(p, &val, sizeof(s32)); 713 } 714 715 ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { vst1_s32((int32_t*)p, v.v2s); } 716 717 ALWAYS_INLINE static int store(const GSVector2i& v) { return vget_lane_s32(v.v2s, 0); } 718 719 ALWAYS_INLINE void operator&=(const GSVector2i& v) 720 { 721 v2s = vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))); 722 } 723 724 ALWAYS_INLINE void operator|=(const GSVector2i& v) 725 { 726 v2s = vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))); 727 } 728 729 ALWAYS_INLINE void operator^=(const GSVector2i& v) 730 { 731 v2s = vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))); 732 } 733 734 ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2) 735 { 736 return GSVector2i(vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s)))); 737 } 738 739 ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2) 740 { 741 return GSVector2i(vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s)))); 742 } 743 744 ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2) 745 { 746 return GSVector2i(vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s)))); 747 } 748 749 ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, int i) { return v & GSVector2i(i); } 750 751 ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, int i) { return v | GSVector2i(i); } 752 753 ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, int i) { return v ^ GSVector2i(i); } 754 755 ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return GSVector2i(vmvn_s32(v.v2s)); } 756 757 ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(0); } 758 759 ALWAYS_INLINE GSVector2i xy() const { return *this; } 760 ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 0, 0)); } 761 ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 0)); } 762 ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 1)); } 763 }; 764 765 class alignas(16) GSVector2 766 { 767 struct cxpr_init_tag 768 { 769 }; 770 static constexpr cxpr_init_tag cxpr_init{}; 771 772 constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {} 773 774 constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {} 775 776 public: 777 union 778 { 779 struct 780 { 781 float x, y; 782 }; 783 struct 784 { 785 float r, g; 786 }; 787 float F32[2]; 788 double F64[1]; 789 s8 I8[8]; 790 s16 I16[4]; 791 s32 I32[2]; 792 s64 I64[1]; 793 u8 U8[8]; 794 u16 U16[4]; 795 u32 U32[2]; 796 u64 U64[1]; 797 float32x2_t v2s; 798 }; 799 800 GSVector2() = default; 801 802 constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); } 803 804 constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); } 805 806 constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); } 807 808 constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); } 809 810 ALWAYS_INLINE GSVector2(float x, float y) : v2s(vset_lane_f32(y, vdup_n_f32(x), 1)) {} 811 812 ALWAYS_INLINE GSVector2(int x, int y) : v2s(vcvt_f32_s32(vset_lane_s32(y, vdup_n_s32(x), 1))) {} 813 814 ALWAYS_INLINE constexpr explicit GSVector2(float32x2_t m) : v2s(m) {} 815 816 ALWAYS_INLINE explicit GSVector2(float f) { v2s = vdup_n_f32(f); } 817 818 ALWAYS_INLINE explicit GSVector2(int i) { v2s = vcvt_f32_s32(vdup_n_s32(i)); } 819 820 ALWAYS_INLINE explicit GSVector2(const GSVector2i& v); 821 822 ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v); 823 824 ALWAYS_INLINE void operator=(float f) { v2s = vdup_n_f32(f); } 825 826 ALWAYS_INLINE void operator=(float32x2_t m) { v2s = m; } 827 828 ALWAYS_INLINE operator float32x2_t() const { return v2s; } 829 830 ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); } 831 ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); } 832 ALWAYS_INLINE GSVector2 rcp() const 833 { 834 float32x2_t recip = vrecpe_f32(v2s); 835 recip = vmul_f32(recip, vrecps_f32(recip, v2s)); 836 return GSVector2(recip); 837 } 838 839 #ifdef CPU_ARCH_ARM64 840 841 ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); } 842 ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); } 843 844 #else 845 846 ALWAYS_INLINE GSVector2 floor() const 847 { 848 return GSVector2(std::floor(vget_lane_f32(v2s, 0)), std::floor(vget_lane_f32(v2s, 1))); 849 } 850 851 ALWAYS_INLINE GSVector2 ceil() const 852 { 853 return GSVector2(std::ceil(vget_lane_f32(v2s, 0)), std::ceil(vget_lane_f32(v2s, 1))); 854 } 855 856 #endif 857 858 ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); } 859 860 ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); } 861 862 ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); } 863 864 ALWAYS_INLINE GSVector2 min(const GSVector2& a) const { return GSVector2(vmin_f32(v2s, a.v2s)); } 865 866 ALWAYS_INLINE GSVector2 max(const GSVector2& a) const { return GSVector2(vmax_f32(v2s, a.v2s)); } 867 868 template<int mask> 869 ALWAYS_INLINE GSVector2 blend32(const GSVector2& a) const 870 { 871 return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1)); 872 } 873 874 ALWAYS_INLINE GSVector2 blend32(const GSVector2& a, const GSVector2& mask) const 875 { 876 // duplicate sign bit across and bit select 877 const uint32x2_t bitmask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(mask.v2s), 31)); 878 return GSVector2(vbsl_f32(bitmask, a.v2s, v2s)); 879 } 880 881 ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const 882 { 883 return GSVector2(vreinterpret_f32_s32(vbic_s32(vreinterpret_s32_f32(v2s), vreinterpret_s32_f32(v.v2s)))); 884 } 885 886 ALWAYS_INLINE int mask() const 887 { 888 const uint32x2_t masks = vshr_n_u32(vreinterpret_u32_s32(v2s), 31); 889 return (vget_lane_u32(masks, 0) | (vget_lane_u32(masks, 1) << 1)); 890 } 891 892 ALWAYS_INLINE bool alltrue() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0xFFFFFFFFFFFFFFFFULL); } 893 894 ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0); } 895 896 ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); } 897 898 template<int src, int dst> 899 ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const 900 { 901 #ifdef CPU_ARCH_ARM64 902 return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src)); 903 #else 904 return GSVector2(vset_lane_f32(vget_lane_f32(v.v2s, src), v2s, dst)); 905 #endif 906 } 907 908 template<int i> 909 ALWAYS_INLINE int extract32() const 910 { 911 return vget_lane_s32(vreinterpret_s32_f32(v2s), i); 912 } 913 914 ALWAYS_INLINE float dot(const GSVector2& v) const 915 { 916 #ifdef CPU_ARCH_ARM64 917 return vaddv_f32(vmul_f32(v2s, v.v2s)); 918 #else 919 const float32x2_t dp = vmul_f32(v2s, v.v2s); 920 return vget_lane_f32(vadd_f32(dp, vdup_lane_f32(dp, 1)), 0); 921 #endif 922 } 923 924 ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); } 925 926 ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); } 927 928 ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(vset_lane_f32(f, vmov_n_f32(0.0f), 0)); } 929 930 ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(vld1_f32((const float*)p)); } 931 932 ALWAYS_INLINE static void store(void* p, const GSVector2& v) { vst1_f32((float*)p, v.v2s); } 933 934 ALWAYS_INLINE GSVector2 operator-() const { return neg(); } 935 936 ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); } 937 ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); } 938 ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); } 939 ALWAYS_INLINE void operator/=(const GSVector2& v) 940 { 941 #ifdef CPU_ARCH_ARM64 942 v2s = vdiv_f32(v2s, v.v2s); 943 #else 944 *this = GSVector2(vget_lane_f32(v2s, 0) / vget_lane_f32(v.v2s, 0), vget_lane_f32(v2s, 1) / vget_lane_f32(v.v2s, 1)); 945 #endif 946 } 947 948 ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); } 949 ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); } 950 ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); } 951 ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); } 952 953 ALWAYS_INLINE void operator&=(const GSVector2& v) 954 { 955 v2s = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s))); 956 } 957 958 ALWAYS_INLINE void operator|=(const GSVector2& v) 959 { 960 v2s = vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s))); 961 } 962 963 ALWAYS_INLINE void operator^=(const GSVector2& v) 964 { 965 v2s = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s))); 966 } 967 968 ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2) 969 { 970 return GSVector2(vadd_f32(v1.v2s, v2.v2s)); 971 } 972 973 ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2) 974 { 975 return GSVector2(vsub_f32(v1.v2s, v2.v2s)); 976 } 977 978 ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2) 979 { 980 return GSVector2(vmul_f32(v1.v2s, v2.v2s)); 981 } 982 983 ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2) 984 { 985 #ifdef CPU_ARCH_ARM64 986 return GSVector2(vdiv_f32(v1.v2s, v2.v2s)); 987 #else 988 return GSVector2(vget_lane_f32(v1.v2s, 0) / vget_lane_f32(v2.v2s, 0), 989 vget_lane_f32(v1.v2s, 1) / vget_lane_f32(v2.v2s, 1)); 990 #endif 991 } 992 993 ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); } 994 ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); } 995 ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); } 996 ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); } 997 998 ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2) 999 { 1000 return GSVector2(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s)))); 1001 } 1002 1003 ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2) 1004 { 1005 return GSVector2(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s)))); 1006 } 1007 1008 ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2) 1009 { 1010 return GSVector2(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s)))); 1011 } 1012 1013 ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2) 1014 { 1015 return GSVector2(vreinterpret_f32_u32(vceq_f32(v1.v2s, v2.v2s))); 1016 } 1017 1018 ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2) 1019 { 1020 // NEON has no != 1021 return GSVector2(vreinterpret_f32_u32(vmvn_u32(vceq_f32(v1.v2s, v2.v2s)))); 1022 } 1023 1024 ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2) 1025 { 1026 return GSVector2(vreinterpret_f32_u32(vcgt_f32(v1.v2s, v2.v2s))); 1027 } 1028 1029 ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2) 1030 { 1031 return GSVector2(vreinterpret_f32_u32(vclt_f32(v1.v2s, v2.v2s))); 1032 } 1033 1034 ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2) 1035 { 1036 return GSVector2(vreinterpret_f32_u32(vcge_f32(v1.v2s, v2.v2s))); 1037 } 1038 1039 ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2) 1040 { 1041 return GSVector2(vreinterpret_f32_u32(vcle_f32(v1.v2s, v2.v2s))); 1042 } 1043 1044 ALWAYS_INLINE GSVector2 xy() const { return *this; } 1045 ALWAYS_INLINE GSVector2 xx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 0, 0)); } 1046 ALWAYS_INLINE GSVector2 yx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 0)); } 1047 ALWAYS_INLINE GSVector2 yy() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 1)); } 1048 }; 1049 1050 class alignas(16) GSVector4i 1051 { 1052 struct cxpr_init_tag 1053 { 1054 }; 1055 static constexpr cxpr_init_tag cxpr_init{}; 1056 1057 constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {} 1058 1059 constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) 1060 : S16{s0, s1, s2, s3, s4, s5, s6, s7} 1061 { 1062 } 1063 1064 constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, 1065 s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) 1066 : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} 1067 { 1068 } 1069 1070 public: 1071 union 1072 { 1073 struct 1074 { 1075 int x, y, z, w; 1076 }; 1077 struct 1078 { 1079 int r, g, b, a; 1080 }; 1081 struct 1082 { 1083 int left, top, right, bottom; 1084 }; 1085 float F32[4]; 1086 s8 S8[16]; 1087 s16 S16[8]; 1088 s32 S32[4]; 1089 s64 S64[2]; 1090 u8 U8[16]; 1091 u16 U16[8]; 1092 u32 U32[4]; 1093 u64 U64[2]; 1094 int32x4_t v4s; 1095 }; 1096 1097 GSVector4i() = default; 1098 1099 ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w) 1100 { 1101 return GSVector4i(cxpr_init, x, y, z, w); 1102 } 1103 1104 ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); } 1105 1106 ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); } 1107 1108 ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) 1109 { 1110 return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7); 1111 } 1112 1113 ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, 1114 s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) 1115 { 1116 return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15); 1117 } 1118 1119 ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) 1120 { 1121 GSVector4i xz = load(x).upl32(load(z)); 1122 GSVector4i yw = load(y).upl32(load(w)); 1123 1124 *this = xz.upl32(yw); 1125 } 1126 1127 ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); } 1128 1129 ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) 1130 : S16{s0, s1, s2, s3, s4, s5, s6, s7} 1131 { 1132 } 1133 1134 constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12, 1135 s8 b13, s8 b14, s8 b15) 1136 : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} 1137 { 1138 } 1139 1140 // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), 1141 // so leave the non-constexpr version default 1142 ALWAYS_INLINE explicit GSVector4i(int i) { *this = i; } 1143 1144 ALWAYS_INLINE explicit GSVector4i(int32x2_t m) : v4s(vcombine_s32(m, vcreate_s32(0))) {} 1145 ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {} 1146 1147 ALWAYS_INLINE explicit GSVector4i(const GSVector2& v); 1148 ALWAYS_INLINE explicit GSVector4i(const GSVector4& v); 1149 1150 ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); 1151 1152 ALWAYS_INLINE void operator=(int i) { v4s = vdupq_n_s32(i); } 1153 1154 ALWAYS_INLINE operator int32x4_t() const { return v4s; } 1155 1156 // rect 1157 1158 ALWAYS_INLINE int width() const { return right - left; } 1159 1160 ALWAYS_INLINE int height() const { return bottom - top; } 1161 1162 ALWAYS_INLINE GSVector4i rsize() const 1163 { 1164 return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height()); 1165 } 1166 1167 ALWAYS_INLINE s32 rarea() const { return width() * height(); } 1168 1169 ALWAYS_INLINE bool rempty() const 1170 { 1171 #ifdef CPU_ARCH_ARM64 1172 return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0); 1173 #else 1174 return (vget_lane_u64(vreinterpret_u64_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))), 0) == 0); 1175 #endif 1176 } 1177 1178 ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_i32(a).upl64(max_i32(a).srl<8>()); } 1179 1180 ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& a) const { return sat_i32(a); } 1181 ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); } 1182 ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); } 1183 1184 ALWAYS_INLINE u32 rgba32() const 1185 { 1186 GSVector4i v = *this; 1187 1188 v = v.ps32(v); 1189 v = v.pu16(v); 1190 1191 return (u32)store(v); 1192 } 1193 1194 ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const 1195 { 1196 return max_i8(min).min_i8(max); 1197 } 1198 ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const 1199 { 1200 return max_i8(minmax.xyxy()).min_i8(minmax.zwzw()); 1201 } 1202 ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const 1203 { 1204 return max_i16(min).min_i16(max); 1205 } 1206 ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const 1207 { 1208 return max_i16(minmax.xyxy()).min_i16(minmax.zwzw()); 1209 } 1210 ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const 1211 { 1212 return max_i32(min).min_i32(max); 1213 } 1214 ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const 1215 { 1216 return max_i32(minmax.xyxy()).min_i32(minmax.zwzw()); 1217 } 1218 1219 ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const 1220 { 1221 return max_u8(min).min_u8(max); 1222 } 1223 ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const 1224 { 1225 return max_u8(minmax.xyxy()).min_u8(minmax.zwzw()); 1226 } 1227 ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const 1228 { 1229 return max_u16(min).min_u16(max); 1230 } 1231 ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const 1232 { 1233 return max_u16(minmax.xyxy()).min_u16(minmax.zwzw()); 1234 } 1235 ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const 1236 { 1237 return max_u32(min).min_u32(max); 1238 } 1239 ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const 1240 { 1241 return max_u32(minmax.xyxy()).min_u32(minmax.zwzw()); 1242 } 1243 1244 ALWAYS_INLINE GSVector4i min_i8(const GSVector4i& v) const 1245 { 1246 return GSVector4i(vreinterpretq_s32_s8(vminq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 1247 } 1248 1249 ALWAYS_INLINE GSVector4i max_i8(const GSVector4i& v) const 1250 { 1251 return GSVector4i(vreinterpretq_s32_s8(vmaxq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 1252 } 1253 1254 ALWAYS_INLINE GSVector4i min_i16(const GSVector4i& v) const 1255 { 1256 return GSVector4i(vreinterpretq_s32_s16(vminq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 1257 } 1258 1259 ALWAYS_INLINE GSVector4i max_i16(const GSVector4i& v) const 1260 { 1261 return GSVector4i(vreinterpretq_s32_s16(vmaxq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 1262 } 1263 1264 ALWAYS_INLINE GSVector4i min_i32(const GSVector4i& v) const { return GSVector4i(vminq_s32(v4s, v.v4s)); } 1265 1266 ALWAYS_INLINE GSVector4i max_i32(const GSVector4i& v) const { return GSVector4i(vmaxq_s32(v4s, v.v4s)); } 1267 1268 ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const 1269 { 1270 return GSVector4i(vreinterpretq_s32_u8(vminq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s)))); 1271 } 1272 1273 ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const 1274 { 1275 return GSVector4i(vreinterpretq_s32_u8(vmaxq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s)))); 1276 } 1277 1278 ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const 1279 { 1280 return GSVector4i(vreinterpretq_s32_u16(vminq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s)))); 1281 } 1282 1283 ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const 1284 { 1285 return GSVector4i(vreinterpretq_s32_u16(vmaxq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s)))); 1286 } 1287 1288 ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const 1289 { 1290 return GSVector4i(vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s)))); 1291 } 1292 1293 ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const 1294 { 1295 return GSVector4i(vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s)))); 1296 } 1297 1298 ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const 1299 { 1300 #ifdef CPU_ARCH_ARM64 1301 const int32x4_t acc = 1302 vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s))); 1303 return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))); 1304 #else 1305 // borrowed from sse2neon 1306 const int32x4_t low = 1307 vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s))); 1308 const int32x4_t high = 1309 vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s))); 1310 return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)), 1311 vpadd_s32(vget_low_s32(high), vget_high_s32(high)))); 1312 #endif 1313 } 1314 1315 ALWAYS_INLINE GSVector4i addp_s32() const 1316 { 1317 #ifdef CPU_ARCH_ARM64 1318 return GSVector4i(vpaddq_s32(v4s, v4s)); 1319 #else 1320 const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s)); 1321 return GSVector4i(vcombine_s32(res, res)); 1322 #endif 1323 } 1324 1325 ALWAYS_INLINE s32 addv_s32() const 1326 { 1327 #ifdef CPU_ARCH_ARM64 1328 return vaddvq_s32(v4s); 1329 #else 1330 const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s)); 1331 return vget_lane_s32(res, 0) + vget_lane_s32(res, 1); 1332 #endif 1333 } 1334 1335 #ifdef CPU_ARCH_ARM64 1336 1337 ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); } 1338 1339 ALWAYS_INLINE u16 maxv_u8() const { return vmaxvq_u8(vreinterpretq_u8_s32(v4s)); } 1340 1341 ALWAYS_INLINE u16 minv_u16() const { return vminvq_u16(vreinterpretq_u16_s32(v4s)); } 1342 1343 ALWAYS_INLINE u16 maxv_u16() const { return vmaxvq_u16(vreinterpretq_u16_s32(v4s)); } 1344 1345 ALWAYS_INLINE s32 minv_s32() const { return vminvq_s32(v4s); } 1346 1347 ALWAYS_INLINE u32 minv_u32() const { return vminvq_u32(v4s); } 1348 1349 ALWAYS_INLINE s32 maxv_s32() const { return vmaxvq_s32(v4s); } 1350 1351 ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); } 1352 1353 #else 1354 1355 ALWAYS_INLINE u8 minv_u8() const 1356 { 1357 uint8x8_t vmin = vmin_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s))); 1358 vmin = vmin_u8(vmin, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmin), 1))); 1359 return static_cast<u8>( 1360 std::min(static_cast<u32>(vget_lane_u8(vmin, 0)), 1361 std::min(static_cast<u32>(vget_lane_u8(vmin, 1)), 1362 std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3)))))); 1363 } 1364 1365 ALWAYS_INLINE u16 maxv_u8() const 1366 { 1367 uint8x8_t vmax = vmax_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s))); 1368 vmax = vmax_u8(vmax, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmax), 1))); 1369 return static_cast<u8>( 1370 std::max(static_cast<u32>(vget_lane_u8(vmax, 0)), 1371 std::max(static_cast<u32>(vget_lane_u8(vmax, 1)), 1372 std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3)))))); 1373 } 1374 1375 ALWAYS_INLINE u16 minv_u16() const 1376 { 1377 uint16x4_t vmin = vmin_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s))); 1378 vmin = vmin_u16(vmin, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmin), 1))); 1379 return static_cast<u16>( 1380 std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1)))); 1381 } 1382 1383 ALWAYS_INLINE u16 maxv_u16() const 1384 { 1385 uint16x4_t vmax = vmax_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s))); 1386 vmax = vmax_u16(vmax, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmax), 1))); 1387 return static_cast<u16>( 1388 std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1)))); 1389 } 1390 1391 ALWAYS_INLINE s32 minv_s32() const 1392 { 1393 int32x2_t vmin = vmin_s32(vget_low_s32(v4s), vget_high_s32(v4s)); 1394 return std::min<s32>(vget_lane_s32(vmin, 0), vget_lane_s32(vmin, 1)); 1395 } 1396 1397 ALWAYS_INLINE u32 minv_u32() const 1398 { 1399 uint32x2_t vmin = vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s))); 1400 return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(vmin), 0), vget_lane_u32(vreinterpret_u32_s32(vmin), 1)); 1401 } 1402 1403 ALWAYS_INLINE s32 maxv_s32() const 1404 { 1405 int32x2_t vmax = vmax_s32(vget_low_s32(v4s), vget_high_s32(v4s)); 1406 return std::max<s32>(vget_lane_s32(vmax, 0), vget_lane_s32(vmax, 1)); 1407 } 1408 1409 ALWAYS_INLINE u32 maxv_u32() const 1410 { 1411 uint32x2_t vmax = vmax_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s))); 1412 return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(vmax), 0), vget_lane_u32(vreinterpret_u32_s32(vmax), 1)); 1413 } 1414 1415 #endif 1416 1417 ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); } 1418 1419 ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const 1420 { 1421 uint8x16_t mask2 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_s32(mask.v4s), 7)); 1422 return GSVector4i(vreinterpretq_s32_u8(vbslq_u8(mask2, vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(v4s)))); 1423 } 1424 1425 template<int mask> 1426 ALWAYS_INLINE GSVector4i blend16(const GSVector4i& a) const 1427 { 1428 static constexpr const uint16_t _mask[8] = { 1429 ((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0, 1430 ((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0, 1431 ((mask) & (1 << 4)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 5)) ? (uint16_t)-1 : 0x0, 1432 ((mask) & (1 << 6)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 7)) ? (uint16_t)-1 : 0x0}; 1433 return GSVector4i( 1434 vreinterpretq_s32_u16(vbslq_u16(vld1q_u16(_mask), vreinterpretq_u16_s32(a.v4s), vreinterpretq_u16_s32(v4s)))); 1435 } 1436 1437 template<int mask> 1438 ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const 1439 { 1440 constexpr int bit3 = ((mask & 8) * 3) << 3; 1441 constexpr int bit2 = ((mask & 4) * 3) << 2; 1442 constexpr int bit1 = ((mask & 2) * 3) << 1; 1443 constexpr int bit0 = (mask & 1) * 3; 1444 return blend16<bit3 | bit2 | bit1 | bit0>(v); 1445 } 1446 1447 ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const 1448 { 1449 return GSVector4i( 1450 vreinterpretq_s32_s8(vorrq_s8(vbicq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(mask.v4s)), 1451 vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s))))); 1452 } 1453 1454 ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); } 1455 1456 ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const 1457 { 1458 #ifdef CPU_ARCH_ARM64 1459 return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s)))); 1460 #else 1461 int8x8x2_t split = {vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v4s))}; 1462 return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vtbl2_s8(split, vget_low_s8(vreinterpretq_s8_s32(mask.v4s))), 1463 vtbl2_s8(split, vget_high_s8(vreinterpretq_s8_s32(mask.v4s)))))); 1464 #endif 1465 } 1466 1467 ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const 1468 { 1469 return GSVector4i(vreinterpretq_s32_s8( 1470 vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v.v4s))))); 1471 } 1472 1473 ALWAYS_INLINE GSVector4i ps16() const 1474 { 1475 return GSVector4i(vreinterpretq_s32_s8( 1476 vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v4s))))); 1477 } 1478 1479 ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const 1480 { 1481 return GSVector4i(vreinterpretq_s32_u8( 1482 vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v.v4s))))); 1483 } 1484 1485 ALWAYS_INLINE GSVector4i pu16() const 1486 { 1487 return GSVector4i(vreinterpretq_s32_u8( 1488 vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v4s))))); 1489 } 1490 1491 ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const 1492 { 1493 return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v.v4s)))); 1494 } 1495 1496 ALWAYS_INLINE GSVector4i ps32() const 1497 { 1498 return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v4s)))); 1499 } 1500 1501 ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const 1502 { 1503 return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v.v4s)))); 1504 } 1505 1506 ALWAYS_INLINE GSVector4i pu32() const 1507 { 1508 return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s)))); 1509 } 1510 1511 #ifdef CPU_ARCH_ARM64 1512 1513 ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const 1514 { 1515 return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 1516 } 1517 1518 ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const 1519 { 1520 return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 1521 } 1522 1523 ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const 1524 { 1525 return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 1526 } 1527 1528 ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const 1529 { 1530 return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 1531 } 1532 1533 ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(vzip1q_s32(v4s, v.v4s)); } 1534 1535 ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(vzip2q_s32(v4s, v.v4s)); } 1536 1537 ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const 1538 { 1539 return GSVector4i(vreinterpretq_s32_s64( 1540 vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s))))); 1541 } 1542 1543 ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const 1544 { 1545 return GSVector4i(vreinterpretq_s32_s64( 1546 vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s))))); 1547 } 1548 1549 ALWAYS_INLINE GSVector4i upl8() const 1550 { 1551 return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0)))); 1552 } 1553 1554 ALWAYS_INLINE GSVector4i uph8() const 1555 { 1556 return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0)))); 1557 } 1558 1559 ALWAYS_INLINE GSVector4i upl16() const 1560 { 1561 return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0)))); 1562 } 1563 1564 ALWAYS_INLINE GSVector4i uph16() const 1565 { 1566 return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0)))); 1567 } 1568 1569 ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(vzip1q_s32(v4s, vdupq_n_s32(0))); } 1570 1571 ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(vzip2q_s32(v4s, vdupq_n_s32(0))); } 1572 1573 ALWAYS_INLINE GSVector4i upl64() const 1574 { 1575 return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0)))); 1576 } 1577 1578 ALWAYS_INLINE GSVector4i uph64() const 1579 { 1580 return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0)))); 1581 } 1582 1583 #else 1584 1585 ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const 1586 { 1587 const int8x8x2_t res = vzip_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_low_s8(vreinterpretq_s8_s32(v.v4s))); 1588 return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1]))); 1589 } 1590 1591 ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const 1592 { 1593 const int8x8x2_t res = vzip_s8(vget_high_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v.v4s))); 1594 return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1]))); 1595 } 1596 1597 ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const 1598 { 1599 const int16x4x2_t res = 1600 vzip_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s))); 1601 return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1]))); 1602 } 1603 1604 ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const 1605 { 1606 const int16x4x2_t res = 1607 vzip_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s))); 1608 return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1]))); 1609 } 1610 1611 ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const 1612 { 1613 const int32x2x2_t res = vzip_s32(vget_low_s32(v4s), vget_low_s32(v.v4s)); 1614 return GSVector4i(vcombine_s32(res.val[0], res.val[1])); 1615 } 1616 1617 ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const 1618 { 1619 const int32x2x2_t res = vzip_s32(vget_high_s32(v4s), vget_high_s32(v.v4s)); 1620 return GSVector4i(vcombine_s32(res.val[0], res.val[1])); 1621 } 1622 1623 ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const 1624 { 1625 return GSVector4i(vreinterpretq_s32_s64( 1626 vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s))))); 1627 } 1628 1629 ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const 1630 { 1631 return GSVector4i(vreinterpretq_s32_s64( 1632 vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s))))); 1633 } 1634 1635 ALWAYS_INLINE GSVector4i upl8() const { return upl8(GSVector4i(vdupq_n_s32(0))); } 1636 1637 ALWAYS_INLINE GSVector4i uph8() const { return uph8(GSVector4i(vdupq_n_s32(0))); } 1638 1639 ALWAYS_INLINE GSVector4i upl16() const { return upl16(GSVector4i(vdupq_n_s32(0))); } 1640 1641 ALWAYS_INLINE GSVector4i uph16() const { return uph16(GSVector4i(vdupq_n_s32(0))); } 1642 1643 ALWAYS_INLINE GSVector4i upl32() const { return upl32(GSVector4i(vdupq_n_s32(0))); } 1644 1645 ALWAYS_INLINE GSVector4i uph32() const { return uph32(GSVector4i(vdupq_n_s32(0))); } 1646 1647 ALWAYS_INLINE GSVector4i upl64() const 1648 { 1649 return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0)))); 1650 } 1651 1652 ALWAYS_INLINE GSVector4i uph64() const 1653 { 1654 return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0)))); 1655 } 1656 #endif 1657 1658 ALWAYS_INLINE GSVector4i s8to16() const 1659 { 1660 return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))); 1661 } 1662 1663 ALWAYS_INLINE GSVector4i u8to16() const 1664 { 1665 return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s))))); 1666 } 1667 1668 ALWAYS_INLINE GSVector4i s8to32() const 1669 { 1670 return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))); 1671 } 1672 1673 ALWAYS_INLINE GSVector4i u8to32() const 1674 { 1675 return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s))))))); 1676 } 1677 1678 ALWAYS_INLINE GSVector4i s8to64() const 1679 { 1680 return GSVector4i(vreinterpretq_s32_s64( 1681 vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))))))); 1682 } 1683 1684 ALWAYS_INLINE GSVector4i u8to64() const 1685 { 1686 return GSVector4i(vreinterpretq_s32_u64( 1687 vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s))))))))); 1688 } 1689 1690 ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); } 1691 1692 ALWAYS_INLINE GSVector4i u16to32() const 1693 { 1694 return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s))))); 1695 } 1696 1697 ALWAYS_INLINE GSVector4i s16to64() const 1698 { 1699 return GSVector4i( 1700 vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s))))))); 1701 } 1702 1703 ALWAYS_INLINE GSVector4i u16to64() const 1704 { 1705 return GSVector4i( 1706 vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s))))))); 1707 } 1708 1709 ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); } 1710 1711 ALWAYS_INLINE GSVector4i u32to64() const 1712 { 1713 return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s32(v4s))))); 1714 } 1715 1716 template<int i> 1717 ALWAYS_INLINE GSVector4i srl() const 1718 { 1719 return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0), i))); 1720 } 1721 1722 template<int i> 1723 ALWAYS_INLINE GSVector4i srl(const GSVector4i& v) 1724 { 1725 if constexpr (i >= 16) 1726 return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v.v4s), vdupq_n_u8(0), i - 16))); 1727 else 1728 return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s), i))); 1729 } 1730 1731 template<int i> 1732 ALWAYS_INLINE GSVector4i sll() const 1733 { 1734 return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(v4s), 16 - i))); 1735 } 1736 1737 template<int i> 1738 ALWAYS_INLINE GSVector4i sll16() const 1739 { 1740 return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i))); 1741 } 1742 1743 ALWAYS_INLINE GSVector4i sll16(s32 i) const 1744 { 1745 return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i)))); 1746 } 1747 1748 ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const 1749 { 1750 return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 1751 } 1752 1753 template<int i> 1754 ALWAYS_INLINE GSVector4i srl16() const 1755 { 1756 return GSVector4i(vreinterpretq_s32_u16(vshrq_n_u16(vreinterpretq_u16_s32(v4s), i))); 1757 } 1758 1759 ALWAYS_INLINE GSVector4i srl16(s32 i) const 1760 { 1761 return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_u16(-i)))); 1762 } 1763 1764 ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const 1765 { 1766 return GSVector4i( 1767 vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s))))); 1768 } 1769 1770 template<int i> 1771 ALWAYS_INLINE GSVector4i sra16() const 1772 { 1773 constexpr int count = (i & ~15) ? 15 : i; 1774 return GSVector4i(vreinterpretq_s32_s16(vshrq_n_s16(vreinterpretq_s16_s32(v4s), count))); 1775 } 1776 1777 ALWAYS_INLINE GSVector4i sra16(s32 i) const 1778 { 1779 return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(-i)))); 1780 } 1781 1782 ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const 1783 { 1784 return GSVector4i( 1785 vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s))))); 1786 } 1787 1788 template<int i> 1789 ALWAYS_INLINE GSVector4i sll32() const 1790 { 1791 return GSVector4i(vshlq_n_s32(v4s, i)); 1792 } 1793 1794 ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); } 1795 1796 ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); } 1797 1798 template<int i> 1799 ALWAYS_INLINE GSVector4i srl32() const 1800 { 1801 return GSVector4i(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(v4s), i))); 1802 } 1803 1804 ALWAYS_INLINE GSVector4i srl32(s32 i) const 1805 { 1806 return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i)))); 1807 } 1808 1809 ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const 1810 { 1811 return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s)))); 1812 } 1813 1814 template<int i> 1815 ALWAYS_INLINE GSVector4i sra32() const 1816 { 1817 return GSVector4i(vshrq_n_s32(v4s, i)); 1818 } 1819 1820 ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(-i))); } 1821 1822 ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const 1823 { 1824 return GSVector4i(vshlq_s32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s))); 1825 } 1826 1827 template<int i> 1828 ALWAYS_INLINE GSVector4i sll64() const 1829 { 1830 return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i))); 1831 } 1832 1833 ALWAYS_INLINE GSVector4i sll64(s32 i) const 1834 { 1835 return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(i)))); 1836 } 1837 1838 ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const 1839 { 1840 return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s)))); 1841 } 1842 1843 template<int i> 1844 ALWAYS_INLINE GSVector4i sra64() const 1845 { 1846 return GSVector4i(vreinterpretq_s32_s64(vshrq_n_s64(vreinterpretq_s64_s32(v4s), i))); 1847 } 1848 1849 ALWAYS_INLINE GSVector4i sra64(s32 i) const 1850 { 1851 return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(-i)))); 1852 } 1853 1854 #ifdef CPU_ARCH_ARM64 1855 // not on arm32, hopefully we can do without 1856 ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const 1857 { 1858 return GSVector4i( 1859 vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s))))); 1860 } 1861 #endif 1862 1863 template<int i> 1864 ALWAYS_INLINE GSVector4i srl64() const 1865 { 1866 return GSVector4i(vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(v4s), i))); 1867 } 1868 1869 ALWAYS_INLINE GSVector4i srl64(s32 i) const 1870 { 1871 return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i)))); 1872 } 1873 1874 #ifdef CPU_ARCH_ARM64 1875 ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const 1876 { 1877 return GSVector4i( 1878 vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s))))); 1879 } 1880 #endif 1881 1882 ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const 1883 { 1884 return GSVector4i(vreinterpretq_s32_s8(vaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 1885 } 1886 1887 ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const 1888 { 1889 return GSVector4i(vreinterpretq_s32_s16(vaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 1890 } 1891 1892 ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(vaddq_s32(v4s, v.v4s)); } 1893 1894 ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const 1895 { 1896 return GSVector4i(vreinterpretq_s32_s8(vqaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 1897 } 1898 1899 ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const 1900 { 1901 return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 1902 } 1903 1904 ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const 1905 { 1906 // can't use vpaddq_s16() here, because we need saturation. 1907 // return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 1908 const int16x8_t a = vreinterpretq_s16_s32(v4s); 1909 const int16x8_t b = vreinterpretq_s16_s32(v.v4s); 1910 #ifdef CPU_ARCH_ARM64 1911 return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); 1912 #else 1913 // sse2neon again 1914 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); 1915 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); 1916 return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(ab0246, ab1357))); 1917 #endif 1918 } 1919 1920 ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const 1921 { 1922 return GSVector4i(vreinterpretq_s32_u8(vqaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s)))); 1923 } 1924 1925 ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const 1926 { 1927 return GSVector4i(vreinterpretq_s32_u16(vqaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s)))); 1928 } 1929 1930 ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const 1931 { 1932 return GSVector4i(vreinterpretq_s32_s8(vsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 1933 } 1934 1935 ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const 1936 { 1937 return GSVector4i(vreinterpretq_s32_s16(vsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 1938 } 1939 1940 ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(vsubq_s32(v4s, v.v4s)); } 1941 1942 ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const 1943 { 1944 return GSVector4i(vreinterpretq_s32_s8(vqsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 1945 } 1946 1947 ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const 1948 { 1949 return GSVector4i(vreinterpretq_s32_s16(vqsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 1950 } 1951 1952 ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const 1953 { 1954 return GSVector4i(vreinterpretq_s32_u8(vqsubq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s)))); 1955 } 1956 1957 ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const 1958 { 1959 return GSVector4i(vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s)))); 1960 } 1961 1962 ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const 1963 { 1964 return GSVector4i(vreinterpretq_s32_u8(vrhaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s)))); 1965 } 1966 1967 ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const 1968 { 1969 return GSVector4i(vreinterpretq_s32_u16(vrhaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s)))); 1970 } 1971 1972 ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const 1973 { 1974 // from sse2neon 1975 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_s32(v4s)); 1976 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_s32(v.v4s)); 1977 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ 1978 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_s32(v4s)); 1979 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_s32(v.v4s)); 1980 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ 1981 uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); 1982 return GSVector4i(vreinterpretq_s32_u16(r.val[1])); 1983 } 1984 1985 ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const 1986 { 1987 return GSVector4i(vreinterpretq_s32_s16(vmulq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 1988 } 1989 1990 ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const 1991 { 1992 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s))); 1993 int32x4_t mul_hi = 1994 vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s))); 1995 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); 1996 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); 1997 return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(narrow_lo, narrow_hi))); 1998 } 1999 2000 ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); } 2001 2002 ALWAYS_INLINE bool eq(const GSVector4i& v) const 2003 { 2004 const int32x4_t res = veorq_s32(v4s, v.v4s); 2005 #ifdef CPU_ARCH_ARM64 2006 return (vmaxvq_u32(vreinterpretq_u32_s32(res)) == 0); 2007 #else 2008 const int32x2_t paired = vorr_s32(vget_low_s32(res), vget_high_s32(res)); 2009 return (vget_lane_u64(vreinterpret_u64_s32(paired), 0) == 0); 2010 #endif 2011 } 2012 2013 ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const 2014 { 2015 return GSVector4i(vreinterpretq_s32_u8(vceqq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 2016 } 2017 2018 ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const 2019 { 2020 return GSVector4i(vreinterpretq_s32_u16(vceqq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 2021 } 2022 2023 ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const 2024 { 2025 return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s))); 2026 } 2027 2028 #ifdef CPU_ARCH_ARM64 2029 ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const 2030 { 2031 return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s)))); 2032 } 2033 #endif 2034 2035 ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); } 2036 2037 ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); } 2038 2039 ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); } 2040 2041 ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const 2042 { 2043 return GSVector4i(vreinterpretq_s32_s8(vcgtq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 2044 } 2045 2046 ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const 2047 { 2048 return GSVector4i(vreinterpretq_s32_s16(vcgtq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 2049 } 2050 2051 ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(vcgtq_s32(v4s, v.v4s)); } 2052 2053 ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const 2054 { 2055 return GSVector4i(vreinterpretq_s32_s8(vcgeq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 2056 } 2057 ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const 2058 { 2059 return GSVector4i(vreinterpretq_s32_s16(vcgeq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 2060 } 2061 ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return GSVector4i(vcgeq_s32(v4s, v.v4s)); } 2062 2063 ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const 2064 { 2065 return GSVector4i(vreinterpretq_s32_s8(vcltq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 2066 } 2067 2068 ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const 2069 { 2070 return GSVector4i(vreinterpretq_s32_s16(vcltq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 2071 } 2072 2073 ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(vcltq_s32(v4s, v.v4s)); } 2074 2075 ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const 2076 { 2077 return GSVector4i(vreinterpretq_s32_s8(vcleq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); 2078 } 2079 ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const 2080 { 2081 return GSVector4i(vreinterpretq_s32_s16(vcleq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); 2082 } 2083 ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return GSVector4i(vcleq_s32(v4s, v.v4s)); } 2084 2085 ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(vbicq_s32(v4s, v.v4s)); } 2086 2087 ALWAYS_INLINE int mask() const 2088 { 2089 // borrowed from sse2neon 2090 const uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s32(v4s), 7)); 2091 const uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); 2092 const uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); 2093 const uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); 2094 return static_cast<int>(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8)); 2095 } 2096 2097 ALWAYS_INLINE bool alltrue() const 2098 { 2099 // MSB should be set in all 8-bit lanes. 2100 #ifdef CPU_ARCH_ARM64 2101 return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80; 2102 #else 2103 const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))); 2104 return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0x80808080u); 2105 #endif 2106 } 2107 2108 ALWAYS_INLINE bool allfalse() const 2109 { 2110 // MSB should be clear in all 8-bit lanes. 2111 #ifdef CPU_ARCH_ARM64 2112 return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80; 2113 #else 2114 const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))); 2115 return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0); 2116 #endif 2117 } 2118 2119 template<int i> 2120 ALWAYS_INLINE GSVector4i insert8(int a) const 2121 { 2122 return GSVector4i(vreinterpretq_s32_u8(vsetq_lane_u8(a, vreinterpretq_u8_s32(v4s), static_cast<uint8_t>(i)))); 2123 } 2124 2125 template<int i> 2126 ALWAYS_INLINE int extract8() const 2127 { 2128 return vgetq_lane_u8(vreinterpretq_u8_s32(v4s), i); 2129 } 2130 2131 template<int i> 2132 ALWAYS_INLINE GSVector4i insert16(int a) const 2133 { 2134 return GSVector4i(vreinterpretq_s32_u16(vsetq_lane_u16(a, vreinterpretq_u16_s32(v4s), static_cast<uint16_t>(i)))); 2135 } 2136 2137 template<int i> 2138 ALWAYS_INLINE int extract16() const 2139 { 2140 return vgetq_lane_u16(vreinterpretq_u16_s32(v4s), i); 2141 } 2142 2143 template<int i> 2144 ALWAYS_INLINE GSVector4i insert32(int a) const 2145 { 2146 return GSVector4i(vsetq_lane_s32(a, v4s, i)); 2147 } 2148 2149 template<int i> 2150 ALWAYS_INLINE int extract32() const 2151 { 2152 return vgetq_lane_s32(v4s, i); 2153 } 2154 2155 template<int i> 2156 ALWAYS_INLINE GSVector4i insert64(s64 a) const 2157 { 2158 return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(a, vreinterpretq_s64_s32(v4s), i))); 2159 } 2160 2161 template<int i> 2162 ALWAYS_INLINE s64 extract64() const 2163 { 2164 return vgetq_lane_s64(vreinterpretq_s64_s32(v4s), i); 2165 } 2166 2167 ALWAYS_INLINE static GSVector4i loadnt(const void* p) 2168 { 2169 #if __has_builtin(__builtin_nontemporal_store) 2170 return GSVector4i(__builtin_nontemporal_load((int32x4_t*)p)); 2171 #else 2172 return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p))); 2173 #endif 2174 } 2175 2176 ALWAYS_INLINE static GSVector4i load32(const void* p) 2177 { 2178 // should be ldr s0, [x0] 2179 u32 val; 2180 std::memcpy(&val, p, sizeof(u32)); 2181 return GSVector4i(vsetq_lane_u32(val, vdupq_n_u32(0), 0)); 2182 } 2183 2184 ALWAYS_INLINE static GSVector4i loadl(const void* p) 2185 { 2186 return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0))); 2187 } 2188 2189 ALWAYS_INLINE static GSVector4i loadh(const void* p) 2190 { 2191 return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p)))); 2192 } 2193 2194 ALWAYS_INLINE static GSVector4i loadh(const void* p, const GSVector4i& v) 2195 { 2196 return GSVector4i( 2197 vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v.v4s)), vld1_s64((int64_t*)p)))); 2198 } 2199 2200 ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return GSVector4i(vcombine_s32(vcreate_s32(0), v.v2s)); } 2201 2202 ALWAYS_INLINE static GSVector4i load(const void* pl, const void* ph) 2203 { 2204 return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vld1_s64((int64_t*)pl), vld1_s64((int64_t*)ph)))); 2205 } 2206 2207 template<bool aligned> 2208 ALWAYS_INLINE static GSVector4i load(const void* p) 2209 { 2210 return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p))); 2211 } 2212 2213 ALWAYS_INLINE static GSVector4i load(int i) { return GSVector4i(vsetq_lane_s32(i, vdupq_n_s32(0), 0)); } 2214 2215 ALWAYS_INLINE static GSVector4i loadq(s64 i) 2216 { 2217 return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(i, vdupq_n_s64(0), 0))); 2218 } 2219 2220 ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) 2221 { 2222 #if __has_builtin(__builtin_nontemporal_store) 2223 __builtin_nontemporal_store(v.v4s, ((int32x4_t*)p)); 2224 #else 2225 vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s)); 2226 #endif 2227 } 2228 2229 ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) 2230 { 2231 u32 val = vgetq_lane_s32(v, 0); 2232 std::memcpy(p, &val, sizeof(u32)); 2233 } 2234 2235 ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) 2236 { 2237 vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s))); 2238 } 2239 2240 ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) 2241 { 2242 vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s))); 2243 } 2244 2245 ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v) 2246 { 2247 GSVector4i::storel(pl, v); 2248 GSVector4i::storeh(ph, v); 2249 } 2250 2251 template<bool aligned> 2252 ALWAYS_INLINE static void store(void* p, const GSVector4i& v) 2253 { 2254 vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s)); 2255 } 2256 2257 ALWAYS_INLINE static int store(const GSVector4i& v) { return vgetq_lane_s32(v.v4s, 0); } 2258 2259 ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return vgetq_lane_s64(vreinterpretq_s64_s32(v.v4s), 0); } 2260 2261 ALWAYS_INLINE void operator&=(const GSVector4i& v) 2262 { 2263 v4s = vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))); 2264 } 2265 2266 ALWAYS_INLINE void operator|=(const GSVector4i& v) 2267 { 2268 v4s = vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))); 2269 } 2270 2271 ALWAYS_INLINE void operator^=(const GSVector4i& v) 2272 { 2273 v4s = vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))); 2274 } 2275 2276 ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2) 2277 { 2278 return GSVector4i(vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s)))); 2279 } 2280 2281 ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2) 2282 { 2283 return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s)))); 2284 } 2285 2286 ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2) 2287 { 2288 return GSVector4i(vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s)))); 2289 } 2290 2291 ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, int i) { return v & GSVector4i(i); } 2292 2293 ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, int i) { return v | GSVector4i(i); } 2294 2295 ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, int i) { return v ^ GSVector4i(i); } 2296 2297 ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return GSVector4i(vmvnq_s32(v.v4s)); } 2298 2299 ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(0); } 2300 2301 ALWAYS_INLINE static GSVector4i xffffffff() { return GSVector4i(0xFFFFFFFF); } 2302 2303 ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } 2304 2305 ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(vget_low_s32(v4s)); } 2306 2307 ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); } 2308 2309 // clang-format off 2310 2311 2312 #define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ 2313 ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } 2314 2315 #define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ 2316 VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ 2317 VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ 2318 VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ 2319 VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ 2320 2321 #define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \ 2322 VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ 2323 VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ 2324 VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ 2325 VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ 2326 2327 #define VECTOR4i_SHUFFLE_1(xs, xn) \ 2328 VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \ 2329 VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \ 2330 VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \ 2331 VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \ 2332 2333 VECTOR4i_SHUFFLE_1(x, 0) 2334 VECTOR4i_SHUFFLE_1(y, 1) 2335 VECTOR4i_SHUFFLE_1(z, 2) 2336 VECTOR4i_SHUFFLE_1(w, 3) 2337 2338 // clang-format on 2339 }; 2340 2341 class alignas(16) GSVector4 2342 { 2343 struct cxpr_init_tag 2344 { 2345 }; 2346 static constexpr cxpr_init_tag cxpr_init{}; 2347 2348 constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {} 2349 2350 constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {} 2351 2352 constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {} 2353 2354 constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {} 2355 2356 public: 2357 union 2358 { 2359 struct 2360 { 2361 float x, y, z, w; 2362 }; 2363 struct 2364 { 2365 float r, g, b, a; 2366 }; 2367 struct 2368 { 2369 float left, top, right, bottom; 2370 }; 2371 float F32[4]; 2372 double F64[2]; 2373 s8 I8[16]; 2374 s16 I16[8]; 2375 s32 I32[4]; 2376 s64 I64[2]; 2377 u8 U8[16]; 2378 u16 U16[8]; 2379 u32 U32[4]; 2380 u64 U64[2]; 2381 float32x4_t v4s; 2382 }; 2383 2384 GSVector4() = default; 2385 2386 constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); } 2387 2388 constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); } 2389 2390 constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); } 2391 2392 constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); } 2393 2394 constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); } 2395 2396 constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); } 2397 2398 constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); } 2399 2400 constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); } 2401 2402 ALWAYS_INLINE GSVector4(float x, float y, float z, float w) 2403 { 2404 const float arr[4] = {x, y, z, w}; 2405 v4s = vld1q_f32(arr); 2406 } 2407 2408 ALWAYS_INLINE GSVector4(float x, float y) { v4s = vsetq_lane_f32(x, vsetq_lane_f32(y, vdupq_n_f32(0.0f), 1), 0); } 2409 2410 ALWAYS_INLINE GSVector4(int x, int y, int z, int w) 2411 { 2412 const int arr[4] = {x, y, z, w}; 2413 v4s = vcvtq_f32_s32(vld1q_s32(arr)); 2414 } 2415 2416 ALWAYS_INLINE GSVector4(int x, int y) 2417 { 2418 v4s = vcvtq_f32_s32(vsetq_lane_s32(x, vsetq_lane_s32(y, vdupq_n_s32(0), 0), 0)); 2419 } 2420 2421 ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); } 2422 2423 ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { v4s = vcombine_f32(vcvt_f32_s32(v.v2s), vcreate_f32(0)); } 2424 2425 ALWAYS_INLINE constexpr explicit GSVector4(float32x4_t m) : v4s(m) {} 2426 2427 ALWAYS_INLINE explicit GSVector4(float f) { v4s = vdupq_n_f32(f); } 2428 2429 ALWAYS_INLINE explicit GSVector4(int i) { v4s = vcvtq_f32_s32(vdupq_n_s32(i)); } 2430 2431 ALWAYS_INLINE explicit GSVector4(const GSVector4i& v); 2432 2433 ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); 2434 2435 ALWAYS_INLINE static GSVector4 f64(double x, double y) 2436 { 2437 #ifdef CPU_ARCH_ARM64 2438 return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1))); 2439 #else 2440 GSVector4 ret; 2441 ret.F64[0] = x; 2442 ret.F64[1] = y; 2443 return ret; 2444 #endif 2445 } 2446 2447 ALWAYS_INLINE static GSVector4 f64(double x) 2448 { 2449 #ifdef CPU_ARCH_ARM64 2450 return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x))); 2451 #else 2452 GSVector4 ret; 2453 ret.F64[0] = ret.F64[1] = x; 2454 return ret; 2455 #endif 2456 } 2457 2458 ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); } 2459 2460 ALWAYS_INLINE void operator=(float32x4_t m) { v4s = m; } 2461 2462 ALWAYS_INLINE operator float32x4_t() const { return v4s; } 2463 2464 ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); } 2465 2466 ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); } 2467 2468 ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); } 2469 2470 ALWAYS_INLINE GSVector4 abs() const { return GSVector4(vabsq_f32(v4s)); } 2471 2472 ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); } 2473 2474 ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(vrecpeq_f32(v4s)); } 2475 2476 ALWAYS_INLINE GSVector4 rcpnr() const 2477 { 2478 float32x4_t recip = vrecpeq_f32(v4s); 2479 recip = vmulq_f32(recip, vrecpsq_f32(recip, v4s)); 2480 return GSVector4(recip); 2481 } 2482 2483 #ifdef _M_ARM64 2484 2485 ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); } 2486 2487 ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); } 2488 2489 #else 2490 2491 ALWAYS_INLINE GSVector4 floor() const 2492 { 2493 return GSVector4(std::floor(vgetq_lane_f32(v4s, 0)), std::floor(vgetq_lane_f32(v4s, 1)), 2494 std::floor(vgetq_lane_f32(v4s, 2)), std::floor(vgetq_lane_f32(v4s, 3))); 2495 } 2496 2497 ALWAYS_INLINE GSVector4 ceil() const 2498 { 2499 return GSVector4(std::ceil(vgetq_lane_f32(v4s, 0)), std::ceil(vgetq_lane_f32(v4s, 1)), 2500 std::ceil(vgetq_lane_f32(v4s, 2)), std::ceil(vgetq_lane_f32(v4s, 3))); 2501 } 2502 2503 #endif 2504 2505 ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const 2506 { 2507 return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s)); 2508 } 2509 ALWAYS_INLINE GSVector4 msub(const GSVector4& a, const GSVector4& b) const 2510 { 2511 return GSVector4(vfmsq_f32(b.v4s, v4s, a.v4s)); 2512 } 2513 ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const { return b - *this * a; } 2514 ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const { return -b - *this * a; } 2515 2516 ALWAYS_INLINE GSVector4 addm(const GSVector4& a, const GSVector4& b) const 2517 { 2518 return a.madd(b, *this); // *this + a * b 2519 } 2520 2521 ALWAYS_INLINE GSVector4 subm(const GSVector4& a, const GSVector4& b) const 2522 { 2523 return a.nmadd(b, *this); // *this - a * b 2524 } 2525 2526 #ifdef CPU_ARCH_ARM64 2527 2528 ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); } 2529 2530 ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); } 2531 2532 ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s))); } 2533 2534 ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const 2535 { 2536 return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s))); 2537 } 2538 2539 #else 2540 2541 ALWAYS_INLINE GSVector4 hadd() const 2542 { 2543 const float32x2_t res = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); 2544 return GSVector4(vcombine_f32(res, res)); 2545 } 2546 2547 ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const 2548 { 2549 const float32x2_t res1 = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); 2550 const float32x2_t res2 = vpadd_f32(vget_low_f32(v.v4s), vget_high_f32(v.v4s)); 2551 return GSVector4(vcombine_f32(res1, res2)); 2552 } 2553 2554 ALWAYS_INLINE GSVector4 hsub() const 2555 { 2556 const float32x4x2_t res = vuzpq_f32(v4s, v4s); 2557 return GSVector4(vsubq_f32(res.val[0], res.val[0])); 2558 } 2559 2560 ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const 2561 { 2562 const float32x4x2_t res = vuzpq_f32(v4s, v.v4s); 2563 return GSVector4(vsubq_f32(res.val[0], res.val[1])); 2564 } 2565 2566 #endif 2567 2568 ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); } 2569 2570 ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const 2571 { 2572 #ifdef CPU_ARCH_ARM64 2573 const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0))); 2574 const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1))); 2575 #else 2576 const GSVector4 minv(a.xyxy()); 2577 const GSVector4 maxv(a.zwzw()); 2578 #endif 2579 return sat(minv, maxv); 2580 } 2581 2582 ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); } 2583 2584 ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); } 2585 2586 ALWAYS_INLINE GSVector4 min(const GSVector4& a) const { return GSVector4(vminq_f32(v4s, a.v4s)); } 2587 2588 ALWAYS_INLINE GSVector4 max(const GSVector4& a) const { return GSVector4(vmaxq_f32(v4s, a.v4s)); } 2589 2590 template<int mask> 2591 ALWAYS_INLINE GSVector4 blend32(const GSVector4& a) const 2592 { 2593 return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2, 2594 (mask & 8) ? 7 : 3)); 2595 } 2596 2597 ALWAYS_INLINE GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const 2598 { 2599 // duplicate sign bit across and bit select 2600 const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31)); 2601 return GSVector4(vbslq_f32(bitmask, a.v4s, v4s)); 2602 } 2603 2604 #ifdef CPU_ARCH_ARM64 2605 2606 ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); } 2607 2608 ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); } 2609 2610 ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const 2611 { 2612 return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s)))); 2613 } 2614 2615 ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const 2616 { 2617 return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s)))); 2618 } 2619 2620 #else 2621 2622 ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const 2623 { 2624 const float32x2x2_t res = vzip_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)); 2625 return GSVector4(vcombine_f32(res.val[0], res.val[1])); 2626 } 2627 2628 ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const 2629 { 2630 const float32x2x2_t res = vzip_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)); 2631 return GSVector4(vcombine_f32(res.val[0], res.val[1])); 2632 } 2633 2634 ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const 2635 { 2636 return GSVector4(vreinterpretq_f32_s64( 2637 vcombine_s64(vget_low_s64(vreinterpretq_s64_f32(v4s)), vget_low_s64(vreinterpretq_s64_f32(a.v4s))))); 2638 } 2639 2640 ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const 2641 { 2642 return GSVector4(vreinterpretq_f32_s64( 2643 vcombine_s64(vget_high_s64(vreinterpretq_s64_f32(v4s)), vget_high_s64(vreinterpretq_s64_f32(a.v4s))))); 2644 } 2645 2646 #endif 2647 2648 ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const 2649 { 2650 return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s))); 2651 } 2652 2653 ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const 2654 { 2655 return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s))); 2656 } 2657 2658 ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const 2659 { 2660 return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s)))); 2661 } 2662 2663 ALWAYS_INLINE int mask() const 2664 { 2665 #ifdef CPU_ARCH_ARM64 2666 static constexpr const int32_t shifts[] = {0, 1, 2, 3}; 2667 return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts)))); 2668 #else 2669 // sse2neon again 2670 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31)); 2671 uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); 2672 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); 2673 #endif 2674 } 2675 2676 ALWAYS_INLINE bool alltrue() const 2677 { 2678 // return mask() == 0xf; 2679 return ~(vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) & vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0; 2680 } 2681 2682 ALWAYS_INLINE bool allfalse() const 2683 { 2684 return (vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) | vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0; 2685 } 2686 2687 ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); } 2688 2689 template<int src, int dst> 2690 ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const 2691 { 2692 #ifdef CPU_ARCH_ARM64 2693 return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src)); 2694 #else 2695 return GSVector4(vsetq_lane_f32(vgetq_lane_f32(v.v4s, src), v4s, dst)); 2696 #endif 2697 } 2698 2699 template<int i> 2700 ALWAYS_INLINE int extract32() const 2701 { 2702 return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i); 2703 } 2704 2705 template<int dst> 2706 ALWAYS_INLINE GSVector4 insert64(double v) const 2707 { 2708 #ifdef CPU_ARCH_ARM64 2709 return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst))); 2710 #else 2711 GSVector4 ret; 2712 ret.F64[dst] = v; 2713 return ret; 2714 #endif 2715 } 2716 2717 template<int src> 2718 ALWAYS_INLINE double extract64() const 2719 { 2720 #ifdef CPU_ARCH_ARM64 2721 return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src); 2722 #else 2723 return F64[src]; 2724 #endif 2725 } 2726 2727 ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); } 2728 2729 ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); } 2730 2731 ALWAYS_INLINE static GSVector4 loadl(const void* p) 2732 { 2733 return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0))); 2734 } 2735 2736 ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0)); } 2737 2738 template<bool aligned> 2739 ALWAYS_INLINE static GSVector4 load(const void* p) 2740 { 2741 return GSVector4(vld1q_f32((const float*)p)); 2742 } 2743 2744 ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); } 2745 2746 ALWAYS_INLINE static void storel(void* p, const GSVector4& v) 2747 { 2748 #ifdef CPU_ARCH_ARM64 2749 vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s))); 2750 #else 2751 vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s))); 2752 #endif 2753 } 2754 2755 ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) 2756 { 2757 #ifdef CPU_ARCH_ARM64 2758 vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s))); 2759 #else 2760 vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s))); 2761 #endif 2762 } 2763 2764 template<bool aligned> 2765 ALWAYS_INLINE static void store(void* p, const GSVector4& v) 2766 { 2767 vst1q_f32((float*)p, v.v4s); 2768 } 2769 2770 ALWAYS_INLINE static void store(float* p, const GSVector4& v) { vst1q_lane_f32(p, v.v4s, 0); } 2771 2772 ALWAYS_INLINE GSVector4 operator-() const { return neg(); } 2773 2774 ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); } 2775 ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); } 2776 ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); } 2777 ALWAYS_INLINE void operator/=(const GSVector4& v) 2778 { 2779 #ifdef CPU_ARCH_ARM64 2780 v4s = vdivq_f32(v4s, v.v4s); 2781 #else 2782 *this = 2783 GSVector4(vgetq_lane_f32(v4s, 0) / vgetq_lane_f32(v.v4s, 0), vgetq_lane_f32(v4s, 1) / vgetq_lane_f32(v.v4s, 1), 2784 vgetq_lane_f32(v4s, 2) / vgetq_lane_f32(v.v4s, 2), vgetq_lane_f32(v4s, 3) / vgetq_lane_f32(v.v4s, 3)); 2785 #endif 2786 } 2787 2788 ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); } 2789 ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); } 2790 ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); } 2791 ALWAYS_INLINE void operator/=(float f) 2792 { 2793 #ifdef CPU_ARCH_ARM64 2794 *this /= GSVector4(f); 2795 #else 2796 *this = GSVector4(vgetq_lane_f32(v4s, 0) / f, vgetq_lane_f32(v4s, 1) / f, vgetq_lane_f32(v4s, 2) / f, 2797 vgetq_lane_f32(v4s, 3) / f); 2798 #endif 2799 } 2800 2801 ALWAYS_INLINE void operator&=(const GSVector4& v) 2802 { 2803 v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s))); 2804 } 2805 2806 ALWAYS_INLINE void operator|=(const GSVector4& v) 2807 { 2808 v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s))); 2809 } 2810 2811 ALWAYS_INLINE void operator^=(const GSVector4& v) 2812 { 2813 v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s))); 2814 } 2815 2816 ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2) 2817 { 2818 return GSVector4(vaddq_f32(v1.v4s, v2.v4s)); 2819 } 2820 2821 ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2) 2822 { 2823 return GSVector4(vsubq_f32(v1.v4s, v2.v4s)); 2824 } 2825 2826 ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2) 2827 { 2828 return GSVector4(vmulq_f32(v1.v4s, v2.v4s)); 2829 } 2830 2831 ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2) 2832 { 2833 #ifdef CPU_ARCH_ARM64 2834 return GSVector4(vdivq_f32(v1.v4s, v2.v4s)); 2835 #else 2836 return GSVector4( 2837 vgetq_lane_f32(v1.v4s, 0) / vgetq_lane_f32(v2.v4s, 0), vgetq_lane_f32(v1.v4s, 1) / vgetq_lane_f32(v2.v4s, 1), 2838 vgetq_lane_f32(v1.v4s, 2) / vgetq_lane_f32(v2.v4s, 2), vgetq_lane_f32(v1.v4s, 3) / vgetq_lane_f32(v2.v4s, 3)); 2839 #endif 2840 } 2841 2842 ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); } 2843 ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); } 2844 ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); } 2845 ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) 2846 { 2847 #ifdef CPU_ARCH_ARM64 2848 return v / GSVector4(f); 2849 #else 2850 return GSVector4(vgetq_lane_f32(v.v4s, 0) / f, vgetq_lane_f32(v.v4s, 1) / f, vgetq_lane_f32(v.v4s, 2) / f, 2851 vgetq_lane_f32(v.v4s, 3) / f); 2852 #endif 2853 } 2854 2855 ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2) 2856 { 2857 return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s)))); 2858 } 2859 2860 ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2) 2861 { 2862 return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s)))); 2863 } 2864 2865 ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2) 2866 { 2867 return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s)))); 2868 } 2869 2870 ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2) 2871 { 2872 return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s))); 2873 } 2874 2875 ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2) 2876 { 2877 // NEON has no != 2878 return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s)))); 2879 } 2880 2881 ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2) 2882 { 2883 return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s))); 2884 } 2885 2886 ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2) 2887 { 2888 return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s))); 2889 } 2890 2891 ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2) 2892 { 2893 return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s))); 2894 } 2895 2896 ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2) 2897 { 2898 return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s))); 2899 } 2900 2901 ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const 2902 { 2903 #ifdef CPU_ARCH_ARM64 2904 return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); 2905 #else 2906 return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]); 2907 #endif 2908 } 2909 2910 ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const 2911 { 2912 #ifdef CPU_ARCH_ARM64 2913 return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); 2914 #else 2915 return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]); 2916 #endif 2917 } 2918 2919 ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const 2920 { 2921 #ifdef CPU_ARCH_ARM64 2922 return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); 2923 #else 2924 return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]); 2925 #endif 2926 } 2927 2928 ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const 2929 { 2930 #ifdef CPU_ARCH_ARM64 2931 return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); 2932 #else 2933 return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]); 2934 #endif 2935 } 2936 2937 ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const 2938 { 2939 #ifdef CPU_ARCH_ARM64 2940 return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); 2941 #else 2942 GSVector4 ret; 2943 ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; 2944 ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; 2945 return ret; 2946 #endif 2947 } 2948 2949 ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const 2950 { 2951 #ifdef CPU_ARCH_ARM64 2952 return GSVector4(vreinterpretq_f32_f64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); 2953 #else 2954 GSVector4 ret; 2955 ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; 2956 ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; 2957 return ret; 2958 #endif 2959 } 2960 2961 ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const 2962 { 2963 #ifdef CPU_ARCH_ARM64 2964 return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); 2965 #else 2966 GSVector4 ret; 2967 ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; 2968 ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; 2969 return ret; 2970 #endif 2971 } 2972 2973 ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const 2974 { 2975 #ifdef CPU_ARCH_ARM64 2976 return GSVector4(vreinterpretq_f32_f64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); 2977 #else 2978 GSVector4 ret; 2979 ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; 2980 ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; 2981 return ret; 2982 #endif 2983 } 2984 2985 ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const 2986 { 2987 #ifdef CPU_ARCH_ARM64 2988 return GSVector4(vreinterpretq_f32_f64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); 2989 #else 2990 GSVector4 ret; 2991 ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0; 2992 ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0; 2993 return ret; 2994 #endif 2995 } 2996 2997 ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const 2998 { 2999 #ifdef CPU_ARCH_ARM64 3000 return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); 3001 #else 3002 return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1])); 3003 #endif 3004 } 3005 3006 ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const 3007 { 3008 #ifdef CPU_ARCH_ARM64 3009 return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)))); 3010 #else 3011 return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1])); 3012 #endif 3013 } 3014 3015 ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); } 3016 3017 ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); } 3018 3019 ALWAYS_INLINE GSVector4 sqrt64() const 3020 { 3021 #ifdef CPU_ARCH_ARM64 3022 return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s)))); 3023 #else 3024 return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1])); 3025 #endif 3026 } 3027 3028 ALWAYS_INLINE GSVector4 sqr64() const 3029 { 3030 #ifdef CPU_ARCH_ARM64 3031 return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s)))); 3032 #else 3033 return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]); 3034 #endif 3035 } 3036 3037 ALWAYS_INLINE GSVector4 floor64() const 3038 { 3039 #ifdef CPU_ARCH_ARM64 3040 return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s)))); 3041 #else 3042 return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1])); 3043 #endif 3044 } 3045 3046 ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v) 3047 { 3048 #ifdef CPU_ARCH_ARM64 3049 return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s)))); 3050 #else 3051 return GSVector4::f64(static_cast<double>(vgetq_lane_f32(v.v4s, 0)), static_cast<double>(vgetq_lane_f32(v.v4s, 1))); 3052 #endif 3053 } 3054 3055 ALWAYS_INLINE static GSVector4 f32to64(const void* p) 3056 { 3057 #ifdef CPU_ARCH_ARM64 3058 return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p))))); 3059 #else 3060 const float* fp = static_cast<const float*>(p); 3061 return GSVector4::f64(static_cast<double>(fp[0]), static_cast<double>(fp[1])); 3062 #endif 3063 } 3064 3065 ALWAYS_INLINE GSVector4i f64toi32() const 3066 { 3067 #ifdef CPU_ARCH_ARM64 3068 const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0)); 3069 const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1)); 3070 #else 3071 const s32 low = static_cast<s32>(F64[0]); 3072 const s32 high = static_cast<s32>(F64[1]); 3073 #endif 3074 return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1)); 3075 } 3076 3077 // clang-format off 3078 3079 #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ 3080 ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } \ 3081 ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const { return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); } 3082 3083 #define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ 3084 VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ 3085 VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ 3086 VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ 3087 VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ 3088 3089 #define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \ 3090 VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ 3091 VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ 3092 VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ 3093 VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ 3094 3095 #define VECTOR4_SHUFFLE_1(xs, xn) \ 3096 VECTOR4_SHUFFLE_2(xs, xn, x, 0) \ 3097 VECTOR4_SHUFFLE_2(xs, xn, y, 1) \ 3098 VECTOR4_SHUFFLE_2(xs, xn, z, 2) \ 3099 VECTOR4_SHUFFLE_2(xs, xn, w, 3) \ 3100 3101 VECTOR4_SHUFFLE_1(x, 0) 3102 VECTOR4_SHUFFLE_1(y, 1) 3103 VECTOR4_SHUFFLE_1(z, 2) 3104 VECTOR4_SHUFFLE_1(w, 3) 3105 3106 // clang-format on 3107 3108 ALWAYS_INLINE GSVector4 broadcast32() const 3109 { 3110 #ifdef CPU_ARCH_ARM64 3111 return GSVector4(vdupq_laneq_f32(v4s, 0)); 3112 #else 3113 return xxxx(); 3114 #endif 3115 } 3116 3117 ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) 3118 { 3119 #ifdef CPU_ARCH_ARM64 3120 return GSVector4(vdupq_laneq_f32(v.v4s, 0)); 3121 #else 3122 return v.xxxx(); 3123 #endif 3124 } 3125 3126 ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); } 3127 3128 ALWAYS_INLINE static GSVector4 broadcast64(const void* f) 3129 { 3130 #ifdef CPU_ARCH_ARM64 3131 return GSVector4(vreinterpretq_f32_f64(vld1q_dup_f64((const double*)f))); 3132 #else 3133 return GSVector4(vreinterpretq_f32_s64(vld1q_dup_s64((const s64*)f))); 3134 #endif 3135 } 3136 }; 3137 3138 ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v) 3139 { 3140 v2s = vcvt_s32_f32(v.v2s); 3141 } 3142 3143 ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v) 3144 { 3145 v2s = vcvt_f32_s32(v.v2s); 3146 } 3147 3148 ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v) 3149 { 3150 return GSVector2i(vreinterpret_s32_f32(v.v2s)); 3151 } 3152 3153 ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v) 3154 { 3155 return GSVector2(vreinterpret_f32_s32(v.v2s)); 3156 } 3157 3158 ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v) 3159 { 3160 v4s = vcvtq_s32_f32(v.v4s); 3161 } 3162 3163 ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v) 3164 { 3165 v4s = vcvtq_f32_s32(v.v4s); 3166 } 3167 3168 ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v) 3169 { 3170 return GSVector4i(vreinterpretq_s32_f32(v.v4s)); 3171 } 3172 3173 ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v) 3174 { 3175 return GSVector4(vreinterpretq_f32_s32(v.v4s)); 3176 }