intrin.h (2748B)
1 // SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com> 2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) 3 4 // Includes appropriate intrinsic header based on platform. 5 6 #pragma once 7 8 #include "align.h" 9 #include "types.h" 10 11 #include <type_traits> 12 13 #if defined(CPU_ARCH_X86) || defined(CPU_ARCH_X64) 14 #define CPU_ARCH_SSE 1 15 #include <emmintrin.h> 16 #include <tmmintrin.h> 17 #include <smmintrin.h> 18 #include <immintrin.h> 19 20 #if defined(__AVX2__) 21 #define CPU_ARCH_AVX 1 22 #define CPU_ARCH_AVX2 1 23 #define CPU_ARCH_SSE41 1 24 #elif defined(__AVX__) 25 #define CPU_ARCH_AVX 1 26 #define CPU_ARCH_SSE41 1 27 #elif defined(__SSE4_1__) || defined(_MSC_VER) 28 #define CPU_ARCH_SSE41 1 29 #endif 30 #elif defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) 31 #define CPU_ARCH_NEON 1 32 #if defined(_MSC_VER) && !defined(__clang__) 33 #include <arm64_neon.h> 34 #else 35 #include <arm_neon.h> 36 #endif 37 #endif 38 39 #ifdef __APPLE__ 40 #include <stdlib.h> // alloca 41 #else 42 #include <malloc.h> // alloca 43 #endif 44 45 /// Only currently using 128-bit vectors at max. 46 static constexpr u32 VECTOR_ALIGNMENT = 16; 47 48 /// Aligns allocation/pitch size to preferred host size. 49 template<typename T> 50 ALWAYS_INLINE static T VectorAlign(T value) 51 { 52 return Common::AlignUpPow2(value, VECTOR_ALIGNMENT); 53 } 54 55 template<typename T> 56 ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count) 57 { 58 static_assert(std::is_pointer_v<T>, "T is pointer type"); 59 static_assert(sizeof(T) == sizeof(void*), "T isn't a fat pointer"); 60 T* dest = ptr; 61 62 #if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON) 63 static constexpr u32 PTRS_PER_VECTOR = (16 / sizeof(T)); 64 const u32 aligned_count = count / PTRS_PER_VECTOR; 65 const u32 remaining_count = count % PTRS_PER_VECTOR; 66 67 #if defined(CPU_ARCH_SSE) 68 const __m128i svalue = _mm_set1_epi64x(reinterpret_cast<intptr_t>(value)); 69 #elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64) 70 const uint64x2_t svalue = vdupq_n_u64(reinterpret_cast<uintptr_t>(value)); 71 #elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32) 72 const uint32x4_t svalue = vdupq_n_u32(reinterpret_cast<uintptr_t>(value)); 73 #endif 74 75 // Clang gets way too eager and tries to unroll these, emitting thousands of instructions. 76 #ifdef __clang__ 77 #pragma clang loop unroll(disable) 78 #endif 79 for (u32 i = 0; i < aligned_count; i++) 80 { 81 #if defined(CPU_ARCH_SSE) 82 _mm_store_si128(reinterpret_cast<__m128i*>(dest), svalue); 83 #elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64) 84 vst1q_u64(reinterpret_cast<u64*>(dest), svalue); 85 #elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32) 86 vst1q_u32(reinterpret_cast<u32*>(dest), svalue); 87 #endif 88 dest += PTRS_PER_VECTOR; 89 } 90 #else 91 const u32 remaining_count = count; 92 #endif 93 94 for (u32 i = 0; i < remaining_count; i++) 95 *(dest++) = value; 96 }