duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

intrin.h (2748B)


      1 // SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 // Includes appropriate intrinsic header based on platform.
      5 
      6 #pragma once
      7 
      8 #include "align.h"
      9 #include "types.h"
     10 
     11 #include <type_traits>
     12 
     13 #if defined(CPU_ARCH_X86) || defined(CPU_ARCH_X64)
     14 #define CPU_ARCH_SSE 1
     15 #include <emmintrin.h>
     16 #include <tmmintrin.h>
     17 #include <smmintrin.h>
     18 #include <immintrin.h>
     19 
     20 #if defined(__AVX2__)
     21 #define CPU_ARCH_AVX 1
     22 #define CPU_ARCH_AVX2 1
     23 #define CPU_ARCH_SSE41 1
     24 #elif defined(__AVX__)
     25 #define CPU_ARCH_AVX 1
     26 #define CPU_ARCH_SSE41 1
     27 #elif defined(__SSE4_1__) || defined(_MSC_VER)
     28 #define CPU_ARCH_SSE41 1
     29 #endif
     30 #elif defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64)
     31 #define CPU_ARCH_NEON 1
     32 #if defined(_MSC_VER) && !defined(__clang__)
     33 #include <arm64_neon.h>
     34 #else
     35 #include <arm_neon.h>
     36 #endif
     37 #endif
     38 
     39 #ifdef __APPLE__
     40 #include <stdlib.h> // alloca
     41 #else
     42 #include <malloc.h> // alloca
     43 #endif
     44 
     45 /// Only currently using 128-bit vectors at max.
     46 static constexpr u32 VECTOR_ALIGNMENT = 16;
     47 
     48 /// Aligns allocation/pitch size to preferred host size.
     49 template<typename T>
     50 ALWAYS_INLINE static T VectorAlign(T value)
     51 {
     52   return Common::AlignUpPow2(value, VECTOR_ALIGNMENT);
     53 }
     54 
     55 template<typename T>
     56 ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count)
     57 {
     58   static_assert(std::is_pointer_v<T>, "T is pointer type");
     59   static_assert(sizeof(T) == sizeof(void*), "T isn't a fat pointer");
     60   T* dest = ptr;
     61 
     62 #if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
     63   static constexpr u32 PTRS_PER_VECTOR = (16 / sizeof(T));
     64   const u32 aligned_count = count / PTRS_PER_VECTOR;
     65   const u32 remaining_count = count % PTRS_PER_VECTOR;
     66 
     67 #if defined(CPU_ARCH_SSE)
     68   const __m128i svalue = _mm_set1_epi64x(reinterpret_cast<intptr_t>(value));
     69 #elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64)
     70   const uint64x2_t svalue = vdupq_n_u64(reinterpret_cast<uintptr_t>(value));
     71 #elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32)
     72   const uint32x4_t svalue = vdupq_n_u32(reinterpret_cast<uintptr_t>(value));
     73 #endif
     74 
     75   // Clang gets way too eager and tries to unroll these, emitting thousands of instructions.
     76 #ifdef __clang__
     77 #pragma clang loop unroll(disable)
     78 #endif
     79   for (u32 i = 0; i < aligned_count; i++)
     80   {
     81 #if defined(CPU_ARCH_SSE)
     82     _mm_store_si128(reinterpret_cast<__m128i*>(dest), svalue);
     83 #elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64)
     84     vst1q_u64(reinterpret_cast<u64*>(dest), svalue);
     85 #elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32)
     86     vst1q_u32(reinterpret_cast<u32*>(dest), svalue);
     87 #endif
     88     dest += PTRS_PER_VECTOR;
     89   }
     90 #else
     91   const u32 remaining_count = count;
     92 #endif
     93 
     94   for (u32 i = 0; i < remaining_count; i++)
     95     *(dest++) = value;
     96 }