simd_util.cc (2288B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "lib/jxl/simd_util.h" 7 8 #include <algorithm> 9 #include <cstddef> 10 11 #undef HWY_TARGET_INCLUDE 12 #define HWY_TARGET_INCLUDE "lib/jxl/simd_util.cc" 13 #include <hwy/foreach_target.h> 14 #include <hwy/highway.h> 15 16 #include "lib/jxl/base/common.h" 17 #include "lib/jxl/base/status.h" 18 #include "lib/jxl/cache_aligned.h" 19 20 HWY_BEFORE_NAMESPACE(); 21 namespace jxl { 22 namespace HWY_NAMESPACE { 23 24 size_t MaxVectorSize() { 25 HWY_FULL(float) df; 26 return Lanes(df) * sizeof(float); 27 } 28 29 // NOLINTNEXTLINE(google-readability-namespace-comments) 30 } // namespace HWY_NAMESPACE 31 } // namespace jxl 32 HWY_AFTER_NAMESPACE(); 33 34 #if HWY_ONCE 35 namespace jxl { 36 37 HWY_EXPORT(MaxVectorSize); 38 39 size_t MaxVectorSize() { 40 // Ideally HWY framework should provide us this value. 41 // Less than ideal is to check all available targets and choose maximal. 42 // As for now, we just ask current active target, assuming it won't change. 43 return HWY_DYNAMIC_DISPATCH(MaxVectorSize)(); 44 } 45 46 size_t BytesPerRow(const size_t xsize, const size_t sizeof_t) { 47 // Special case: we don't allow any ops -> don't need extra padding/ 48 if (xsize == 0) { 49 return 0; 50 } 51 52 const size_t vec_size = MaxVectorSize(); 53 size_t valid_bytes = xsize * sizeof_t; 54 55 // Allow unaligned accesses starting at the last valid value. 56 // Skip for the scalar case because no extra lanes will be loaded. 57 if (vec_size != 0) { 58 valid_bytes += vec_size - sizeof_t; 59 } 60 61 // Round up to vector and cache line size. 62 const size_t align = std::max(vec_size, CacheAligned::kAlignment); 63 size_t bytes_per_row = RoundUpTo(valid_bytes, align); 64 65 // During the lengthy window before writes are committed to memory, CPUs 66 // guard against read after write hazards by checking the address, but 67 // only the lower 11 bits. We avoid a false dependency between writes to 68 // consecutive rows by ensuring their sizes are not multiples of 2 KiB. 69 // Avoid2K prevents the same problem for the planes of an Image3. 70 if (bytes_per_row % CacheAligned::kAlias == 0) { 71 bytes_per_row += align; 72 } 73 74 JXL_ASSERT(bytes_per_row % align == 0); 75 return bytes_per_row; 76 } 77 78 } // namespace jxl 79 #endif