upsample.cc (4647B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "lib/jpegli/upsample.h" 7 8 #include <string.h> 9 10 #undef HWY_TARGET_INCLUDE 11 #define HWY_TARGET_INCLUDE "lib/jpegli/upsample.cc" 12 #include <hwy/foreach_target.h> 13 #include <hwy/highway.h> 14 15 HWY_BEFORE_NAMESPACE(); 16 namespace jpegli { 17 namespace HWY_NAMESPACE { 18 19 // These templates are not found via ADL. 20 using hwy::HWY_NAMESPACE::Mul; 21 using hwy::HWY_NAMESPACE::MulAdd; 22 using hwy::HWY_NAMESPACE::Vec; 23 24 #if HWY_CAP_GE512 25 using hwy::HWY_NAMESPACE::Half; 26 using hwy::HWY_NAMESPACE::Vec; 27 template <size_t i, class DF, class V> 28 HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) { 29 using HF = Half<DF>; 30 using HHF = Half<HF>; 31 auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v); 32 return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half); 33 } 34 35 template <class DF, class V> 36 HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) { 37 using HF = Half<DF>; 38 return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0)); 39 } 40 41 #endif 42 43 // Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be 44 // aligned. 45 template <class DF, class V, typename T> 46 void StoreInterleaved(const DF df, V v0, V v1, T* mem) { 47 static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); 48 #if HWY_TARGET == HWY_SCALAR 49 Store(v0, df, mem); 50 Store(v1, df, mem + 1); 51 #elif !HWY_CAP_GE256 52 Store(InterleaveLower(df, v0, v1), df, mem); 53 Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df)); 54 #else 55 if (!HWY_CAP_GE512 || Lanes(df) == 8) { 56 auto t0 = InterleaveLower(df, v0, v1); 57 auto t1 = InterleaveUpper(df, v0, v1); 58 Store(ConcatLowerLower(df, t1, t0), df, mem); 59 Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df)); 60 } else { 61 #if HWY_CAP_GE512 62 auto t0 = InterleaveLower(df, v0, v1); 63 auto t1 = InterleaveUpper(df, v0, v1); 64 Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1), 65 Quarter<1>(df, t0), Quarter<1>(df, t1)), 66 df, mem); 67 Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1), 68 Quarter<3>(df, t0), Quarter<3>(df, t1)), 69 df, mem + Lanes(df)); 70 #endif 71 } 72 #endif 73 } 74 75 void Upsample2Horizontal(float* JXL_RESTRICT row, 76 float* JXL_RESTRICT scratch_space, size_t len_out) { 77 HWY_FULL(float) df; 78 auto threefour = Set(df, 0.75f); 79 auto onefour = Set(df, 0.25f); 80 const size_t len_in = (len_out + 1) >> 1; 81 memcpy(scratch_space, row, len_in * sizeof(row[0])); 82 scratch_space[-1] = scratch_space[0]; 83 scratch_space[len_in] = scratch_space[len_in - 1]; 84 for (size_t x = 0; x < len_in; x += Lanes(df)) { 85 auto current = Mul(Load(df, scratch_space + x), threefour); 86 auto prev = LoadU(df, scratch_space + x - 1); 87 auto next = LoadU(df, scratch_space + x + 1); 88 auto left = MulAdd(onefour, prev, current); 89 auto right = MulAdd(onefour, next, current); 90 StoreInterleaved(df, left, right, row + x * 2); 91 } 92 } 93 94 void Upsample2Vertical(const float* JXL_RESTRICT row_top, 95 const float* JXL_RESTRICT row_mid, 96 const float* JXL_RESTRICT row_bot, 97 float* JXL_RESTRICT row_out0, 98 float* JXL_RESTRICT row_out1, size_t len) { 99 HWY_FULL(float) df; 100 auto threefour = Set(df, 0.75f); 101 auto onefour = Set(df, 0.25f); 102 for (size_t x = 0; x < len; x += Lanes(df)) { 103 auto it = Load(df, row_top + x); 104 auto im = Load(df, row_mid + x); 105 auto ib = Load(df, row_bot + x); 106 auto im_scaled = Mul(im, threefour); 107 Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x); 108 Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x); 109 } 110 } 111 112 // NOLINTNEXTLINE(google-readability-namespace-comments) 113 } // namespace HWY_NAMESPACE 114 } // namespace jpegli 115 HWY_AFTER_NAMESPACE(); 116 117 #if HWY_ONCE 118 namespace jpegli { 119 120 HWY_EXPORT(Upsample2Horizontal); 121 HWY_EXPORT(Upsample2Vertical); 122 123 void Upsample2Horizontal(float* JXL_RESTRICT row, 124 float* JXL_RESTRICT scratch_space, size_t len_out) { 125 HWY_DYNAMIC_DISPATCH(Upsample2Horizontal)(row, scratch_space, len_out); 126 } 127 128 void Upsample2Vertical(const float* JXL_RESTRICT row_top, 129 const float* JXL_RESTRICT row_mid, 130 const float* JXL_RESTRICT row_bot, 131 float* JXL_RESTRICT row_out0, 132 float* JXL_RESTRICT row_out1, size_t len) { 133 HWY_DYNAMIC_DISPATCH(Upsample2Vertical) 134 (row_top, row_mid, row_bot, row_out0, row_out1, len); 135 } 136 } // namespace jpegli 137 #endif // HWY_ONCE