dct-inl.h (8112B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE) 7 #ifdef LIB_JPEGLI_DCT_INL_H_ 8 #undef LIB_JPEGLI_DCT_INL_H_ 9 #else 10 #define LIB_JPEGLI_DCT_INL_H_ 11 #endif 12 13 #include "lib/jpegli/transpose-inl.h" 14 #include "lib/jxl/base/compiler_specific.h" 15 16 HWY_BEFORE_NAMESPACE(); 17 namespace jpegli { 18 namespace HWY_NAMESPACE { 19 namespace { 20 21 // These templates are not found via ADL. 22 using hwy::HWY_NAMESPACE::Abs; 23 using hwy::HWY_NAMESPACE::Add; 24 using hwy::HWY_NAMESPACE::DemoteTo; 25 using hwy::HWY_NAMESPACE::Ge; 26 using hwy::HWY_NAMESPACE::IfThenElseZero; 27 using hwy::HWY_NAMESPACE::Mul; 28 using hwy::HWY_NAMESPACE::MulAdd; 29 using hwy::HWY_NAMESPACE::Rebind; 30 using hwy::HWY_NAMESPACE::Round; 31 using hwy::HWY_NAMESPACE::Sub; 32 using hwy::HWY_NAMESPACE::Vec; 33 34 using D = HWY_FULL(float); 35 using DI = HWY_FULL(int32_t); 36 37 template <size_t N> 38 void AddReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2, 39 float* JXL_RESTRICT aout) { 40 HWY_CAPPED(float, 8) d8; 41 for (size_t i = 0; i < N; i++) { 42 auto in1 = Load(d8, ain1 + i * 8); 43 auto in2 = Load(d8, ain2 + (N - i - 1) * 8); 44 Store(Add(in1, in2), d8, aout + i * 8); 45 } 46 } 47 48 template <size_t N> 49 void SubReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2, 50 float* JXL_RESTRICT aout) { 51 HWY_CAPPED(float, 8) d8; 52 for (size_t i = 0; i < N; i++) { 53 auto in1 = Load(d8, ain1 + i * 8); 54 auto in2 = Load(d8, ain2 + (N - i - 1) * 8); 55 Store(Sub(in1, in2), d8, aout + i * 8); 56 } 57 } 58 59 template <size_t N> 60 void B(float* JXL_RESTRICT coeff) { 61 HWY_CAPPED(float, 8) d8; 62 constexpr float kSqrt2 = 1.41421356237f; 63 auto sqrt2 = Set(d8, kSqrt2); 64 auto in1 = Load(d8, coeff); 65 auto in2 = Load(d8, coeff + 8); 66 Store(MulAdd(in1, sqrt2, in2), d8, coeff); 67 for (size_t i = 1; i + 1 < N; i++) { 68 auto in1 = Load(d8, coeff + i * 8); 69 auto in2 = Load(d8, coeff + (i + 1) * 8); 70 Store(Add(in1, in2), d8, coeff + i * 8); 71 } 72 } 73 74 // Ideally optimized away by compiler (except the multiply). 75 template <size_t N> 76 void InverseEvenOdd(const float* JXL_RESTRICT ain, float* JXL_RESTRICT aout) { 77 HWY_CAPPED(float, 8) d8; 78 for (size_t i = 0; i < N / 2; i++) { 79 auto in1 = Load(d8, ain + i * 8); 80 Store(in1, d8, aout + 2 * i * 8); 81 } 82 for (size_t i = N / 2; i < N; i++) { 83 auto in1 = Load(d8, ain + i * 8); 84 Store(in1, d8, aout + (2 * (i - N / 2) + 1) * 8); 85 } 86 } 87 88 // Constants for DCT implementation. Generated by the following snippet: 89 // for i in range(N // 2): 90 // print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ") 91 template <size_t N> 92 struct WcMultipliers; 93 94 template <> 95 struct WcMultipliers<4> { 96 static constexpr float kMultipliers[] = { 97 0.541196100146197, 98 1.3065629648763764, 99 }; 100 }; 101 102 template <> 103 struct WcMultipliers<8> { 104 static constexpr float kMultipliers[] = { 105 0.5097955791041592, 106 0.6013448869350453, 107 0.8999762231364156, 108 2.5629154477415055, 109 }; 110 }; 111 112 constexpr float WcMultipliers<4>::kMultipliers[]; 113 constexpr float WcMultipliers<8>::kMultipliers[]; 114 115 // Invoked on full vector. 116 template <size_t N> 117 void Multiply(float* JXL_RESTRICT coeff) { 118 HWY_CAPPED(float, 8) d8; 119 for (size_t i = 0; i < N / 2; i++) { 120 auto in1 = Load(d8, coeff + (N / 2 + i) * 8); 121 auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]); 122 Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8); 123 } 124 } 125 126 void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride, 127 size_t off, float* JXL_RESTRICT coeff) { 128 HWY_CAPPED(float, 8) d8; 129 for (size_t i = 0; i < 8; i++) { 130 Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8); 131 } 132 } 133 134 void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output, 135 size_t off) { 136 HWY_CAPPED(float, 8) d8; 137 auto mul = Set(d8, 1.0f / 8); 138 for (size_t i = 0; i < 8; i++) { 139 StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off); 140 } 141 } 142 143 template <size_t N> 144 struct DCT1DImpl; 145 146 template <> 147 struct DCT1DImpl<1> { 148 JXL_INLINE void operator()(float* JXL_RESTRICT mem) {} 149 }; 150 151 template <> 152 struct DCT1DImpl<2> { 153 JXL_INLINE void operator()(float* JXL_RESTRICT mem) { 154 HWY_CAPPED(float, 8) d8; 155 auto in1 = Load(d8, mem); 156 auto in2 = Load(d8, mem + 8); 157 Store(Add(in1, in2), d8, mem); 158 Store(Sub(in1, in2), d8, mem + 8); 159 } 160 }; 161 162 template <size_t N> 163 struct DCT1DImpl { 164 void operator()(float* JXL_RESTRICT mem) { 165 HWY_ALIGN float tmp[N * 8]; 166 AddReverse<N / 2>(mem, mem + N * 4, tmp); 167 DCT1DImpl<N / 2>()(tmp); 168 SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4); 169 Multiply<N>(tmp); 170 DCT1DImpl<N / 2>()(tmp + N * 4); 171 B<N / 2>(tmp + N * 4); 172 InverseEvenOdd<N>(tmp, mem); 173 } 174 }; 175 176 void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride, 177 float* JXL_RESTRICT output) { 178 HWY_CAPPED(float, 8) d8; 179 HWY_ALIGN float tmp[64]; 180 for (size_t i = 0; i < 8; i += Lanes(d8)) { 181 // TODO(veluca): consider removing the temporary memory here (as is done in 182 // IDCT), if it turns out that some compilers don't optimize away the loads 183 // and this is performance-critical. 184 LoadFromBlock(pixels, pixels_stride, i, tmp); 185 DCT1DImpl<8>()(tmp); 186 StoreToBlockAndScale(tmp, output, i); 187 } 188 } 189 190 JXL_INLINE JXL_MAYBE_UNUSED void TransformFromPixels( 191 const float* JXL_RESTRICT pixels, size_t pixels_stride, 192 float* JXL_RESTRICT coefficients, float* JXL_RESTRICT scratch_space) { 193 DCT1D(pixels, pixels_stride, scratch_space); 194 Transpose8x8Block(scratch_space, coefficients); 195 DCT1D(coefficients, 8, scratch_space); 196 Transpose8x8Block(scratch_space, coefficients); 197 } 198 199 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival, 200 int16_t* out) { 201 Rebind<int16_t, DI> di16; 202 Store(DemoteTo(di16, ival), di16, out); 203 } 204 205 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival, 206 int32_t* out) { 207 DI di; 208 Store(ival, di, out); 209 } 210 211 template <typename T> 212 void QuantizeBlock(const float* dct, const float* qmc, float aq_strength, 213 const float* zero_bias_offset, const float* zero_bias_mul, 214 T* block) { 215 D d; 216 DI di; 217 const auto aq_mul = Set(d, aq_strength); 218 for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) { 219 const auto val = Load(d, dct + k); 220 const auto q = Load(d, qmc + k); 221 const auto qval = Mul(val, q); 222 const auto zb_offset = Load(d, zero_bias_offset + k); 223 const auto zb_mul = Load(d, zero_bias_mul + k); 224 const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul)); 225 const auto nzero_mask = Ge(Abs(qval), threshold); 226 const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval))); 227 StoreQuantizedValue(ival, block + k); 228 } 229 } 230 231 template <typename T> 232 void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride, 233 const float* JXL_RESTRICT qmc, 234 int16_t last_dc_coeff, float aq_strength, 235 const float* zero_bias_offset, 236 const float* zero_bias_mul, 237 float* JXL_RESTRICT tmp, T* block) { 238 float* JXL_RESTRICT dct = tmp; 239 float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2; 240 TransformFromPixels(pixels, stride, dct, scratch_space); 241 QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul, block); 242 // Center DC values around zero. 243 static constexpr float kDCBias = 128.0f; 244 const float dc = (dct[0] - kDCBias) * qmc[0]; 245 float dc_threshold = zero_bias_offset[0] + aq_strength * zero_bias_mul[0]; 246 if (std::abs(dc - last_dc_coeff) < dc_threshold) { 247 block[0] = last_dc_coeff; 248 } else { 249 block[0] = std::round(dc); 250 } 251 } 252 253 // NOLINTNEXTLINE(google-readability-namespace-comments) 254 } // namespace 255 } // namespace HWY_NAMESPACE 256 } // namespace jpegli 257 HWY_AFTER_NAMESPACE(); 258 #endif // LIB_JPEGLI_DCT_INL_H_