libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

dct-inl.h (8112B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
      7 #ifdef LIB_JPEGLI_DCT_INL_H_
      8 #undef LIB_JPEGLI_DCT_INL_H_
      9 #else
     10 #define LIB_JPEGLI_DCT_INL_H_
     11 #endif
     12 
     13 #include "lib/jpegli/transpose-inl.h"
     14 #include "lib/jxl/base/compiler_specific.h"
     15 
     16 HWY_BEFORE_NAMESPACE();
     17 namespace jpegli {
     18 namespace HWY_NAMESPACE {
     19 namespace {
     20 
     21 // These templates are not found via ADL.
     22 using hwy::HWY_NAMESPACE::Abs;
     23 using hwy::HWY_NAMESPACE::Add;
     24 using hwy::HWY_NAMESPACE::DemoteTo;
     25 using hwy::HWY_NAMESPACE::Ge;
     26 using hwy::HWY_NAMESPACE::IfThenElseZero;
     27 using hwy::HWY_NAMESPACE::Mul;
     28 using hwy::HWY_NAMESPACE::MulAdd;
     29 using hwy::HWY_NAMESPACE::Rebind;
     30 using hwy::HWY_NAMESPACE::Round;
     31 using hwy::HWY_NAMESPACE::Sub;
     32 using hwy::HWY_NAMESPACE::Vec;
     33 
     34 using D = HWY_FULL(float);
     35 using DI = HWY_FULL(int32_t);
     36 
     37 template <size_t N>
     38 void AddReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
     39                 float* JXL_RESTRICT aout) {
     40   HWY_CAPPED(float, 8) d8;
     41   for (size_t i = 0; i < N; i++) {
     42     auto in1 = Load(d8, ain1 + i * 8);
     43     auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
     44     Store(Add(in1, in2), d8, aout + i * 8);
     45   }
     46 }
     47 
     48 template <size_t N>
     49 void SubReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
     50                 float* JXL_RESTRICT aout) {
     51   HWY_CAPPED(float, 8) d8;
     52   for (size_t i = 0; i < N; i++) {
     53     auto in1 = Load(d8, ain1 + i * 8);
     54     auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
     55     Store(Sub(in1, in2), d8, aout + i * 8);
     56   }
     57 }
     58 
     59 template <size_t N>
     60 void B(float* JXL_RESTRICT coeff) {
     61   HWY_CAPPED(float, 8) d8;
     62   constexpr float kSqrt2 = 1.41421356237f;
     63   auto sqrt2 = Set(d8, kSqrt2);
     64   auto in1 = Load(d8, coeff);
     65   auto in2 = Load(d8, coeff + 8);
     66   Store(MulAdd(in1, sqrt2, in2), d8, coeff);
     67   for (size_t i = 1; i + 1 < N; i++) {
     68     auto in1 = Load(d8, coeff + i * 8);
     69     auto in2 = Load(d8, coeff + (i + 1) * 8);
     70     Store(Add(in1, in2), d8, coeff + i * 8);
     71   }
     72 }
     73 
     74 // Ideally optimized away by compiler (except the multiply).
     75 template <size_t N>
     76 void InverseEvenOdd(const float* JXL_RESTRICT ain, float* JXL_RESTRICT aout) {
     77   HWY_CAPPED(float, 8) d8;
     78   for (size_t i = 0; i < N / 2; i++) {
     79     auto in1 = Load(d8, ain + i * 8);
     80     Store(in1, d8, aout + 2 * i * 8);
     81   }
     82   for (size_t i = N / 2; i < N; i++) {
     83     auto in1 = Load(d8, ain + i * 8);
     84     Store(in1, d8, aout + (2 * (i - N / 2) + 1) * 8);
     85   }
     86 }
     87 
     88 // Constants for DCT implementation. Generated by the following snippet:
     89 // for i in range(N // 2):
     90 //    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
     91 template <size_t N>
     92 struct WcMultipliers;
     93 
     94 template <>
     95 struct WcMultipliers<4> {
     96   static constexpr float kMultipliers[] = {
     97       0.541196100146197,
     98       1.3065629648763764,
     99   };
    100 };
    101 
    102 template <>
    103 struct WcMultipliers<8> {
    104   static constexpr float kMultipliers[] = {
    105       0.5097955791041592,
    106       0.6013448869350453,
    107       0.8999762231364156,
    108       2.5629154477415055,
    109   };
    110 };
    111 
    112 constexpr float WcMultipliers<4>::kMultipliers[];
    113 constexpr float WcMultipliers<8>::kMultipliers[];
    114 
    115 // Invoked on full vector.
    116 template <size_t N>
    117 void Multiply(float* JXL_RESTRICT coeff) {
    118   HWY_CAPPED(float, 8) d8;
    119   for (size_t i = 0; i < N / 2; i++) {
    120     auto in1 = Load(d8, coeff + (N / 2 + i) * 8);
    121     auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
    122     Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8);
    123   }
    124 }
    125 
    126 void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride,
    127                    size_t off, float* JXL_RESTRICT coeff) {
    128   HWY_CAPPED(float, 8) d8;
    129   for (size_t i = 0; i < 8; i++) {
    130     Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8);
    131   }
    132 }
    133 
    134 void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output,
    135                           size_t off) {
    136   HWY_CAPPED(float, 8) d8;
    137   auto mul = Set(d8, 1.0f / 8);
    138   for (size_t i = 0; i < 8; i++) {
    139     StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off);
    140   }
    141 }
    142 
    143 template <size_t N>
    144 struct DCT1DImpl;
    145 
    146 template <>
    147 struct DCT1DImpl<1> {
    148   JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
    149 };
    150 
    151 template <>
    152 struct DCT1DImpl<2> {
    153   JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
    154     HWY_CAPPED(float, 8) d8;
    155     auto in1 = Load(d8, mem);
    156     auto in2 = Load(d8, mem + 8);
    157     Store(Add(in1, in2), d8, mem);
    158     Store(Sub(in1, in2), d8, mem + 8);
    159   }
    160 };
    161 
    162 template <size_t N>
    163 struct DCT1DImpl {
    164   void operator()(float* JXL_RESTRICT mem) {
    165     HWY_ALIGN float tmp[N * 8];
    166     AddReverse<N / 2>(mem, mem + N * 4, tmp);
    167     DCT1DImpl<N / 2>()(tmp);
    168     SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4);
    169     Multiply<N>(tmp);
    170     DCT1DImpl<N / 2>()(tmp + N * 4);
    171     B<N / 2>(tmp + N * 4);
    172     InverseEvenOdd<N>(tmp, mem);
    173   }
    174 };
    175 
    176 void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride,
    177            float* JXL_RESTRICT output) {
    178   HWY_CAPPED(float, 8) d8;
    179   HWY_ALIGN float tmp[64];
    180   for (size_t i = 0; i < 8; i += Lanes(d8)) {
    181     // TODO(veluca): consider removing the temporary memory here (as is done in
    182     // IDCT), if it turns out that some compilers don't optimize away the loads
    183     // and this is performance-critical.
    184     LoadFromBlock(pixels, pixels_stride, i, tmp);
    185     DCT1DImpl<8>()(tmp);
    186     StoreToBlockAndScale(tmp, output, i);
    187   }
    188 }
    189 
    190 JXL_INLINE JXL_MAYBE_UNUSED void TransformFromPixels(
    191     const float* JXL_RESTRICT pixels, size_t pixels_stride,
    192     float* JXL_RESTRICT coefficients, float* JXL_RESTRICT scratch_space) {
    193   DCT1D(pixels, pixels_stride, scratch_space);
    194   Transpose8x8Block(scratch_space, coefficients);
    195   DCT1D(coefficients, 8, scratch_space);
    196   Transpose8x8Block(scratch_space, coefficients);
    197 }
    198 
    199 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
    200                                                      int16_t* out) {
    201   Rebind<int16_t, DI> di16;
    202   Store(DemoteTo(di16, ival), di16, out);
    203 }
    204 
    205 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
    206                                                      int32_t* out) {
    207   DI di;
    208   Store(ival, di, out);
    209 }
    210 
    211 template <typename T>
    212 void QuantizeBlock(const float* dct, const float* qmc, float aq_strength,
    213                    const float* zero_bias_offset, const float* zero_bias_mul,
    214                    T* block) {
    215   D d;
    216   DI di;
    217   const auto aq_mul = Set(d, aq_strength);
    218   for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
    219     const auto val = Load(d, dct + k);
    220     const auto q = Load(d, qmc + k);
    221     const auto qval = Mul(val, q);
    222     const auto zb_offset = Load(d, zero_bias_offset + k);
    223     const auto zb_mul = Load(d, zero_bias_mul + k);
    224     const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));
    225     const auto nzero_mask = Ge(Abs(qval), threshold);
    226     const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval)));
    227     StoreQuantizedValue(ival, block + k);
    228   }
    229 }
    230 
    231 template <typename T>
    232 void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride,
    233                              const float* JXL_RESTRICT qmc,
    234                              int16_t last_dc_coeff, float aq_strength,
    235                              const float* zero_bias_offset,
    236                              const float* zero_bias_mul,
    237                              float* JXL_RESTRICT tmp, T* block) {
    238   float* JXL_RESTRICT dct = tmp;
    239   float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2;
    240   TransformFromPixels(pixels, stride, dct, scratch_space);
    241   QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul, block);
    242   // Center DC values around zero.
    243   static constexpr float kDCBias = 128.0f;
    244   const float dc = (dct[0] - kDCBias) * qmc[0];
    245   float dc_threshold = zero_bias_offset[0] + aq_strength * zero_bias_mul[0];
    246   if (std::abs(dc - last_dc_coeff) < dc_threshold) {
    247     block[0] = last_dc_coeff;
    248   } else {
    249     block[0] = std::round(dc);
    250   }
    251 }
    252 
    253 // NOLINTNEXTLINE(google-readability-namespace-comments)
    254 }  // namespace
    255 }  // namespace HWY_NAMESPACE
    256 }  // namespace jpegli
    257 HWY_AFTER_NAMESPACE();
    258 #endif  // LIB_JPEGLI_DCT_INL_H_