libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

entropy_coding-inl.h (7290B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #if defined(LIB_JPEGLI_ENTROPY_CODING_INL_H_) == defined(HWY_TARGET_TOGGLE)
      7 #ifdef LIB_JPEGLI_ENTROPY_CODING_INL_H_
      8 #undef LIB_JPEGLI_ENTROPY_CODING_INL_H_
      9 #else
     10 #define LIB_JPEGLI_ENTROPY_CODING_INL_H_
     11 #endif
     12 
     13 #include "lib/jxl/base/compiler_specific.h"
     14 
     15 HWY_BEFORE_NAMESPACE();
     16 namespace jpegli {
     17 namespace HWY_NAMESPACE {
     18 namespace {
     19 
     20 // These templates are not found via ADL.
     21 using hwy::HWY_NAMESPACE::Abs;
     22 using hwy::HWY_NAMESPACE::Add;
     23 using hwy::HWY_NAMESPACE::And;
     24 using hwy::HWY_NAMESPACE::AndNot;
     25 using hwy::HWY_NAMESPACE::Compress;
     26 using hwy::HWY_NAMESPACE::CountTrue;
     27 using hwy::HWY_NAMESPACE::Eq;
     28 using hwy::HWY_NAMESPACE::GetLane;
     29 using hwy::HWY_NAMESPACE::MaskFromVec;
     30 using hwy::HWY_NAMESPACE::Max;
     31 using hwy::HWY_NAMESPACE::Not;
     32 using hwy::HWY_NAMESPACE::Or;
     33 using hwy::HWY_NAMESPACE::ShiftRight;
     34 using hwy::HWY_NAMESPACE::Shl;
     35 using hwy::HWY_NAMESPACE::Sub;
     36 
     37 using DI = HWY_FULL(int32_t);
     38 constexpr DI di;
     39 
     40 template <typename DI, class V>
     41 JXL_INLINE V NumBits(DI di, const V x) {
     42   // TODO(szabadka) Add faster implementations for some specific architectures.
     43   const auto b1 = And(x, Set(di, 1));
     44   const auto b2 = And(x, Set(di, 2));
     45   const auto b3 = Sub((And(x, Set(di, 4))), Set(di, 1));
     46   const auto b4 = Sub((And(x, Set(di, 8))), Set(di, 4));
     47   const auto b5 = Sub((And(x, Set(di, 16))), Set(di, 11));
     48   const auto b6 = Sub((And(x, Set(di, 32))), Set(di, 26));
     49   const auto b7 = Sub((And(x, Set(di, 64))), Set(di, 57));
     50   const auto b8 = Sub((And(x, Set(di, 128))), Set(di, 120));
     51   const auto b9 = Sub((And(x, Set(di, 256))), Set(di, 247));
     52   const auto b10 = Sub((And(x, Set(di, 512))), Set(di, 502));
     53   const auto b11 = Sub((And(x, Set(di, 1024))), Set(di, 1013));
     54   const auto b12 = Sub((And(x, Set(di, 2048))), Set(di, 2036));
     55   return Max(Max(Max(Max(b1, b2), Max(b3, b4)), Max(Max(b5, b6), Max(b7, b8))),
     56              Max(Max(b9, b10), Max(b11, b12)));
     57 }
     58 
     59 // Coefficient indexes pre-multiplied by 16 for the symbol calculation.
     60 HWY_ALIGN constexpr int32_t kIndexes[64] = {
     61     0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176,  192,
     62     208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384,  400,
     63     416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592,  608,
     64     624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800,  816,
     65     832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008,
     66 };
     67 
     68 JXL_INLINE int CompactBlock(int32_t* JXL_RESTRICT block,
     69                             int32_t* JXL_RESTRICT nonzero_idx) {
     70   const auto zero = Zero(di);
     71   HWY_ALIGN constexpr int32_t dc_mask_lanes[HWY_LANES(DI)] = {-1};
     72   const auto dc_mask = MaskFromVec(Load(di, dc_mask_lanes));
     73   int num_nonzeros = 0;
     74   int k = 0;
     75   {
     76     const auto coef = Load(di, block);
     77     const auto idx = Load(di, kIndexes);
     78     const auto nonzero_mask = Or(dc_mask, Not(Eq(coef, zero)));
     79     const auto nzero_coef = Compress(coef, nonzero_mask);
     80     const auto nzero_idx = Compress(idx, nonzero_mask);
     81     StoreU(nzero_coef, di, &block[num_nonzeros]);
     82     StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]);
     83     num_nonzeros += CountTrue(di, nonzero_mask);
     84     k += Lanes(di);
     85   }
     86   for (; k < DCTSIZE2; k += Lanes(di)) {
     87     const auto coef = Load(di, &block[k]);
     88     const auto idx = Load(di, &kIndexes[k]);
     89     const auto nonzero_mask = Not(Eq(coef, zero));
     90     const auto nzero_coef = Compress(coef, nonzero_mask);
     91     const auto nzero_idx = Compress(idx, nonzero_mask);
     92     StoreU(nzero_coef, di, &block[num_nonzeros]);
     93     StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]);
     94     num_nonzeros += CountTrue(di, nonzero_mask);
     95   }
     96   return num_nonzeros;
     97 }
     98 
     99 JXL_INLINE void ComputeSymbols(const int num_nonzeros,
    100                                int32_t* JXL_RESTRICT nonzero_idx,
    101                                int32_t* JXL_RESTRICT block,
    102                                int32_t* JXL_RESTRICT symbols) {
    103   nonzero_idx[-1] = -16;
    104   const auto one = Set(di, 1);
    105   const auto offset = Set(di, 16);
    106   for (int i = 0; i < num_nonzeros; i += Lanes(di)) {
    107     const auto idx = Load(di, &nonzero_idx[i]);
    108     const auto prev_idx = LoadU(di, &nonzero_idx[i - 1]);
    109     const auto coeff = Load(di, &block[i]);
    110     const auto nbits = NumBits(di, Abs(coeff));
    111     const auto mask = ShiftRight<8 * sizeof(int32_t) - 1>(coeff);
    112     const auto bits = And(Add(coeff, mask), Sub(Shl(one, nbits), one));
    113     const auto symbol = Sub(Add(nbits, idx), Add(prev_idx, offset));
    114     Store(symbol, di, symbols + i);
    115     Store(bits, di, block + i);
    116   }
    117 }
    118 
    119 template <typename T>
    120 int NumNonZero8x8ExceptDC(const T* block) {
    121   const HWY_CAPPED(T, 8) di;
    122 
    123   const auto zero = Zero(di);
    124   // Add FFFF for every zero coefficient, negate to get #zeros.
    125   auto neg_sum_zero = zero;
    126   {
    127     // First row has DC, so mask
    128     const size_t y = 0;
    129     HWY_ALIGN const T dc_mask_lanes[8] = {-1};
    130 
    131     for (size_t x = 0; x < 8; x += Lanes(di)) {
    132       const auto dc_mask = Load(di, dc_mask_lanes + x);
    133 
    134       // DC counts as zero so we don't include it in nzeros.
    135       const auto coef = AndNot(dc_mask, Load(di, &block[y * 8 + x]));
    136 
    137       neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
    138     }
    139   }
    140   // Remaining rows: no mask
    141   for (size_t y = 1; y < 8; y++) {
    142     for (size_t x = 0; x < 8; x += Lanes(di)) {
    143       const auto coef = Load(di, &block[y * 8 + x]);
    144       neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
    145     }
    146   }
    147 
    148   // We want 64 - sum_zero, add because neg_sum_zero is already negated.
    149   return kDCTBlockSize + GetLane(SumOfLanes(di, neg_sum_zero));
    150 }
    151 
    152 template <typename T, bool zig_zag_order>
    153 void ComputeTokensForBlock(const T* block, int last_dc, int dc_ctx, int ac_ctx,
    154                            Token** tokens_ptr) {
    155   Token* next_token = *tokens_ptr;
    156   coeff_t temp2;
    157   coeff_t temp;
    158   temp = block[0] - last_dc;
    159   if (temp == 0) {
    160     *next_token++ = Token(dc_ctx, 0, 0);
    161   } else {
    162     temp2 = temp;
    163     if (temp < 0) {
    164       temp = -temp;
    165       temp2--;
    166     }
    167     int dc_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
    168     int dc_mask = (1 << dc_nbits) - 1;
    169     *next_token++ = Token(dc_ctx, dc_nbits, temp2 & dc_mask);
    170   }
    171   int num_nonzeros = NumNonZero8x8ExceptDC(block);
    172   for (int k = 1; k < 64; ++k) {
    173     if (num_nonzeros == 0) {
    174       *next_token++ = Token(ac_ctx, 0, 0);
    175       break;
    176     }
    177     int r = 0;
    178     if (zig_zag_order) {
    179       while ((temp = block[k]) == 0) {
    180         r++;
    181         k++;
    182       }
    183     } else {
    184       while ((temp = block[kJPEGNaturalOrder[k]]) == 0) {
    185         r++;
    186         k++;
    187       }
    188     }
    189     --num_nonzeros;
    190     if (temp < 0) {
    191       temp = -temp;
    192       temp2 = ~temp;
    193     } else {
    194       temp2 = temp;
    195     }
    196     while (r > 15) {
    197       *next_token++ = Token(ac_ctx, 0xf0, 0);
    198       r -= 16;
    199     }
    200     int ac_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
    201     int ac_mask = (1 << ac_nbits) - 1;
    202     int symbol = (r << 4u) + ac_nbits;
    203     *next_token++ = Token(ac_ctx, symbol, temp2 & ac_mask);
    204   }
    205   *tokens_ptr = next_token;
    206 }
    207 
    208 // NOLINTNEXTLINE(google-readability-namespace-comments)
    209 }  // namespace
    210 }  // namespace HWY_NAMESPACE
    211 }  // namespace jpegli
    212 HWY_AFTER_NAMESPACE();
    213 #endif  // LIB_JPEGLI_ENTROPY_CODING_INL_H_