enc_entropy_coder.cc - libjxl - FORK: libjxl patches used on blog

enc_entropy_coder.cc (10580B)
      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jxl/enc_entropy_coder.h"
      7 
      8 #include <stddef.h>
      9 #include <stdint.h>
     10 
     11 #include <algorithm>
     12 #include <utility>
     13 #include <vector>
     14 
     15 #undef HWY_TARGET_INCLUDE
     16 #define HWY_TARGET_INCLUDE "lib/jxl/enc_entropy_coder.cc"
     17 #include <hwy/foreach_target.h>
     18 #include <hwy/highway.h>
     19 
     20 #include "lib/jxl/ac_context.h"
     21 #include "lib/jxl/ac_strategy.h"
     22 #include "lib/jxl/base/bits.h"
     23 #include "lib/jxl/base/compiler_specific.h"
     24 #include "lib/jxl/base/status.h"
     25 #include "lib/jxl/coeff_order.h"
     26 #include "lib/jxl/coeff_order_fwd.h"
     27 #include "lib/jxl/dec_ans.h"
     28 #include "lib/jxl/dec_bit_reader.h"
     29 #include "lib/jxl/dec_context_map.h"
     30 #include "lib/jxl/entropy_coder.h"
     31 #include "lib/jxl/epf.h"
     32 #include "lib/jxl/image.h"
     33 #include "lib/jxl/image_ops.h"
     34 #include "lib/jxl/pack_signed.h"
     35 
     36 HWY_BEFORE_NAMESPACE();
     37 namespace jxl {
     38 namespace HWY_NAMESPACE {
     39 
     40 // These templates are not found via ADL.
     41 using hwy::HWY_NAMESPACE::Add;
     42 using hwy::HWY_NAMESPACE::AndNot;
     43 using hwy::HWY_NAMESPACE::Eq;
     44 using hwy::HWY_NAMESPACE::GetLane;
     45 
     46 // Returns number of non-zero coefficients (but skip LLF).
     47 // We cannot rely on block[] being all-zero bits, so first truncate to integer.
     48 // Also writes the per-8x8 block nzeros starting at nzeros_pos.
     49 int32_t NumNonZeroExceptLLF(const size_t cx, const size_t cy,
     50                             const AcStrategy acs, const size_t covered_blocks,
     51                             const size_t log2_covered_blocks,
     52                             const int32_t* JXL_RESTRICT block,
     53                             const size_t nzeros_stride,
     54                             int32_t* JXL_RESTRICT nzeros_pos) {
     55   const HWY_CAPPED(int32_t, kBlockDim) di;
     56 
     57   const auto zero = Zero(di);
     58   // Add FF..FF for every zero coefficient, negate to get #zeros.
     59   auto neg_sum_zero = zero;
     60 
     61   {
     62     // Mask sufficient for one row of coefficients.
     63     HWY_ALIGN const int32_t
     64         llf_mask_lanes[AcStrategy::kMaxCoeffBlocks * (1 + kBlockDim)] = {
     65             -1, -1, -1, -1};
     66     // First cx=1,2,4 elements are FF..FF, others 0.
     67     const int32_t* llf_mask_pos =
     68         llf_mask_lanes + AcStrategy::kMaxCoeffBlocks - cx;
     69 
     70     // Rows with LLF: mask out the LLF
     71     for (size_t y = 0; y < cy; y++) {
     72       for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) {
     73         const auto llf_mask = LoadU(di, llf_mask_pos + x);
     74 
     75         // LLF counts as zero so we don't include it in nzeros.
     76         const auto coef =
     77             AndNot(llf_mask, Load(di, &block[y * cx * kBlockDim + x]));
     78 
     79         neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
     80       }
     81     }
     82   }
     83 
     84   // Remaining rows: no mask
     85   for (size_t y = cy; y < cy * kBlockDim; y++) {
     86     for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) {
     87       const auto coef = Load(di, &block[y * cx * kBlockDim + x]);
     88       neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
     89     }
     90   }
     91 
     92   // We want area - sum_zero, add because neg_sum_zero is already negated.
     93   const int32_t nzeros = static_cast<int32_t>(cx * cy * kDCTBlockSize) +
     94                          GetLane(SumOfLanes(di, neg_sum_zero));
     95 
     96   const int32_t shifted_nzeros = static_cast<int32_t>(
     97       (nzeros + covered_blocks - 1) >> log2_covered_blocks);
     98   // Need non-canonicalized dimensions!
     99   for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
    100     for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
    101       nzeros_pos[x + y * nzeros_stride] = shifted_nzeros;
    102     }
    103   }
    104 
    105   return nzeros;
    106 }
    107 
    108 // Specialization for 8x8, where only top-left is LLF/DC.
    109 // About 1% overall speedup vs. NumNonZeroExceptLLF.
    110 int32_t NumNonZero8x8ExceptDC(const int32_t* JXL_RESTRICT block,
    111                               int32_t* JXL_RESTRICT nzeros_pos) {
    112   const HWY_CAPPED(int32_t, kBlockDim) di;
    113 
    114   const auto zero = Zero(di);
    115   // Add FF..FF for every zero coefficient, negate to get #zeros.
    116   auto neg_sum_zero = zero;
    117 
    118   {
    119     // First row has DC, so mask
    120     const size_t y = 0;
    121     HWY_ALIGN const int32_t dc_mask_lanes[kBlockDim] = {-1};
    122 
    123     for (size_t x = 0; x < kBlockDim; x += Lanes(di)) {
    124       const auto dc_mask = Load(di, dc_mask_lanes + x);
    125 
    126       // DC counts as zero so we don't include it in nzeros.
    127       const auto coef = AndNot(dc_mask, Load(di, &block[y * kBlockDim + x]));
    128 
    129       neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
    130     }
    131   }
    132 
    133   // Remaining rows: no mask
    134   for (size_t y = 1; y < kBlockDim; y++) {
    135     for (size_t x = 0; x < kBlockDim; x += Lanes(di)) {
    136       const auto coef = Load(di, &block[y * kBlockDim + x]);
    137       neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
    138     }
    139   }
    140 
    141   // We want 64 - sum_zero, add because neg_sum_zero is already negated.
    142   const int32_t nzeros = static_cast<int32_t>(kDCTBlockSize) +
    143                          GetLane(SumOfLanes(di, neg_sum_zero));
    144 
    145   *nzeros_pos = nzeros;
    146 
    147   return nzeros;
    148 }
    149 
    150 // The number of nonzeros of each block is predicted from the top and the left
    151 // blocks, with opportune scaling to take into account the number of blocks of
    152 // each strategy.  The predicted number of nonzeros divided by two is used as a
    153 // context; if this number is above 63, a specific context is used.  If the
    154 // number of nonzeros of a strategy is above 63, it is written directly using a
    155 // fixed number of bits (that depends on the size of the strategy).
    156 void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders,
    157                           const Rect& rect,
    158                           const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows,
    159                           const AcStrategyImage& ac_strategy,
    160                           const YCbCrChromaSubsampling& cs,
    161                           Image3I* JXL_RESTRICT tmp_num_nzeroes,
    162                           std::vector<Token>* JXL_RESTRICT output,
    163                           const ImageB& qdc, const ImageI& qf,
    164                           const BlockCtxMap& block_ctx_map) {
    165   const size_t xsize_blocks = rect.xsize();
    166   const size_t ysize_blocks = rect.ysize();
    167   output->clear();
    168   // TODO(user): update the estimate: usually less coefficients are used.
    169   output->reserve(3 * xsize_blocks * ysize_blocks * kDCTBlockSize);
    170 
    171   size_t offset[3] = {};
    172   const size_t nzeros_stride = tmp_num_nzeroes->PixelsPerRow();
    173   for (size_t by = 0; by < ysize_blocks; ++by) {
    174     size_t sby[3] = {by >> cs.VShift(0), by >> cs.VShift(1),
    175                      by >> cs.VShift(2)};
    176     int32_t* JXL_RESTRICT row_nzeros[3] = {
    177         tmp_num_nzeroes->PlaneRow(0, sby[0]),
    178         tmp_num_nzeroes->PlaneRow(1, sby[1]),
    179         tmp_num_nzeroes->PlaneRow(2, sby[2]),
    180     };
    181     const int32_t* JXL_RESTRICT row_nzeros_top[3] = {
    182         sby[0] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(0, sby[0] - 1),
    183         sby[1] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(1, sby[1] - 1),
    184         sby[2] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(2, sby[2] - 1),
    185     };
    186     const uint8_t* JXL_RESTRICT row_qdc =
    187         qdc.ConstRow(rect.y0() + by) + rect.x0();
    188     const int32_t* JXL_RESTRICT row_qf = rect.ConstRow(qf, by);
    189     AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
    190     for (size_t bx = 0; bx < xsize_blocks; ++bx) {
    191       AcStrategy acs = acs_row[bx];
    192       if (!acs.IsFirstBlock()) continue;
    193       size_t sbx[3] = {bx >> cs.HShift(0), bx >> cs.HShift(1),
    194                        bx >> cs.HShift(2)};
    195       size_t cx = acs.covered_blocks_x();
    196       size_t cy = acs.covered_blocks_y();
    197       const size_t covered_blocks = cx * cy;  // = #LLF coefficients
    198       const size_t log2_covered_blocks =
    199           Num0BitsBelowLS1Bit_Nonzero(covered_blocks);
    200       const size_t size = covered_blocks * kDCTBlockSize;
    201 
    202       CoefficientLayout(&cy, &cx);  // swap cx/cy to canonical order
    203 
    204       for (int c : {1, 0, 2}) {
    205         if (sbx[c] << cs.HShift(c) != bx) continue;
    206         if (sby[c] << cs.VShift(c) != by) continue;
    207         const int32_t* JXL_RESTRICT block = ac_rows[c] + offset[c];
    208 
    209         int32_t nzeros =
    210             (covered_blocks == 1)
    211                 ? NumNonZero8x8ExceptDC(block, row_nzeros[c] + sbx[c])
    212                 : NumNonZeroExceptLLF(cx, cy, acs, covered_blocks,
    213                                       log2_covered_blocks, block, nzeros_stride,
    214                                       row_nzeros[c] + sbx[c]);
    215 
    216         int ord = kStrategyOrder[acs.RawStrategy()];
    217         const coeff_order_t* JXL_RESTRICT order =
    218             &orders[CoeffOrderOffset(ord, c)];
    219 
    220         int32_t predicted_nzeros =
    221             PredictFromTopAndLeft(row_nzeros_top[c], row_nzeros[c], sbx[c], 32);
    222         size_t block_ctx =
    223             block_ctx_map.Context(row_qdc[bx], row_qf[sbx[c]], ord, c);
    224         const int32_t nzero_ctx =
    225             block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx);
    226 
    227         output->emplace_back(nzero_ctx, nzeros);
    228         const size_t histo_offset =
    229             block_ctx_map.ZeroDensityContextsOffset(block_ctx);
    230         // Skip LLF.
    231         size_t prev = (nzeros > static_cast<ssize_t>(size / 16) ? 0 : 1);
    232         for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
    233           int32_t coeff = block[order[k]];
    234           size_t ctx =
    235               histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
    236                                                 log2_covered_blocks, prev);
    237           uint32_t u_coeff = PackSigned(coeff);
    238           output->emplace_back(ctx, u_coeff);
    239           prev = (coeff != 0) ? 1 : 0;
    240           nzeros -= prev;
    241         }
    242         JXL_DASSERT(nzeros == 0);
    243         offset[c] += size;
    244       }
    245     }
    246   }
    247 }
    248 
    249 // NOLINTNEXTLINE(google-readability-namespace-comments)
    250 }  // namespace HWY_NAMESPACE
    251 }  // namespace jxl
    252 HWY_AFTER_NAMESPACE();
    253 
    254 #if HWY_ONCE
    255 namespace jxl {
    256 HWY_EXPORT(TokenizeCoefficients);
    257 void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders,
    258                           const Rect& rect,
    259                           const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows,
    260                           const AcStrategyImage& ac_strategy,
    261                           const YCbCrChromaSubsampling& cs,
    262                           Image3I* JXL_RESTRICT tmp_num_nzeroes,
    263                           std::vector<Token>* JXL_RESTRICT output,
    264                           const ImageB& qdc, const ImageI& qf,
    265                           const BlockCtxMap& block_ctx_map) {
    266   HWY_DYNAMIC_DISPATCH(TokenizeCoefficients)
    267   (orders, rect, ac_rows, ac_strategy, cs, tmp_num_nzeroes, output, qdc, qf,
    268    block_ctx_map);
    269 }
    270 
    271 }  // namespace jxl
    272 #endif  // HWY_ONCE
	libjxl FORK: libjxl patches used on blog
	git clone https://git.neptards.moe/blog/libjxl.git
	Log \| Files \| Refs \| Submodules \| README \| LICENSE