libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

enc_group.cc (20623B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jxl/enc_group.h"
      7 
      8 #include <hwy/aligned_allocator.h>
      9 #include <utility>
     10 
     11 #undef HWY_TARGET_INCLUDE
     12 #define HWY_TARGET_INCLUDE "lib/jxl/enc_group.cc"
     13 #include <hwy/foreach_target.h>
     14 #include <hwy/highway.h>
     15 
     16 #include "lib/jxl/ac_strategy.h"
     17 #include "lib/jxl/base/bits.h"
     18 #include "lib/jxl/base/compiler_specific.h"
     19 #include "lib/jxl/common.h"  // kMaxNumPasses
     20 #include "lib/jxl/dct_util.h"
     21 #include "lib/jxl/dec_transforms-inl.h"
     22 #include "lib/jxl/enc_aux_out.h"
     23 #include "lib/jxl/enc_cache.h"
     24 #include "lib/jxl/enc_params.h"
     25 #include "lib/jxl/enc_transforms-inl.h"
     26 #include "lib/jxl/image.h"
     27 #include "lib/jxl/quantizer-inl.h"
     28 #include "lib/jxl/quantizer.h"
     29 #include "lib/jxl/simd_util.h"
     30 HWY_BEFORE_NAMESPACE();
     31 namespace jxl {
     32 namespace HWY_NAMESPACE {
     33 
     34 // These templates are not found via ADL.
     35 using hwy::HWY_NAMESPACE::Abs;
     36 using hwy::HWY_NAMESPACE::Ge;
     37 using hwy::HWY_NAMESPACE::IfThenElse;
     38 using hwy::HWY_NAMESPACE::IfThenElseZero;
     39 using hwy::HWY_NAMESPACE::MaskFromVec;
     40 using hwy::HWY_NAMESPACE::Round;
     41 
     42 // NOTE: caller takes care of extracting quant from rect of RawQuantField.
     43 void QuantizeBlockAC(const Quantizer& quantizer, const bool error_diffusion,
     44                      size_t c, float qm_multiplier, size_t quant_kind,
     45                      size_t xsize, size_t ysize, float* thresholds,
     46                      const float* JXL_RESTRICT block_in, int32_t* quant,
     47                      int32_t* JXL_RESTRICT block_out) {
     48   const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
     49   float qac = quantizer.Scale() * (*quant);
     50   // Not SIMD-ified for now.
     51   if (c != 1 && xsize * ysize >= 4) {
     52     for (int i = 0; i < 4; ++i) {
     53       thresholds[i] -= 0.00744f * xsize * ysize;
     54       if (thresholds[i] < 0.5) {
     55         thresholds[i] = 0.5;
     56       }
     57     }
     58   }
     59   HWY_CAPPED(float, kBlockDim) df;
     60   HWY_CAPPED(int32_t, kBlockDim) di;
     61   HWY_CAPPED(uint32_t, kBlockDim) du;
     62   const auto quantv = Set(df, qac * qm_multiplier);
     63   for (size_t y = 0; y < ysize * kBlockDim; y++) {
     64     size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2;
     65     const size_t off = y * kBlockDim * xsize;
     66     for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) {
     67       auto thr = Zero(df);
     68       if (xsize == 1) {
     69         HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u};
     70         const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x)));
     71         thr = IfThenElse(mask, Set(df, thresholds[yfix + 1]),
     72                          Set(df, thresholds[yfix]));
     73       } else {
     74         // Same for all lanes in the vector.
     75         thr = Set(
     76             df,
     77             thresholds[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]);
     78       }
     79       const auto q = Mul(Load(df, qm + off + x), quantv);
     80       const auto in = Load(df, block_in + off + x);
     81       const auto val = Mul(q, in);
     82       const auto nzero_mask = Ge(Abs(val), thr);
     83       const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val)));
     84       Store(v, di, block_out + off + x);
     85     }
     86   }
     87 }
     88 
     89 void AdjustQuantBlockAC(const Quantizer& quantizer, size_t c,
     90                         float qm_multiplier, size_t quant_kind, size_t xsize,
     91                         size_t ysize, float* thresholds,
     92                         const float* JXL_RESTRICT block_in, int32_t* quant) {
     93   // No quantization adjusting for these small blocks.
     94   // Quantization adjusting attempts to fix some known issues
     95   // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness
     96   // when there are not many non-zeros.
     97   constexpr size_t kPartialBlockKinds =
     98       (1 << AcStrategy::Type::IDENTITY) | (1 << AcStrategy::Type::DCT2X2) |
     99       (1 << AcStrategy::Type::DCT4X4) | (1 << AcStrategy::Type::DCT4X8) |
    100       (1 << AcStrategy::Type::DCT8X4) | (1 << AcStrategy::Type::AFV0) |
    101       (1 << AcStrategy::Type::AFV1) | (1 << AcStrategy::Type::AFV2) |
    102       (1 << AcStrategy::Type::AFV3);
    103   if ((1 << quant_kind) & kPartialBlockKinds) {
    104     return;
    105   }
    106 
    107   const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
    108   float qac = quantizer.Scale() * (*quant);
    109   if (xsize > 1 || ysize > 1) {
    110     for (int i = 0; i < 4; ++i) {
    111       thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f);
    112       if (thresholds[i] < 0.54) {
    113         thresholds[i] = 0.54;
    114       }
    115     }
    116   }
    117   float sum_of_highest_freq_row_and_column = 0;
    118   float sum_of_error = 0;
    119   float sum_of_vals = 0;
    120   float hfNonZeros[4] = {};
    121   float hfMaxError[4] = {};
    122 
    123   for (size_t y = 0; y < ysize * kBlockDim; y++) {
    124     for (size_t x = 0; x < xsize * kBlockDim; x++) {
    125       const size_t pos = y * kBlockDim * xsize + x;
    126       if (x < xsize && y < ysize) {
    127         continue;
    128       }
    129       const size_t hfix = (static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2 +
    130                            static_cast<size_t>(x >= xsize * kBlockDim / 2));
    131       const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier);
    132       const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val);
    133       const float error = std::abs(val - v);
    134       sum_of_error += error;
    135       sum_of_vals += std::abs(v);
    136       if (c == 1 && v == 0) {
    137         if (hfMaxError[hfix] < error) {
    138           hfMaxError[hfix] = error;
    139         }
    140       }
    141       if (v != 0.0f) {
    142         hfNonZeros[hfix] += std::abs(v);
    143         bool in_corner = y >= 7 * ysize && x >= 7 * xsize;
    144         bool on_border =
    145             y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1;
    146         bool in_larger_corner = x >= 4 * xsize && y >= 4 * ysize;
    147         if (in_corner || (on_border && in_larger_corner)) {
    148           sum_of_highest_freq_row_and_column += std::abs(val);
    149         }
    150       }
    151     }
    152   }
    153   if (c == 1 && sum_of_vals * 8 < xsize * ysize) {
    154     static const double kLimit[4] = {
    155         0.46,
    156         0.46,
    157         0.46,
    158         0.46,
    159     };
    160     static const double kMul[4] = {
    161         0.9999,
    162         0.9999,
    163         0.9999,
    164         0.9999,
    165     };
    166     const int32_t orig_quant = *quant;
    167     int32_t new_quant = *quant;
    168     for (int i = 1; i < 4; ++i) {
    169       if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit[i]) {
    170         new_quant = orig_quant + 1;
    171         break;
    172       }
    173     }
    174     *quant = new_quant;
    175     if (hfNonZeros[3] == 0.0 && hfMaxError[3] > kLimit[3]) {
    176       thresholds[3] = kMul[3] * hfMaxError[3] * new_quant / orig_quant;
    177     } else if ((hfNonZeros[1] == 0.0 && hfMaxError[1] > kLimit[1]) ||
    178                (hfNonZeros[2] == 0.0 && hfMaxError[2] > kLimit[2])) {
    179       thresholds[1] = kMul[1] * std::max(hfMaxError[1], hfMaxError[2]) *
    180                       new_quant / orig_quant;
    181       thresholds[2] = thresholds[1];
    182     } else if (hfNonZeros[0] == 0.0 && hfMaxError[0] > kLimit[0]) {
    183       thresholds[0] = kMul[0] * hfMaxError[0] * new_quant / orig_quant;
    184     }
    185   }
    186   // Heuristic for improving accuracy of high-frequency patterns
    187   // occurring in an environment with no medium-frequency masking
    188   // patterns.
    189   {
    190     float all =
    191         hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] + 1;
    192     float mul[3] = {70, 30, 60};
    193     if (mul[c] * sum_of_highest_freq_row_and_column >= all) {
    194       *quant += mul[c] * sum_of_highest_freq_row_and_column / all;
    195       if (*quant >= Quantizer::kQuantMax) {
    196         *quant = Quantizer::kQuantMax - 1;
    197       }
    198     }
    199   }
    200   if (quant_kind == AcStrategy::Type::DCT) {
    201     // If this 8x8 block is too flat, increase the adaptive quantization level
    202     // a bit to reduce visible block boundaries and requantize the block.
    203     if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) {
    204       *quant += 1;
    205       if (*quant >= Quantizer::kQuantMax) {
    206         *quant = Quantizer::kQuantMax - 1;
    207       }
    208     }
    209   }
    210   {
    211     static const double kMul1[4][3] = {
    212         {
    213             0.22080615753848404,
    214             0.45797479824262011,
    215             0.29859235095977965,
    216         },
    217         {
    218             0.70109486510286834,
    219             0.16185281305512639,
    220             0.14387691730035473,
    221         },
    222         {
    223             0.114985964456218638,
    224             0.44656840441027695,
    225             0.10587658215149048,
    226         },
    227         {
    228             0.46849665264409396,
    229             0.41239077937781954,
    230             0.088667407767185444,
    231         },
    232     };
    233     static const double kMul2[4][3] = {
    234         {
    235             0.27450281941822197,
    236             1.1255766549984996,
    237             0.98950459134128388,
    238         },
    239         {
    240             0.4652168675598285,
    241             0.40945807983455818,
    242             0.36581899811751367,
    243         },
    244         {
    245             0.28034972424715715,
    246             0.9182653201929738,
    247             1.5581531543057416,
    248         },
    249         {
    250             0.26873118114033728,
    251             0.68863712390392484,
    252             1.2082185408666786,
    253         },
    254     };
    255     static const double kQuantNormalizer = 2.2942708343284721;
    256     sum_of_error *= kQuantNormalizer;
    257     sum_of_vals *= kQuantNormalizer;
    258     if (quant_kind >= AcStrategy::Type::DCT16X16) {
    259       int ix = 3;
    260       if (quant_kind == AcStrategy::Type::DCT32X16 ||
    261           quant_kind == AcStrategy::Type::DCT16X32) {
    262         ix = 1;
    263       } else if (quant_kind == AcStrategy::Type::DCT16X16) {
    264         ix = 0;
    265       } else if (quant_kind == AcStrategy::Type::DCT32X32) {
    266         ix = 2;
    267       }
    268       int step =
    269           sum_of_error / (kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim +
    270                           kMul2[ix][c] * sum_of_vals);
    271       if (step >= 2) {
    272         step = 2;
    273       }
    274       if (step < 0) {
    275         step = 0;
    276       }
    277       if (sum_of_error > kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim +
    278                              kMul2[ix][c] * sum_of_vals) {
    279         *quant += step;
    280         if (*quant >= Quantizer::kQuantMax) {
    281           *quant = Quantizer::kQuantMax - 1;
    282         }
    283       }
    284     }
    285   }
    286   {
    287     // Reduce quant in highly active areas.
    288     int32_t div = (xsize * ysize);
    289     int32_t activity = (static_cast<int32_t>(hfNonZeros[0]) + div / 2) / div;
    290     int32_t orig_qp_limit = std::max(4, *quant / 2);
    291     for (int i = 1; i < 4; ++i) {
    292       activity = std::min(
    293           activity, (static_cast<int32_t>(hfNonZeros[i]) + div / 2) / div);
    294     }
    295     if (activity >= 15) {
    296       activity = 15;
    297     }
    298     int32_t qp = *quant - activity;
    299     if (c == 1) {
    300       for (int i = 1; i < 4; ++i) {
    301         thresholds[i] += 0.01 * activity;
    302       }
    303     }
    304     if (qp < orig_qp_limit) {
    305       qp = orig_qp_limit;
    306     }
    307     *quant = qp;
    308   }
    309 }
    310 
    311 // NOTE: caller takes care of extracting quant from rect of RawQuantField.
    312 void QuantizeRoundtripYBlockAC(PassesEncoderState* enc_state, const size_t size,
    313                                const Quantizer& quantizer,
    314                                const bool error_diffusion, size_t quant_kind,
    315                                size_t xsize, size_t ysize,
    316                                const float* JXL_RESTRICT biases, int32_t* quant,
    317                                float* JXL_RESTRICT inout,
    318                                int32_t* JXL_RESTRICT quantized) {
    319   float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f};
    320   if (enc_state->cparams.speed_tier <= SpeedTier::kHare) {
    321     int32_t max_quant = 0;
    322     int quant_orig = *quant;
    323     float val[3] = {enc_state->x_qm_multiplier, 1.0f,
    324                     enc_state->b_qm_multiplier};
    325     int clut[3] = {1, 0, 2};
    326     for (int ii = 0; ii < 3; ++ii) {
    327       float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f};
    328       int c = clut[ii];
    329       *quant = quant_orig;
    330       AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize,
    331                          &thres[0], inout + c * size, quant);
    332       // Dead zone adjustment
    333       if (c == 1) {
    334         for (int k = 0; k < 4; ++k) {
    335           thres_y[k] = thres[k];
    336         }
    337       }
    338       max_quant = std::max(*quant, max_quant);
    339     }
    340     *quant = max_quant;
    341   } else {
    342     thres_y[0] = 0.56;
    343     thres_y[1] = 0.62;
    344     thres_y[2] = 0.62;
    345     thres_y[3] = 0.62;
    346   }
    347 
    348   QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize,
    349                   &thres_y[0], inout + size, quant, quantized + size);
    350 
    351   const float* JXL_RESTRICT dequant_matrix =
    352       quantizer.DequantMatrix(quant_kind, 1);
    353 
    354   HWY_CAPPED(float, kDCTBlockSize) df;
    355   HWY_CAPPED(int32_t, kDCTBlockSize) di;
    356   const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant));
    357   for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) {
    358     const auto quant = Load(di, quantized + size + k);
    359     const auto adj_quant = AdjustQuantBias(di, 1, quant, biases);
    360     const auto dequantm = Load(df, dequant_matrix + k);
    361     Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k);
    362   }
    363 }
    364 
    365 void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
    366                          const Image3F& opsin, const Rect& rect, Image3F* dc) {
    367   const Rect block_group_rect =
    368       enc_state->shared.frame_dim.BlockGroupRect(group_idx);
    369   const Rect cmap_rect(
    370       block_group_rect.x0() / kColorTileDimInBlocks,
    371       block_group_rect.y0() / kColorTileDimInBlocks,
    372       DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks),
    373       DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks));
    374   const Rect group_rect =
    375       enc_state->shared.frame_dim.GroupRect(group_idx).Translate(rect.x0(),
    376                                                                  rect.y0());
    377 
    378   const size_t xsize_blocks = block_group_rect.xsize();
    379   const size_t ysize_blocks = block_group_rect.ysize();
    380 
    381   const size_t dc_stride = static_cast<size_t>(dc->PixelsPerRow());
    382   const size_t opsin_stride = static_cast<size_t>(opsin.PixelsPerRow());
    383 
    384   ImageI& full_quant_field = enc_state->shared.raw_quant_field;
    385   const CompressParams& cparams = enc_state->cparams;
    386 
    387   const size_t dct_scratch_size =
    388       3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
    389 
    390   // TODO(veluca): consider strategies to reduce this memory.
    391   auto mem = hwy::AllocateAligned<int32_t>(3 * AcStrategy::kMaxCoeffArea);
    392   auto fmem = hwy::AllocateAligned<float>(5 * AcStrategy::kMaxCoeffArea +
    393                                           dct_scratch_size);
    394   float* JXL_RESTRICT scratch_space =
    395       fmem.get() + 3 * AcStrategy::kMaxCoeffArea;
    396   {
    397     // Only use error diffusion in Squirrel mode or slower.
    398     const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel;
    399     constexpr HWY_CAPPED(float, kDCTBlockSize) d;
    400 
    401     int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {};
    402     size_t num_passes = enc_state->progressive_splitter.GetNumPasses();
    403     JXL_DASSERT(num_passes > 0);
    404     for (size_t i = 0; i < num_passes; i++) {
    405       // TODO(veluca): 16-bit quantized coeffs are not implemented yet.
    406       JXL_ASSERT(enc_state->coeffs[i]->Type() == ACType::k32);
    407       for (size_t c = 0; c < 3; c++) {
    408         coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32;
    409       }
    410     }
    411 
    412     HWY_ALIGN float* coeffs_in = fmem.get();
    413     HWY_ALIGN int32_t* quantized = mem.get();
    414 
    415     for (size_t by = 0; by < ysize_blocks; ++by) {
    416       int32_t* JXL_RESTRICT row_quant_ac =
    417           block_group_rect.Row(&full_quant_field, by);
    418       size_t ty = by / kColorTileDimInBlocks;
    419       const int8_t* JXL_RESTRICT row_cmap[3] = {
    420           cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty),
    421           nullptr,
    422           cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty),
    423       };
    424       const float* JXL_RESTRICT opsin_rows[3] = {
    425           group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim),
    426           group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim),
    427           group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim),
    428       };
    429       float* JXL_RESTRICT dc_rows[3] = {
    430           block_group_rect.PlaneRow(dc, 0, by),
    431           block_group_rect.PlaneRow(dc, 1, by),
    432           block_group_rect.PlaneRow(dc, 2, by),
    433       };
    434       AcStrategyRow ac_strategy_row =
    435           enc_state->shared.ac_strategy.ConstRow(block_group_rect, by);
    436       for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
    437            tx++) {
    438         const auto x_factor =
    439             Set(d, enc_state->shared.cmap.YtoXRatio(row_cmap[0][tx]));
    440         const auto b_factor =
    441             Set(d, enc_state->shared.cmap.YtoBRatio(row_cmap[2][tx]));
    442         for (size_t bx = tx * kColorTileDimInBlocks;
    443              bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) {
    444           const AcStrategy acs = ac_strategy_row[bx];
    445           if (!acs.IsFirstBlock()) continue;
    446 
    447           size_t xblocks = acs.covered_blocks_x();
    448           size_t yblocks = acs.covered_blocks_y();
    449 
    450           CoefficientLayout(&yblocks, &xblocks);
    451 
    452           size_t size = kDCTBlockSize * xblocks * yblocks;
    453 
    454           // DCT Y channel, roundtrip-quantize it and set DC.
    455           int32_t quant_ac = row_quant_ac[bx];
    456           for (size_t c : {0, 1, 2}) {
    457             TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim,
    458                                 opsin_stride, coeffs_in + c * size,
    459                                 scratch_space);
    460           }
    461           DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size,
    462                                   dc_rows[1] + bx, dc_stride);
    463 
    464           QuantizeRoundtripYBlockAC(
    465               enc_state, size, enc_state->shared.quantizer, error_diffusion,
    466               acs.RawStrategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac,
    467               coeffs_in, quantized);
    468 
    469           // Unapply color correlation
    470           for (size_t k = 0; k < size; k += Lanes(d)) {
    471             const auto in_x = Load(d, coeffs_in + k);
    472             const auto in_y = Load(d, coeffs_in + size + k);
    473             const auto in_b = Load(d, coeffs_in + 2 * size + k);
    474             const auto out_x = NegMulAdd(x_factor, in_y, in_x);
    475             const auto out_b = NegMulAdd(b_factor, in_y, in_b);
    476             Store(out_x, d, coeffs_in + k);
    477             Store(out_b, d, coeffs_in + 2 * size + k);
    478           }
    479 
    480           // Quantize X and B channels and set DC.
    481           for (size_t c : {0, 2}) {
    482             float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f};
    483             QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c,
    484                             c == 0 ? enc_state->x_qm_multiplier
    485                                    : enc_state->b_qm_multiplier,
    486                             acs.RawStrategy(), xblocks, yblocks, &thres[0],
    487                             coeffs_in + c * size, &quant_ac,
    488                             quantized + c * size);
    489             DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size,
    490                                     dc_rows[c] + bx, dc_stride);
    491           }
    492           row_quant_ac[bx] = quant_ac;
    493           for (size_t c = 0; c < 3; c++) {
    494             enc_state->progressive_splitter.SplitACCoefficients(
    495                 quantized + c * size, acs, bx, by, coeffs[c]);
    496             for (size_t p = 0; p < num_passes; p++) {
    497               coeffs[c][p] += size;
    498             }
    499           }
    500         }
    501       }
    502     }
    503   }
    504 }
    505 
    506 // NOLINTNEXTLINE(google-readability-namespace-comments)
    507 }  // namespace HWY_NAMESPACE
    508 }  // namespace jxl
    509 HWY_AFTER_NAMESPACE();
    510 
    511 #if HWY_ONCE
    512 namespace jxl {
    513 HWY_EXPORT(ComputeCoefficients);
    514 void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
    515                          const Image3F& opsin, const Rect& rect, Image3F* dc) {
    516   HWY_DYNAMIC_DISPATCH(ComputeCoefficients)
    517   (group_idx, enc_state, opsin, rect, dc);
    518 }
    519 
    520 Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx,
    521                                         size_t histogram_idx,
    522                                         const PassesEncoderState& enc_state,
    523                                         BitWriter* writer, AuxOut* aux_out) {
    524   // Select which histogram to use among those of the current pass.
    525   const size_t num_histograms = enc_state.shared.num_histograms;
    526   // num_histograms is 0 only for lossless.
    527   JXL_ASSERT(num_histograms == 0 || histogram_idx < num_histograms);
    528   size_t histo_selector_bits = CeilLog2Nonzero(num_histograms);
    529 
    530   if (histo_selector_bits != 0) {
    531     BitWriter::Allotment allotment(writer, histo_selector_bits);
    532     writer->Write(histo_selector_bits, histogram_idx);
    533     allotment.ReclaimAndCharge(writer, kLayerAC, aux_out);
    534   }
    535   size_t context_offset =
    536       histogram_idx * enc_state.shared.block_ctx_map.NumACContexts();
    537   WriteTokens(enc_state.passes[pass_idx].ac_tokens[group_idx],
    538               enc_state.passes[pass_idx].codes,
    539               enc_state.passes[pass_idx].context_map, context_offset, writer,
    540               kLayerACTokens, aux_out);
    541 
    542   return true;
    543 }
    544 
    545 }  // namespace jxl
    546 #endif  // HWY_ONCE