libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

enc_modular.cc (72414B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jxl/enc_modular.h"
      7 
      8 #include <stddef.h>
      9 #include <stdint.h>
     10 
     11 #include <array>
     12 #include <atomic>
     13 #include <cstdint>
     14 #include <limits>
     15 #include <utility>
     16 #include <vector>
     17 
     18 #include "lib/jxl/base/compiler_specific.h"
     19 #include "lib/jxl/base/printf_macros.h"
     20 #include "lib/jxl/base/status.h"
     21 #include "lib/jxl/compressed_dc.h"
     22 #include "lib/jxl/dec_ans.h"
     23 #include "lib/jxl/enc_aux_out.h"
     24 #include "lib/jxl/enc_bit_writer.h"
     25 #include "lib/jxl/enc_cluster.h"
     26 #include "lib/jxl/enc_fields.h"
     27 #include "lib/jxl/enc_gaborish.h"
     28 #include "lib/jxl/enc_params.h"
     29 #include "lib/jxl/enc_patch_dictionary.h"
     30 #include "lib/jxl/enc_quant_weights.h"
     31 #include "lib/jxl/frame_dimensions.h"
     32 #include "lib/jxl/frame_header.h"
     33 #include "lib/jxl/modular/encoding/context_predict.h"
     34 #include "lib/jxl/modular/encoding/enc_encoding.h"
     35 #include "lib/jxl/modular/encoding/encoding.h"
     36 #include "lib/jxl/modular/encoding/ma_common.h"
     37 #include "lib/jxl/modular/modular_image.h"
     38 #include "lib/jxl/modular/options.h"
     39 #include "lib/jxl/modular/transform/enc_transform.h"
     40 #include "lib/jxl/pack_signed.h"
     41 #include "modular/options.h"
     42 
     43 namespace jxl {
     44 
     45 namespace {
     46 // constexpr bool kPrintTree = false;
     47 
     48 // Squeeze default quantization factors
     49 // these quantization factors are for -Q 50  (other qualities simply scale the
     50 // factors; things are rounded down and obviously cannot get below 1)
     51 const float squeeze_quality_factor =
     52     0.35;  // for easy tweaking of the quality range (decrease this number for
     53            // higher quality)
     54 const float squeeze_luma_factor =
     55     1.1;  // for easy tweaking of the balance between luma (or anything
     56           // non-chroma) and chroma (decrease this number for higher quality
     57           // luma)
     58 const float squeeze_quality_factor_xyb = 2.4f;
     59 const float squeeze_xyb_qtable[3][16] = {
     60     {163.84, 81.92, 40.96, 20.48, 10.24, 5.12, 2.56, 1.28, 0.64, 0.32, 0.16,
     61      0.08, 0.04, 0.02, 0.01, 0.005},  // Y
     62     {1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, 0.5,
     63      0.5},  // X
     64     {2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5,
     65      0.5},  // B-Y
     66 };
     67 
     68 const float squeeze_luma_qtable[16] = {163.84, 81.92, 40.96, 20.48, 10.24, 5.12,
     69                                        2.56,   1.28,  0.64,  0.32,  0.16,  0.08,
     70                                        0.04,   0.02,  0.01,  0.005};
     71 // for 8-bit input, the range of YCoCg chroma is -255..255 so basically this
     72 // does 4:2:0 subsampling (two most fine grained layers get quantized away)
     73 const float squeeze_chroma_qtable[16] = {
     74     1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, 0.5, 0.5};
     75 
     76 // Merges the trees in `trees` using nodes that decide on stream_id, as defined
     77 // by `tree_splits`.
     78 void MergeTrees(const std::vector<Tree>& trees,
     79                 const std::vector<size_t>& tree_splits, size_t begin,
     80                 size_t end, Tree* tree) {
     81   JXL_ASSERT(trees.size() + 1 == tree_splits.size());
     82   JXL_ASSERT(end > begin);
     83   JXL_ASSERT(end <= trees.size());
     84   if (end == begin + 1) {
     85     // Insert the tree, adding the opportune offset to all child nodes.
     86     // This will make the leaf IDs wrong, but subsequent roundtripping will fix
     87     // them.
     88     size_t sz = tree->size();
     89     tree->insert(tree->end(), trees[begin].begin(), trees[begin].end());
     90     for (size_t i = sz; i < tree->size(); i++) {
     91       (*tree)[i].lchild += sz;
     92       (*tree)[i].rchild += sz;
     93     }
     94     return;
     95   }
     96   size_t mid = (begin + end) / 2;
     97   size_t splitval = tree_splits[mid] - 1;
     98   size_t cur = tree->size();
     99   tree->emplace_back(1 /*stream_id*/, splitval, 0, 0, Predictor::Zero, 0, 1);
    100   (*tree)[cur].lchild = tree->size();
    101   MergeTrees(trees, tree_splits, mid, end, tree);
    102   (*tree)[cur].rchild = tree->size();
    103   MergeTrees(trees, tree_splits, begin, mid, tree);
    104 }
    105 
    106 void QuantizeChannel(Channel& ch, const int q) {
    107   if (q == 1) return;
    108   for (size_t y = 0; y < ch.plane.ysize(); y++) {
    109     pixel_type* row = ch.plane.Row(y);
    110     for (size_t x = 0; x < ch.plane.xsize(); x++) {
    111       if (row[x] < 0) {
    112         row[x] = -((-row[x] + q / 2) / q) * q;
    113       } else {
    114         row[x] = ((row[x] + q / 2) / q) * q;
    115       }
    116     }
    117   }
    118 }
    119 
    120 // convert binary32 float that corresponds to custom [bits]-bit float (with
    121 // [exp_bits] exponent bits) to a [bits]-bit integer representation that should
    122 // fit in pixel_type
    123 Status float_to_int(const float* const row_in, pixel_type* const row_out,
    124                     size_t xsize, unsigned int bits, unsigned int exp_bits,
    125                     bool fp, double dfactor) {
    126   JXL_ASSERT(sizeof(pixel_type) * 8 >= bits);
    127   if (!fp) {
    128     if (bits > 22) {
    129       for (size_t x = 0; x < xsize; ++x) {
    130         row_out[x] = row_in[x] * dfactor + (row_in[x] < 0 ? -0.5 : 0.5);
    131       }
    132     } else {
    133       float factor = dfactor;
    134       for (size_t x = 0; x < xsize; ++x) {
    135         row_out[x] = row_in[x] * factor + (row_in[x] < 0 ? -0.5f : 0.5f);
    136       }
    137     }
    138     return true;
    139   }
    140   if (bits == 32 && fp) {
    141     JXL_ASSERT(exp_bits == 8);
    142     memcpy(static_cast<void*>(row_out), static_cast<const void*>(row_in),
    143            4 * xsize);
    144     return true;
    145   }
    146 
    147   JXL_ASSERT(bits > 0);
    148   int exp_bias = (1 << (exp_bits - 1)) - 1;
    149   int max_exp = (1 << exp_bits) - 1;
    150   uint32_t sign = (1u << (bits - 1));
    151   int mant_bits = bits - exp_bits - 1;
    152   int mant_shift = 23 - mant_bits;
    153   for (size_t x = 0; x < xsize; ++x) {
    154     uint32_t f;
    155     memcpy(&f, &row_in[x], 4);
    156     int signbit = (f >> 31);
    157     f &= 0x7fffffff;
    158     if (f == 0) {
    159       row_out[x] = (signbit ? sign : 0);
    160       continue;
    161     }
    162     int exp = (f >> 23) - 127;
    163     if (exp == 128) return JXL_FAILURE("Inf/NaN not allowed");
    164     int mantissa = (f & 0x007fffff);
    165     // broke up the binary32 into its parts, now reassemble into
    166     // arbitrary float
    167     exp += exp_bias;
    168     if (exp < 0) {  // will become a subnormal number
    169       // add implicit leading 1 to mantissa
    170       mantissa |= 0x00800000;
    171       if (exp < -mant_bits) {
    172         return JXL_FAILURE(
    173             "Invalid float number: %g cannot be represented with %i "
    174             "exp_bits and %i mant_bits (exp %i)",
    175             row_in[x], exp_bits, mant_bits, exp);
    176       }
    177       mantissa >>= 1 - exp;
    178       exp = 0;
    179     }
    180     // exp should be representable in exp_bits, otherwise input was
    181     // invalid
    182     if (exp > max_exp) return JXL_FAILURE("Invalid float exponent");
    183     if (mantissa & ((1 << mant_shift) - 1)) {
    184       return JXL_FAILURE("%g is losing precision (mant: %x)", row_in[x],
    185                          mantissa);
    186     }
    187     mantissa >>= mant_shift;
    188     f = (signbit ? sign : 0);
    189     f |= (exp << mant_bits);
    190     f |= mantissa;
    191     row_out[x] = static_cast<pixel_type>(f);
    192   }
    193   return true;
    194 }
    195 
    196 float EstimateWPCost(const Image& img, size_t i) {
    197   size_t extra_bits = 0;
    198   float histo_cost = 0;
    199   HybridUintConfig config;
    200   int32_t cutoffs[] = {-500, -392, -255, -191, -127, -95, -63, -47, -31,
    201                        -23,  -15,  -11,  -7,   -4,   -3,  -1,  0,   1,
    202                        3,    5,    7,    11,   15,   23,  31,  47,  63,
    203                        95,   127,  191,  255,  392,  500};
    204   constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1;
    205   Histogram histo[nc] = {};
    206   weighted::Header wp_header;
    207   PredictorMode(i, &wp_header);
    208   for (const Channel& ch : img.channel) {
    209     const intptr_t onerow = ch.plane.PixelsPerRow();
    210     weighted::State wp_state(wp_header, ch.w, ch.h);
    211     Properties properties(1);
    212     for (size_t y = 0; y < ch.h; y++) {
    213       const pixel_type* JXL_RESTRICT r = ch.Row(y);
    214       for (size_t x = 0; x < ch.w; x++) {
    215         size_t offset = 0;
    216         pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
    217         pixel_type_w top = (y ? *(r + x - onerow) : left);
    218         pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
    219         pixel_type_w topright =
    220             (x + 1 < ch.w && y ? *(r + x + 1 - onerow) : top);
    221         pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top);
    222         pixel_type guess = wp_state.Predict</*compute_properties=*/true>(
    223             x, y, ch.w, top, left, topright, topleft, toptop, &properties,
    224             offset);
    225         size_t ctx = 0;
    226         for (int c : cutoffs) {
    227           ctx += (c >= properties[0]) ? 1 : 0;
    228         }
    229         pixel_type res = r[x] - guess;
    230         uint32_t token;
    231         uint32_t nbits;
    232         uint32_t bits;
    233         config.Encode(PackSigned(res), &token, &nbits, &bits);
    234         histo[ctx].Add(token);
    235         extra_bits += nbits;
    236         wp_state.UpdateErrors(r[x], x, y, ch.w);
    237       }
    238     }
    239     for (auto& h : histo) {
    240       histo_cost += h.ShannonEntropy();
    241       h.Clear();
    242     }
    243   }
    244   return histo_cost + extra_bits;
    245 }
    246 
    247 float EstimateCost(const Image& img) {
    248   // TODO(veluca): consider SIMDfication of this code.
    249   size_t extra_bits = 0;
    250   float histo_cost = 0;
    251   HybridUintConfig config;
    252   uint32_t cutoffs[] = {0,  1,  3,  5,   7,   11,  15,  23, 31,
    253                         47, 63, 95, 127, 191, 255, 392, 500};
    254   constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1;
    255   Histogram histo[nc] = {};
    256   for (const Channel& ch : img.channel) {
    257     const intptr_t onerow = ch.plane.PixelsPerRow();
    258     for (size_t y = 0; y < ch.h; y++) {
    259       const pixel_type* JXL_RESTRICT r = ch.Row(y);
    260       for (size_t x = 0; x < ch.w; x++) {
    261         pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
    262         pixel_type_w top = (y ? *(r + x - onerow) : left);
    263         pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
    264         size_t maxdiff = std::max(std::max(left, top), topleft) -
    265                          std::min(std::min(left, top), topleft);
    266         size_t ctx = 0;
    267         for (uint32_t c : cutoffs) {
    268           ctx += (c > maxdiff) ? 1 : 0;
    269         }
    270         pixel_type res = r[x] - ClampedGradient(top, left, topleft);
    271         uint32_t token;
    272         uint32_t nbits;
    273         uint32_t bits;
    274         config.Encode(PackSigned(res), &token, &nbits, &bits);
    275         histo[ctx].Add(token);
    276         extra_bits += nbits;
    277       }
    278     }
    279     for (auto& h : histo) {
    280       histo_cost += h.ShannonEntropy();
    281       h.Clear();
    282     }
    283   }
    284   return histo_cost + extra_bits;
    285 }
    286 
    287 bool do_transform(Image& image, const Transform& tr,
    288                   const weighted::Header& wp_header,
    289                   jxl::ThreadPool* pool = nullptr, bool force_jxlart = false) {
    290   Transform t = tr;
    291   bool did_it = true;
    292   if (force_jxlart) {
    293     if (!t.MetaApply(image)) return false;
    294   } else {
    295     did_it = TransformForward(t, image, wp_header, pool);
    296   }
    297   if (did_it) image.transform.push_back(t);
    298   return did_it;
    299 }
    300 
    301 bool maybe_do_transform(Image& image, const Transform& tr,
    302                         const CompressParams& cparams,
    303                         const weighted::Header& wp_header,
    304                         jxl::ThreadPool* pool = nullptr,
    305                         bool force_jxlart = false) {
    306   if (force_jxlart || cparams.speed_tier >= SpeedTier::kSquirrel) {
    307     return do_transform(image, tr, wp_header, pool, force_jxlart);
    308   }
    309   float cost_before = EstimateCost(image);
    310   bool did_it = do_transform(image, tr, wp_header, pool);
    311   if (did_it) {
    312     float cost_after = EstimateCost(image);
    313     JXL_DEBUG_V(7, "Cost before: %f  cost after: %f", cost_before, cost_after);
    314     if (cost_after > cost_before) {
    315       Transform t = image.transform.back();
    316       JXL_RETURN_IF_ERROR(t.Inverse(image, wp_header, pool));
    317       image.transform.pop_back();
    318       did_it = false;
    319     }
    320   }
    321   return did_it;
    322 }
    323 
    324 }  // namespace
    325 
    326 ModularFrameEncoder::ModularFrameEncoder(const FrameHeader& frame_header,
    327                                          const CompressParams& cparams_orig,
    328                                          bool streaming_mode)
    329     : frame_dim_(frame_header.ToFrameDimensions()), cparams_(cparams_orig) {
    330   size_t num_streams =
    331       ModularStreamId::Num(frame_dim_, frame_header.passes.num_passes);
    332   if (cparams_.ModularPartIsLossless()) {
    333     switch (cparams_.decoding_speed_tier) {
    334       case 0:
    335         break;
    336       case 1:
    337         cparams_.options.wp_tree_mode = ModularOptions::TreeMode::kWPOnly;
    338         break;
    339       case 2: {
    340         cparams_.options.wp_tree_mode = ModularOptions::TreeMode::kGradientOnly;
    341         cparams_.options.predictor = Predictor::Gradient;
    342         break;
    343       }
    344       case 3: {  // LZ77, no Gradient.
    345         cparams_.options.nb_repeats = 0;
    346         cparams_.options.predictor = Predictor::Gradient;
    347         break;
    348       }
    349       default: {  // LZ77, no predictor.
    350         cparams_.options.nb_repeats = 0;
    351         cparams_.options.predictor = Predictor::Zero;
    352         break;
    353       }
    354     }
    355   }
    356   if (cparams_.decoding_speed_tier >= 1 && cparams_.responsive &&
    357       cparams_.ModularPartIsLossless()) {
    358     cparams_.options.tree_kind =
    359         ModularOptions::TreeKind::kTrivialTreeNoPredictor;
    360     cparams_.options.nb_repeats = 0;
    361   }
    362   stream_images_.resize(num_streams);
    363 
    364   // use a sensible default if nothing explicit is specified:
    365   // Squeeze for lossy, no squeeze for lossless
    366   if (cparams_.responsive < 0) {
    367     if (cparams_.ModularPartIsLossless()) {
    368       cparams_.responsive = 0;
    369     } else {
    370       cparams_.responsive = 1;
    371     }
    372   }
    373 
    374   cparams_.options.splitting_heuristics_node_threshold =
    375       82 + 14 * static_cast<int>(cparams_.speed_tier);
    376 
    377   {
    378     // Set properties.
    379     std::vector<uint32_t> prop_order;
    380     if (cparams_.responsive) {
    381       // Properties in order of their likelihood of being useful for Squeeze
    382       // residuals.
    383       prop_order = {0, 1, 4, 5, 6, 7, 8, 15, 9, 10, 11, 12, 13, 14, 2, 3};
    384     } else {
    385       // Same, but for the non-Squeeze case.
    386       prop_order = {0, 1, 15, 9, 10, 11, 12, 13, 14, 2, 3, 4, 5, 6, 7, 8};
    387       // if few groups, don't use group as a property
    388       if (num_streams < 30 && cparams_.speed_tier > SpeedTier::kTortoise &&
    389           cparams_orig.ModularPartIsLossless()) {
    390         prop_order.erase(prop_order.begin() + 1);
    391       }
    392     }
    393     int max_properties = std::min<int>(
    394         cparams_.options.max_properties,
    395         static_cast<int>(
    396             frame_header.nonserialized_metadata->m.num_extra_channels) +
    397             (frame_header.encoding == FrameEncoding::kModular ? 2 : -1));
    398     switch (cparams_.speed_tier) {
    399       case SpeedTier::kHare:
    400         cparams_.options.splitting_heuristics_properties.assign(
    401             prop_order.begin(), prop_order.begin() + 4);
    402         cparams_.options.max_property_values = 24;
    403         break;
    404       case SpeedTier::kWombat:
    405         cparams_.options.splitting_heuristics_properties.assign(
    406             prop_order.begin(), prop_order.begin() + 5);
    407         cparams_.options.max_property_values = 32;
    408         break;
    409       case SpeedTier::kSquirrel:
    410         cparams_.options.splitting_heuristics_properties.assign(
    411             prop_order.begin(), prop_order.begin() + 7);
    412         cparams_.options.max_property_values = 48;
    413         break;
    414       case SpeedTier::kKitten:
    415         cparams_.options.splitting_heuristics_properties.assign(
    416             prop_order.begin(), prop_order.begin() + 10);
    417         cparams_.options.max_property_values = 96;
    418         break;
    419       case SpeedTier::kGlacier:
    420       case SpeedTier::kTortoise:
    421         cparams_.options.splitting_heuristics_properties = prop_order;
    422         cparams_.options.max_property_values = 256;
    423         break;
    424       default:
    425         cparams_.options.splitting_heuristics_properties.assign(
    426             prop_order.begin(), prop_order.begin() + 3);
    427         cparams_.options.max_property_values = 16;
    428         break;
    429     }
    430     if (cparams_.speed_tier > SpeedTier::kTortoise) {
    431       // Gradient in previous channels.
    432       for (int i = 0; i < max_properties; i++) {
    433         cparams_.options.splitting_heuristics_properties.push_back(
    434             kNumNonrefProperties + i * 4 + 3);
    435       }
    436     } else {
    437       // All the extra properties in Tortoise mode.
    438       for (int i = 0; i < max_properties * 4; i++) {
    439         cparams_.options.splitting_heuristics_properties.push_back(
    440             kNumNonrefProperties + i);
    441       }
    442     }
    443   }
    444 
    445   if ((cparams_.options.predictor == Predictor::Average0 ||
    446        cparams_.options.predictor == Predictor::Average1 ||
    447        cparams_.options.predictor == Predictor::Average2 ||
    448        cparams_.options.predictor == Predictor::Average3 ||
    449        cparams_.options.predictor == Predictor::Average4 ||
    450        cparams_.options.predictor == Predictor::Weighted) &&
    451       !cparams_.ModularPartIsLossless()) {
    452     // Lossy + Average/Weighted predictors does not work, so switch to default
    453     // predictors.
    454     cparams_.options.predictor = kUndefinedPredictor;
    455   }
    456 
    457   if (cparams_.options.predictor == kUndefinedPredictor) {
    458     // no explicit predictor(s) given, set a good default
    459     if ((cparams_.speed_tier <= SpeedTier::kGlacier ||
    460          cparams_.modular_mode == false) &&
    461         cparams_.IsLossless() && cparams_.responsive == JXL_FALSE) {
    462       // TODO(veluca): allow all predictors that don't break residual
    463       // multipliers in lossy mode.
    464       cparams_.options.predictor = Predictor::Variable;
    465     } else if (cparams_.responsive || cparams_.lossy_palette) {
    466       // zero predictor for Squeeze residues and lossy palette
    467       cparams_.options.predictor = Predictor::Zero;
    468     } else if (!cparams_.IsLossless()) {
    469       // If not responsive and lossy. TODO(veluca): use near_lossless instead?
    470       cparams_.options.predictor = Predictor::Gradient;
    471     } else if (cparams_.speed_tier < SpeedTier::kFalcon) {
    472       // try median and weighted predictor for anything else
    473       cparams_.options.predictor = Predictor::Best;
    474     } else if (cparams_.speed_tier == SpeedTier::kFalcon) {
    475       // just weighted predictor in falcon mode
    476       cparams_.options.predictor = Predictor::Weighted;
    477     } else if (cparams_.speed_tier > SpeedTier::kFalcon) {
    478       // just gradient predictor in thunder mode
    479       cparams_.options.predictor = Predictor::Gradient;
    480     }
    481   } else {
    482     delta_pred_ = cparams_.options.predictor;
    483     if (cparams_.lossy_palette) cparams_.options.predictor = Predictor::Zero;
    484   }
    485   if (!cparams_.ModularPartIsLossless()) {
    486     if (cparams_.options.predictor == Predictor::Weighted ||
    487         cparams_.options.predictor == Predictor::Variable ||
    488         cparams_.options.predictor == Predictor::Best)
    489       cparams_.options.predictor = Predictor::Zero;
    490   }
    491   tree_splits_.push_back(0);
    492   if (cparams_.modular_mode == false) {
    493     cparams_.options.fast_decode_multiplier = 1.0f;
    494     tree_splits_.push_back(ModularStreamId::VarDCTDC(0).ID(frame_dim_));
    495     tree_splits_.push_back(ModularStreamId::ModularDC(0).ID(frame_dim_));
    496     tree_splits_.push_back(ModularStreamId::ACMetadata(0).ID(frame_dim_));
    497     tree_splits_.push_back(ModularStreamId::QuantTable(0).ID(frame_dim_));
    498     tree_splits_.push_back(ModularStreamId::ModularAC(0, 0).ID(frame_dim_));
    499     ac_metadata_size.resize(frame_dim_.num_dc_groups);
    500     extra_dc_precision.resize(frame_dim_.num_dc_groups);
    501   }
    502   tree_splits_.push_back(num_streams);
    503   cparams_.options.max_chan_size = frame_dim_.group_dim;
    504   cparams_.options.group_dim = frame_dim_.group_dim;
    505 
    506   // TODO(veluca): figure out how to use different predictor sets per channel.
    507   stream_options_.resize(num_streams, cparams_.options);
    508 
    509   stream_options_[0] = cparams_.options;
    510   if (cparams_.speed_tier == SpeedTier::kFalcon) {
    511     stream_options_[0].tree_kind = ModularOptions::TreeKind::kWPFixedDC;
    512   } else if (cparams_.speed_tier == SpeedTier::kThunder) {
    513     stream_options_[0].tree_kind = ModularOptions::TreeKind::kGradientFixedDC;
    514   }
    515   stream_options_[0].histogram_params =
    516       HistogramParams::ForModular(cparams_, {}, streaming_mode);
    517 }
    518 
    519 Status ModularFrameEncoder::ComputeEncodingData(
    520     const FrameHeader& frame_header, const ImageMetadata& metadata,
    521     Image3F* JXL_RESTRICT color, const std::vector<ImageF>& extra_channels,
    522     const Rect& group_rect, const FrameDimensions& patch_dim,
    523     const Rect& frame_area_rect, PassesEncoderState* JXL_RESTRICT enc_state,
    524     const JxlCmsInterface& cms, ThreadPool* pool, AuxOut* aux_out,
    525     bool do_color) {
    526   JXL_DEBUG_V(6, "Computing modular encoding data for frame %s",
    527               frame_header.DebugString().c_str());
    528 
    529   bool groupwise = enc_state->streaming_mode;
    530 
    531   if (do_color && frame_header.loop_filter.gab && !groupwise) {
    532     float w = 0.9908511000000001f;
    533     float weights[3] = {w, w, w};
    534     JXL_RETURN_IF_ERROR(GaborishInverse(color, Rect(*color), weights, pool));
    535   }
    536 
    537   if (do_color && metadata.bit_depth.bits_per_sample <= 16 &&
    538       cparams_.speed_tier < SpeedTier::kCheetah &&
    539       cparams_.decoding_speed_tier < 2 && !groupwise) {
    540     JXL_RETURN_IF_ERROR(FindBestPatchDictionary(
    541         *color, enc_state, cms, nullptr, aux_out,
    542         cparams_.color_transform == ColorTransform::kXYB));
    543     PatchDictionaryEncoder::SubtractFrom(
    544         enc_state->shared.image_features.patches, color);
    545   }
    546 
    547   if (cparams_.custom_splines.HasAny()) {
    548     PassesSharedState& shared = enc_state->shared;
    549     ImageFeatures& image_features = shared.image_features;
    550     image_features.splines = cparams_.custom_splines;
    551   }
    552 
    553   // Convert ImageBundle to modular Image object
    554   const size_t xsize = patch_dim.xsize;
    555   const size_t ysize = patch_dim.ysize;
    556 
    557   int nb_chans = 3;
    558   if (metadata.color_encoding.IsGray() &&
    559       cparams_.color_transform == ColorTransform::kNone) {
    560     nb_chans = 1;
    561   }
    562   if (!do_color) nb_chans = 0;
    563 
    564   nb_chans += extra_channels.size();
    565 
    566   bool fp = metadata.bit_depth.floating_point_sample &&
    567             cparams_.color_transform != ColorTransform::kXYB;
    568 
    569   // bits_per_sample is just metadata for XYB images.
    570   if (metadata.bit_depth.bits_per_sample >= 32 && do_color &&
    571       cparams_.color_transform != ColorTransform::kXYB) {
    572     if (metadata.bit_depth.bits_per_sample == 32 && fp == false) {
    573       return JXL_FAILURE("uint32_t not supported in enc_modular");
    574     } else if (metadata.bit_depth.bits_per_sample > 32) {
    575       return JXL_FAILURE("bits_per_sample > 32 not supported");
    576     }
    577   }
    578 
    579   // in the non-float case, there is an implicit 0 sign bit
    580   int max_bitdepth =
    581       do_color ? metadata.bit_depth.bits_per_sample + (fp ? 0 : 1) : 0;
    582   Image& gi = stream_images_[0];
    583   JXL_ASSIGN_OR_RETURN(
    584       gi, Image::Create(xsize, ysize, metadata.bit_depth.bits_per_sample,
    585                         nb_chans));
    586   int c = 0;
    587   if (cparams_.color_transform == ColorTransform::kXYB &&
    588       cparams_.modular_mode == true) {
    589     float enc_factors[3] = {32768.0f, 2048.0f, 2048.0f};
    590     if (cparams_.butteraugli_distance > 0 && !cparams_.responsive) {
    591       // quantize XYB here and then treat it as a lossless image
    592       enc_factors[0] *= 1.f / (1.f + 23.f * cparams_.butteraugli_distance);
    593       enc_factors[1] *= 1.f / (1.f + 14.f * cparams_.butteraugli_distance);
    594       enc_factors[2] *= 1.f / (1.f + 14.f * cparams_.butteraugli_distance);
    595       cparams_.butteraugli_distance = 0;
    596     }
    597     if (cparams_.manual_xyb_factors.size() == 3) {
    598       DequantMatricesSetCustomDC(&enc_state->shared.matrices,
    599                                  cparams_.manual_xyb_factors.data());
    600       // TODO(jon): update max_bitdepth in this case
    601     } else {
    602       DequantMatricesSetCustomDC(&enc_state->shared.matrices, enc_factors);
    603       max_bitdepth = 12;
    604     }
    605   }
    606   pixel_type maxval = gi.bitdepth < 32 ? (1u << gi.bitdepth) - 1 : 0;
    607   if (do_color) {
    608     for (; c < 3; c++) {
    609       if (metadata.color_encoding.IsGray() &&
    610           cparams_.color_transform == ColorTransform::kNone &&
    611           c != (cparams_.color_transform == ColorTransform::kXYB ? 1 : 0))
    612         continue;
    613       int c_out = c;
    614       // XYB is encoded as YX(B-Y)
    615       if (cparams_.color_transform == ColorTransform::kXYB && c < 2)
    616         c_out = 1 - c_out;
    617       double factor = maxval;
    618       if (cparams_.color_transform == ColorTransform::kXYB)
    619         factor = enc_state->shared.matrices.InvDCQuant(c);
    620       if (c == 2 && cparams_.color_transform == ColorTransform::kXYB) {
    621         JXL_ASSERT(!fp);
    622         for (size_t y = 0; y < ysize; ++y) {
    623           const float* const JXL_RESTRICT row_in = color->PlaneRow(c, y);
    624           pixel_type* const JXL_RESTRICT row_out = gi.channel[c_out].Row(y);
    625           pixel_type* const JXL_RESTRICT row_Y = gi.channel[0].Row(y);
    626           for (size_t x = 0; x < xsize; ++x) {
    627             row_out[x] = row_in[x] * factor + 0.5f;
    628             row_out[x] -= row_Y[x];
    629             // zero the lsb of B
    630             row_out[x] = row_out[x] / 2 * 2;
    631           }
    632         }
    633       } else {
    634         int bits = metadata.bit_depth.bits_per_sample;
    635         int exp_bits = metadata.bit_depth.exponent_bits_per_sample;
    636         gi.channel[c_out].hshift = frame_header.chroma_subsampling.HShift(c);
    637         gi.channel[c_out].vshift = frame_header.chroma_subsampling.VShift(c);
    638         size_t xsize_shifted = DivCeil(xsize, 1 << gi.channel[c_out].hshift);
    639         size_t ysize_shifted = DivCeil(ysize, 1 << gi.channel[c_out].vshift);
    640         JXL_RETURN_IF_ERROR(
    641             gi.channel[c_out].shrink(xsize_shifted, ysize_shifted));
    642         std::atomic<bool> has_error{false};
    643         JXL_RETURN_IF_ERROR(RunOnPool(
    644             pool, 0, ysize_shifted, ThreadPool::NoInit,
    645             [&](const int task, const int thread) {
    646               if (has_error) return;
    647               const size_t y = task;
    648               const float* const JXL_RESTRICT row_in =
    649                   color->PlaneRow(c, y + group_rect.y0()) + group_rect.x0();
    650               pixel_type* const JXL_RESTRICT row_out = gi.channel[c_out].Row(y);
    651               if (!float_to_int(row_in, row_out, xsize_shifted, bits, exp_bits,
    652                                 fp, factor)) {
    653                 has_error = true;
    654                 return;
    655               };
    656             },
    657             "float2int"));
    658         if (has_error) {
    659           return JXL_FAILURE("Error in float to integer conversion");
    660         }
    661       }
    662     }
    663     if (metadata.color_encoding.IsGray() &&
    664         cparams_.color_transform == ColorTransform::kNone)
    665       c = 1;
    666   }
    667 
    668   for (size_t ec = 0; ec < extra_channels.size(); ec++, c++) {
    669     const ExtraChannelInfo& eci = metadata.extra_channel_info[ec];
    670     size_t ecups = frame_header.extra_channel_upsampling[ec];
    671     JXL_RETURN_IF_ERROR(
    672         gi.channel[c].shrink(DivCeil(patch_dim.xsize_upsampled, ecups),
    673                              DivCeil(patch_dim.ysize_upsampled, ecups)));
    674     gi.channel[c].hshift = gi.channel[c].vshift =
    675         CeilLog2Nonzero(ecups) - CeilLog2Nonzero(frame_header.upsampling);
    676 
    677     int bits = eci.bit_depth.bits_per_sample;
    678     int exp_bits = eci.bit_depth.exponent_bits_per_sample;
    679     bool fp = eci.bit_depth.floating_point_sample;
    680     double factor = (fp ? 1 : ((1u << eci.bit_depth.bits_per_sample) - 1));
    681     if (bits + (fp ? 0 : 1) > max_bitdepth) max_bitdepth = bits + (fp ? 0 : 1);
    682     std::atomic<bool> has_error{false};
    683     JXL_RETURN_IF_ERROR(RunOnPool(
    684         pool, 0, gi.channel[c].plane.ysize(), ThreadPool::NoInit,
    685         [&](const int task, const int thread) {
    686           if (has_error) return;
    687           const size_t y = task;
    688           const float* const JXL_RESTRICT row_in =
    689               extra_channels[ec].Row(y + group_rect.y0()) + group_rect.x0();
    690           pixel_type* const JXL_RESTRICT row_out = gi.channel[c].Row(y);
    691           if (!float_to_int(row_in, row_out, gi.channel[c].plane.xsize(), bits,
    692                             exp_bits, fp, factor)) {
    693             has_error = true;
    694             return;
    695           };
    696         },
    697         "float2int"));
    698     if (has_error) return JXL_FAILURE("Error in float to integer conversion");
    699   }
    700   JXL_ASSERT(c == nb_chans);
    701 
    702   int level_max_bitdepth = (cparams_.level == 5 ? 16 : 32);
    703   if (max_bitdepth > level_max_bitdepth) {
    704     return JXL_FAILURE(
    705         "Bitdepth too high for level %i (need %i bits, have only %i in this "
    706         "level)",
    707         cparams_.level, max_bitdepth, level_max_bitdepth);
    708   }
    709 
    710   // Set options and apply transformations
    711   if (!cparams_.ModularPartIsLossless()) {
    712     if (cparams_.palette_colors != 0) {
    713       JXL_DEBUG_V(3, "Lossy encode, not doing palette transforms");
    714     }
    715     if (cparams_.color_transform == ColorTransform::kXYB) {
    716       cparams_.channel_colors_pre_transform_percent = 0;
    717     }
    718     cparams_.channel_colors_percent = 0;
    719     cparams_.palette_colors = 0;
    720     cparams_.lossy_palette = false;
    721   }
    722 
    723   // Global palette
    724   if ((cparams_.palette_colors != 0 || cparams_.lossy_palette) && !groupwise) {
    725     // all-channel palette (e.g. RGBA)
    726     if (gi.channel.size() - gi.nb_meta_channels > 1) {
    727       Transform maybe_palette(TransformId::kPalette);
    728       maybe_palette.begin_c = gi.nb_meta_channels;
    729       maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels;
    730       maybe_palette.nb_colors = std::min(static_cast<int>(xsize * ysize / 2),
    731                                          std::abs(cparams_.palette_colors));
    732       maybe_palette.ordered_palette = cparams_.palette_colors >= 0;
    733       maybe_palette.lossy_palette =
    734           (cparams_.lossy_palette && maybe_palette.num_c == 3);
    735       if (maybe_palette.lossy_palette) {
    736         maybe_palette.predictor = delta_pred_;
    737       }
    738       // TODO(veluca): use a custom weighted header if using the weighted
    739       // predictor.
    740       maybe_do_transform(gi, maybe_palette, cparams_, weighted::Header(), pool,
    741                          cparams_.options.zero_tokens);
    742     }
    743     // all-minus-one-channel palette (RGB with separate alpha, or CMY with
    744     // separate K)
    745     if (gi.channel.size() - gi.nb_meta_channels > 3) {
    746       Transform maybe_palette_3(TransformId::kPalette);
    747       maybe_palette_3.begin_c = gi.nb_meta_channels;
    748       maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1;
    749       maybe_palette_3.nb_colors = std::min(static_cast<int>(xsize * ysize / 3),
    750                                            std::abs(cparams_.palette_colors));
    751       maybe_palette_3.ordered_palette = cparams_.palette_colors >= 0;
    752       maybe_palette_3.lossy_palette = cparams_.lossy_palette;
    753       if (maybe_palette_3.lossy_palette) {
    754         maybe_palette_3.predictor = delta_pred_;
    755       }
    756       maybe_do_transform(gi, maybe_palette_3, cparams_, weighted::Header(),
    757                          pool, cparams_.options.zero_tokens);
    758     }
    759   }
    760 
    761   // Global channel palette
    762   if (!groupwise && cparams_.channel_colors_pre_transform_percent > 0 &&
    763       !cparams_.lossy_palette &&
    764       (cparams_.speed_tier <= SpeedTier::kThunder ||
    765        (do_color && metadata.bit_depth.bits_per_sample > 8))) {
    766     // single channel palette (like FLIF's ChannelCompact)
    767     size_t nb_channels = gi.channel.size() - gi.nb_meta_channels;
    768     int orig_bitdepth = max_bitdepth;
    769     max_bitdepth = 0;
    770     for (size_t i = 0; i < nb_channels; i++) {
    771       int32_t min;
    772       int32_t max;
    773       compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max);
    774       int64_t colors = static_cast<int64_t>(max) - min + 1;
    775       JXL_DEBUG_V(10, "Channel %" PRIuS ": range=%i..%i", i, min, max);
    776       Transform maybe_palette_1(TransformId::kPalette);
    777       maybe_palette_1.begin_c = i + gi.nb_meta_channels;
    778       maybe_palette_1.num_c = 1;
    779       // simple heuristic: if less than X percent of the values in the range
    780       // actually occur, it is probably worth it to do a compaction
    781       // (but only if the channel palette is less than 6% the size of the
    782       // image itself)
    783       maybe_palette_1.nb_colors = std::min(
    784           static_cast<int>(xsize * ysize / 16),
    785           static_cast<int>(cparams_.channel_colors_pre_transform_percent /
    786                            100. * colors));
    787       if (maybe_do_transform(gi, maybe_palette_1, cparams_, weighted::Header(),
    788                              pool)) {
    789         // effective bit depth is lower, adjust quantization accordingly
    790         compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max);
    791         if (max < maxval) maxval = max;
    792         int ch_bitdepth =
    793             (max > 0 ? CeilLog2Nonzero(static_cast<uint32_t>(max)) : 0);
    794         if (ch_bitdepth > max_bitdepth) max_bitdepth = ch_bitdepth;
    795       } else
    796         max_bitdepth = orig_bitdepth;
    797     }
    798   }
    799 
    800   // don't do an RCT if we're short on bits
    801   if (cparams_.color_transform == ColorTransform::kNone && do_color &&
    802       gi.channel.size() - gi.nb_meta_channels >= 3 &&
    803       max_bitdepth + 1 < level_max_bitdepth) {
    804     if (cparams_.colorspace < 0 && (!cparams_.ModularPartIsLossless() ||
    805                                     cparams_.speed_tier > SpeedTier::kHare)) {
    806       Transform ycocg{TransformId::kRCT};
    807       ycocg.rct_type = 6;
    808       ycocg.begin_c = gi.nb_meta_channels;
    809       do_transform(gi, ycocg, weighted::Header(), pool);
    810       max_bitdepth++;
    811     } else if (cparams_.colorspace > 0) {
    812       Transform sg(TransformId::kRCT);
    813       sg.begin_c = gi.nb_meta_channels;
    814       sg.rct_type = cparams_.colorspace;
    815       do_transform(gi, sg, weighted::Header(), pool);
    816       max_bitdepth++;
    817     }
    818   }
    819 
    820   if (cparams_.move_to_front_from_channel > 0) {
    821     for (size_t tgt = 0;
    822          tgt + cparams_.move_to_front_from_channel < gi.channel.size(); tgt++) {
    823       size_t pos = cparams_.move_to_front_from_channel;
    824       while (pos > 0) {
    825         Transform move(TransformId::kRCT);
    826         if (pos == 1) {
    827           move.begin_c = tgt;
    828           move.rct_type = 28;  // RGB -> GRB
    829           pos -= 1;
    830         } else {
    831           move.begin_c = tgt + pos - 2;
    832           move.rct_type = 14;  // RGB -> BRG
    833           pos -= 2;
    834         }
    835         do_transform(gi, move, weighted::Header(), pool);
    836       }
    837     }
    838   }
    839 
    840   // don't do squeeze if we don't have some spare bits
    841   if (!groupwise && cparams_.responsive && !gi.channel.empty() &&
    842       max_bitdepth + 2 < level_max_bitdepth) {
    843     Transform t(TransformId::kSqueeze);
    844     do_transform(gi, t, weighted::Header(), pool);
    845     max_bitdepth += 2;
    846   }
    847 
    848   if (max_bitdepth + 1 > level_max_bitdepth) {
    849     // force no group RCTs if we don't have a spare bit
    850     cparams_.colorspace = 0;
    851   }
    852   JXL_ASSERT(max_bitdepth <= level_max_bitdepth);
    853 
    854   if (!cparams_.ModularPartIsLossless()) {
    855     quants_.resize(gi.channel.size(), 1);
    856     float quantizer = 0.25f;
    857     if (!cparams_.responsive) {
    858       JXL_DEBUG_V(1,
    859                   "Warning: lossy compression without Squeeze "
    860                   "transform is just color quantization.");
    861       quantizer *= 0.1f;
    862     }
    863     float bitdepth_correction = 1.f;
    864     if (cparams_.color_transform != ColorTransform::kXYB) {
    865       bitdepth_correction = maxval / 255.f;
    866     }
    867     std::vector<float> quantizers;
    868     for (size_t i = 0; i < 3; i++) {
    869       float dist = cparams_.butteraugli_distance;
    870       quantizers.push_back(quantizer * dist * bitdepth_correction);
    871     }
    872     for (size_t i = 0; i < extra_channels.size(); i++) {
    873       int ec_bitdepth =
    874           metadata.extra_channel_info[i].bit_depth.bits_per_sample;
    875       pixel_type ec_maxval = ec_bitdepth < 32 ? (1u << ec_bitdepth) - 1 : 0;
    876       bitdepth_correction = ec_maxval / 255.f;
    877       float dist = 0;
    878       if (i < cparams_.ec_distance.size()) dist = cparams_.ec_distance[i];
    879       if (dist < 0) dist = cparams_.butteraugli_distance;
    880       quantizers.push_back(quantizer * dist * bitdepth_correction);
    881     }
    882     if (cparams_.options.nb_repeats == 0) {
    883       return JXL_FAILURE("nb_repeats = 0 not supported with modular lossy!");
    884     }
    885     for (uint32_t i = gi.nb_meta_channels; i < gi.channel.size(); i++) {
    886       Channel& ch = gi.channel[i];
    887       int shift = ch.hshift + ch.vshift;  // number of pixel halvings
    888       if (shift > 16) shift = 16;
    889       if (shift > 0) shift--;
    890       int q;
    891       // assuming default Squeeze here
    892       int component =
    893           (do_color ? 0 : 3) + ((i - gi.nb_meta_channels) % nb_chans);
    894       // last 4 channels are final chroma residuals
    895       if (nb_chans > 2 && i >= gi.channel.size() - 4 && cparams_.responsive) {
    896         component = 1;
    897       }
    898       if (cparams_.color_transform == ColorTransform::kXYB && component < 3) {
    899         q = quantizers[component] * squeeze_quality_factor_xyb *
    900             squeeze_xyb_qtable[component][shift];
    901       } else {
    902         if (cparams_.colorspace != 0 && component > 0 && component < 3) {
    903           q = quantizers[component] * squeeze_quality_factor *
    904               squeeze_chroma_qtable[shift];
    905         } else {
    906           q = quantizers[component] * squeeze_quality_factor *
    907               squeeze_luma_factor * squeeze_luma_qtable[shift];
    908         }
    909       }
    910       if (q < 1) q = 1;
    911       QuantizeChannel(gi.channel[i], q);
    912       quants_[i] = q;
    913     }
    914   }
    915 
    916   // Fill other groups.
    917   // DC
    918   for (size_t group_id = 0; group_id < patch_dim.num_dc_groups; group_id++) {
    919     const size_t rgx = group_id % patch_dim.xsize_dc_groups;
    920     const size_t rgy = group_id / patch_dim.xsize_dc_groups;
    921     const Rect rect(rgx * patch_dim.dc_group_dim, rgy * patch_dim.dc_group_dim,
    922                     patch_dim.dc_group_dim, patch_dim.dc_group_dim);
    923     size_t gx = rgx + frame_area_rect.x0() / 2048;
    924     size_t gy = rgy + frame_area_rect.y0() / 2048;
    925     size_t real_group_id = gy * frame_dim_.xsize_dc_groups + gx;
    926     // minShift==3 because (frame_dim.dc_group_dim >> 3) == frame_dim.group_dim
    927     // maxShift==1000 is infinity
    928     stream_params_.push_back(
    929         GroupParams{rect, 3, 1000, ModularStreamId::ModularDC(real_group_id)});
    930   }
    931   // AC global -> nothing.
    932   // AC
    933   for (size_t group_id = 0; group_id < patch_dim.num_groups; group_id++) {
    934     const size_t rgx = group_id % patch_dim.xsize_groups;
    935     const size_t rgy = group_id / patch_dim.xsize_groups;
    936     const Rect mrect(rgx * patch_dim.group_dim, rgy * patch_dim.group_dim,
    937                      patch_dim.group_dim, patch_dim.group_dim);
    938     size_t gx = rgx + frame_area_rect.x0() / (frame_dim_.group_dim);
    939     size_t gy = rgy + frame_area_rect.y0() / (frame_dim_.group_dim);
    940     size_t real_group_id = gy * frame_dim_.xsize_groups + gx;
    941     for (size_t i = 0; i < enc_state->progressive_splitter.GetNumPasses();
    942          i++) {
    943       int maxShift;
    944       int minShift;
    945       frame_header.passes.GetDownsamplingBracket(i, minShift, maxShift);
    946       stream_params_.push_back(
    947           GroupParams{mrect, minShift, maxShift,
    948                       ModularStreamId::ModularAC(real_group_id, i)});
    949     }
    950   }
    951   // if there's only one group, everything ends up in GlobalModular
    952   // in that case, also try RCTs/WP params for the one group
    953   if (stream_params_.size() == 2) {
    954     stream_params_.push_back(GroupParams{Rect(0, 0, xsize, ysize), 0, 1000,
    955                                          ModularStreamId::Global()});
    956   }
    957   gi_channel_.resize(stream_images_.size());
    958 
    959   JXL_RETURN_IF_ERROR(RunOnPool(
    960       pool, 0, stream_params_.size(), ThreadPool::NoInit,
    961       [&](const uint32_t i, size_t /* thread */) {
    962         size_t stream = stream_params_[i].id.ID(frame_dim_);
    963         stream_options_[stream] = stream_options_[0];
    964         JXL_CHECK(PrepareStreamParams(
    965             stream_params_[i].rect, cparams_, stream_params_[i].minShift,
    966             stream_params_[i].maxShift, stream_params_[i].id, do_color,
    967             groupwise));
    968       },
    969       "ChooseParams"));
    970   {
    971     // Clear out channels that have been copied to groups.
    972     Image& full_image = stream_images_[0];
    973     size_t c = full_image.nb_meta_channels;
    974     for (; c < full_image.channel.size(); c++) {
    975       Channel& fc = full_image.channel[c];
    976       if (fc.w > frame_dim_.group_dim || fc.h > frame_dim_.group_dim) break;
    977     }
    978     for (; c < full_image.channel.size(); c++) {
    979       full_image.channel[c].plane = ImageI();
    980     }
    981   }
    982 
    983   JXL_RETURN_IF_ERROR(ValidateChannelDimensions(gi, stream_options_[0]));
    984   return true;
    985 }
    986 
    987 Status ModularFrameEncoder::ComputeTree(ThreadPool* pool) {
    988   std::vector<ModularMultiplierInfo> multiplier_info;
    989   if (!quants_.empty()) {
    990     for (uint32_t stream_id = 0; stream_id < stream_images_.size();
    991          stream_id++) {
    992       // skip non-modular stream_ids
    993       if (stream_id > 0 && gi_channel_[stream_id].empty()) continue;
    994       const Image& image = stream_images_[stream_id];
    995       const ModularOptions& options = stream_options_[stream_id];
    996       for (uint32_t i = image.nb_meta_channels; i < image.channel.size(); i++) {
    997         if (i >= image.nb_meta_channels &&
    998             (image.channel[i].w > options.max_chan_size ||
    999              image.channel[i].h > options.max_chan_size)) {
   1000           continue;
   1001         }
   1002         if (stream_id > 0 && gi_channel_[stream_id].empty()) continue;
   1003         size_t ch_id = stream_id == 0
   1004                            ? i
   1005                            : gi_channel_[stream_id][i - image.nb_meta_channels];
   1006         uint32_t q = quants_[ch_id];
   1007         // Inform the tree splitting heuristics that each channel in each group
   1008         // used this quantization factor. This will produce a tree with the
   1009         // given multipliers.
   1010         if (multiplier_info.empty() ||
   1011             multiplier_info.back().range[1][0] != stream_id ||
   1012             multiplier_info.back().multiplier != q) {
   1013           StaticPropRange range;
   1014           range[0] = {{i, i + 1}};
   1015           range[1] = {{stream_id, stream_id + 1}};
   1016           multiplier_info.push_back({range, static_cast<uint32_t>(q)});
   1017         } else {
   1018           // Previous channel in the same group had the same quantization
   1019           // factor. Don't provide two different ranges, as that creates
   1020           // unnecessary nodes.
   1021           multiplier_info.back().range[0][1] = i + 1;
   1022         }
   1023       }
   1024     }
   1025     // Merge group+channel settings that have the same channels and quantization
   1026     // factors, to avoid unnecessary nodes.
   1027     std::sort(multiplier_info.begin(), multiplier_info.end(),
   1028               [](ModularMultiplierInfo a, ModularMultiplierInfo b) {
   1029                 return std::make_tuple(a.range, a.multiplier) <
   1030                        std::make_tuple(b.range, b.multiplier);
   1031               });
   1032     size_t new_num = 1;
   1033     for (size_t i = 1; i < multiplier_info.size(); i++) {
   1034       ModularMultiplierInfo& prev = multiplier_info[new_num - 1];
   1035       ModularMultiplierInfo& cur = multiplier_info[i];
   1036       if (prev.range[0] == cur.range[0] && prev.multiplier == cur.multiplier &&
   1037           prev.range[1][1] == cur.range[1][0]) {
   1038         prev.range[1][1] = cur.range[1][1];
   1039       } else {
   1040         multiplier_info[new_num++] = multiplier_info[i];
   1041       }
   1042     }
   1043     multiplier_info.resize(new_num);
   1044   }
   1045 
   1046   if (!cparams_.custom_fixed_tree.empty()) {
   1047     tree_ = cparams_.custom_fixed_tree;
   1048   } else if (cparams_.speed_tier < SpeedTier::kFalcon ||
   1049              !cparams_.modular_mode) {
   1050     // Avoid creating a tree with leaves that don't correspond to any pixels.
   1051     std::vector<size_t> useful_splits;
   1052     useful_splits.reserve(tree_splits_.size());
   1053     for (size_t chunk = 0; chunk < tree_splits_.size() - 1; chunk++) {
   1054       bool has_pixels = false;
   1055       size_t start = tree_splits_[chunk];
   1056       size_t stop = tree_splits_[chunk + 1];
   1057       for (size_t i = start; i < stop; i++) {
   1058         if (!stream_images_[i].empty()) has_pixels = true;
   1059       }
   1060       if (has_pixels) {
   1061         useful_splits.push_back(tree_splits_[chunk]);
   1062       }
   1063     }
   1064     // Don't do anything if modular mode does not have any pixels in this image
   1065     if (useful_splits.empty()) return true;
   1066     useful_splits.push_back(tree_splits_.back());
   1067 
   1068     std::atomic_flag invalid_force_wp = ATOMIC_FLAG_INIT;
   1069 
   1070     std::vector<Tree> trees(useful_splits.size() - 1);
   1071     JXL_RETURN_IF_ERROR(RunOnPool(
   1072         pool, 0, useful_splits.size() - 1, ThreadPool::NoInit,
   1073         [&](const uint32_t chunk, size_t /* thread */) {
   1074           // TODO(veluca): parallelize more.
   1075           size_t total_pixels = 0;
   1076           uint32_t start = useful_splits[chunk];
   1077           uint32_t stop = useful_splits[chunk + 1];
   1078           while (start < stop && stream_images_[start].empty()) ++start;
   1079           while (start < stop && stream_images_[stop - 1].empty()) --stop;
   1080           if (stream_options_[start].tree_kind !=
   1081               ModularOptions::TreeKind::kLearn) {
   1082             for (size_t i = start; i < stop; i++) {
   1083               for (const Channel& ch : stream_images_[i].channel) {
   1084                 total_pixels += ch.w * ch.h;
   1085               }
   1086             }
   1087             trees[chunk] =
   1088                 PredefinedTree(stream_options_[start].tree_kind, total_pixels);
   1089             return;
   1090           }
   1091           TreeSamples tree_samples;
   1092           if (!tree_samples.SetPredictor(stream_options_[start].predictor,
   1093                                          stream_options_[start].wp_tree_mode)) {
   1094             invalid_force_wp.test_and_set(std::memory_order_acq_rel);
   1095             return;
   1096           }
   1097           if (!tree_samples.SetProperties(
   1098                   stream_options_[start].splitting_heuristics_properties,
   1099                   stream_options_[start].wp_tree_mode)) {
   1100             invalid_force_wp.test_and_set(std::memory_order_acq_rel);
   1101             return;
   1102           }
   1103           uint32_t max_c = 0;
   1104           std::vector<pixel_type> pixel_samples;
   1105           std::vector<pixel_type> diff_samples;
   1106           std::vector<uint32_t> group_pixel_count;
   1107           std::vector<uint32_t> channel_pixel_count;
   1108           for (size_t i = start; i < stop; i++) {
   1109             max_c = std::max<uint32_t>(stream_images_[i].channel.size(), max_c);
   1110             CollectPixelSamples(stream_images_[i], stream_options_[i], i,
   1111                                 group_pixel_count, channel_pixel_count,
   1112                                 pixel_samples, diff_samples);
   1113           }
   1114           StaticPropRange range;
   1115           range[0] = {{0, max_c}};
   1116           range[1] = {{start, stop}};
   1117 
   1118           tree_samples.PreQuantizeProperties(
   1119               range, multiplier_info, group_pixel_count, channel_pixel_count,
   1120               pixel_samples, diff_samples,
   1121               stream_options_[start].max_property_values);
   1122           for (size_t i = start; i < stop; i++) {
   1123             JXL_CHECK(ModularGenericCompress(
   1124                 stream_images_[i], stream_options_[i], /*writer=*/nullptr,
   1125                 /*aux_out=*/nullptr, 0, i, &tree_samples, &total_pixels));
   1126           }
   1127 
   1128           // TODO(veluca): parallelize more.
   1129           trees[chunk] =
   1130               LearnTree(std::move(tree_samples), total_pixels,
   1131                         stream_options_[start], multiplier_info, range);
   1132         },
   1133         "LearnTrees"));
   1134     if (invalid_force_wp.test_and_set(std::memory_order_acq_rel)) {
   1135       return JXL_FAILURE("PrepareEncoding: force_no_wp with {Weighted}");
   1136     }
   1137     tree_.clear();
   1138     MergeTrees(trees, useful_splits, 0, useful_splits.size() - 1, &tree_);
   1139   } else {
   1140     // Fixed tree.
   1141     size_t total_pixels = 0;
   1142     for (const Image& img : stream_images_) {
   1143       for (const Channel& ch : img.channel) {
   1144         total_pixels += ch.w * ch.h;
   1145       }
   1146     }
   1147     if (cparams_.speed_tier <= SpeedTier::kFalcon) {
   1148       tree_ =
   1149           PredefinedTree(ModularOptions::TreeKind::kWPFixedDC, total_pixels);
   1150     } else if (cparams_.speed_tier <= SpeedTier::kThunder) {
   1151       tree_ = PredefinedTree(ModularOptions::TreeKind::kGradientFixedDC,
   1152                              total_pixels);
   1153     } else {
   1154       tree_ = {PropertyDecisionNode::Leaf(Predictor::Gradient)};
   1155     }
   1156   }
   1157   tree_tokens_.resize(1);
   1158   tree_tokens_[0].clear();
   1159   Tree decoded_tree;
   1160   TokenizeTree(tree_, tree_tokens_.data(), &decoded_tree);
   1161   JXL_ASSERT(tree_.size() == decoded_tree.size());
   1162   tree_ = std::move(decoded_tree);
   1163 
   1164   /* TODO(szabadka) Add text output callback to cparams
   1165   if (kPrintTree && WantDebugOutput(aux_out)) {
   1166     if (frame_header.dc_level > 0) {
   1167       PrintTree(tree_, aux_out->debug_prefix + "/dc_frame_level" +
   1168                            std::to_string(frame_header.dc_level) + "_tree");
   1169     } else {
   1170       PrintTree(tree_, aux_out->debug_prefix + "/global_tree");
   1171     }
   1172   } */
   1173   return true;
   1174 }
   1175 
   1176 Status ModularFrameEncoder::ComputeTokens(ThreadPool* pool) {
   1177   size_t num_streams = stream_images_.size();
   1178   stream_headers_.resize(num_streams);
   1179   tokens_.resize(num_streams);
   1180   image_widths_.resize(num_streams);
   1181   JXL_RETURN_IF_ERROR(RunOnPool(
   1182       pool, 0, num_streams, ThreadPool::NoInit,
   1183       [&](const uint32_t stream_id, size_t /* thread */) {
   1184         AuxOut my_aux_out;
   1185         tokens_[stream_id].clear();
   1186         JXL_CHECK(ModularGenericCompress(
   1187             stream_images_[stream_id], stream_options_[stream_id],
   1188             /*writer=*/nullptr, &my_aux_out, 0, stream_id,
   1189             /*tree_samples=*/nullptr,
   1190             /*total_pixels=*/nullptr,
   1191             /*tree=*/&tree_, /*header=*/&stream_headers_[stream_id],
   1192             /*tokens=*/&tokens_[stream_id],
   1193             /*widths=*/&image_widths_[stream_id]));
   1194       },
   1195       "ComputeTokens"));
   1196   return true;
   1197 }
   1198 
   1199 Status ModularFrameEncoder::EncodeGlobalInfo(bool streaming_mode,
   1200                                              BitWriter* writer,
   1201                                              AuxOut* aux_out) {
   1202   BitWriter::Allotment allotment(writer, 1);
   1203   // If we are using brotli, or not using modular mode.
   1204   if (tree_tokens_.empty() || tree_tokens_[0].empty()) {
   1205     writer->Write(1, 0);
   1206     allotment.ReclaimAndCharge(writer, kLayerModularTree, aux_out);
   1207     return true;
   1208   }
   1209   writer->Write(1, 1);
   1210   allotment.ReclaimAndCharge(writer, kLayerModularTree, aux_out);
   1211 
   1212   // Write tree
   1213   HistogramParams params =
   1214       HistogramParams::ForModular(cparams_, extra_dc_precision, streaming_mode);
   1215   {
   1216     EntropyEncodingData tree_code;
   1217     std::vector<uint8_t> tree_context_map;
   1218     BuildAndEncodeHistograms(params, kNumTreeContexts, tree_tokens_, &tree_code,
   1219                              &tree_context_map, writer, kLayerModularTree,
   1220                              aux_out);
   1221     WriteTokens(tree_tokens_[0], tree_code, tree_context_map, 0, writer,
   1222                 kLayerModularTree, aux_out);
   1223   }
   1224   params.streaming_mode = streaming_mode;
   1225   params.add_missing_symbols = streaming_mode;
   1226   params.image_widths = image_widths_;
   1227   // Write histograms.
   1228   BuildAndEncodeHistograms(params, (tree_.size() + 1) / 2, tokens_, &code_,
   1229                            &context_map_, writer, kLayerModularGlobal, aux_out);
   1230   return true;
   1231 }
   1232 
   1233 Status ModularFrameEncoder::EncodeStream(BitWriter* writer, AuxOut* aux_out,
   1234                                          size_t layer,
   1235                                          const ModularStreamId& stream) {
   1236   size_t stream_id = stream.ID(frame_dim_);
   1237   if (stream_images_[stream_id].channel.empty()) {
   1238     JXL_DEBUG_V(10, "Modular stream %" PRIuS " is empty.", stream_id);
   1239     return true;  // Image with no channels, header never gets decoded.
   1240   }
   1241   if (tokens_.empty()) {
   1242     JXL_RETURN_IF_ERROR(ModularGenericCompress(
   1243         stream_images_[stream_id], stream_options_[stream_id], writer, aux_out,
   1244         layer, stream_id));
   1245   } else {
   1246     JXL_RETURN_IF_ERROR(
   1247         Bundle::Write(stream_headers_[stream_id], writer, layer, aux_out));
   1248     WriteTokens(tokens_[stream_id], code_, context_map_, 0, writer, layer,
   1249                 aux_out);
   1250   }
   1251   return true;
   1252 }
   1253 
   1254 void ModularFrameEncoder::ClearStreamData(const ModularStreamId& stream) {
   1255   size_t stream_id = stream.ID(frame_dim_);
   1256   Image empty_image;
   1257   std::swap(stream_images_[stream_id], empty_image);
   1258 }
   1259 
   1260 void ModularFrameEncoder::ClearModularStreamData() {
   1261   for (const auto& group : stream_params_) {
   1262     ClearStreamData(group.id);
   1263   }
   1264   stream_params_.clear();
   1265 }
   1266 
   1267 size_t ModularFrameEncoder::ComputeStreamingAbsoluteAcGroupId(
   1268     size_t dc_group_id, size_t ac_group_id,
   1269     const FrameDimensions& patch_dim) const {
   1270   size_t dc_group_x = dc_group_id % frame_dim_.xsize_dc_groups;
   1271   size_t dc_group_y = dc_group_id / frame_dim_.xsize_dc_groups;
   1272   size_t ac_group_x = ac_group_id % patch_dim.xsize_groups;
   1273   size_t ac_group_y = ac_group_id / patch_dim.xsize_groups;
   1274   return (dc_group_x * 8 + ac_group_x) +
   1275          (dc_group_y * 8 + ac_group_y) * frame_dim_.xsize_groups;
   1276 }
   1277 
   1278 Status ModularFrameEncoder::PrepareStreamParams(const Rect& rect,
   1279                                                 const CompressParams& cparams_,
   1280                                                 int minShift, int maxShift,
   1281                                                 const ModularStreamId& stream,
   1282                                                 bool do_color, bool groupwise) {
   1283   size_t stream_id = stream.ID(frame_dim_);
   1284   Image& full_image = stream_images_[0];
   1285   const size_t xsize = rect.xsize();
   1286   const size_t ysize = rect.ysize();
   1287   Image& gi = stream_images_[stream_id];
   1288   if (stream_id > 0) {
   1289     JXL_ASSIGN_OR_RETURN(gi,
   1290                          Image::Create(xsize, ysize, full_image.bitdepth, 0));
   1291     // start at the first bigger-than-frame_dim.group_dim non-metachannel
   1292     size_t c = full_image.nb_meta_channels;
   1293     if (!groupwise) {
   1294       for (; c < full_image.channel.size(); c++) {
   1295         Channel& fc = full_image.channel[c];
   1296         if (fc.w > frame_dim_.group_dim || fc.h > frame_dim_.group_dim) break;
   1297       }
   1298     }
   1299     for (; c < full_image.channel.size(); c++) {
   1300       Channel& fc = full_image.channel[c];
   1301       int shift = std::min(fc.hshift, fc.vshift);
   1302       if (shift > maxShift) continue;
   1303       if (shift < minShift) continue;
   1304       Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift,
   1305              rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h);
   1306       if (r.xsize() == 0 || r.ysize() == 0) continue;
   1307       gi_channel_[stream_id].push_back(c);
   1308       JXL_ASSIGN_OR_RETURN(Channel gc, Channel::Create(r.xsize(), r.ysize()));
   1309       gc.hshift = fc.hshift;
   1310       gc.vshift = fc.vshift;
   1311       for (size_t y = 0; y < r.ysize(); ++y) {
   1312         memcpy(gc.Row(y), r.ConstRow(fc.plane, y),
   1313                r.xsize() * sizeof(pixel_type));
   1314       }
   1315       gi.channel.emplace_back(std::move(gc));
   1316     }
   1317 
   1318     if (gi.channel.empty()) return true;
   1319     // Do some per-group transforms
   1320 
   1321     // Local palette
   1322     // TODO(veluca): make this work with quantize-after-prediction in lossy
   1323     // mode.
   1324     if (cparams_.butteraugli_distance == 0.f && cparams_.palette_colors != 0 &&
   1325         cparams_.speed_tier < SpeedTier::kCheetah) {
   1326       // all-channel palette (e.g. RGBA)
   1327       if (gi.channel.size() - gi.nb_meta_channels > 1) {
   1328         Transform maybe_palette(TransformId::kPalette);
   1329         maybe_palette.begin_c = gi.nb_meta_channels;
   1330         maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels;
   1331         maybe_palette.nb_colors = std::abs(cparams_.palette_colors);
   1332         maybe_palette.ordered_palette = cparams_.palette_colors >= 0;
   1333         maybe_do_transform(gi, maybe_palette, cparams_, weighted::Header());
   1334       }
   1335       // all-minus-one-channel palette (RGB with separate alpha, or CMY with
   1336       // separate K)
   1337       if (gi.channel.size() - gi.nb_meta_channels > 3) {
   1338         Transform maybe_palette_3(TransformId::kPalette);
   1339         maybe_palette_3.begin_c = gi.nb_meta_channels;
   1340         maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1;
   1341         maybe_palette_3.nb_colors = std::abs(cparams_.palette_colors);
   1342         maybe_palette_3.ordered_palette = cparams_.palette_colors >= 0;
   1343         maybe_palette_3.lossy_palette = cparams_.lossy_palette;
   1344         if (maybe_palette_3.lossy_palette) {
   1345           maybe_palette_3.predictor = Predictor::Weighted;
   1346         }
   1347         maybe_do_transform(gi, maybe_palette_3, cparams_, weighted::Header());
   1348       }
   1349     }
   1350 
   1351     // Local channel palette
   1352     if (cparams_.channel_colors_percent > 0 &&
   1353         cparams_.butteraugli_distance == 0.f && !cparams_.lossy_palette &&
   1354         cparams_.speed_tier < SpeedTier::kCheetah &&
   1355         !(cparams_.responsive && cparams_.decoding_speed_tier >= 1)) {
   1356       // single channel palette (like FLIF's ChannelCompact)
   1357       size_t nb_channels = gi.channel.size() - gi.nb_meta_channels;
   1358       for (size_t i = 0; i < nb_channels; i++) {
   1359         int32_t min;
   1360         int32_t max;
   1361         compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max);
   1362         int64_t colors = static_cast<int64_t>(max) - min + 1;
   1363         JXL_DEBUG_V(10, "Channel %" PRIuS ": range=%i..%i", i, min, max);
   1364         Transform maybe_palette_1(TransformId::kPalette);
   1365         maybe_palette_1.begin_c = i + gi.nb_meta_channels;
   1366         maybe_palette_1.num_c = 1;
   1367         // simple heuristic: if less than X percent of the values in the range
   1368         // actually occur, it is probably worth it to do a compaction
   1369         // (but only if the channel palette is less than 80% the size of the
   1370         // image itself)
   1371         maybe_palette_1.nb_colors = std::min(
   1372             static_cast<int>(xsize * ysize * 0.8),
   1373             static_cast<int>(cparams_.channel_colors_percent / 100. * colors));
   1374         maybe_do_transform(gi, maybe_palette_1, cparams_, weighted::Header());
   1375       }
   1376     }
   1377   }
   1378 
   1379   // lossless and no specific color transform specified: try Nothing, YCoCg,
   1380   // and 17 RCTs
   1381   if (cparams_.color_transform == ColorTransform::kNone &&
   1382       cparams_.IsLossless() && cparams_.colorspace < 0 &&
   1383       gi.channel.size() - gi.nb_meta_channels >= 3 &&
   1384       cparams_.responsive == JXL_FALSE && do_color &&
   1385       cparams_.speed_tier <= SpeedTier::kHare) {
   1386     Transform sg(TransformId::kRCT);
   1387     sg.begin_c = gi.nb_meta_channels;
   1388     size_t nb_rcts_to_try = 0;
   1389     switch (cparams_.speed_tier) {
   1390       case SpeedTier::kLightning:
   1391       case SpeedTier::kThunder:
   1392       case SpeedTier::kFalcon:
   1393       case SpeedTier::kCheetah:
   1394         nb_rcts_to_try = 0;  // Just do global YCoCg
   1395         break;
   1396       case SpeedTier::kHare:
   1397         nb_rcts_to_try = 4;
   1398         break;
   1399       case SpeedTier::kWombat:
   1400         nb_rcts_to_try = 5;
   1401         break;
   1402       case SpeedTier::kSquirrel:
   1403         nb_rcts_to_try = 7;
   1404         break;
   1405       case SpeedTier::kKitten:
   1406         nb_rcts_to_try = 9;
   1407         break;
   1408       case SpeedTier::kTectonicPlate:
   1409       case SpeedTier::kGlacier:
   1410       case SpeedTier::kTortoise:
   1411         nb_rcts_to_try = 19;
   1412         break;
   1413     }
   1414     float best_cost = std::numeric_limits<float>::max();
   1415     size_t best_rct = 0;
   1416     // These should be 19 actually different transforms; the remaining ones
   1417     // are equivalent to one of these (note that the first two are do-nothing
   1418     // and YCoCg) modulo channel reordering (which only matters in the case of
   1419     // MA-with-prev-channels-properties) and/or sign (e.g. RmG vs GmR)
   1420     for (int i : {0 * 7 + 0, 0 * 7 + 6, 0 * 7 + 5, 1 * 7 + 3, 3 * 7 + 5,
   1421                   5 * 7 + 5, 1 * 7 + 5, 2 * 7 + 5, 1 * 7 + 1, 0 * 7 + 4,
   1422                   1 * 7 + 2, 2 * 7 + 1, 2 * 7 + 2, 2 * 7 + 3, 4 * 7 + 4,
   1423                   4 * 7 + 5, 0 * 7 + 2, 0 * 7 + 1, 0 * 7 + 3}) {
   1424       if (nb_rcts_to_try == 0) break;
   1425       sg.rct_type = i;
   1426       nb_rcts_to_try--;
   1427       if (do_transform(gi, sg, weighted::Header())) {
   1428         float cost = EstimateCost(gi);
   1429         if (cost < best_cost) {
   1430           best_rct = i;
   1431           best_cost = cost;
   1432         }
   1433         Transform t = gi.transform.back();
   1434         JXL_RETURN_IF_ERROR(t.Inverse(gi, weighted::Header(), nullptr));
   1435         gi.transform.pop_back();
   1436       }
   1437     }
   1438     // Apply the best RCT to the image for future encoding.
   1439     sg.rct_type = best_rct;
   1440     do_transform(gi, sg, weighted::Header());
   1441   } else {
   1442     // No need to try anything, just use the default options.
   1443   }
   1444   size_t nb_wp_modes = 1;
   1445   if (cparams_.speed_tier <= SpeedTier::kTortoise) {
   1446     nb_wp_modes = 5;
   1447   } else if (cparams_.speed_tier <= SpeedTier::kKitten) {
   1448     nb_wp_modes = 2;
   1449   }
   1450   if (nb_wp_modes > 1 &&
   1451       (stream_options_[stream_id].predictor == Predictor::Weighted ||
   1452        stream_options_[stream_id].predictor == Predictor::Best ||
   1453        stream_options_[stream_id].predictor == Predictor::Variable)) {
   1454     float best_cost = std::numeric_limits<float>::max();
   1455     stream_options_[stream_id].wp_mode = 0;
   1456     for (size_t i = 0; i < nb_wp_modes; i++) {
   1457       float cost = EstimateWPCost(gi, i);
   1458       if (cost < best_cost) {
   1459         best_cost = cost;
   1460         stream_options_[stream_id].wp_mode = i;
   1461       }
   1462     }
   1463   }
   1464   return true;
   1465 }
   1466 
   1467 constexpr float q_deadzone = 0.62f;
   1468 int QuantizeWP(const int32_t* qrow, size_t onerow, size_t c, size_t x, size_t y,
   1469                size_t w, weighted::State* wp_state, float value,
   1470                float inv_factor) {
   1471   float svalue = value * inv_factor;
   1472   PredictionResult pred =
   1473       PredictNoTreeWP(w, qrow + x, onerow, x, y, Predictor::Weighted, wp_state);
   1474   svalue -= pred.guess;
   1475   if (svalue > -q_deadzone && svalue < q_deadzone) svalue = 0;
   1476   int residual = roundf(svalue);
   1477   if (residual > 2 || residual < -2) residual = roundf(svalue * 0.5) * 2;
   1478   return residual + pred.guess;
   1479 }
   1480 
   1481 int QuantizeGradient(const int32_t* qrow, size_t onerow, size_t c, size_t x,
   1482                      size_t y, size_t w, float value, float inv_factor) {
   1483   float svalue = value * inv_factor;
   1484   PredictionResult pred =
   1485       PredictNoTreeNoWP(w, qrow + x, onerow, x, y, Predictor::Gradient);
   1486   svalue -= pred.guess;
   1487   if (svalue > -q_deadzone && svalue < q_deadzone) svalue = 0;
   1488   int residual = roundf(svalue);
   1489   if (residual > 2 || residual < -2) residual = roundf(svalue * 0.5) * 2;
   1490   return residual + pred.guess;
   1491 }
   1492 
   1493 Status ModularFrameEncoder::AddVarDCTDC(const FrameHeader& frame_header,
   1494                                         const Image3F& dc, const Rect& r,
   1495                                         size_t group_index, bool nl_dc,
   1496                                         PassesEncoderState* enc_state,
   1497                                         bool jpeg_transcode) {
   1498   extra_dc_precision[group_index] = nl_dc ? 1 : 0;
   1499   float mul = 1 << extra_dc_precision[group_index];
   1500 
   1501   size_t stream_id = ModularStreamId::VarDCTDC(group_index).ID(frame_dim_);
   1502   stream_options_[stream_id].max_chan_size = 0xFFFFFF;
   1503   stream_options_[stream_id].predictor = Predictor::Weighted;
   1504   stream_options_[stream_id].wp_tree_mode = ModularOptions::TreeMode::kWPOnly;
   1505   if (cparams_.speed_tier >= SpeedTier::kSquirrel) {
   1506     stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kWPFixedDC;
   1507   }
   1508   if (cparams_.speed_tier < SpeedTier::kSquirrel && !nl_dc) {
   1509     stream_options_[stream_id].predictor =
   1510         (cparams_.speed_tier < SpeedTier::kKitten ? Predictor::Variable
   1511                                                   : Predictor::Best);
   1512     stream_options_[stream_id].wp_tree_mode =
   1513         ModularOptions::TreeMode::kDefault;
   1514     stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kLearn;
   1515   }
   1516   if (cparams_.decoding_speed_tier >= 1) {
   1517     stream_options_[stream_id].tree_kind =
   1518         ModularOptions::TreeKind::kGradientFixedDC;
   1519   }
   1520   stream_options_[stream_id].histogram_params =
   1521       stream_options_[0].histogram_params;
   1522 
   1523   JXL_ASSIGN_OR_RETURN(stream_images_[stream_id],
   1524                        Image::Create(r.xsize(), r.ysize(), 8, 3));
   1525   if (nl_dc && stream_options_[stream_id].tree_kind ==
   1526                    ModularOptions::TreeKind::kGradientFixedDC) {
   1527     JXL_ASSERT(frame_header.chroma_subsampling.Is444());
   1528     for (size_t c : {1, 0, 2}) {
   1529       float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
   1530       float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul;
   1531       float cfl_factor = enc_state->shared.cmap.DCFactors()[c];
   1532       for (size_t y = 0; y < r.ysize(); y++) {
   1533         int32_t* quant_row =
   1534             stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y);
   1535         size_t stride = stream_images_[stream_id]
   1536                             .channel[c < 2 ? c ^ 1 : c]
   1537                             .plane.PixelsPerRow();
   1538         const float* row = r.ConstPlaneRow(dc, c, y);
   1539         if (c == 1) {
   1540           for (size_t x = 0; x < r.xsize(); x++) {
   1541             quant_row[x] = QuantizeGradient(quant_row, stride, c, x, y,
   1542                                             r.xsize(), row[x], inv_factor);
   1543           }
   1544         } else {
   1545           int32_t* quant_row_y =
   1546               stream_images_[stream_id].channel[0].plane.Row(y);
   1547           for (size_t x = 0; x < r.xsize(); x++) {
   1548             quant_row[x] = QuantizeGradient(
   1549                 quant_row, stride, c, x, y, r.xsize(),
   1550                 row[x] - quant_row_y[x] * (y_factor * cfl_factor), inv_factor);
   1551           }
   1552         }
   1553       }
   1554     }
   1555   } else if (nl_dc) {
   1556     JXL_ASSERT(frame_header.chroma_subsampling.Is444());
   1557     for (size_t c : {1, 0, 2}) {
   1558       float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
   1559       float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul;
   1560       float cfl_factor = enc_state->shared.cmap.DCFactors()[c];
   1561       weighted::Header header;
   1562       weighted::State wp_state(header, r.xsize(), r.ysize());
   1563       for (size_t y = 0; y < r.ysize(); y++) {
   1564         int32_t* quant_row =
   1565             stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y);
   1566         size_t stride = stream_images_[stream_id]
   1567                             .channel[c < 2 ? c ^ 1 : c]
   1568                             .plane.PixelsPerRow();
   1569         const float* row = r.ConstPlaneRow(dc, c, y);
   1570         if (c == 1) {
   1571           for (size_t x = 0; x < r.xsize(); x++) {
   1572             quant_row[x] = QuantizeWP(quant_row, stride, c, x, y, r.xsize(),
   1573                                       &wp_state, row[x], inv_factor);
   1574             wp_state.UpdateErrors(quant_row[x], x, y, r.xsize());
   1575           }
   1576         } else {
   1577           int32_t* quant_row_y =
   1578               stream_images_[stream_id].channel[0].plane.Row(y);
   1579           for (size_t x = 0; x < r.xsize(); x++) {
   1580             quant_row[x] = QuantizeWP(
   1581                 quant_row, stride, c, x, y, r.xsize(), &wp_state,
   1582                 row[x] - quant_row_y[x] * (y_factor * cfl_factor), inv_factor);
   1583             wp_state.UpdateErrors(quant_row[x], x, y, r.xsize());
   1584           }
   1585         }
   1586       }
   1587     }
   1588   } else if (frame_header.chroma_subsampling.Is444()) {
   1589     for (size_t c : {1, 0, 2}) {
   1590       float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
   1591       float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul;
   1592       float cfl_factor = enc_state->shared.cmap.DCFactors()[c];
   1593       for (size_t y = 0; y < r.ysize(); y++) {
   1594         int32_t* quant_row =
   1595             stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y);
   1596         const float* row = r.ConstPlaneRow(dc, c, y);
   1597         if (c == 1) {
   1598           for (size_t x = 0; x < r.xsize(); x++) {
   1599             quant_row[x] = roundf(row[x] * inv_factor);
   1600           }
   1601         } else {
   1602           int32_t* quant_row_y =
   1603               stream_images_[stream_id].channel[0].plane.Row(y);
   1604           for (size_t x = 0; x < r.xsize(); x++) {
   1605             quant_row[x] =
   1606                 roundf((row[x] - quant_row_y[x] * (y_factor * cfl_factor)) *
   1607                        inv_factor);
   1608           }
   1609         }
   1610       }
   1611     }
   1612   } else {
   1613     for (size_t c : {1, 0, 2}) {
   1614       Rect rect(r.x0() >> frame_header.chroma_subsampling.HShift(c),
   1615                 r.y0() >> frame_header.chroma_subsampling.VShift(c),
   1616                 r.xsize() >> frame_header.chroma_subsampling.HShift(c),
   1617                 r.ysize() >> frame_header.chroma_subsampling.VShift(c));
   1618       float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
   1619       size_t ys = rect.ysize();
   1620       size_t xs = rect.xsize();
   1621       Channel& ch = stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c];
   1622       ch.w = xs;
   1623       ch.h = ys;
   1624       JXL_RETURN_IF_ERROR(ch.shrink());
   1625       for (size_t y = 0; y < ys; y++) {
   1626         int32_t* quant_row = ch.plane.Row(y);
   1627         const float* row = rect.ConstPlaneRow(dc, c, y);
   1628         for (size_t x = 0; x < xs; x++) {
   1629           quant_row[x] = roundf(row[x] * inv_factor);
   1630         }
   1631       }
   1632     }
   1633   }
   1634 
   1635   DequantDC(r, &enc_state->shared.dc_storage, &enc_state->shared.quant_dc,
   1636             stream_images_[stream_id], enc_state->shared.quantizer.MulDC(),
   1637             1.0 / mul, enc_state->shared.cmap.DCFactors(),
   1638             frame_header.chroma_subsampling, enc_state->shared.block_ctx_map);
   1639   return true;
   1640 }
   1641 
   1642 Status ModularFrameEncoder::AddACMetadata(const Rect& r, size_t group_index,
   1643                                           bool jpeg_transcode,
   1644                                           PassesEncoderState* enc_state) {
   1645   size_t stream_id = ModularStreamId::ACMetadata(group_index).ID(frame_dim_);
   1646   stream_options_[stream_id].max_chan_size = 0xFFFFFF;
   1647   if (stream_options_[stream_id].predictor != Predictor::Weighted) {
   1648     stream_options_[stream_id].wp_tree_mode = ModularOptions::TreeMode::kNoWP;
   1649   }
   1650   if (jpeg_transcode) {
   1651     stream_options_[stream_id].tree_kind =
   1652         ModularOptions::TreeKind::kJpegTranscodeACMeta;
   1653   } else if (cparams_.speed_tier >= SpeedTier::kFalcon) {
   1654     stream_options_[stream_id].tree_kind =
   1655         ModularOptions::TreeKind::kFalconACMeta;
   1656   } else if (cparams_.speed_tier > SpeedTier::kKitten) {
   1657     stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kACMeta;
   1658   }
   1659   // If we are using a non-constant CfL field, and are in a slow enough mode,
   1660   // re-enable tree computation for it.
   1661   if (cparams_.speed_tier < SpeedTier::kSquirrel &&
   1662       cparams_.force_cfl_jpeg_recompression) {
   1663     stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kLearn;
   1664   }
   1665   stream_options_[stream_id].histogram_params =
   1666       stream_options_[0].histogram_params;
   1667   // YToX, YToB, ACS + QF, EPF
   1668   Image& image = stream_images_[stream_id];
   1669   JXL_ASSIGN_OR_RETURN(image, Image::Create(r.xsize(), r.ysize(), 8, 4));
   1670   static_assert(kColorTileDimInBlocks == 8, "Color tile size changed");
   1671   Rect cr(r.x0() >> 3, r.y0() >> 3, (r.xsize() + 7) >> 3, (r.ysize() + 7) >> 3);
   1672   JXL_ASSIGN_OR_RETURN(image.channel[0],
   1673                        Channel::Create(cr.xsize(), cr.ysize(), 3, 3));
   1674   JXL_ASSIGN_OR_RETURN(image.channel[1],
   1675                        Channel::Create(cr.xsize(), cr.ysize(), 3, 3));
   1676   JXL_ASSIGN_OR_RETURN(image.channel[2],
   1677                        Channel::Create(r.xsize() * r.ysize(), 2, 0, 0));
   1678   ConvertPlaneAndClamp(cr, enc_state->shared.cmap.ytox_map,
   1679                        Rect(image.channel[0].plane), &image.channel[0].plane);
   1680   ConvertPlaneAndClamp(cr, enc_state->shared.cmap.ytob_map,
   1681                        Rect(image.channel[1].plane), &image.channel[1].plane);
   1682   size_t num = 0;
   1683   for (size_t y = 0; y < r.ysize(); y++) {
   1684     AcStrategyRow row_acs = enc_state->shared.ac_strategy.ConstRow(r, y);
   1685     const int32_t* row_qf = r.ConstRow(enc_state->shared.raw_quant_field, y);
   1686     const uint8_t* row_epf = r.ConstRow(enc_state->shared.epf_sharpness, y);
   1687     int32_t* out_acs = image.channel[2].plane.Row(0);
   1688     int32_t* out_qf = image.channel[2].plane.Row(1);
   1689     int32_t* row_out_epf = image.channel[3].plane.Row(y);
   1690     for (size_t x = 0; x < r.xsize(); x++) {
   1691       row_out_epf[x] = row_epf[x];
   1692       if (!row_acs[x].IsFirstBlock()) continue;
   1693       out_acs[num] = row_acs[x].RawStrategy();
   1694       out_qf[num] = row_qf[x] - 1;
   1695       num++;
   1696     }
   1697   }
   1698   image.channel[2].w = num;
   1699   ac_metadata_size[group_index] = num;
   1700   return true;
   1701 }
   1702 
   1703 Status ModularFrameEncoder::EncodeQuantTable(
   1704     size_t size_x, size_t size_y, BitWriter* writer,
   1705     const QuantEncoding& encoding, size_t idx,
   1706     ModularFrameEncoder* modular_frame_encoder) {
   1707   JXL_ASSERT(encoding.qraw.qtable != nullptr);
   1708   JXL_ASSERT(size_x * size_y * 3 == encoding.qraw.qtable->size());
   1709   JXL_CHECK(F16Coder::Write(encoding.qraw.qtable_den, writer));
   1710   if (modular_frame_encoder) {
   1711     JXL_CHECK(modular_frame_encoder->EncodeStream(
   1712         writer, nullptr, 0, ModularStreamId::QuantTable(idx)));
   1713     return true;
   1714   }
   1715   JXL_ASSIGN_OR_RETURN(Image image, Image::Create(size_x, size_y, 8, 3));
   1716   for (size_t c = 0; c < 3; c++) {
   1717     for (size_t y = 0; y < size_y; y++) {
   1718       int32_t* JXL_RESTRICT row = image.channel[c].Row(y);
   1719       for (size_t x = 0; x < size_x; x++) {
   1720         row[x] = (*encoding.qraw.qtable)[c * size_x * size_y + y * size_x + x];
   1721       }
   1722     }
   1723   }
   1724   ModularOptions cfopts;
   1725   JXL_CHECK(ModularGenericCompress(image, cfopts, writer));
   1726   return true;
   1727 }
   1728 
   1729 Status ModularFrameEncoder::AddQuantTable(size_t size_x, size_t size_y,
   1730                                           const QuantEncoding& encoding,
   1731                                           size_t idx) {
   1732   size_t stream_id = ModularStreamId::QuantTable(idx).ID(frame_dim_);
   1733   JXL_ASSERT(encoding.qraw.qtable != nullptr);
   1734   JXL_ASSERT(size_x * size_y * 3 == encoding.qraw.qtable->size());
   1735   Image& image = stream_images_[stream_id];
   1736   JXL_ASSIGN_OR_RETURN(image, Image::Create(size_x, size_y, 8, 3));
   1737   for (size_t c = 0; c < 3; c++) {
   1738     for (size_t y = 0; y < size_y; y++) {
   1739       int32_t* JXL_RESTRICT row = image.channel[c].Row(y);
   1740       for (size_t x = 0; x < size_x; x++) {
   1741         row[x] = (*encoding.qraw.qtable)[c * size_x * size_y + y * size_x + x];
   1742       }
   1743     }
   1744   }
   1745   return true;
   1746 }
   1747 }  // namespace jxl