libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

butteraugli.cc (78837B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 //
      6 // Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com)
      7 //
      8 // The physical architecture of butteraugli is based on the following naming
      9 // convention:
     10 //   * Opsin - dynamics of the photosensitive chemicals in the retina
     11 //             with their immediate electrical processing
     12 //   * Xyb - hybrid opponent/trichromatic color space
     13 //     x is roughly red-subtract-green.
     14 //     y is yellow.
     15 //     b is blue.
     16 //     Xyb values are computed from Opsin mixing, not directly from rgb.
     17 //   * Mask - for visual masking
     18 //   * Hf - color modeling for spatially high-frequency features
     19 //   * Lf - color modeling for spatially low-frequency features
     20 //   * Diffmap - to cluster and build an image of error between the images
     21 //   * Blur - to hold the smoothing code
     22 
     23 #include "lib/jxl/butteraugli/butteraugli.h"
     24 
     25 #include <stdint.h>
     26 #include <stdio.h>
     27 #include <stdlib.h>
     28 #include <string.h>
     29 
     30 #include <algorithm>
     31 #include <cmath>
     32 #include <memory>
     33 #include <vector>
     34 
     35 #include "lib/jxl/image.h"
     36 
     37 #undef HWY_TARGET_INCLUDE
     38 #define HWY_TARGET_INCLUDE "lib/jxl/butteraugli/butteraugli.cc"
     39 #include <hwy/foreach_target.h>
     40 
     41 #include "lib/jxl/base/fast_math-inl.h"
     42 #include "lib/jxl/base/printf_macros.h"
     43 #include "lib/jxl/base/status.h"
     44 #include "lib/jxl/convolve.h"
     45 #include "lib/jxl/image_ops.h"
     46 
     47 #ifndef JXL_BUTTERAUGLI_ONCE
     48 #define JXL_BUTTERAUGLI_ONCE
     49 
     50 namespace jxl {
     51 
     52 static const double wMfMalta = 37.0819870399;
     53 static const double norm1Mf = 130262059.556;
     54 static const double wMfMaltaX = 8246.75321353;
     55 static const double norm1MfX = 1009002.70582;
     56 static const double wHfMalta = 18.7237414387;
     57 static const double norm1Hf = 4498534.45232;
     58 static const double wHfMaltaX = 6923.99476109;
     59 static const double norm1HfX = 8051.15833247;
     60 static const double wUhfMalta = 1.10039032555;
     61 static const double norm1Uhf = 71.7800275169;
     62 static const double wUhfMaltaX = 173.5;
     63 static const double norm1UhfX = 5.0;
     64 static const double wmul[9] = {
     65     400.0,         1.50815703118,  0,
     66     2150.0,        10.6195433239,  16.2176043152,
     67     29.2353797994, 0.844626970982, 0.703646627719,
     68 };
     69 
     70 std::vector<float> ComputeKernel(float sigma) {
     71   const float m = 2.25;  // Accuracy increases when m is increased.
     72   const double scaler = -1.0 / (2.0 * sigma * sigma);
     73   const int diff = std::max<int>(1, m * std::fabs(sigma));
     74   std::vector<float> kernel(2 * diff + 1);
     75   for (int i = -diff; i <= diff; ++i) {
     76     kernel[i + diff] = std::exp(scaler * i * i);
     77   }
     78   return kernel;
     79 }
     80 
     81 void ConvolveBorderColumn(const ImageF& in, const std::vector<float>& kernel,
     82                           const size_t x, float* BUTTERAUGLI_RESTRICT row_out) {
     83   const size_t offset = kernel.size() / 2;
     84   int minx = x < offset ? 0 : x - offset;
     85   int maxx = std::min<int>(in.xsize() - 1, x + offset);
     86   float weight = 0.0f;
     87   for (int j = minx; j <= maxx; ++j) {
     88     weight += kernel[j - x + offset];
     89   }
     90   float scale = 1.0f / weight;
     91   for (size_t y = 0; y < in.ysize(); ++y) {
     92     const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y);
     93     float sum = 0.0f;
     94     for (int j = minx; j <= maxx; ++j) {
     95       sum += row_in[j] * kernel[j - x + offset];
     96     }
     97     row_out[y] = sum * scale;
     98   }
     99 }
    100 
    101 // Computes a horizontal convolution and transposes the result.
    102 void ConvolutionWithTranspose(const ImageF& in,
    103                               const std::vector<float>& kernel,
    104                               ImageF* BUTTERAUGLI_RESTRICT out) {
    105   JXL_CHECK(out->xsize() == in.ysize());
    106   JXL_CHECK(out->ysize() == in.xsize());
    107   const size_t len = kernel.size();
    108   const size_t offset = len / 2;
    109   float weight_no_border = 0.0f;
    110   for (size_t j = 0; j < len; ++j) {
    111     weight_no_border += kernel[j];
    112   }
    113   const float scale_no_border = 1.0f / weight_no_border;
    114   const size_t border1 = std::min(in.xsize(), offset);
    115   const size_t border2 = in.xsize() > offset ? in.xsize() - offset : 0;
    116   std::vector<float> scaled_kernel(len / 2 + 1);
    117   for (size_t i = 0; i <= len / 2; ++i) {
    118     scaled_kernel[i] = kernel[i] * scale_no_border;
    119   }
    120 
    121   // middle
    122   switch (len) {
    123     case 7: {
    124       const float sk0 = scaled_kernel[0];
    125       const float sk1 = scaled_kernel[1];
    126       const float sk2 = scaled_kernel[2];
    127       const float sk3 = scaled_kernel[3];
    128       for (size_t y = 0; y < in.ysize(); ++y) {
    129         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
    130         for (size_t x = border1; x < border2; ++x, ++row_in) {
    131           const float sum0 = (row_in[0] + row_in[6]) * sk0;
    132           const float sum1 = (row_in[1] + row_in[5]) * sk1;
    133           const float sum2 = (row_in[2] + row_in[4]) * sk2;
    134           const float sum = (row_in[3]) * sk3 + sum0 + sum1 + sum2;
    135           float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
    136           row_out[y] = sum;
    137         }
    138       }
    139     } break;
    140     case 13: {
    141       for (size_t y = 0; y < in.ysize(); ++y) {
    142         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
    143         for (size_t x = border1; x < border2; ++x, ++row_in) {
    144           float sum0 = (row_in[0] + row_in[12]) * scaled_kernel[0];
    145           float sum1 = (row_in[1] + row_in[11]) * scaled_kernel[1];
    146           float sum2 = (row_in[2] + row_in[10]) * scaled_kernel[2];
    147           float sum3 = (row_in[3] + row_in[9]) * scaled_kernel[3];
    148           sum0 += (row_in[4] + row_in[8]) * scaled_kernel[4];
    149           sum1 += (row_in[5] + row_in[7]) * scaled_kernel[5];
    150           const float sum = (row_in[6]) * scaled_kernel[6];
    151           float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
    152           row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
    153         }
    154       }
    155       break;
    156     }
    157     case 15: {
    158       for (size_t y = 0; y < in.ysize(); ++y) {
    159         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
    160         for (size_t x = border1; x < border2; ++x, ++row_in) {
    161           float sum0 = (row_in[0] + row_in[14]) * scaled_kernel[0];
    162           float sum1 = (row_in[1] + row_in[13]) * scaled_kernel[1];
    163           float sum2 = (row_in[2] + row_in[12]) * scaled_kernel[2];
    164           float sum3 = (row_in[3] + row_in[11]) * scaled_kernel[3];
    165           sum0 += (row_in[4] + row_in[10]) * scaled_kernel[4];
    166           sum1 += (row_in[5] + row_in[9]) * scaled_kernel[5];
    167           sum2 += (row_in[6] + row_in[8]) * scaled_kernel[6];
    168           const float sum = (row_in[7]) * scaled_kernel[7];
    169           float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
    170           row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
    171         }
    172       }
    173       break;
    174     }
    175     case 33: {
    176       for (size_t y = 0; y < in.ysize(); ++y) {
    177         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
    178         for (size_t x = border1; x < border2; ++x, ++row_in) {
    179           float sum0 = (row_in[0] + row_in[32]) * scaled_kernel[0];
    180           float sum1 = (row_in[1] + row_in[31]) * scaled_kernel[1];
    181           float sum2 = (row_in[2] + row_in[30]) * scaled_kernel[2];
    182           float sum3 = (row_in[3] + row_in[29]) * scaled_kernel[3];
    183           sum0 += (row_in[4] + row_in[28]) * scaled_kernel[4];
    184           sum1 += (row_in[5] + row_in[27]) * scaled_kernel[5];
    185           sum2 += (row_in[6] + row_in[26]) * scaled_kernel[6];
    186           sum3 += (row_in[7] + row_in[25]) * scaled_kernel[7];
    187           sum0 += (row_in[8] + row_in[24]) * scaled_kernel[8];
    188           sum1 += (row_in[9] + row_in[23]) * scaled_kernel[9];
    189           sum2 += (row_in[10] + row_in[22]) * scaled_kernel[10];
    190           sum3 += (row_in[11] + row_in[21]) * scaled_kernel[11];
    191           sum0 += (row_in[12] + row_in[20]) * scaled_kernel[12];
    192           sum1 += (row_in[13] + row_in[19]) * scaled_kernel[13];
    193           sum2 += (row_in[14] + row_in[18]) * scaled_kernel[14];
    194           sum3 += (row_in[15] + row_in[17]) * scaled_kernel[15];
    195           const float sum = (row_in[16]) * scaled_kernel[16];
    196           float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
    197           row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
    198         }
    199       }
    200       break;
    201     }
    202     default:
    203       JXL_UNREACHABLE("Kernel size %" PRIuS " not implemented", len);
    204   }
    205   // left border
    206   for (size_t x = 0; x < border1; ++x) {
    207     ConvolveBorderColumn(in, kernel, x, out->Row(x));
    208   }
    209 
    210   // right border
    211   for (size_t x = border2; x < in.xsize(); ++x) {
    212     ConvolveBorderColumn(in, kernel, x, out->Row(x));
    213   }
    214 }
    215 
    216 // A blur somewhat similar to a 2D Gaussian blur.
    217 // See: https://en.wikipedia.org/wiki/Gaussian_blur
    218 //
    219 // This is a bottleneck because the sigma can be quite large (>7). We can use
    220 // gauss_blur.cc (runtime independent of sigma, closer to a 4*sigma truncated
    221 // Gaussian and our 2.25 in ComputeKernel), but its boundary conditions are
    222 // zero-valued. This leads to noticeable differences at the edges of diffmaps.
    223 // We retain a special case for 5x5 kernels (even faster than gauss_blur),
    224 // optionally use gauss_blur followed by fixup of the borders for large images,
    225 // or fall back to the previous truncated FIR followed by a transpose.
    226 Status Blur(const ImageF& in, float sigma, const ButteraugliParams& params,
    227             BlurTemp* temp, ImageF* out) {
    228   std::vector<float> kernel = ComputeKernel(sigma);
    229   // Separable5 does an in-place convolution, so this fast path is not safe if
    230   // in aliases out.
    231   if (kernel.size() == 5 && &in != out) {
    232     float sum_weights = 0.0f;
    233     for (const float w : kernel) {
    234       sum_weights += w;
    235     }
    236     const float scale = 1.0f / sum_weights;
    237     const float w0 = kernel[2] * scale;
    238     const float w1 = kernel[1] * scale;
    239     const float w2 = kernel[0] * scale;
    240     const WeightsSeparable5 weights = {
    241         {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
    242         {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
    243     };
    244     Separable5(in, Rect(in), weights, /*pool=*/nullptr, out);
    245     return true;
    246   }
    247 
    248   ImageF* temp_t;
    249   JXL_RETURN_IF_ERROR(temp->GetTransposed(in, &temp_t));
    250   ConvolutionWithTranspose(in, kernel, temp_t);
    251   ConvolutionWithTranspose(*temp_t, kernel, out);
    252   return true;
    253 }
    254 
    255 // Allows PaddedMaltaUnit to call either function via overloading.
    256 struct MaltaTagLF {};
    257 struct MaltaTag {};
    258 
    259 }  // namespace jxl
    260 
    261 #endif  // JXL_BUTTERAUGLI_ONCE
    262 
    263 #include <hwy/highway.h>
    264 HWY_BEFORE_NAMESPACE();
    265 namespace jxl {
    266 namespace HWY_NAMESPACE {
    267 
    268 // These templates are not found via ADL.
    269 using hwy::HWY_NAMESPACE::Abs;
    270 using hwy::HWY_NAMESPACE::Div;
    271 using hwy::HWY_NAMESPACE::Gt;
    272 using hwy::HWY_NAMESPACE::IfThenElse;
    273 using hwy::HWY_NAMESPACE::IfThenElseZero;
    274 using hwy::HWY_NAMESPACE::Lt;
    275 using hwy::HWY_NAMESPACE::Max;
    276 using hwy::HWY_NAMESPACE::Mul;
    277 using hwy::HWY_NAMESPACE::MulAdd;
    278 using hwy::HWY_NAMESPACE::MulSub;
    279 using hwy::HWY_NAMESPACE::Neg;
    280 using hwy::HWY_NAMESPACE::Sub;
    281 using hwy::HWY_NAMESPACE::Vec;
    282 using hwy::HWY_NAMESPACE::ZeroIfNegative;
    283 
    284 template <class D, class V>
    285 HWY_INLINE V MaximumClamp(D d, V v, double kMaxVal) {
    286   static const double kMul = 0.724216145665;
    287   const V mul = Set(d, kMul);
    288   const V maxval = Set(d, kMaxVal);
    289   // If greater than maxval or less than -maxval, replace with if_*.
    290   const V if_pos = MulAdd(Sub(v, maxval), mul, maxval);
    291   const V if_neg = MulSub(Add(v, maxval), mul, maxval);
    292   const V pos_or_v = IfThenElse(Ge(v, maxval), if_pos, v);
    293   return IfThenElse(Lt(v, Neg(maxval)), if_neg, pos_or_v);
    294 }
    295 
    296 // Make area around zero less important (remove it).
    297 template <class D, class V>
    298 HWY_INLINE V RemoveRangeAroundZero(const D d, const double kw, const V x) {
    299   const auto w = Set(d, kw);
    300   return IfThenElse(Gt(x, w), Sub(x, w),
    301                     IfThenElseZero(Lt(x, Neg(w)), Add(x, w)));
    302 }
    303 
    304 // Make area around zero more important (2x it until the limit).
    305 template <class D, class V>
    306 HWY_INLINE V AmplifyRangeAroundZero(const D d, const double kw, const V x) {
    307   const auto w = Set(d, kw);
    308   return IfThenElse(Gt(x, w), Add(x, w),
    309                     IfThenElse(Lt(x, Neg(w)), Sub(x, w), Add(x, x)));
    310 }
    311 
    312 // XybLowFreqToVals converts from low-frequency XYB space to the 'vals' space.
    313 // Vals space can be converted to L2-norm space (Euclidean and normalized)
    314 // through visual masking.
    315 template <class D, class V>
    316 HWY_INLINE void XybLowFreqToVals(const D d, const V& x, const V& y,
    317                                  const V& b_arg, V* HWY_RESTRICT valx,
    318                                  V* HWY_RESTRICT valy, V* HWY_RESTRICT valb) {
    319   static const double xmul_scalar = 33.832837186260;
    320   static const double ymul_scalar = 14.458268100570;
    321   static const double bmul_scalar = 49.87984651440;
    322   static const double y_to_b_mul_scalar = -0.362267051518;
    323   const V xmul = Set(d, xmul_scalar);
    324   const V ymul = Set(d, ymul_scalar);
    325   const V bmul = Set(d, bmul_scalar);
    326   const V y_to_b_mul = Set(d, y_to_b_mul_scalar);
    327   const V b = MulAdd(y_to_b_mul, y, b_arg);
    328   *valb = Mul(b, bmul);
    329   *valx = Mul(x, xmul);
    330   *valy = Mul(y, ymul);
    331 }
    332 
    333 void XybLowFreqToVals(Image3F* xyb_lf) {
    334   // Modify range around zero code only concerns the high frequency
    335   // planes and only the X and Y channels.
    336   // Convert low freq xyb to vals space so that we can do a simple squared sum
    337   // diff on the low frequencies later.
    338   const HWY_FULL(float) d;
    339   for (size_t y = 0; y < xyb_lf->ysize(); ++y) {
    340     float* BUTTERAUGLI_RESTRICT row_x = xyb_lf->PlaneRow(0, y);
    341     float* BUTTERAUGLI_RESTRICT row_y = xyb_lf->PlaneRow(1, y);
    342     float* BUTTERAUGLI_RESTRICT row_b = xyb_lf->PlaneRow(2, y);
    343     for (size_t x = 0; x < xyb_lf->xsize(); x += Lanes(d)) {
    344       auto valx = Undefined(d);
    345       auto valy = Undefined(d);
    346       auto valb = Undefined(d);
    347       XybLowFreqToVals(d, Load(d, row_x + x), Load(d, row_y + x),
    348                        Load(d, row_b + x), &valx, &valy, &valb);
    349       Store(valx, d, row_x + x);
    350       Store(valy, d, row_y + x);
    351       Store(valb, d, row_b + x);
    352     }
    353   }
    354 }
    355 
    356 void SuppressXByY(const ImageF& in_y, ImageF* HWY_RESTRICT inout_x) {
    357   JXL_DASSERT(SameSize(*inout_x, in_y));
    358   const size_t xsize = in_y.xsize();
    359   const size_t ysize = in_y.ysize();
    360   const HWY_FULL(float) d;
    361   static const double suppress = 46.0;
    362   static const double s = 0.653020556257;
    363   const auto sv = Set(d, s);
    364   const auto one_minus_s = Set(d, 1.0 - s);
    365   const auto ywv = Set(d, suppress);
    366 
    367   for (size_t y = 0; y < ysize; ++y) {
    368     const float* HWY_RESTRICT row_y = in_y.ConstRow(y);
    369     float* HWY_RESTRICT row_x = inout_x->Row(y);
    370     for (size_t x = 0; x < xsize; x += Lanes(d)) {
    371       const auto vx = Load(d, row_x + x);
    372       const auto vy = Load(d, row_y + x);
    373       const auto scaler =
    374           MulAdd(Div(ywv, MulAdd(vy, vy, ywv)), one_minus_s, sv);
    375       Store(Mul(scaler, vx), d, row_x + x);
    376     }
    377   }
    378 }
    379 
    380 void Subtract(const ImageF& a, const ImageF& b, ImageF* c) {
    381   const HWY_FULL(float) d;
    382   for (size_t y = 0; y < a.ysize(); ++y) {
    383     const float* row_a = a.ConstRow(y);
    384     const float* row_b = b.ConstRow(y);
    385     float* row_c = c->Row(y);
    386     for (size_t x = 0; x < a.xsize(); x += Lanes(d)) {
    387       Store(Sub(Load(d, row_a + x), Load(d, row_b + x)), d, row_c + x);
    388     }
    389   }
    390 }
    391 
    392 Status SeparateLFAndMF(const ButteraugliParams& params, const Image3F& xyb,
    393                        Image3F* lf, Image3F* mf, BlurTemp* blur_temp) {
    394   static const double kSigmaLf = 7.15593339443;
    395   for (int i = 0; i < 3; ++i) {
    396     // Extract lf ...
    397     JXL_RETURN_IF_ERROR(
    398         Blur(xyb.Plane(i), kSigmaLf, params, blur_temp, &lf->Plane(i)));
    399     // ... and keep everything else in mf.
    400     Subtract(xyb.Plane(i), lf->Plane(i), &mf->Plane(i));
    401   }
    402   XybLowFreqToVals(lf);
    403   return true;
    404 }
    405 
    406 Status SeparateMFAndHF(const ButteraugliParams& params, Image3F* mf, ImageF* hf,
    407                        BlurTemp* blur_temp) {
    408   const HWY_FULL(float) d;
    409   static const double kSigmaHf = 3.22489901262;
    410   const size_t xsize = mf->xsize();
    411   const size_t ysize = mf->ysize();
    412   JXL_ASSIGN_OR_RETURN(hf[0], ImageF::Create(xsize, ysize));
    413   JXL_ASSIGN_OR_RETURN(hf[1], ImageF::Create(xsize, ysize));
    414   for (int i = 0; i < 3; ++i) {
    415     if (i == 2) {
    416       JXL_RETURN_IF_ERROR(
    417           Blur(mf->Plane(i), kSigmaHf, params, blur_temp, &mf->Plane(i)));
    418       break;
    419     }
    420     for (size_t y = 0; y < ysize; ++y) {
    421       float* BUTTERAUGLI_RESTRICT row_mf = mf->PlaneRow(i, y);
    422       float* BUTTERAUGLI_RESTRICT row_hf = hf[i].Row(y);
    423       for (size_t x = 0; x < xsize; x += Lanes(d)) {
    424         Store(Load(d, row_mf + x), d, row_hf + x);
    425       }
    426     }
    427     JXL_RETURN_IF_ERROR(
    428         Blur(mf->Plane(i), kSigmaHf, params, blur_temp, &mf->Plane(i)));
    429     static const double kRemoveMfRange = 0.29;
    430     static const double kAddMfRange = 0.1;
    431     if (i == 0) {
    432       for (size_t y = 0; y < ysize; ++y) {
    433         float* BUTTERAUGLI_RESTRICT row_mf = mf->PlaneRow(0, y);
    434         float* BUTTERAUGLI_RESTRICT row_hf = hf[0].Row(y);
    435         for (size_t x = 0; x < xsize; x += Lanes(d)) {
    436           auto mf = Load(d, row_mf + x);
    437           auto hf = Sub(Load(d, row_hf + x), mf);
    438           mf = RemoveRangeAroundZero(d, kRemoveMfRange, mf);
    439           Store(mf, d, row_mf + x);
    440           Store(hf, d, row_hf + x);
    441         }
    442       }
    443     } else {
    444       for (size_t y = 0; y < ysize; ++y) {
    445         float* BUTTERAUGLI_RESTRICT row_mf = mf->PlaneRow(1, y);
    446         float* BUTTERAUGLI_RESTRICT row_hf = hf[1].Row(y);
    447         for (size_t x = 0; x < xsize; x += Lanes(d)) {
    448           auto mf = Load(d, row_mf + x);
    449           auto hf = Sub(Load(d, row_hf + x), mf);
    450 
    451           mf = AmplifyRangeAroundZero(d, kAddMfRange, mf);
    452           Store(mf, d, row_mf + x);
    453           Store(hf, d, row_hf + x);
    454         }
    455       }
    456     }
    457   }
    458   // Suppress red-green by intensity change in the high freq channels.
    459   SuppressXByY(hf[1], &hf[0]);
    460   return true;
    461 }
    462 
    463 Status SeparateHFAndUHF(const ButteraugliParams& params, ImageF* hf,
    464                         ImageF* uhf, BlurTemp* blur_temp) {
    465   const HWY_FULL(float) d;
    466   const size_t xsize = hf[0].xsize();
    467   const size_t ysize = hf[0].ysize();
    468   static const double kSigmaUhf = 1.56416327805;
    469   JXL_ASSIGN_OR_RETURN(uhf[0], ImageF::Create(xsize, ysize));
    470   JXL_ASSIGN_OR_RETURN(uhf[1], ImageF::Create(xsize, ysize));
    471   for (int i = 0; i < 2; ++i) {
    472     // Divide hf into hf and uhf.
    473     for (size_t y = 0; y < ysize; ++y) {
    474       float* BUTTERAUGLI_RESTRICT row_uhf = uhf[i].Row(y);
    475       float* BUTTERAUGLI_RESTRICT row_hf = hf[i].Row(y);
    476       for (size_t x = 0; x < xsize; ++x) {
    477         row_uhf[x] = row_hf[x];
    478       }
    479     }
    480     JXL_RETURN_IF_ERROR(Blur(hf[i], kSigmaUhf, params, blur_temp, &hf[i]));
    481     static const double kRemoveHfRange = 1.5;
    482     static const double kAddHfRange = 0.132;
    483     static const double kRemoveUhfRange = 0.04;
    484     static const double kMaxclampHf = 28.4691806922;
    485     static const double kMaxclampUhf = 5.19175294647;
    486     static double kMulYHf = 2.155;
    487     static double kMulYUhf = 2.69313763794;
    488     if (i == 0) {
    489       for (size_t y = 0; y < ysize; ++y) {
    490         float* BUTTERAUGLI_RESTRICT row_uhf = uhf[0].Row(y);
    491         float* BUTTERAUGLI_RESTRICT row_hf = hf[0].Row(y);
    492         for (size_t x = 0; x < xsize; x += Lanes(d)) {
    493           auto hf = Load(d, row_hf + x);
    494           auto uhf = Sub(Load(d, row_uhf + x), hf);
    495           hf = RemoveRangeAroundZero(d, kRemoveHfRange, hf);
    496           uhf = RemoveRangeAroundZero(d, kRemoveUhfRange, uhf);
    497           Store(hf, d, row_hf + x);
    498           Store(uhf, d, row_uhf + x);
    499         }
    500       }
    501     } else {
    502       for (size_t y = 0; y < ysize; ++y) {
    503         float* BUTTERAUGLI_RESTRICT row_uhf = uhf[1].Row(y);
    504         float* BUTTERAUGLI_RESTRICT row_hf = hf[1].Row(y);
    505         for (size_t x = 0; x < xsize; x += Lanes(d)) {
    506           auto hf = Load(d, row_hf + x);
    507           hf = MaximumClamp(d, hf, kMaxclampHf);
    508 
    509           auto uhf = Sub(Load(d, row_uhf + x), hf);
    510           uhf = MaximumClamp(d, uhf, kMaxclampUhf);
    511           uhf = Mul(uhf, Set(d, kMulYUhf));
    512           Store(uhf, d, row_uhf + x);
    513 
    514           hf = Mul(hf, Set(d, kMulYHf));
    515           hf = AmplifyRangeAroundZero(d, kAddHfRange, hf);
    516           Store(hf, d, row_hf + x);
    517         }
    518       }
    519     }
    520   }
    521   return true;
    522 }
    523 
    524 void DeallocateHFAndUHF(ImageF* hf, ImageF* uhf) {
    525   for (int i = 0; i < 2; ++i) {
    526     hf[i] = ImageF();
    527     uhf[i] = ImageF();
    528   }
    529 }
    530 
    531 Status SeparateFrequencies(size_t xsize, size_t ysize,
    532                            const ButteraugliParams& params, BlurTemp* blur_temp,
    533                            const Image3F& xyb, PsychoImage& ps) {
    534   JXL_ASSIGN_OR_RETURN(ps.lf, Image3F::Create(xyb.xsize(), xyb.ysize()));
    535   JXL_ASSIGN_OR_RETURN(ps.mf, Image3F::Create(xyb.xsize(), xyb.ysize()));
    536   JXL_RETURN_IF_ERROR(SeparateLFAndMF(params, xyb, &ps.lf, &ps.mf, blur_temp));
    537   JXL_RETURN_IF_ERROR(SeparateMFAndHF(params, &ps.mf, &ps.hf[0], blur_temp));
    538   JXL_RETURN_IF_ERROR(
    539       SeparateHFAndUHF(params, &ps.hf[0], &ps.uhf[0], blur_temp));
    540   return true;
    541 }
    542 
    543 namespace {
    544 template <typename V>
    545 BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d) {
    546   return Add(Add(a, b), Add(c, d));
    547 }
    548 template <typename V>
    549 BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e) {
    550   return Sum(a, b, c, Add(d, e));
    551 }
    552 template <typename V>
    553 BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g) {
    554   return Sum(a, b, c, Sum(d, e, f, g));
    555 }
    556 template <typename V>
    557 BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g, V h, V i) {
    558   return Add(Add(Sum(a, b, c, d), Sum(e, f, g, h)), i);
    559 }
    560 }  // namespace
    561 
    562 template <class D>
    563 Vec<D> MaltaUnit(MaltaTagLF /*tag*/, const D df,
    564                  const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
    565   const intptr_t xs3 = 3 * xs;
    566 
    567   const auto center = LoadU(df, d);
    568 
    569   // x grows, y constant
    570   const auto sum_yconst = Sum(LoadU(df, d - 4), LoadU(df, d - 2), center,
    571                               LoadU(df, d + 2), LoadU(df, d + 4));
    572   // Will return this, sum of all line kernels
    573   auto retval = Mul(sum_yconst, sum_yconst);
    574   {
    575     // y grows, x constant
    576     auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs - xs), center,
    577                    LoadU(df, d + xs + xs), LoadU(df, d + xs3 + xs));
    578     retval = MulAdd(sum, sum, retval);
    579   }
    580   {
    581     // both grow
    582     auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2), center,
    583                    LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3));
    584     retval = MulAdd(sum, sum, retval);
    585   }
    586   {
    587     // y grows, x shrinks
    588     auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2), center,
    589                    LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3));
    590     retval = MulAdd(sum, sum, retval);
    591   }
    592   {
    593     // y grows -4 to 4, x shrinks 1 -> -1
    594     auto sum =
    595         Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs - xs + 1), center,
    596             LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 1));
    597     retval = MulAdd(sum, sum, retval);
    598   }
    599   {
    600     //  y grows -4 to 4, x grows -1 -> 1
    601     auto sum =
    602         Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs - xs - 1), center,
    603             LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 1));
    604     retval = MulAdd(sum, sum, retval);
    605   }
    606   {
    607     // x grows -4 to 4, y grows -1 to 1
    608     auto sum = Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 2 - xs), center,
    609                    LoadU(df, d + 2 + xs), LoadU(df, d + 4 + xs));
    610     retval = MulAdd(sum, sum, retval);
    611   }
    612   {
    613     // x grows -4 to 4, y shrinks 1 to -1
    614     auto sum = Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 2 + xs), center,
    615                    LoadU(df, d + 2 - xs), LoadU(df, d + 4 - xs));
    616     retval = MulAdd(sum, sum, retval);
    617   }
    618   {
    619     /* 0_________
    620        1__*______
    621        2___*_____
    622        3_________
    623        4____0____
    624        5_________
    625        6_____*___
    626        7______*__
    627        8_________ */
    628     auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1), center,
    629                    LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2));
    630     retval = MulAdd(sum, sum, retval);
    631   }
    632   {
    633     /* 0_________
    634        1______*__
    635        2_____*___
    636        3_________
    637        4____0____
    638        5_________
    639        6___*_____
    640        7__*______
    641        8_________ */
    642     auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1), center,
    643                    LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2));
    644     retval = MulAdd(sum, sum, retval);
    645   }
    646   {
    647     /* 0_________
    648        1_________
    649        2_*_______
    650        3__*______
    651        4____0____
    652        5______*__
    653        6_______*_
    654        7_________
    655        8_________ */
    656     auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2), center,
    657                    LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3));
    658     retval = MulAdd(sum, sum, retval);
    659   }
    660   {
    661     /* 0_________
    662        1_________
    663        2_______*_
    664        3______*__
    665        4____0____
    666        5__*______
    667        6_*_______
    668        7_________
    669        8_________ */
    670     auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2), center,
    671                    LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3));
    672     retval = MulAdd(sum, sum, retval);
    673   }
    674   {
    675     /* 0_________
    676        1_________
    677        2________*
    678        3______*__
    679        4____0____
    680        5__*______
    681        6*________
    682        7_________
    683        8_________ */
    684 
    685     auto sum = Sum(LoadU(df, d + xs + xs - 4), LoadU(df, d + xs - 2), center,
    686                    LoadU(df, d - xs + 2), LoadU(df, d - xs - xs + 4));
    687     retval = MulAdd(sum, sum, retval);
    688   }
    689   {
    690     /* 0_________
    691        1_________
    692        2*________
    693        3__*______
    694        4____0____
    695        5______*__
    696        6________*
    697        7_________
    698        8_________ */
    699     auto sum = Sum(LoadU(df, d - xs - xs - 4), LoadU(df, d - xs - 2), center,
    700                    LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 4));
    701     retval = MulAdd(sum, sum, retval);
    702   }
    703   {
    704     /* 0__*______
    705        1_________
    706        2___*_____
    707        3_________
    708        4____0____
    709        5_________
    710        6_____*___
    711        7_________
    712        8______*__ */
    713     auto sum =
    714         Sum(LoadU(df, d - xs3 - xs - 2), LoadU(df, d - xs - xs - 1), center,
    715             LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 2));
    716     retval = MulAdd(sum, sum, retval);
    717   }
    718   {
    719     /* 0______*__
    720        1_________
    721        2_____*___
    722        3_________
    723        4____0____
    724        5_________
    725        6___*_____
    726        7_________
    727        8__*______ */
    728     auto sum =
    729         Sum(LoadU(df, d - xs3 - xs + 2), LoadU(df, d - xs - xs + 1), center,
    730             LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 2));
    731     retval = MulAdd(sum, sum, retval);
    732   }
    733   return retval;
    734 }
    735 
    736 template <class D>
    737 Vec<D> MaltaUnit(MaltaTag /*tag*/, const D df,
    738                  const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
    739   const intptr_t xs3 = 3 * xs;
    740 
    741   const auto center = LoadU(df, d);
    742 
    743   // x grows, y constant
    744   const auto sum_yconst =
    745       Sum(LoadU(df, d - 4), LoadU(df, d - 3), LoadU(df, d - 2),
    746           LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2),
    747           LoadU(df, d + 3), LoadU(df, d + 4));
    748   // Will return this, sum of all line kernels
    749   auto retval = Mul(sum_yconst, sum_yconst);
    750 
    751   {
    752     // y grows, x constant
    753     auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs3),
    754                    LoadU(df, d - xs - xs), LoadU(df, d - xs), center,
    755                    LoadU(df, d + xs), LoadU(df, d + xs + xs),
    756                    LoadU(df, d + xs3), LoadU(df, d + xs3 + xs));
    757     retval = MulAdd(sum, sum, retval);
    758   }
    759   {
    760     // both grow
    761     auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2),
    762                    LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
    763                    LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3));
    764     retval = MulAdd(sum, sum, retval);
    765   }
    766   {
    767     // y grows, x shrinks
    768     auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2),
    769                    LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
    770                    LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3));
    771     retval = MulAdd(sum, sum, retval);
    772   }
    773   {
    774     // y grows -4 to 4, x shrinks 1 -> -1
    775     auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1),
    776                    LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center,
    777                    LoadU(df, d + xs), LoadU(df, d + xs + xs - 1),
    778                    LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1));
    779     retval = MulAdd(sum, sum, retval);
    780   }
    781   {
    782     //  y grows -4 to 4, x grows -1 -> 1
    783     auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1),
    784                    LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center,
    785                    LoadU(df, d + xs), LoadU(df, d + xs + xs + 1),
    786                    LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1));
    787     retval = MulAdd(sum, sum, retval);
    788   }
    789   {
    790     // x grows -4 to 4, y grows -1 to 1
    791     auto sum =
    792         Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 3 - xs), LoadU(df, d - 2 - xs),
    793             LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 + xs),
    794             LoadU(df, d + 3 + xs), LoadU(df, d + 4 + xs));
    795     retval = MulAdd(sum, sum, retval);
    796   }
    797   {
    798     // x grows -4 to 4, y shrinks 1 to -1
    799     auto sum =
    800         Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 3 + xs), LoadU(df, d - 2 + xs),
    801             LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 - xs),
    802             LoadU(df, d + 3 - xs), LoadU(df, d + 4 - xs));
    803     retval = MulAdd(sum, sum, retval);
    804   }
    805   {
    806     /* 0_________
    807        1__*______
    808        2___*_____
    809        3___*_____
    810        4____0____
    811        5_____*___
    812        6_____*___
    813        7______*__
    814        8_________ */
    815     auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1),
    816                    LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
    817                    LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2));
    818     retval = MulAdd(sum, sum, retval);
    819   }
    820   {
    821     /* 0_________
    822        1______*__
    823        2_____*___
    824        3_____*___
    825        4____0____
    826        5___*_____
    827        6___*_____
    828        7__*______
    829        8_________ */
    830     auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1),
    831                    LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
    832                    LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2));
    833     retval = MulAdd(sum, sum, retval);
    834   }
    835   {
    836     /* 0_________
    837        1_________
    838        2_*_______
    839        3__**_____
    840        4____0____
    841        5_____**__
    842        6_______*_
    843        7_________
    844        8_________ */
    845     auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2),
    846                    LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
    847                    LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3));
    848     retval = MulAdd(sum, sum, retval);
    849   }
    850   {
    851     /* 0_________
    852        1_________
    853        2_______*_
    854        3_____**__
    855        4____0____
    856        5__**_____
    857        6_*_______
    858        7_________
    859        8_________ */
    860     auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2),
    861                    LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
    862                    LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3));
    863     retval = MulAdd(sum, sum, retval);
    864   }
    865   {
    866     /* 0_________
    867        1_________
    868        2_________
    869        3______***
    870        4___*0*___
    871        5***______
    872        6_________
    873        7_________
    874        8_________ */
    875 
    876     auto sum =
    877         Sum(LoadU(df, d + xs - 4), LoadU(df, d + xs - 3), LoadU(df, d + xs - 2),
    878             LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d - xs + 2),
    879             LoadU(df, d - xs + 3), LoadU(df, d - xs + 4));
    880     retval = MulAdd(sum, sum, retval);
    881   }
    882   {
    883     /* 0_________
    884        1_________
    885        2_________
    886        3***______
    887        4___*0*___
    888        5______***
    889        6_________
    890        7_________
    891        8_________ */
    892     auto sum =
    893         Sum(LoadU(df, d - xs - 4), LoadU(df, d - xs - 3), LoadU(df, d - xs - 2),
    894             LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + xs + 2),
    895             LoadU(df, d + xs + 3), LoadU(df, d + xs + 4));
    896     retval = MulAdd(sum, sum, retval);
    897   }
    898   {
    899     /* 0___*_____
    900        1___*_____
    901        2___*_____
    902        3____*____
    903        4____0____
    904        5____*____
    905        6_____*___
    906        7_____*___
    907        8_____*___ */
    908     auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1),
    909                    LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center,
    910                    LoadU(df, d + xs), LoadU(df, d + xs + xs + 1),
    911                    LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1));
    912     retval = MulAdd(sum, sum, retval);
    913   }
    914   {
    915     /* 0_____*___
    916        1_____*___
    917        2____ *___
    918        3____*____
    919        4____0____
    920        5____*____
    921        6___*_____
    922        7___*_____
    923        8___*_____ */
    924     auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1),
    925                    LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center,
    926                    LoadU(df, d + xs), LoadU(df, d + xs + xs - 1),
    927                    LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1));
    928     retval = MulAdd(sum, sum, retval);
    929   }
    930   return retval;
    931 }
    932 
    933 // Returns MaltaUnit. Avoids bounds-checks when x0 and y0 are known
    934 // to be far enough from the image borders. "diffs" is a packed image.
    935 template <class Tag>
    936 static BUTTERAUGLI_INLINE float PaddedMaltaUnit(const ImageF& diffs,
    937                                                 const size_t x0,
    938                                                 const size_t y0) {
    939   const float* BUTTERAUGLI_RESTRICT d = diffs.ConstRow(y0) + x0;
    940   const HWY_CAPPED(float, 1) df;
    941   if ((x0 >= 4 && y0 >= 4 && x0 < (diffs.xsize() - 4) &&
    942        y0 < (diffs.ysize() - 4))) {
    943     return GetLane(MaltaUnit(Tag(), df, d, diffs.PixelsPerRow()));
    944   }
    945 
    946   float borderimage[12 * 9];  // round up to 4
    947   for (int dy = 0; dy < 9; ++dy) {
    948     int y = y0 + dy - 4;
    949     if (y < 0 || static_cast<size_t>(y) >= diffs.ysize()) {
    950       for (int dx = 0; dx < 12; ++dx) {
    951         borderimage[dy * 12 + dx] = 0.0f;
    952       }
    953       continue;
    954     }
    955 
    956     const float* row_diffs = diffs.ConstRow(y);
    957     for (int dx = 0; dx < 9; ++dx) {
    958       int x = x0 + dx - 4;
    959       if (x < 0 || static_cast<size_t>(x) >= diffs.xsize()) {
    960         borderimage[dy * 12 + dx] = 0.0f;
    961       } else {
    962         borderimage[dy * 12 + dx] = row_diffs[x];
    963       }
    964     }
    965     std::fill(borderimage + dy * 12 + 9, borderimage + dy * 12 + 12, 0.0f);
    966   }
    967   return GetLane(MaltaUnit(Tag(), df, &borderimage[4 * 12 + 4], 12));
    968 }
    969 
    970 template <class Tag>
    971 static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1,
    972                           const double w_0gt1, const double w_0lt1,
    973                           const double norm1, const double len,
    974                           const double mulli, ImageF* HWY_RESTRICT diffs,
    975                           ImageF* HWY_RESTRICT block_diff_ac) {
    976   JXL_DASSERT(SameSize(lum0, lum1) && SameSize(lum0, *diffs));
    977   const size_t xsize_ = lum0.xsize();
    978   const size_t ysize_ = lum0.ysize();
    979 
    980   const float kWeight0 = 0.5;
    981   const float kWeight1 = 0.33;
    982 
    983   const double w_pre0gt1 = mulli * std::sqrt(kWeight0 * w_0gt1) / (len * 2 + 1);
    984   const double w_pre0lt1 = mulli * std::sqrt(kWeight1 * w_0lt1) / (len * 2 + 1);
    985   const float norm2_0gt1 = w_pre0gt1 * norm1;
    986   const float norm2_0lt1 = w_pre0lt1 * norm1;
    987 
    988   for (size_t y = 0; y < ysize_; ++y) {
    989     const float* HWY_RESTRICT row0 = lum0.ConstRow(y);
    990     const float* HWY_RESTRICT row1 = lum1.ConstRow(y);
    991     float* HWY_RESTRICT row_diffs = diffs->Row(y);
    992     for (size_t x = 0; x < xsize_; ++x) {
    993       const float absval = 0.5f * (std::abs(row0[x]) + std::abs(row1[x]));
    994       const float diff = row0[x] - row1[x];
    995       const float scaler = norm2_0gt1 / (static_cast<float>(norm1) + absval);
    996 
    997       // Primary symmetric quadratic objective.
    998       row_diffs[x] = scaler * diff;
    999 
   1000       const float scaler2 = norm2_0lt1 / (static_cast<float>(norm1) + absval);
   1001       const double fabs0 = std::fabs(row0[x]);
   1002 
   1003       // Secondary half-open quadratic objectives.
   1004       const double too_small = 0.55 * fabs0;
   1005       const double too_big = 1.05 * fabs0;
   1006 
   1007       if (row0[x] < 0) {
   1008         if (row1[x] > -too_small) {
   1009           double impact = scaler2 * (row1[x] + too_small);
   1010           row_diffs[x] -= impact;
   1011         } else if (row1[x] < -too_big) {
   1012           double impact = scaler2 * (-row1[x] - too_big);
   1013           row_diffs[x] += impact;
   1014         }
   1015       } else {
   1016         if (row1[x] < too_small) {
   1017           double impact = scaler2 * (too_small - row1[x]);
   1018           row_diffs[x] += impact;
   1019         } else if (row1[x] > too_big) {
   1020           double impact = scaler2 * (row1[x] - too_big);
   1021           row_diffs[x] -= impact;
   1022         }
   1023       }
   1024     }
   1025   }
   1026 
   1027   size_t y0 = 0;
   1028   // Top
   1029   for (; y0 < 4; ++y0) {
   1030     float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->Row(y0);
   1031     for (size_t x0 = 0; x0 < xsize_; ++x0) {
   1032       row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
   1033     }
   1034   }
   1035 
   1036   const HWY_FULL(float) df;
   1037   const size_t aligned_x = std::max(static_cast<size_t>(4), Lanes(df));
   1038   const intptr_t stride = diffs->PixelsPerRow();
   1039 
   1040   // Middle
   1041   for (; y0 < ysize_ - 4; ++y0) {
   1042     const float* BUTTERAUGLI_RESTRICT row_in = diffs->ConstRow(y0);
   1043     float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->Row(y0);
   1044     size_t x0 = 0;
   1045     for (; x0 < aligned_x; ++x0) {
   1046       row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
   1047     }
   1048     for (; x0 + Lanes(df) + 4 <= xsize_; x0 += Lanes(df)) {
   1049       auto diff = Load(df, row_diff + x0);
   1050       diff = Add(diff, MaltaUnit(Tag(), df, row_in + x0, stride));
   1051       Store(diff, df, row_diff + x0);
   1052     }
   1053 
   1054     for (; x0 < xsize_; ++x0) {
   1055       row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
   1056     }
   1057   }
   1058 
   1059   // Bottom
   1060   for (; y0 < ysize_; ++y0) {
   1061     float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->Row(y0);
   1062     for (size_t x0 = 0; x0 < xsize_; ++x0) {
   1063       row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
   1064     }
   1065   }
   1066 }
   1067 
   1068 // Need non-template wrapper functions for HWY_EXPORT.
   1069 void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
   1070                   const double w_0lt1, const double norm1,
   1071                   ImageF* HWY_RESTRICT diffs,
   1072                   ImageF* HWY_RESTRICT block_diff_ac) {
   1073   const double len = 3.75;
   1074   static const double mulli = 0.39905817637;
   1075   MaltaDiffMapT(MaltaTag(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
   1076                 diffs, block_diff_ac);
   1077 }
   1078 
   1079 void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
   1080                     const double w_0lt1, const double norm1,
   1081                     ImageF* HWY_RESTRICT diffs,
   1082                     ImageF* HWY_RESTRICT block_diff_ac) {
   1083   const double len = 3.75;
   1084   static const double mulli = 0.611612573796;
   1085   MaltaDiffMapT(MaltaTagLF(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
   1086                 diffs, block_diff_ac);
   1087 }
   1088 
   1089 void CombineChannelsForMasking(const ImageF* hf, const ImageF* uhf,
   1090                                ImageF* out) {
   1091   // Only X and Y components are involved in masking. B's influence
   1092   // is considered less important in the high frequency area, and we
   1093   // don't model masking from lower frequency signals.
   1094   static const float muls[3] = {
   1095       2.5f,
   1096       0.4f,
   1097       0.4f,
   1098   };
   1099   // Silly and unoptimized approach here. TODO(jyrki): rework this.
   1100   for (size_t y = 0; y < hf[0].ysize(); ++y) {
   1101     const float* BUTTERAUGLI_RESTRICT row_y_hf = hf[1].Row(y);
   1102     const float* BUTTERAUGLI_RESTRICT row_y_uhf = uhf[1].Row(y);
   1103     const float* BUTTERAUGLI_RESTRICT row_x_hf = hf[0].Row(y);
   1104     const float* BUTTERAUGLI_RESTRICT row_x_uhf = uhf[0].Row(y);
   1105     float* BUTTERAUGLI_RESTRICT row = out->Row(y);
   1106     for (size_t x = 0; x < hf[0].xsize(); ++x) {
   1107       float xdiff = (row_x_uhf[x] + row_x_hf[x]) * muls[0];
   1108       float ydiff = row_y_uhf[x] * muls[1] + row_y_hf[x] * muls[2];
   1109       row[x] = xdiff * xdiff + ydiff * ydiff;
   1110       row[x] = std::sqrt(row[x]);
   1111     }
   1112   }
   1113 }
   1114 
   1115 void DiffPrecompute(const ImageF& xyb, float mul, float bias_arg, ImageF* out) {
   1116   const size_t xsize = xyb.xsize();
   1117   const size_t ysize = xyb.ysize();
   1118   const float bias = mul * bias_arg;
   1119   const float sqrt_bias = std::sqrt(bias);
   1120   for (size_t y = 0; y < ysize; ++y) {
   1121     const float* BUTTERAUGLI_RESTRICT row_in = xyb.Row(y);
   1122     float* BUTTERAUGLI_RESTRICT row_out = out->Row(y);
   1123     for (size_t x = 0; x < xsize; ++x) {
   1124       // kBias makes sqrt behave more linearly.
   1125       row_out[x] = std::sqrt(mul * std::abs(row_in[x]) + bias) - sqrt_bias;
   1126     }
   1127   }
   1128 }
   1129 
   1130 // std::log(80.0) / std::log(255.0);
   1131 constexpr float kIntensityTargetNormalizationHack = 0.79079917404f;
   1132 static const float kInternalGoodQualityThreshold =
   1133     17.83f * kIntensityTargetNormalizationHack;
   1134 static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
   1135 
   1136 void StoreMin3(const float v, float& min0, float& min1, float& min2) {
   1137   if (v < min2) {
   1138     if (v < min0) {
   1139       min2 = min1;
   1140       min1 = min0;
   1141       min0 = v;
   1142     } else if (v < min1) {
   1143       min2 = min1;
   1144       min1 = v;
   1145     } else {
   1146       min2 = v;
   1147     }
   1148   }
   1149 }
   1150 
   1151 // Look for smooth areas near the area of degradation.
   1152 // If the areas area generally smooth, don't do masking.
   1153 void FuzzyErosion(const ImageF& from, ImageF* to) {
   1154   const size_t xsize = from.xsize();
   1155   const size_t ysize = from.ysize();
   1156   static const int kStep = 3;
   1157   for (size_t y = 0; y < ysize; ++y) {
   1158     for (size_t x = 0; x < xsize; ++x) {
   1159       float min0 = from.Row(y)[x];
   1160       float min1 = 2 * min0;
   1161       float min2 = min1;
   1162       if (x >= kStep) {
   1163         float v = from.Row(y)[x - kStep];
   1164         StoreMin3(v, min0, min1, min2);
   1165         if (y >= kStep) {
   1166           float v = from.Row(y - kStep)[x - kStep];
   1167           StoreMin3(v, min0, min1, min2);
   1168         }
   1169         if (y < ysize - kStep) {
   1170           float v = from.Row(y + kStep)[x - kStep];
   1171           StoreMin3(v, min0, min1, min2);
   1172         }
   1173       }
   1174       if (x < xsize - kStep) {
   1175         float v = from.Row(y)[x + kStep];
   1176         StoreMin3(v, min0, min1, min2);
   1177         if (y >= kStep) {
   1178           float v = from.Row(y - kStep)[x + kStep];
   1179           StoreMin3(v, min0, min1, min2);
   1180         }
   1181         if (y < ysize - kStep) {
   1182           float v = from.Row(y + kStep)[x + kStep];
   1183           StoreMin3(v, min0, min1, min2);
   1184         }
   1185       }
   1186       if (y >= kStep) {
   1187         float v = from.Row(y - kStep)[x];
   1188         StoreMin3(v, min0, min1, min2);
   1189       }
   1190       if (y < ysize - kStep) {
   1191         float v = from.Row(y + kStep)[x];
   1192         StoreMin3(v, min0, min1, min2);
   1193       }
   1194       to->Row(y)[x] = (0.45f * min0 + 0.3f * min1 + 0.25f * min2);
   1195     }
   1196   }
   1197 }
   1198 
   1199 // Compute values of local frequency and dc masking based on the activity
   1200 // in the two images. img_diff_ac may be null.
   1201 Status Mask(const ImageF& mask0, const ImageF& mask1,
   1202             const ButteraugliParams& params, BlurTemp* blur_temp,
   1203             ImageF* BUTTERAUGLI_RESTRICT mask,
   1204             ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
   1205   const size_t xsize = mask0.xsize();
   1206   const size_t ysize = mask0.ysize();
   1207   JXL_ASSIGN_OR_RETURN(*mask, ImageF::Create(xsize, ysize));
   1208   static const float kMul = 6.19424080439;
   1209   static const float kBias = 12.61050594197;
   1210   static const float kRadius = 2.7;
   1211   JXL_ASSIGN_OR_RETURN(ImageF diff0, ImageF::Create(xsize, ysize));
   1212   JXL_ASSIGN_OR_RETURN(ImageF diff1, ImageF::Create(xsize, ysize));
   1213   JXL_ASSIGN_OR_RETURN(ImageF blurred0, ImageF::Create(xsize, ysize));
   1214   JXL_ASSIGN_OR_RETURN(ImageF blurred1, ImageF::Create(xsize, ysize));
   1215   DiffPrecompute(mask0, kMul, kBias, &diff0);
   1216   DiffPrecompute(mask1, kMul, kBias, &diff1);
   1217   JXL_RETURN_IF_ERROR(Blur(diff0, kRadius, params, blur_temp, &blurred0));
   1218   FuzzyErosion(blurred0, &diff0);
   1219   JXL_RETURN_IF_ERROR(Blur(diff1, kRadius, params, blur_temp, &blurred1));
   1220   for (size_t y = 0; y < ysize; ++y) {
   1221     for (size_t x = 0; x < xsize; ++x) {
   1222       mask->Row(y)[x] = diff0.Row(y)[x];
   1223       if (diff_ac != nullptr) {
   1224         static const float kMaskToErrorMul = 10.0;
   1225         float diff = blurred0.Row(y)[x] - blurred1.Row(y)[x];
   1226         diff_ac->Row(y)[x] += kMaskToErrorMul * diff * diff;
   1227       }
   1228     }
   1229   }
   1230   return true;
   1231 }
   1232 
   1233 // `diff_ac` may be null.
   1234 Status MaskPsychoImage(const PsychoImage& pi0, const PsychoImage& pi1,
   1235                        const size_t xsize, const size_t ysize,
   1236                        const ButteraugliParams& params, BlurTemp* blur_temp,
   1237                        ImageF* BUTTERAUGLI_RESTRICT mask,
   1238                        ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
   1239   JXL_ASSIGN_OR_RETURN(ImageF mask0, ImageF::Create(xsize, ysize));
   1240   JXL_ASSIGN_OR_RETURN(ImageF mask1, ImageF::Create(xsize, ysize));
   1241   CombineChannelsForMasking(&pi0.hf[0], &pi0.uhf[0], &mask0);
   1242   CombineChannelsForMasking(&pi1.hf[0], &pi1.uhf[0], &mask1);
   1243   JXL_RETURN_IF_ERROR(Mask(mask0, mask1, params, blur_temp, mask, diff_ac));
   1244   return true;
   1245 }
   1246 
   1247 double MaskY(double delta) {
   1248   static const double offset = 0.829591754942;
   1249   static const double scaler = 0.451936922203;
   1250   static const double mul = 2.5485944793;
   1251   const double c = mul / ((scaler * delta) + offset);
   1252   const double retval = kGlobalScale * (1.0 + c);
   1253   return retval * retval;
   1254 }
   1255 
   1256 double MaskDcY(double delta) {
   1257   static const double offset = 0.20025578522;
   1258   static const double scaler = 3.87449418804;
   1259   static const double mul = 0.505054525019;
   1260   const double c = mul / ((scaler * delta) + offset);
   1261   const double retval = kGlobalScale * (1.0 + c);
   1262   return retval * retval;
   1263 }
   1264 
   1265 inline float MaskColor(const float color[3], const float mask) {
   1266   return color[0] * mask + color[1] * mask + color[2] * mask;
   1267 }
   1268 
   1269 // Diffmap := sqrt of sum{diff images by multiplied by X and Y/B masks}
   1270 void CombineChannelsToDiffmap(const ImageF& mask, const Image3F& block_diff_dc,
   1271                               const Image3F& block_diff_ac, float xmul,
   1272                               ImageF* result) {
   1273   JXL_CHECK(SameSize(mask, *result));
   1274   size_t xsize = mask.xsize();
   1275   size_t ysize = mask.ysize();
   1276   for (size_t y = 0; y < ysize; ++y) {
   1277     float* BUTTERAUGLI_RESTRICT row_out = result->Row(y);
   1278     for (size_t x = 0; x < xsize; ++x) {
   1279       float val = mask.Row(y)[x];
   1280       float maskval = MaskY(val);
   1281       float dc_maskval = MaskDcY(val);
   1282       float diff_dc[3];
   1283       float diff_ac[3];
   1284       for (int i = 0; i < 3; ++i) {
   1285         diff_dc[i] = block_diff_dc.PlaneRow(i, y)[x];
   1286         diff_ac[i] = block_diff_ac.PlaneRow(i, y)[x];
   1287       }
   1288       diff_ac[0] *= xmul;
   1289       diff_dc[0] *= xmul;
   1290       row_out[x] = std::sqrt(MaskColor(diff_dc, dc_maskval) +
   1291                              MaskColor(diff_ac, maskval));
   1292     }
   1293   }
   1294 }
   1295 
   1296 // Adds weighted L2 difference between i0 and i1 to diffmap.
   1297 static void L2Diff(const ImageF& i0, const ImageF& i1, const float w,
   1298                    ImageF* BUTTERAUGLI_RESTRICT diffmap) {
   1299   if (w == 0) return;
   1300 
   1301   const HWY_FULL(float) d;
   1302   const auto weight = Set(d, w);
   1303 
   1304   for (size_t y = 0; y < i0.ysize(); ++y) {
   1305     const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
   1306     const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
   1307     float* BUTTERAUGLI_RESTRICT row_diff = diffmap->Row(y);
   1308 
   1309     for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
   1310       const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
   1311       const auto diff2 = Mul(diff, diff);
   1312       const auto prev = Load(d, row_diff + x);
   1313       Store(MulAdd(diff2, weight, prev), d, row_diff + x);
   1314     }
   1315   }
   1316 }
   1317 
   1318 // Initializes diffmap to the weighted L2 difference between i0 and i1.
   1319 static void SetL2Diff(const ImageF& i0, const ImageF& i1, const float w,
   1320                       ImageF* BUTTERAUGLI_RESTRICT diffmap) {
   1321   if (w == 0) return;
   1322 
   1323   const HWY_FULL(float) d;
   1324   const auto weight = Set(d, w);
   1325 
   1326   for (size_t y = 0; y < i0.ysize(); ++y) {
   1327     const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
   1328     const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
   1329     float* BUTTERAUGLI_RESTRICT row_diff = diffmap->Row(y);
   1330 
   1331     for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
   1332       const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
   1333       const auto diff2 = Mul(diff, diff);
   1334       Store(Mul(diff2, weight), d, row_diff + x);
   1335     }
   1336   }
   1337 }
   1338 
   1339 // i0 is the original image.
   1340 // i1 is the deformed copy.
   1341 static void L2DiffAsymmetric(const ImageF& i0, const ImageF& i1, float w_0gt1,
   1342                              float w_0lt1,
   1343                              ImageF* BUTTERAUGLI_RESTRICT diffmap) {
   1344   if (w_0gt1 == 0 && w_0lt1 == 0) {
   1345     return;
   1346   }
   1347 
   1348   const HWY_FULL(float) d;
   1349   const auto vw_0gt1 = Set(d, w_0gt1 * 0.8);
   1350   const auto vw_0lt1 = Set(d, w_0lt1 * 0.8);
   1351 
   1352   for (size_t y = 0; y < i0.ysize(); ++y) {
   1353     const float* BUTTERAUGLI_RESTRICT row0 = i0.Row(y);
   1354     const float* BUTTERAUGLI_RESTRICT row1 = i1.Row(y);
   1355     float* BUTTERAUGLI_RESTRICT row_diff = diffmap->Row(y);
   1356 
   1357     for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
   1358       const auto val0 = Load(d, row0 + x);
   1359       const auto val1 = Load(d, row1 + x);
   1360 
   1361       // Primary symmetric quadratic objective.
   1362       const auto diff = Sub(val0, val1);
   1363       auto total = MulAdd(Mul(diff, diff), vw_0gt1, Load(d, row_diff + x));
   1364 
   1365       // Secondary half-open quadratic objectives.
   1366       const auto fabs0 = Abs(val0);
   1367       const auto too_small = Mul(Set(d, 0.4), fabs0);
   1368       const auto too_big = fabs0;
   1369 
   1370       const auto if_neg = IfThenElse(
   1371           Gt(val1, Neg(too_small)), Add(val1, too_small),
   1372           IfThenElseZero(Lt(val1, Neg(too_big)), Sub(Neg(val1), too_big)));
   1373       const auto if_pos =
   1374           IfThenElse(Lt(val1, too_small), Sub(too_small, val1),
   1375                      IfThenElseZero(Gt(val1, too_big), Sub(val1, too_big)));
   1376       const auto v = IfThenElse(Lt(val0, Zero(d)), if_neg, if_pos);
   1377       total = MulAdd(vw_0lt1, Mul(v, v), total);
   1378       Store(total, d, row_diff + x);
   1379     }
   1380   }
   1381 }
   1382 
   1383 // A simple HDR compatible gamma function.
   1384 template <class DF, class V>
   1385 V Gamma(const DF df, V v) {
   1386   // ln(2) constant folded in because we want std::log but have FastLog2f.
   1387   const auto kRetMul = Set(df, 19.245013259874995f * 0.693147180559945f);
   1388   const auto kRetAdd = Set(df, -23.16046239805755);
   1389   // This should happen rarely, but may lead to a NaN in log, which is
   1390   // undesirable. Since negative photons don't exist we solve the NaNs by
   1391   // clamping here.
   1392   v = ZeroIfNegative(v);
   1393 
   1394   const auto biased = Add(v, Set(df, 9.9710635769299145));
   1395   const auto log = FastLog2f(df, biased);
   1396   // We could fold this into a custom Log2 polynomial, but there would be
   1397   // relatively little gain.
   1398   return MulAdd(kRetMul, log, kRetAdd);
   1399 }
   1400 
   1401 template <bool Clamp, class DF, class V>
   1402 BUTTERAUGLI_INLINE void OpsinAbsorbance(const DF df, const V& in0, const V& in1,
   1403                                         const V& in2, V* JXL_RESTRICT out0,
   1404                                         V* JXL_RESTRICT out1,
   1405                                         V* JXL_RESTRICT out2) {
   1406   // https://en.wikipedia.org/wiki/Photopsin absorbance modeling.
   1407   static const double mixi0 = 0.29956550340058319;
   1408   static const double mixi1 = 0.63373087833825936;
   1409   static const double mixi2 = 0.077705617820981968;
   1410   static const double mixi3 = 1.7557483643287353;
   1411   static const double mixi4 = 0.22158691104574774;
   1412   static const double mixi5 = 0.69391388044116142;
   1413   static const double mixi6 = 0.0987313588422;
   1414   static const double mixi7 = 1.7557483643287353;
   1415   static const double mixi8 = 0.02;
   1416   static const double mixi9 = 0.02;
   1417   static const double mixi10 = 0.20480129041026129;
   1418   static const double mixi11 = 12.226454707163354;
   1419 
   1420   const V mix0 = Set(df, mixi0);
   1421   const V mix1 = Set(df, mixi1);
   1422   const V mix2 = Set(df, mixi2);
   1423   const V mix3 = Set(df, mixi3);
   1424   const V mix4 = Set(df, mixi4);
   1425   const V mix5 = Set(df, mixi5);
   1426   const V mix6 = Set(df, mixi6);
   1427   const V mix7 = Set(df, mixi7);
   1428   const V mix8 = Set(df, mixi8);
   1429   const V mix9 = Set(df, mixi9);
   1430   const V mix10 = Set(df, mixi10);
   1431   const V mix11 = Set(df, mixi11);
   1432 
   1433   *out0 = MulAdd(mix0, in0, MulAdd(mix1, in1, MulAdd(mix2, in2, mix3)));
   1434   *out1 = MulAdd(mix4, in0, MulAdd(mix5, in1, MulAdd(mix6, in2, mix7)));
   1435   *out2 = MulAdd(mix8, in0, MulAdd(mix9, in1, MulAdd(mix10, in2, mix11)));
   1436 
   1437   if (Clamp) {
   1438     *out0 = Max(*out0, mix3);
   1439     *out1 = Max(*out1, mix7);
   1440     *out2 = Max(*out2, mix11);
   1441   }
   1442 }
   1443 
   1444 // `blurred` is a temporary image used inside this function and not returned.
   1445 Status OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params,
   1446                           Image3F* blurred, BlurTemp* blur_temp, Image3F* xyb) {
   1447   const double kSigma = 1.2;
   1448   JXL_RETURN_IF_ERROR(
   1449       Blur(rgb.Plane(0), kSigma, params, blur_temp, &blurred->Plane(0)));
   1450   JXL_RETURN_IF_ERROR(
   1451       Blur(rgb.Plane(1), kSigma, params, blur_temp, &blurred->Plane(1)));
   1452   JXL_RETURN_IF_ERROR(
   1453       Blur(rgb.Plane(2), kSigma, params, blur_temp, &blurred->Plane(2)));
   1454   const HWY_FULL(float) df;
   1455   const auto intensity_target_multiplier = Set(df, params.intensity_target);
   1456   for (size_t y = 0; y < rgb.ysize(); ++y) {
   1457     const float* row_r = rgb.ConstPlaneRow(0, y);
   1458     const float* row_g = rgb.ConstPlaneRow(1, y);
   1459     const float* row_b = rgb.ConstPlaneRow(2, y);
   1460     const float* row_blurred_r = blurred->ConstPlaneRow(0, y);
   1461     const float* row_blurred_g = blurred->ConstPlaneRow(1, y);
   1462     const float* row_blurred_b = blurred->ConstPlaneRow(2, y);
   1463     float* row_out_x = xyb->PlaneRow(0, y);
   1464     float* row_out_y = xyb->PlaneRow(1, y);
   1465     float* row_out_b = xyb->PlaneRow(2, y);
   1466     const auto min = Set(df, 1e-4f);
   1467     for (size_t x = 0; x < rgb.xsize(); x += Lanes(df)) {
   1468       auto sensitivity0 = Undefined(df);
   1469       auto sensitivity1 = Undefined(df);
   1470       auto sensitivity2 = Undefined(df);
   1471       {
   1472         // Calculate sensitivity based on the smoothed image gamma derivative.
   1473         auto pre_mixed0 = Undefined(df);
   1474         auto pre_mixed1 = Undefined(df);
   1475         auto pre_mixed2 = Undefined(df);
   1476         OpsinAbsorbance<true>(
   1477             df, Mul(Load(df, row_blurred_r + x), intensity_target_multiplier),
   1478             Mul(Load(df, row_blurred_g + x), intensity_target_multiplier),
   1479             Mul(Load(df, row_blurred_b + x), intensity_target_multiplier),
   1480             &pre_mixed0, &pre_mixed1, &pre_mixed2);
   1481         pre_mixed0 = Max(pre_mixed0, min);
   1482         pre_mixed1 = Max(pre_mixed1, min);
   1483         pre_mixed2 = Max(pre_mixed2, min);
   1484         sensitivity0 = Div(Gamma(df, pre_mixed0), pre_mixed0);
   1485         sensitivity1 = Div(Gamma(df, pre_mixed1), pre_mixed1);
   1486         sensitivity2 = Div(Gamma(df, pre_mixed2), pre_mixed2);
   1487         sensitivity0 = Max(sensitivity0, min);
   1488         sensitivity1 = Max(sensitivity1, min);
   1489         sensitivity2 = Max(sensitivity2, min);
   1490       }
   1491       auto cur_mixed0 = Undefined(df);
   1492       auto cur_mixed1 = Undefined(df);
   1493       auto cur_mixed2 = Undefined(df);
   1494       OpsinAbsorbance<false>(
   1495           df, Mul(Load(df, row_r + x), intensity_target_multiplier),
   1496           Mul(Load(df, row_g + x), intensity_target_multiplier),
   1497           Mul(Load(df, row_b + x), intensity_target_multiplier), &cur_mixed0,
   1498           &cur_mixed1, &cur_mixed2);
   1499       cur_mixed0 = Mul(cur_mixed0, sensitivity0);
   1500       cur_mixed1 = Mul(cur_mixed1, sensitivity1);
   1501       cur_mixed2 = Mul(cur_mixed2, sensitivity2);
   1502       // This is a kludge. The negative values should be zeroed away before
   1503       // blurring. Ideally there would be no negative values in the first place.
   1504       const auto min01 = Set(df, 1.7557483643287353f);
   1505       const auto min2 = Set(df, 12.226454707163354f);
   1506       cur_mixed0 = Max(cur_mixed0, min01);
   1507       cur_mixed1 = Max(cur_mixed1, min01);
   1508       cur_mixed2 = Max(cur_mixed2, min2);
   1509 
   1510       Store(Sub(cur_mixed0, cur_mixed1), df, row_out_x + x);
   1511       Store(Add(cur_mixed0, cur_mixed1), df, row_out_y + x);
   1512       Store(cur_mixed2, df, row_out_b + x);
   1513     }
   1514   }
   1515   return true;
   1516 }
   1517 
   1518 Status ButteraugliDiffmapInPlace(Image3F& image0, Image3F& image1,
   1519                                  const ButteraugliParams& params,
   1520                                  ImageF& diffmap) {
   1521   // image0 and image1 are in linear sRGB color space
   1522   const size_t xsize = image0.xsize();
   1523   const size_t ysize = image0.ysize();
   1524   BlurTemp blur_temp;
   1525   {
   1526     // Convert image0 and image1 to XYB in-place
   1527     JXL_ASSIGN_OR_RETURN(Image3F temp, Image3F::Create(xsize, ysize));
   1528     JXL_RETURN_IF_ERROR(
   1529         OpsinDynamicsImage(image0, params, &temp, &blur_temp, &image0));
   1530     JXL_RETURN_IF_ERROR(
   1531         OpsinDynamicsImage(image1, params, &temp, &blur_temp, &image1));
   1532   }
   1533   // image0 and image1 are in XYB color space
   1534   JXL_ASSIGN_OR_RETURN(ImageF block_diff_dc, ImageF::Create(xsize, ysize));
   1535   ZeroFillImage(&block_diff_dc);
   1536   {
   1537     // separate out LF components from image0 and image1 and compute the dc
   1538     // diff image from them
   1539     JXL_ASSIGN_OR_RETURN(Image3F lf0, Image3F::Create(xsize, ysize));
   1540     JXL_ASSIGN_OR_RETURN(Image3F lf1, Image3F::Create(xsize, ysize));
   1541     JXL_RETURN_IF_ERROR(
   1542         SeparateLFAndMF(params, image0, &lf0, &image0, &blur_temp));
   1543     JXL_RETURN_IF_ERROR(
   1544         SeparateLFAndMF(params, image1, &lf1, &image1, &blur_temp));
   1545     for (size_t c = 0; c < 3; ++c) {
   1546       L2Diff(lf0.Plane(c), lf1.Plane(c), wmul[6 + c], &block_diff_dc);
   1547     }
   1548   }
   1549   // image0 and image1 are MF residuals (before blurring) in XYB color space
   1550   ImageF hf0[2];
   1551   ImageF hf1[2];
   1552   JXL_RETURN_IF_ERROR(SeparateMFAndHF(params, &image0, &hf0[0], &blur_temp));
   1553   JXL_RETURN_IF_ERROR(SeparateMFAndHF(params, &image1, &hf1[0], &blur_temp));
   1554   // image0 and image1 are MF-images in XYB color space
   1555 
   1556   JXL_ASSIGN_OR_RETURN(ImageF block_diff_ac, ImageF::Create(xsize, ysize));
   1557   ZeroFillImage(&block_diff_ac);
   1558   // start accumulating ac diff image from MF images
   1559   {
   1560     JXL_ASSIGN_OR_RETURN(ImageF diffs, ImageF::Create(xsize, ysize));
   1561     MaltaDiffMapLF(image0.Plane(1), image1.Plane(1), wMfMalta, wMfMalta,
   1562                    norm1Mf, &diffs, &block_diff_ac);
   1563     MaltaDiffMapLF(image0.Plane(0), image1.Plane(0), wMfMaltaX, wMfMaltaX,
   1564                    norm1MfX, &diffs, &block_diff_ac);
   1565   }
   1566   for (size_t c = 0; c < 3; ++c) {
   1567     L2Diff(image0.Plane(c), image1.Plane(c), wmul[3 + c], &block_diff_ac);
   1568   }
   1569   // we will not need the MF-images and more, so we deallocate them to reduce
   1570   // peak memory usage
   1571   image0 = Image3F();
   1572   image1 = Image3F();
   1573 
   1574   ImageF uhf0[2];
   1575   ImageF uhf1[2];
   1576   JXL_RETURN_IF_ERROR(SeparateHFAndUHF(params, &hf0[0], &uhf0[0], &blur_temp));
   1577   JXL_RETURN_IF_ERROR(SeparateHFAndUHF(params, &hf1[0], &uhf1[0], &blur_temp));
   1578 
   1579   // continue accumulating ac diff image from HF and UHF images
   1580   const float hf_asymmetry = params.hf_asymmetry;
   1581   {
   1582     JXL_ASSIGN_OR_RETURN(ImageF diffs, ImageF::Create(xsize, ysize));
   1583     MaltaDiffMap(uhf0[1], uhf1[1], wUhfMalta * hf_asymmetry,
   1584                  wUhfMalta / hf_asymmetry, norm1Uhf, &diffs, &block_diff_ac);
   1585     MaltaDiffMap(uhf0[0], uhf1[0], wUhfMaltaX * hf_asymmetry,
   1586                  wUhfMaltaX / hf_asymmetry, norm1UhfX, &diffs, &block_diff_ac);
   1587     MaltaDiffMapLF(hf0[1], hf1[1], wHfMalta * std::sqrt(hf_asymmetry),
   1588                    wHfMalta / std::sqrt(hf_asymmetry), norm1Hf, &diffs,
   1589                    &block_diff_ac);
   1590     MaltaDiffMapLF(hf0[0], hf1[0], wHfMaltaX * std::sqrt(hf_asymmetry),
   1591                    wHfMaltaX / std::sqrt(hf_asymmetry), norm1HfX, &diffs,
   1592                    &block_diff_ac);
   1593   }
   1594   for (size_t c = 0; c < 2; ++c) {
   1595     L2DiffAsymmetric(hf0[c], hf1[c], wmul[c] * hf_asymmetry,
   1596                      wmul[c] / hf_asymmetry, &block_diff_ac);
   1597   }
   1598 
   1599   // compute mask image from HF and UHF X and Y images
   1600   JXL_ASSIGN_OR_RETURN(ImageF mask, ImageF::Create(xsize, ysize));
   1601   {
   1602     JXL_ASSIGN_OR_RETURN(ImageF mask0, ImageF::Create(xsize, ysize));
   1603     JXL_ASSIGN_OR_RETURN(ImageF mask1, ImageF::Create(xsize, ysize));
   1604     CombineChannelsForMasking(&hf0[0], &uhf0[0], &mask0);
   1605     CombineChannelsForMasking(&hf1[0], &uhf1[0], &mask1);
   1606     DeallocateHFAndUHF(&hf1[0], &uhf1[0]);
   1607     DeallocateHFAndUHF(&hf0[0], &uhf0[0]);
   1608     JXL_RETURN_IF_ERROR(
   1609         Mask(mask0, mask1, params, &blur_temp, &mask, &block_diff_ac));
   1610   }
   1611 
   1612   // compute final diffmap from mask image and ac and dc diff images
   1613   JXL_ASSIGN_OR_RETURN(diffmap, ImageF::Create(xsize, ysize));
   1614   for (size_t y = 0; y < ysize; ++y) {
   1615     const float* row_dc = block_diff_dc.Row(y);
   1616     const float* row_ac = block_diff_ac.Row(y);
   1617     float* row_out = diffmap.Row(y);
   1618     for (size_t x = 0; x < xsize; ++x) {
   1619       const float val = mask.Row(y)[x];
   1620       row_out[x] = sqrt(row_dc[x] * MaskDcY(val) + row_ac[x] * MaskY(val));
   1621     }
   1622   }
   1623   return true;
   1624 }
   1625 
   1626 // NOLINTNEXTLINE(google-readability-namespace-comments)
   1627 }  // namespace HWY_NAMESPACE
   1628 }  // namespace jxl
   1629 HWY_AFTER_NAMESPACE();
   1630 
   1631 #if HWY_ONCE
   1632 namespace jxl {
   1633 
   1634 HWY_EXPORT(SeparateFrequencies);       // Local function.
   1635 HWY_EXPORT(MaskPsychoImage);           // Local function.
   1636 HWY_EXPORT(L2DiffAsymmetric);          // Local function.
   1637 HWY_EXPORT(L2Diff);                    // Local function.
   1638 HWY_EXPORT(SetL2Diff);                 // Local function.
   1639 HWY_EXPORT(CombineChannelsToDiffmap);  // Local function.
   1640 HWY_EXPORT(MaltaDiffMap);              // Local function.
   1641 HWY_EXPORT(MaltaDiffMapLF);            // Local function.
   1642 HWY_EXPORT(OpsinDynamicsImage);        // Local function.
   1643 HWY_EXPORT(ButteraugliDiffmapInPlace);  // Local function.
   1644 
   1645 #if BUTTERAUGLI_ENABLE_CHECKS
   1646 
   1647 static inline bool IsNan(const float x) {
   1648   uint32_t bits;
   1649   memcpy(&bits, &x, sizeof(bits));
   1650   const uint32_t bitmask_exp = 0x7F800000;
   1651   return (bits & bitmask_exp) == bitmask_exp && (bits & 0x7FFFFF);
   1652 }
   1653 
   1654 static inline bool IsNan(const double x) {
   1655   uint64_t bits;
   1656   memcpy(&bits, &x, sizeof(bits));
   1657   return (0x7ff0000000000001ULL <= bits && bits <= 0x7fffffffffffffffULL) ||
   1658          (0xfff0000000000001ULL <= bits && bits <= 0xffffffffffffffffULL);
   1659 }
   1660 
   1661 static inline void CheckImage(const ImageF& image, const char* name) {
   1662   for (size_t y = 0; y < image.ysize(); ++y) {
   1663     const float* BUTTERAUGLI_RESTRICT row = image.Row(y);
   1664     for (size_t x = 0; x < image.xsize(); ++x) {
   1665       if (IsNan(row[x])) {
   1666         printf("NAN: Image %s @ %" PRIuS ",%" PRIuS " (of %" PRIuS ",%" PRIuS
   1667                ")\n",
   1668                name, x, y, image.xsize(), image.ysize());
   1669         exit(1);
   1670       }
   1671     }
   1672   }
   1673 }
   1674 
   1675 #define CHECK_NAN(x, str)                \
   1676   do {                                   \
   1677     if (IsNan(x)) {                      \
   1678       printf("%d: %s\n", __LINE__, str); \
   1679       abort();                           \
   1680     }                                    \
   1681   } while (0)
   1682 
   1683 #define CHECK_IMAGE(image, name) CheckImage(image, name)
   1684 
   1685 #else  // BUTTERAUGLI_ENABLE_CHECKS
   1686 
   1687 #define CHECK_NAN(x, str)
   1688 #define CHECK_IMAGE(image, name)
   1689 
   1690 #endif  // BUTTERAUGLI_ENABLE_CHECKS
   1691 
   1692 // Calculate a 2x2 subsampled image for purposes of recursive butteraugli at
   1693 // multiresolution.
   1694 static StatusOr<Image3F> SubSample2x(const Image3F& in) {
   1695   size_t xs = (in.xsize() + 1) / 2;
   1696   size_t ys = (in.ysize() + 1) / 2;
   1697   JXL_ASSIGN_OR_RETURN(Image3F retval, Image3F::Create(xs, ys));
   1698   for (size_t c = 0; c < 3; ++c) {
   1699     for (size_t y = 0; y < ys; ++y) {
   1700       for (size_t x = 0; x < xs; ++x) {
   1701         retval.PlaneRow(c, y)[x] = 0;
   1702       }
   1703     }
   1704   }
   1705   for (size_t c = 0; c < 3; ++c) {
   1706     for (size_t y = 0; y < in.ysize(); ++y) {
   1707       for (size_t x = 0; x < in.xsize(); ++x) {
   1708         retval.PlaneRow(c, y / 2)[x / 2] += 0.25f * in.PlaneRow(c, y)[x];
   1709       }
   1710     }
   1711     if ((in.xsize() & 1) != 0) {
   1712       for (size_t y = 0; y < retval.ysize(); ++y) {
   1713         size_t last_column = retval.xsize() - 1;
   1714         retval.PlaneRow(c, y)[last_column] *= 2.0f;
   1715       }
   1716     }
   1717     if ((in.ysize() & 1) != 0) {
   1718       for (size_t x = 0; x < retval.xsize(); ++x) {
   1719         size_t last_row = retval.ysize() - 1;
   1720         retval.PlaneRow(c, last_row)[x] *= 2.0f;
   1721       }
   1722     }
   1723   }
   1724   return retval;
   1725 }
   1726 
   1727 // Supersample src by 2x and add it to dest.
   1728 static void AddSupersampled2x(const ImageF& src, float w, ImageF& dest) {
   1729   for (size_t y = 0; y < dest.ysize(); ++y) {
   1730     for (size_t x = 0; x < dest.xsize(); ++x) {
   1731       // There will be less errors from the more averaged images.
   1732       // We take it into account to some extent using a scaler.
   1733       static const double kHeuristicMixingValue = 0.3;
   1734       dest.Row(y)[x] *= 1.0 - kHeuristicMixingValue * w;
   1735       dest.Row(y)[x] += w * src.Row(y / 2)[x / 2];
   1736     }
   1737   }
   1738 }
   1739 
   1740 Image3F* ButteraugliComparator::Temp() const {
   1741   bool was_in_use = temp_in_use_.test_and_set(std::memory_order_acq_rel);
   1742   JXL_ASSERT(!was_in_use);
   1743   (void)was_in_use;
   1744   return &temp_;
   1745 }
   1746 
   1747 void ButteraugliComparator::ReleaseTemp() const { temp_in_use_.clear(); }
   1748 
   1749 ButteraugliComparator::ButteraugliComparator(size_t xsize, size_t ysize,
   1750                                              const ButteraugliParams& params)
   1751     : xsize_(xsize), ysize_(ysize), params_(params) {}
   1752 
   1753 StatusOr<std::unique_ptr<ButteraugliComparator>> ButteraugliComparator::Make(
   1754     const Image3F& rgb0, const ButteraugliParams& params) {
   1755   size_t xsize = rgb0.xsize();
   1756   size_t ysize = rgb0.ysize();
   1757   std::unique_ptr<ButteraugliComparator> result =
   1758       std::unique_ptr<ButteraugliComparator>(
   1759           new ButteraugliComparator(xsize, ysize, params));
   1760   JXL_ASSIGN_OR_RETURN(result->temp_, Image3F::Create(xsize, ysize));
   1761 
   1762   if (xsize < 8 || ysize < 8) {
   1763     return result;
   1764   }
   1765 
   1766   JXL_ASSIGN_OR_RETURN(Image3F xyb0, Image3F::Create(xsize, ysize));
   1767   JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
   1768       rgb0, params, result->Temp(), &result->blur_temp_, &xyb0));
   1769   result->ReleaseTemp();
   1770   JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(SeparateFrequencies)(
   1771       xsize, ysize, params, &result->blur_temp_, xyb0, result->pi0_));
   1772 
   1773   // Awful recursive construction of samples of different resolution.
   1774   // This is an after-thought and possibly somewhat parallel in
   1775   // functionality with the PsychoImage multi-resolution approach.
   1776   JXL_ASSIGN_OR_RETURN(Image3F subsampledRgb0, SubSample2x(rgb0));
   1777   StatusOr<std::unique_ptr<ButteraugliComparator>> sub =
   1778       ButteraugliComparator::Make(subsampledRgb0, params);
   1779   if (!sub.ok()) return sub.status();
   1780   result->sub_ = std::move(sub).value();
   1781 
   1782   return result;
   1783 }
   1784 
   1785 Status ButteraugliComparator::Mask(ImageF* BUTTERAUGLI_RESTRICT mask) const {
   1786   return HWY_DYNAMIC_DISPATCH(MaskPsychoImage)(
   1787       pi0_, pi0_, xsize_, ysize_, params_, &blur_temp_, mask, nullptr);
   1788 }
   1789 
   1790 Status ButteraugliComparator::Diffmap(const Image3F& rgb1,
   1791                                       ImageF& result) const {
   1792   if (xsize_ < 8 || ysize_ < 8) {
   1793     ZeroFillImage(&result);
   1794     return true;
   1795   }
   1796   JXL_ASSIGN_OR_RETURN(Image3F xyb1, Image3F::Create(xsize_, ysize_));
   1797   JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
   1798       rgb1, params_, Temp(), &blur_temp_, &xyb1));
   1799   ReleaseTemp();
   1800   JXL_RETURN_IF_ERROR(DiffmapOpsinDynamicsImage(xyb1, result));
   1801   if (sub_) {
   1802     if (sub_->xsize_ < 8 || sub_->ysize_ < 8) {
   1803       return true;
   1804     }
   1805     JXL_ASSIGN_OR_RETURN(Image3F sub_xyb,
   1806                          Image3F::Create(sub_->xsize_, sub_->ysize_));
   1807     JXL_ASSIGN_OR_RETURN(Image3F subsampledRgb1, SubSample2x(rgb1));
   1808     JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
   1809         subsampledRgb1, params_, sub_->Temp(), &sub_->blur_temp_, &sub_xyb));
   1810     sub_->ReleaseTemp();
   1811     ImageF subresult;
   1812     JXL_RETURN_IF_ERROR(sub_->DiffmapOpsinDynamicsImage(sub_xyb, subresult));
   1813     AddSupersampled2x(subresult, 0.5, result);
   1814   }
   1815   return true;
   1816 }
   1817 
   1818 Status ButteraugliComparator::DiffmapOpsinDynamicsImage(const Image3F& xyb1,
   1819                                                         ImageF& result) const {
   1820   if (xsize_ < 8 || ysize_ < 8) {
   1821     ZeroFillImage(&result);
   1822     return true;
   1823   }
   1824   PsychoImage pi1;
   1825   JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(SeparateFrequencies)(
   1826       xsize_, ysize_, params_, &blur_temp_, xyb1, pi1));
   1827   JXL_ASSIGN_OR_RETURN(result, ImageF::Create(xsize_, ysize_));
   1828   return DiffmapPsychoImage(pi1, result);
   1829 }
   1830 
   1831 namespace {
   1832 
   1833 void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
   1834                   const double w_0lt1, const double norm1,
   1835                   ImageF* HWY_RESTRICT diffs,
   1836                   Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
   1837   HWY_DYNAMIC_DISPATCH(MaltaDiffMap)
   1838   (lum0, lum1, w_0gt1, w_0lt1, norm1, diffs, &block_diff_ac->Plane(c));
   1839 }
   1840 
   1841 void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
   1842                     const double w_0lt1, const double norm1,
   1843                     ImageF* HWY_RESTRICT diffs,
   1844                     Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
   1845   HWY_DYNAMIC_DISPATCH(MaltaDiffMapLF)
   1846   (lum0, lum1, w_0gt1, w_0lt1, norm1, diffs, &block_diff_ac->Plane(c));
   1847 }
   1848 
   1849 }  // namespace
   1850 
   1851 Status ButteraugliComparator::DiffmapPsychoImage(const PsychoImage& pi1,
   1852                                                  ImageF& diffmap) const {
   1853   if (xsize_ < 8 || ysize_ < 8) {
   1854     ZeroFillImage(&diffmap);
   1855     return true;
   1856   }
   1857 
   1858   const float hf_asymmetry_ = params_.hf_asymmetry;
   1859   const float xmul_ = params_.xmul;
   1860 
   1861   JXL_ASSIGN_OR_RETURN(ImageF diffs, ImageF::Create(xsize_, ysize_));
   1862   JXL_ASSIGN_OR_RETURN(Image3F block_diff_ac, Image3F::Create(xsize_, ysize_));
   1863   ZeroFillImage(&block_diff_ac);
   1864   MaltaDiffMap(pi0_.uhf[1], pi1.uhf[1], wUhfMalta * hf_asymmetry_,
   1865                wUhfMalta / hf_asymmetry_, norm1Uhf, &diffs, &block_diff_ac, 1);
   1866   MaltaDiffMap(pi0_.uhf[0], pi1.uhf[0], wUhfMaltaX * hf_asymmetry_,
   1867                wUhfMaltaX / hf_asymmetry_, norm1UhfX, &diffs, &block_diff_ac,
   1868                0);
   1869   MaltaDiffMapLF(pi0_.hf[1], pi1.hf[1], wHfMalta * std::sqrt(hf_asymmetry_),
   1870                  wHfMalta / std::sqrt(hf_asymmetry_), norm1Hf, &diffs,
   1871                  &block_diff_ac, 1);
   1872   MaltaDiffMapLF(pi0_.hf[0], pi1.hf[0], wHfMaltaX * std::sqrt(hf_asymmetry_),
   1873                  wHfMaltaX / std::sqrt(hf_asymmetry_), norm1HfX, &diffs,
   1874                  &block_diff_ac, 0);
   1875   MaltaDiffMapLF(pi0_.mf.Plane(1), pi1.mf.Plane(1), wMfMalta, wMfMalta, norm1Mf,
   1876                  &diffs, &block_diff_ac, 1);
   1877   MaltaDiffMapLF(pi0_.mf.Plane(0), pi1.mf.Plane(0), wMfMaltaX, wMfMaltaX,
   1878                  norm1MfX, &diffs, &block_diff_ac, 0);
   1879 
   1880   JXL_ASSIGN_OR_RETURN(Image3F block_diff_dc, Image3F::Create(xsize_, ysize_));
   1881   for (size_t c = 0; c < 3; ++c) {
   1882     if (c < 2) {  // No blue channel error accumulated at HF.
   1883       HWY_DYNAMIC_DISPATCH(L2DiffAsymmetric)
   1884       (pi0_.hf[c], pi1.hf[c], wmul[c] * hf_asymmetry_, wmul[c] / hf_asymmetry_,
   1885        &block_diff_ac.Plane(c));
   1886     }
   1887     HWY_DYNAMIC_DISPATCH(L2Diff)
   1888     (pi0_.mf.Plane(c), pi1.mf.Plane(c), wmul[3 + c], &block_diff_ac.Plane(c));
   1889     HWY_DYNAMIC_DISPATCH(SetL2Diff)
   1890     (pi0_.lf.Plane(c), pi1.lf.Plane(c), wmul[6 + c], &block_diff_dc.Plane(c));
   1891   }
   1892 
   1893   ImageF mask;
   1894   JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(MaskPsychoImage)(
   1895       pi0_, pi1, xsize_, ysize_, params_, &blur_temp_, &mask,
   1896       &block_diff_ac.Plane(1)));
   1897 
   1898   HWY_DYNAMIC_DISPATCH(CombineChannelsToDiffmap)
   1899   (mask, block_diff_dc, block_diff_ac, xmul_, &diffmap);
   1900   return true;
   1901 }
   1902 
   1903 double ButteraugliScoreFromDiffmap(const ImageF& diffmap,
   1904                                    const ButteraugliParams* params) {
   1905   float retval = 0.0f;
   1906   for (size_t y = 0; y < diffmap.ysize(); ++y) {
   1907     const float* BUTTERAUGLI_RESTRICT row = diffmap.ConstRow(y);
   1908     for (size_t x = 0; x < diffmap.xsize(); ++x) {
   1909       retval = std::max(retval, row[x]);
   1910     }
   1911   }
   1912   return retval;
   1913 }
   1914 
   1915 Status ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
   1916                           double hf_asymmetry, double xmul, ImageF& diffmap) {
   1917   ButteraugliParams params;
   1918   params.hf_asymmetry = hf_asymmetry;
   1919   params.xmul = xmul;
   1920   return ButteraugliDiffmap(rgb0, rgb1, params, diffmap);
   1921 }
   1922 
   1923 template <size_t kMax>
   1924 bool ButteraugliDiffmapSmall(const Image3F& rgb0, const Image3F& rgb1,
   1925                              const ButteraugliParams& params, ImageF& diffmap) {
   1926   const size_t xsize = rgb0.xsize();
   1927   const size_t ysize = rgb0.ysize();
   1928   // Butteraugli values for small (where xsize or ysize is smaller
   1929   // than 8 pixels) images are non-sensical, but most likely it is
   1930   // less disruptive to try to compute something than just give up.
   1931   // Temporarily extend the borders of the image to fit 8 x 8 size.
   1932   size_t xborder = xsize < kMax ? (kMax - xsize) / 2 : 0;
   1933   size_t yborder = ysize < kMax ? (kMax - ysize) / 2 : 0;
   1934   size_t xscaled = std::max<size_t>(kMax, xsize);
   1935   size_t yscaled = std::max<size_t>(kMax, ysize);
   1936   JXL_ASSIGN_OR_RETURN(Image3F scaled0, Image3F::Create(xscaled, yscaled));
   1937   JXL_ASSIGN_OR_RETURN(Image3F scaled1, Image3F::Create(xscaled, yscaled));
   1938   for (int i = 0; i < 3; ++i) {
   1939     for (size_t y = 0; y < yscaled; ++y) {
   1940       for (size_t x = 0; x < xscaled; ++x) {
   1941         size_t x2 = std::min<size_t>(xsize - 1, x > xborder ? x - xborder : 0);
   1942         size_t y2 = std::min<size_t>(ysize - 1, y > yborder ? y - yborder : 0);
   1943         scaled0.PlaneRow(i, y)[x] = rgb0.PlaneRow(i, y2)[x2];
   1944         scaled1.PlaneRow(i, y)[x] = rgb1.PlaneRow(i, y2)[x2];
   1945       }
   1946     }
   1947   }
   1948   ImageF diffmap_scaled;
   1949   const bool ok = ButteraugliDiffmap(scaled0, scaled1, params, diffmap_scaled);
   1950   JXL_ASSIGN_OR_RETURN(diffmap, ImageF::Create(xsize, ysize));
   1951   for (size_t y = 0; y < ysize; ++y) {
   1952     for (size_t x = 0; x < xsize; ++x) {
   1953       diffmap.Row(y)[x] = diffmap_scaled.Row(y + yborder)[x + xborder];
   1954     }
   1955   }
   1956   return ok;
   1957 }
   1958 
   1959 Status ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
   1960                           const ButteraugliParams& params, ImageF& diffmap) {
   1961   const size_t xsize = rgb0.xsize();
   1962   const size_t ysize = rgb0.ysize();
   1963   if (xsize < 1 || ysize < 1) {
   1964     return JXL_FAILURE("Zero-sized image");
   1965   }
   1966   if (!SameSize(rgb0, rgb1)) {
   1967     return JXL_FAILURE("Size mismatch");
   1968   }
   1969   static const int kMax = 8;
   1970   if (xsize < kMax || ysize < kMax) {
   1971     return ButteraugliDiffmapSmall<kMax>(rgb0, rgb1, params, diffmap);
   1972   }
   1973   JXL_ASSIGN_OR_RETURN(std::unique_ptr<ButteraugliComparator> butteraugli,
   1974                        ButteraugliComparator::Make(rgb0, params));
   1975   JXL_RETURN_IF_ERROR(butteraugli->Diffmap(rgb1, diffmap));
   1976   return true;
   1977 }
   1978 
   1979 bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
   1980                           float hf_asymmetry, float xmul, ImageF& diffmap,
   1981                           double& diffvalue) {
   1982   ButteraugliParams params;
   1983   params.hf_asymmetry = hf_asymmetry;
   1984   params.xmul = xmul;
   1985   return ButteraugliInterface(rgb0, rgb1, params, diffmap, diffvalue);
   1986 }
   1987 
   1988 bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
   1989                           const ButteraugliParams& params, ImageF& diffmap,
   1990                           double& diffvalue) {
   1991   if (!ButteraugliDiffmap(rgb0, rgb1, params, diffmap)) {
   1992     return false;
   1993   }
   1994   diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
   1995   return true;
   1996 }
   1997 
   1998 Status ButteraugliInterfaceInPlace(Image3F&& rgb0, Image3F&& rgb1,
   1999                                    const ButteraugliParams& params,
   2000                                    ImageF& diffmap, double& diffvalue) {
   2001   const size_t xsize = rgb0.xsize();
   2002   const size_t ysize = rgb0.ysize();
   2003   if (xsize < 1 || ysize < 1) {
   2004     return JXL_FAILURE("Zero-sized image");
   2005   }
   2006   if (!SameSize(rgb0, rgb1)) {
   2007     return JXL_FAILURE("Size mismatch");
   2008   }
   2009   static const int kMax = 8;
   2010   if (xsize < kMax || ysize < kMax) {
   2011     bool ok = ButteraugliDiffmapSmall<kMax>(rgb0, rgb1, params, diffmap);
   2012     diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
   2013     return ok;
   2014   }
   2015   ImageF subdiffmap;
   2016   if (xsize >= 15 && ysize >= 15) {
   2017     JXL_ASSIGN_OR_RETURN(Image3F rgb0_sub, SubSample2x(rgb0));
   2018     JXL_ASSIGN_OR_RETURN(Image3F rgb1_sub, SubSample2x(rgb1));
   2019     JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(ButteraugliDiffmapInPlace)(
   2020         rgb0_sub, rgb1_sub, params, subdiffmap));
   2021   }
   2022   JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(ButteraugliDiffmapInPlace)(
   2023       rgb0, rgb1, params, diffmap));
   2024   if (xsize >= 15 && ysize >= 15) {
   2025     AddSupersampled2x(subdiffmap, 0.5, diffmap);
   2026   }
   2027   diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
   2028   return true;
   2029 }
   2030 
   2031 double ButteraugliFuzzyClass(double score) {
   2032   static const double fuzzy_width_up = 4.8;
   2033   static const double fuzzy_width_down = 4.8;
   2034   static const double m0 = 2.0;
   2035   static const double scaler = 0.7777;
   2036   double val;
   2037   if (score < 1.0) {
   2038     // val in [scaler .. 2.0]
   2039     val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_down));
   2040     val -= 1.0;           // from [1 .. 2] to [0 .. 1]
   2041     val *= 2.0 - scaler;  // from [0 .. 1] to [0 .. 2.0 - scaler]
   2042     val += scaler;        // from [0 .. 2.0 - scaler] to [scaler .. 2.0]
   2043   } else {
   2044     // val in [0 .. scaler]
   2045     val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_up));
   2046     val *= scaler;
   2047   }
   2048   return val;
   2049 }
   2050 
   2051 // #define PRINT_OUT_NORMALIZATION
   2052 
   2053 double ButteraugliFuzzyInverse(double seek) {
   2054   double pos = 0;
   2055   // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter)
   2056   for (double range = 1.0; range >= 1e-10; range *= 0.5) {
   2057     double cur = ButteraugliFuzzyClass(pos);
   2058     if (cur < seek) {
   2059       pos -= range;
   2060     } else {
   2061       pos += range;
   2062     }
   2063   }
   2064 #ifdef PRINT_OUT_NORMALIZATION
   2065   if (seek == 1.0) {
   2066     fprintf(stderr, "Fuzzy inverse %g\n", pos);
   2067   }
   2068 #endif
   2069   return pos;
   2070 }
   2071 
   2072 #ifdef PRINT_OUT_NORMALIZATION
   2073 static double print_out_normalization = ButteraugliFuzzyInverse(1.0);
   2074 #endif
   2075 
   2076 namespace {
   2077 
   2078 void ScoreToRgb(double score, double good_threshold, double bad_threshold,
   2079                 float rgb[3]) {
   2080   double heatmap[12][3] = {
   2081       {0, 0, 0},       {0, 0, 1},
   2082       {0, 1, 1},       {0, 1, 0},  // Good level
   2083       {1, 1, 0},       {1, 0, 0},  // Bad level
   2084       {1, 0, 1},       {0.5, 0.5, 1.0},
   2085       {1.0, 0.5, 0.5},  // Pastel colors for the very bad quality range.
   2086       {1.0, 1.0, 0.5}, {1, 1, 1},
   2087       {1, 1, 1},  // Last color repeated to have a solid range of white.
   2088   };
   2089   if (score < good_threshold) {
   2090     score = (score / good_threshold) * 0.3;
   2091   } else if (score < bad_threshold) {
   2092     score = 0.3 +
   2093             (score - good_threshold) / (bad_threshold - good_threshold) * 0.15;
   2094   } else {
   2095     score = 0.45 + (score - bad_threshold) / (bad_threshold * 12) * 0.5;
   2096   }
   2097   static const int kTableSize = sizeof(heatmap) / sizeof(heatmap[0]);
   2098   score = std::min<double>(std::max<double>(score * (kTableSize - 1), 0.0),
   2099                            kTableSize - 2);
   2100   int ix = static_cast<int>(score);
   2101   ix = std::min(std::max(0, ix), kTableSize - 2);  // Handle NaN
   2102   double mix = score - ix;
   2103   for (int i = 0; i < 3; ++i) {
   2104     double v = mix * heatmap[ix + 1][i] + (1 - mix) * heatmap[ix][i];
   2105     rgb[i] = pow(v, 0.5);
   2106   }
   2107 }
   2108 
   2109 }  // namespace
   2110 
   2111 StatusOr<Image3F> CreateHeatMapImage(const ImageF& distmap,
   2112                                      double good_threshold,
   2113                                      double bad_threshold) {
   2114   JXL_ASSIGN_OR_RETURN(Image3F heatmap,
   2115                        Image3F::Create(distmap.xsize(), distmap.ysize()));
   2116   for (size_t y = 0; y < distmap.ysize(); ++y) {
   2117     const float* BUTTERAUGLI_RESTRICT row_distmap = distmap.ConstRow(y);
   2118     float* BUTTERAUGLI_RESTRICT row_h0 = heatmap.PlaneRow(0, y);
   2119     float* BUTTERAUGLI_RESTRICT row_h1 = heatmap.PlaneRow(1, y);
   2120     float* BUTTERAUGLI_RESTRICT row_h2 = heatmap.PlaneRow(2, y);
   2121     for (size_t x = 0; x < distmap.xsize(); ++x) {
   2122       const float d = row_distmap[x];
   2123       float rgb[3];
   2124       ScoreToRgb(d, good_threshold, bad_threshold, rgb);
   2125       row_h0[x] = rgb[0];
   2126       row_h1[x] = rgb[1];
   2127       row_h2[x] = rgb[2];
   2128     }
   2129   }
   2130   return heatmap;
   2131 }
   2132 
   2133 }  // namespace jxl
   2134 #endif  // HWY_ONCE