libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

enc_xyb.cc (17614B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jxl/enc_xyb.h"
      7 
      8 #include <algorithm>
      9 #include <atomic>
     10 #include <cstdlib>
     11 
     12 #undef HWY_TARGET_INCLUDE
     13 #define HWY_TARGET_INCLUDE "lib/jxl/enc_xyb.cc"
     14 #include <hwy/foreach_target.h>
     15 #include <hwy/highway.h>
     16 
     17 #include "lib/jxl/base/compiler_specific.h"
     18 #include "lib/jxl/base/data_parallel.h"
     19 #include "lib/jxl/base/fast_math-inl.h"
     20 #include "lib/jxl/base/status.h"
     21 #include "lib/jxl/cms/opsin_params.h"
     22 #include "lib/jxl/cms/transfer_functions-inl.h"
     23 #include "lib/jxl/color_encoding_internal.h"
     24 #include "lib/jxl/enc_image_bundle.h"
     25 #include "lib/jxl/image_bundle.h"
     26 #include "lib/jxl/image_ops.h"
     27 
     28 HWY_BEFORE_NAMESPACE();
     29 namespace jxl {
     30 namespace HWY_NAMESPACE {
     31 
     32 // These templates are not found via ADL.
     33 using hwy::HWY_NAMESPACE::Add;
     34 using hwy::HWY_NAMESPACE::Mul;
     35 using hwy::HWY_NAMESPACE::MulAdd;
     36 using hwy::HWY_NAMESPACE::Sub;
     37 using hwy::HWY_NAMESPACE::ZeroIfNegative;
     38 
     39 // 4x3 matrix * 3x1 SIMD vectors
     40 template <class V>
     41 JXL_INLINE void OpsinAbsorbance(const V r, const V g, const V b,
     42                                 const float* JXL_RESTRICT premul_absorb,
     43                                 V* JXL_RESTRICT mixed0, V* JXL_RESTRICT mixed1,
     44                                 V* JXL_RESTRICT mixed2) {
     45   const float* bias = jxl::cms::kOpsinAbsorbanceBias.data();
     46   const HWY_FULL(float) d;
     47   const size_t N = Lanes(d);
     48   const auto m0 = Load(d, premul_absorb + 0 * N);
     49   const auto m1 = Load(d, premul_absorb + 1 * N);
     50   const auto m2 = Load(d, premul_absorb + 2 * N);
     51   const auto m3 = Load(d, premul_absorb + 3 * N);
     52   const auto m4 = Load(d, premul_absorb + 4 * N);
     53   const auto m5 = Load(d, premul_absorb + 5 * N);
     54   const auto m6 = Load(d, premul_absorb + 6 * N);
     55   const auto m7 = Load(d, premul_absorb + 7 * N);
     56   const auto m8 = Load(d, premul_absorb + 8 * N);
     57   *mixed0 = MulAdd(m0, r, MulAdd(m1, g, MulAdd(m2, b, Set(d, bias[0]))));
     58   *mixed1 = MulAdd(m3, r, MulAdd(m4, g, MulAdd(m5, b, Set(d, bias[1]))));
     59   *mixed2 = MulAdd(m6, r, MulAdd(m7, g, MulAdd(m8, b, Set(d, bias[2]))));
     60 }
     61 
     62 template <class V>
     63 void StoreXYB(const V r, V g, const V b, float* JXL_RESTRICT valx,
     64               float* JXL_RESTRICT valy, float* JXL_RESTRICT valz) {
     65   const HWY_FULL(float) d;
     66   const V half = Set(d, 0.5f);
     67   Store(Mul(half, Sub(r, g)), d, valx);
     68   Store(Mul(half, Add(r, g)), d, valy);
     69   Store(b, d, valz);
     70 }
     71 
     72 // Converts one RGB vector to XYB.
     73 template <class V>
     74 void LinearRGBToXYB(const V r, const V g, const V b,
     75                     const float* JXL_RESTRICT premul_absorb,
     76                     float* JXL_RESTRICT valx, float* JXL_RESTRICT valy,
     77                     float* JXL_RESTRICT valz) {
     78   V mixed0;
     79   V mixed1;
     80   V mixed2;
     81   OpsinAbsorbance(r, g, b, premul_absorb, &mixed0, &mixed1, &mixed2);
     82 
     83   // mixed* should be non-negative even for wide-gamut, so clamp to zero.
     84   mixed0 = ZeroIfNegative(mixed0);
     85   mixed1 = ZeroIfNegative(mixed1);
     86   mixed2 = ZeroIfNegative(mixed2);
     87 
     88   const HWY_FULL(float) d;
     89   const size_t N = Lanes(d);
     90   mixed0 = CubeRootAndAdd(mixed0, Load(d, premul_absorb + 9 * N));
     91   mixed1 = CubeRootAndAdd(mixed1, Load(d, premul_absorb + 10 * N));
     92   mixed2 = CubeRootAndAdd(mixed2, Load(d, premul_absorb + 11 * N));
     93   StoreXYB(mixed0, mixed1, mixed2, valx, valy, valz);
     94 
     95   // For wide-gamut inputs, r/g/b and valx (but not y/z) are often negative.
     96 }
     97 
     98 void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
     99                        float* JXL_RESTRICT row2,
    100                        const float* JXL_RESTRICT premul_absorb, size_t xsize) {
    101   const HWY_FULL(float) d;
    102   for (size_t x = 0; x < xsize; x += Lanes(d)) {
    103     const auto r = Load(d, row0 + x);
    104     const auto g = Load(d, row1 + x);
    105     const auto b = Load(d, row2 + x);
    106     LinearRGBToXYB(r, g, b, premul_absorb, row0 + x, row1 + x, row2 + x);
    107   }
    108 }
    109 
    110 // Input/output uses the codec.h scaling: nominally 0-1 if in-gamut.
    111 template <class V>
    112 V LinearFromSRGB(V encoded) {
    113   return TF_SRGB().DisplayFromEncoded(encoded);
    114 }
    115 
    116 Status LinearSRGBToXYB(const float* JXL_RESTRICT premul_absorb,
    117                        ThreadPool* pool, Image3F* JXL_RESTRICT image) {
    118   const size_t xsize = image->xsize();
    119 
    120   const HWY_FULL(float) d;
    121   return RunOnPool(
    122       pool, 0, static_cast<uint32_t>(image->ysize()), ThreadPool::NoInit,
    123       [&](const uint32_t task, size_t /*thread*/) {
    124         const size_t y = static_cast<size_t>(task);
    125         float* JXL_RESTRICT row0 = image->PlaneRow(0, y);
    126         float* JXL_RESTRICT row1 = image->PlaneRow(1, y);
    127         float* JXL_RESTRICT row2 = image->PlaneRow(2, y);
    128 
    129         for (size_t x = 0; x < xsize; x += Lanes(d)) {
    130           const auto in_r = Load(d, row0 + x);
    131           const auto in_g = Load(d, row1 + x);
    132           const auto in_b = Load(d, row2 + x);
    133           LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row0 + x, row1 + x,
    134                          row2 + x);
    135         }
    136       },
    137       "LinearToXYB");
    138 }
    139 
    140 Status SRGBToXYB(const float* JXL_RESTRICT premul_absorb, ThreadPool* pool,
    141                  Image3F* JXL_RESTRICT image) {
    142   const size_t xsize = image->xsize();
    143 
    144   const HWY_FULL(float) d;
    145   return RunOnPool(
    146       pool, 0, static_cast<uint32_t>(image->ysize()), ThreadPool::NoInit,
    147       [&](const uint32_t task, size_t /*thread*/) {
    148         const size_t y = static_cast<size_t>(task);
    149         float* JXL_RESTRICT row0 = image->PlaneRow(0, y);
    150         float* JXL_RESTRICT row1 = image->PlaneRow(1, y);
    151         float* JXL_RESTRICT row2 = image->PlaneRow(2, y);
    152 
    153         for (size_t x = 0; x < xsize; x += Lanes(d)) {
    154           const auto in_r = LinearFromSRGB(Load(d, row0 + x));
    155           const auto in_g = LinearFromSRGB(Load(d, row1 + x));
    156           const auto in_b = LinearFromSRGB(Load(d, row2 + x));
    157           LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row0 + x, row1 + x,
    158                          row2 + x);
    159         }
    160       },
    161       "SRGBToXYB");
    162 }
    163 
    164 Status SRGBToXYBAndLinear(const float* JXL_RESTRICT premul_absorb,
    165                           ThreadPool* pool, Image3F* JXL_RESTRICT image,
    166                           Image3F* JXL_RESTRICT linear) {
    167   const size_t xsize = image->xsize();
    168 
    169   const HWY_FULL(float) d;
    170   return RunOnPool(
    171       pool, 0, static_cast<uint32_t>(image->ysize()), ThreadPool::NoInit,
    172       [&](const uint32_t task, size_t /*thread*/) {
    173         const size_t y = static_cast<size_t>(task);
    174         float* JXL_RESTRICT row_image0 = image->PlaneRow(0, y);
    175         float* JXL_RESTRICT row_image1 = image->PlaneRow(1, y);
    176         float* JXL_RESTRICT row_image2 = image->PlaneRow(2, y);
    177         float* JXL_RESTRICT row_linear0 = linear->PlaneRow(0, y);
    178         float* JXL_RESTRICT row_linear1 = linear->PlaneRow(1, y);
    179         float* JXL_RESTRICT row_linear2 = linear->PlaneRow(2, y);
    180 
    181         for (size_t x = 0; x < xsize; x += Lanes(d)) {
    182           const auto in_r = LinearFromSRGB(Load(d, row_image0 + x));
    183           const auto in_g = LinearFromSRGB(Load(d, row_image1 + x));
    184           const auto in_b = LinearFromSRGB(Load(d, row_image2 + x));
    185 
    186           Store(in_r, d, row_linear0 + x);
    187           Store(in_g, d, row_linear1 + x);
    188           Store(in_b, d, row_linear2 + x);
    189 
    190           LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_image0 + x,
    191                          row_image1 + x, row_image2 + x);
    192         }
    193       },
    194       "SRGBToXYBAndLinear");
    195 }
    196 
    197 void ComputePremulAbsorb(float intensity_target, float* premul_absorb) {
    198   const HWY_FULL(float) d;
    199   const size_t N = Lanes(d);
    200   const float mul = intensity_target / 255.0f;
    201   for (size_t i = 0; i < 9; ++i) {
    202     const auto absorb = Set(d, jxl::cms::kOpsinAbsorbanceMatrix[i] * mul);
    203     Store(absorb, d, premul_absorb + i * N);
    204   }
    205   for (size_t i = 0; i < 3; ++i) {
    206     const auto neg_bias_cbrt =
    207         Set(d, -cbrtf(jxl::cms::kOpsinAbsorbanceBias[i]));
    208     Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N);
    209   }
    210 }
    211 
    212 StatusOr<Image3F> TransformToLinearRGB(const Image3F& in,
    213                                        const ColorEncoding& color_encoding,
    214                                        float intensity_target,
    215                                        const JxlCmsInterface& cms,
    216                                        ThreadPool* pool) {
    217   ColorSpaceTransform c_transform(cms);
    218   bool is_gray = color_encoding.IsGray();
    219   const ColorEncoding& c_desired = ColorEncoding::LinearSRGB(is_gray);
    220   JXL_ASSIGN_OR_RETURN(Image3F out, Image3F::Create(in.xsize(), in.ysize()));
    221   std::atomic<bool> has_error{false};
    222   JXL_CHECK(RunOnPool(
    223       pool, 0, in.ysize(),
    224       [&](const size_t num_threads) {
    225         return c_transform.Init(color_encoding, c_desired, intensity_target,
    226                                 in.xsize(), num_threads);
    227       },
    228       [&](const uint32_t y, const size_t thread) {
    229         if (has_error) return;
    230         float* mutable_src_buf = c_transform.BufSrc(thread);
    231         const float* src_buf = mutable_src_buf;
    232         // Interleave input.
    233         if (is_gray) {
    234           src_buf = in.ConstPlaneRow(0, y);
    235         } else {
    236           const float* JXL_RESTRICT row_in0 = in.ConstPlaneRow(0, y);
    237           const float* JXL_RESTRICT row_in1 = in.ConstPlaneRow(1, y);
    238           const float* JXL_RESTRICT row_in2 = in.ConstPlaneRow(2, y);
    239           for (size_t x = 0; x < in.xsize(); x++) {
    240             mutable_src_buf[3 * x + 0] = row_in0[x];
    241             mutable_src_buf[3 * x + 1] = row_in1[x];
    242             mutable_src_buf[3 * x + 2] = row_in2[x];
    243           }
    244         }
    245         float* JXL_RESTRICT dst_buf = c_transform.BufDst(thread);
    246         if (!c_transform.Run(thread, src_buf, dst_buf, in.xsize())) {
    247           has_error = true;
    248           return;
    249         }
    250         float* JXL_RESTRICT row_out0 = out.PlaneRow(0, y);
    251         float* JXL_RESTRICT row_out1 = out.PlaneRow(1, y);
    252         float* JXL_RESTRICT row_out2 = out.PlaneRow(2, y);
    253         // De-interleave output and convert type.
    254         if (is_gray) {
    255           for (size_t x = 0; x < in.xsize(); x++) {
    256             row_out0[x] = dst_buf[x];
    257             row_out1[x] = dst_buf[x];
    258             row_out2[x] = dst_buf[x];
    259           }
    260         } else {
    261           for (size_t x = 0; x < in.xsize(); x++) {
    262             row_out0[x] = dst_buf[3 * x + 0];
    263             row_out1[x] = dst_buf[3 * x + 1];
    264             row_out2[x] = dst_buf[3 * x + 2];
    265           }
    266         }
    267       },
    268       "Colorspace transform"));
    269   JXL_CHECK(!has_error);
    270   return out;
    271 }
    272 
    273 // This is different from Butteraugli's OpsinDynamicsImage() in the sense that
    274 // it does not contain a sensitivity multiplier based on the blurred image.
    275 void ToXYB(const ColorEncoding& c_current, float intensity_target,
    276            const ImageF* black, ThreadPool* pool, Image3F* JXL_RESTRICT image,
    277            const JxlCmsInterface& cms, Image3F* const JXL_RESTRICT linear) {
    278   if (black) JXL_ASSERT(SameSize(*image, *black));
    279   if (linear) JXL_ASSERT(SameSize(*image, *linear));
    280 
    281   const HWY_FULL(float) d;
    282   // Pre-broadcasted constants
    283   HWY_ALIGN float premul_absorb[MaxLanes(d) * 12];
    284   ComputePremulAbsorb(intensity_target, premul_absorb);
    285 
    286   const bool want_linear = linear != nullptr;
    287 
    288   const ColorEncoding& c_linear_srgb =
    289       ColorEncoding::LinearSRGB(c_current.IsGray());
    290   // Linear sRGB inputs are rare but can be useful for the fastest encoders, for
    291   // which undoing the sRGB transfer function would be a large part of the cost.
    292   if (c_linear_srgb.SameColorEncoding(c_current)) {
    293     // This only happens if kitten or slower, moving ImageBundle might be
    294     // possible but the encoder is much slower than this copy.
    295     if (want_linear) {
    296       CopyImageTo(*image, linear);
    297     }
    298     JXL_CHECK(LinearSRGBToXYB(premul_absorb, pool, image));
    299     return;
    300   }
    301 
    302   // Common case: already sRGB, can avoid the color transform
    303   if (c_current.IsSRGB()) {
    304     // Common case: can avoid allocating/copying
    305     if (want_linear) {
    306       // Slow encoder also wants linear sRGB.
    307       JXL_CHECK(SRGBToXYBAndLinear(premul_absorb, pool, image, linear));
    308     } else {
    309       JXL_CHECK(SRGBToXYB(premul_absorb, pool, image));
    310     }
    311     return;
    312   }
    313 
    314   JXL_CHECK(ApplyColorTransform(c_current, intensity_target, *image, black,
    315                                 Rect(*image), c_linear_srgb, cms, pool,
    316                                 want_linear ? linear : image));
    317   if (want_linear) {
    318     CopyImageTo(*linear, image);
    319   }
    320   JXL_CHECK(LinearSRGBToXYB(premul_absorb, pool, image));
    321 }
    322 
    323 // Transform RGB to YCbCr.
    324 // Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B).
    325 Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
    326                   const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
    327                   ImageF* cr_plane, ThreadPool* pool) {
    328   const HWY_FULL(float) df;
    329   const size_t S = Lanes(df);  // Step.
    330 
    331   const size_t xsize = r_plane.xsize();
    332   const size_t ysize = r_plane.ysize();
    333   if ((xsize == 0) || (ysize == 0)) return true;
    334 
    335   // Full-range BT.601 as defined by JFIF Clause 7:
    336   // https://www.itu.int/rec/T-REC-T.871-201105-I/en
    337   const auto k128 = Set(df, 128.0f / 255);
    338   const auto kR = Set(df, 0.299f);  // NTSC luma
    339   const auto kG = Set(df, 0.587f);
    340   const auto kB = Set(df, 0.114f);
    341   const auto kAmpR = Set(df, 0.701f);
    342   const auto kAmpB = Set(df, 0.886f);
    343   const auto kDiffR = Add(kAmpR, kR);
    344   const auto kDiffB = Add(kAmpB, kB);
    345   const auto kNormR = Div(Set(df, 1.0f), (Add(kAmpR, Add(kG, kB))));
    346   const auto kNormB = Div(Set(df, 1.0f), (Add(kR, Add(kG, kAmpB))));
    347 
    348   constexpr size_t kGroupArea = kGroupDim * kGroupDim;
    349   const size_t lines_per_group = DivCeil(kGroupArea, xsize);
    350   const size_t num_stripes = DivCeil(ysize, lines_per_group);
    351   const auto transform = [&](int idx, int /* thread*/) {
    352     const size_t y0 = idx * lines_per_group;
    353     const size_t y1 = std::min<size_t>(y0 + lines_per_group, ysize);
    354     for (size_t y = y0; y < y1; ++y) {
    355       const float* r_row = r_plane.ConstRow(y);
    356       const float* g_row = g_plane.ConstRow(y);
    357       const float* b_row = b_plane.ConstRow(y);
    358       float* y_row = y_plane->Row(y);
    359       float* cb_row = cb_plane->Row(y);
    360       float* cr_row = cr_plane->Row(y);
    361       for (size_t x = 0; x < xsize; x += S) {
    362         const auto r = Load(df, r_row + x);
    363         const auto g = Load(df, g_row + x);
    364         const auto b = Load(df, b_row + x);
    365         const auto r_base = Mul(r, kR);
    366         const auto r_diff = Mul(r, kDiffR);
    367         const auto g_base = Mul(g, kG);
    368         const auto b_base = Mul(b, kB);
    369         const auto b_diff = Mul(b, kDiffB);
    370         const auto y_base = Add(r_base, Add(g_base, b_base));
    371         const auto y_vec = Sub(y_base, k128);
    372         const auto cb_vec = Mul(Sub(b_diff, y_base), kNormB);
    373         const auto cr_vec = Mul(Sub(r_diff, y_base), kNormR);
    374         Store(y_vec, df, y_row + x);
    375         Store(cb_vec, df, cb_row + x);
    376         Store(cr_vec, df, cr_row + x);
    377       }
    378     }
    379   };
    380   return RunOnPool(pool, 0, static_cast<int>(num_stripes), ThreadPool::NoInit,
    381                    transform, "RgbToYcbCr");
    382 }
    383 
    384 // NOLINTNEXTLINE(google-readability-namespace-comments)
    385 }  // namespace HWY_NAMESPACE
    386 }  // namespace jxl
    387 HWY_AFTER_NAMESPACE();
    388 
    389 #if HWY_ONCE
    390 namespace jxl {
    391 HWY_EXPORT(ToXYB);
    392 void ToXYB(const ColorEncoding& c_current, float intensity_target,
    393            const ImageF* black, ThreadPool* pool, Image3F* JXL_RESTRICT image,
    394            const JxlCmsInterface& cms, Image3F* const JXL_RESTRICT linear) {
    395   HWY_DYNAMIC_DISPATCH(ToXYB)
    396   (c_current, intensity_target, black, pool, image, cms, linear);
    397 }
    398 
    399 Status ToXYB(const ImageBundle& in, ThreadPool* pool, Image3F* JXL_RESTRICT xyb,
    400              const JxlCmsInterface& cms, Image3F* JXL_RESTRICT linear) {
    401   JXL_ASSIGN_OR_RETURN(*xyb, Image3F::Create(in.xsize(), in.ysize()));
    402   CopyImageTo(in.color(), xyb);
    403   ToXYB(in.c_current(), in.metadata()->IntensityTarget(),
    404         in.HasBlack() ? &in.black() : nullptr, pool, xyb, cms, linear);
    405   return true;
    406 }
    407 
    408 HWY_EXPORT(LinearRGBRowToXYB);
    409 void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
    410                        float* JXL_RESTRICT row2,
    411                        const float* JXL_RESTRICT premul_absorb, size_t xsize) {
    412   HWY_DYNAMIC_DISPATCH(LinearRGBRowToXYB)
    413   (row0, row1, row2, premul_absorb, xsize);
    414 }
    415 
    416 HWY_EXPORT(ComputePremulAbsorb);
    417 void ComputePremulAbsorb(float intensity_target, float* premul_absorb) {
    418   HWY_DYNAMIC_DISPATCH(ComputePremulAbsorb)(intensity_target, premul_absorb);
    419 }
    420 
    421 void ScaleXYBRow(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
    422                  float* JXL_RESTRICT row2, size_t xsize) {
    423   for (size_t x = 0; x < xsize; x++) {
    424     row2[x] = (row2[x] - row1[x] + jxl::cms::kScaledXYBOffset[2]) *
    425               jxl::cms::kScaledXYBScale[2];
    426     row0[x] = (row0[x] + jxl::cms::kScaledXYBOffset[0]) *
    427               jxl::cms::kScaledXYBScale[0];
    428     row1[x] = (row1[x] + jxl::cms::kScaledXYBOffset[1]) *
    429               jxl::cms::kScaledXYBScale[1];
    430   }
    431 }
    432 
    433 void ScaleXYB(Image3F* opsin) {
    434   for (size_t y = 0; y < opsin->ysize(); y++) {
    435     float* row0 = opsin->PlaneRow(0, y);
    436     float* row1 = opsin->PlaneRow(1, y);
    437     float* row2 = opsin->PlaneRow(2, y);
    438     ScaleXYBRow(row0, row1, row2, opsin->xsize());
    439   }
    440 }
    441 
    442 HWY_EXPORT(RgbToYcbcr);
    443 Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
    444                   const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
    445                   ImageF* cr_plane, ThreadPool* pool) {
    446   return HWY_DYNAMIC_DISPATCH(RgbToYcbcr)(r_plane, g_plane, b_plane, y_plane,
    447                                           cb_plane, cr_plane, pool);
    448 }
    449 
    450 }  // namespace jxl
    451 #endif  // HWY_ONCE