transfer_functions-inl.h - libjxl - FORK: libjxl patches used on blog

transfer_functions-inl.h (13873B)
      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 // Transfer functions for color encodings.
      7 
      8 #if defined(LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_) == defined(HWY_TARGET_TOGGLE)
      9 #ifdef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_
     10 #undef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_
     11 #else
     12 #define LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_
     13 #endif
     14 
     15 #include <algorithm>
     16 #include <cmath>
     17 #include <hwy/highway.h>
     18 
     19 #include "lib/jxl/base/compiler_specific.h"
     20 #include "lib/jxl/base/fast_math-inl.h"
     21 #include "lib/jxl/base/rational_polynomial-inl.h"
     22 #include "lib/jxl/base/status.h"
     23 #include "lib/jxl/cms/transfer_functions.h"
     24 
     25 HWY_BEFORE_NAMESPACE();
     26 namespace jxl {
     27 namespace HWY_NAMESPACE {
     28 
     29 // These templates are not found via ADL.
     30 using hwy::HWY_NAMESPACE::And;
     31 using hwy::HWY_NAMESPACE::AndNot;
     32 using hwy::HWY_NAMESPACE::Gt;
     33 using hwy::HWY_NAMESPACE::IfThenElse;
     34 using hwy::HWY_NAMESPACE::Lt;
     35 using hwy::HWY_NAMESPACE::Or;
     36 using hwy::HWY_NAMESPACE::Sqrt;
     37 using hwy::HWY_NAMESPACE::TableLookupBytes;
     38 
     39 // Definitions for BT.2100-2 transfer functions (used inside/outside SIMD):
     40 // "display" is linear light (nits) normalized to [0, 1].
     41 // "encoded" is a nonlinear encoding (e.g. PQ) in [0, 1].
     42 // "scene" is a linear function of photon counts, normalized to [0, 1].
     43 
     44 // Despite the stated ranges, we need unbounded transfer functions: see
     45 // http://www.littlecms.com/CIC18_UnboundedCMM.pdf. Inputs can be negative or
     46 // above 1 due to chromatic adaptation. To avoid severe round-trip errors caused
     47 // by clamping, we mirror negative inputs via copysign (f(-x) = -f(x), see
     48 // https://developer.apple.com/documentation/coregraphics/cgcolorspace/1644735-extendedsrgb)
     49 // and extend the function domains above 1.
     50 
     51 // Hybrid Log-Gamma.
     52 class TF_HLG : TF_HLG_Base {
     53  public:
     54   // Maximum error 5e-7.
     55   template <class D, class V>
     56   JXL_INLINE V EncodedFromDisplay(D d, V x) const {
     57     const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
     58     const V kSign = BitCast(d, Set(du, 0x80000000u));
     59     const V original_sign = And(x, kSign);
     60     x = AndNot(kSign, x);  // abs
     61     const V below_div12 = Sqrt(Mul(Set(d, 3.0f), x));
     62     const V e =
     63         MulAdd(Set(d, kA * 0.693147181f),
     64                FastLog2f(d, MulAdd(Set(d, 12), x, Set(d, -kB))), Set(d, kC));
     65     const V magnitude = IfThenElse(Le(x, Set(d, kDiv12)), below_div12, e);
     66     return Or(AndNot(kSign, magnitude), original_sign);
     67   }
     68 };
     69 
     70 class TF_709 {
     71  public:
     72   JXL_INLINE double EncodedFromDisplay(const double d) const {
     73     if (d < kThresh) return kMulLow * d;
     74     return kMulHi * std::pow(d, kPowHi) + kSub;
     75   }
     76 
     77   // Maximum error 1e-6.
     78   template <class D, class V>
     79   JXL_INLINE V EncodedFromDisplay(D d, V x) const {
     80     auto low = Mul(Set(d, kMulLow), x);
     81     auto hi =
     82         MulAdd(Set(d, kMulHi), FastPowf(d, x, Set(d, kPowHi)), Set(d, kSub));
     83     return IfThenElse(Le(x, Set(d, kThresh)), low, hi);
     84   }
     85 
     86   template <class D, class V>
     87   JXL_INLINE V DisplayFromEncoded(D d, V x) const {
     88     auto low = Mul(Set(d, kInvMulLow), x);
     89     auto hi = FastPowf(d, MulAdd(x, Set(d, kInvMulHi), Set(d, kInvAdd)),
     90                        Set(d, kInvPowHi));
     91     return IfThenElse(Lt(x, Set(d, kInvThresh)), low, hi);
     92   }
     93 
     94  private:
     95   static constexpr double kThresh = 0.018;
     96   static constexpr double kMulLow = 4.5;
     97   static constexpr double kMulHi = 1.099;
     98   static constexpr double kPowHi = 0.45;
     99   static constexpr double kSub = -0.099;
    100 
    101   static constexpr double kInvThresh = 0.081;
    102   static constexpr double kInvMulLow = 1 / 4.5;
    103   static constexpr double kInvMulHi = 1 / 1.099;
    104   static constexpr double kInvPowHi = 1 / 0.45;
    105   static constexpr double kInvAdd = 0.099 * kInvMulHi;
    106 };
    107 
    108 // Perceptual Quantization
    109 class TF_PQ : TF_PQ_Base {
    110  public:
    111   explicit TF_PQ(float display_intensity_target = kDefaultIntensityTarget)
    112       : display_scaling_factor_to_10000_nits_(display_intensity_target *
    113                                               (1.0f / 10000.0f)),
    114         display_scaling_factor_from_10000_nits_(10000.0f /
    115                                                 display_intensity_target) {}
    116 
    117   // Maximum error 3e-6
    118   template <class D, class V>
    119   JXL_INLINE V DisplayFromEncoded(D d, V x) const {
    120     const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
    121     const V kSign = BitCast(d, Set(du, 0x80000000u));
    122     const V original_sign = And(x, kSign);
    123     x = AndNot(kSign, x);  // abs
    124     // 4-over-4-degree rational polynomial approximation on x+x*x. This improves
    125     // the maximum error by about 5x over a rational polynomial for x.
    126     auto xpxx = MulAdd(x, x, x);
    127     HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
    128         HWY_REP4(2.62975656e-04f), HWY_REP4(-6.23553089e-03f),
    129         HWY_REP4(7.38602301e-01f), HWY_REP4(2.64553172e+00f),
    130         HWY_REP4(5.50034862e-01f),
    131     };
    132     HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
    133         HWY_REP4(4.21350107e+02f), HWY_REP4(-4.28736818e+02f),
    134         HWY_REP4(1.74364667e+02f), HWY_REP4(-3.39078883e+01f),
    135         HWY_REP4(2.67718770e+00f),
    136     };
    137     auto magnitude = EvalRationalPolynomial(d, xpxx, p, q);
    138     return Or(
    139         AndNot(kSign,
    140                Mul(magnitude, Set(d, display_scaling_factor_from_10000_nits_))),
    141         original_sign);
    142   }
    143 
    144   // Maximum error 7e-7.
    145   template <class D, class V>
    146   JXL_INLINE V EncodedFromDisplay(D d, V x) const {
    147     const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
    148     const V kSign = BitCast(d, Set(du, 0x80000000u));
    149     const V original_sign = And(x, kSign);
    150     x = AndNot(kSign, x);  // abs
    151     // 4-over-4-degree rational polynomial approximation on x**0.25, with two
    152     // different polynomials above and below 1e-4.
    153     auto xto025 =
    154         Sqrt(Sqrt(Mul(x, Set(d, display_scaling_factor_to_10000_nits_))));
    155     HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
    156         HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f),
    157         HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f),
    158         HWY_REP4(4.838434e+01f),
    159     };
    160     HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
    161         HWY_REP4(1.012416e+00f), HWY_REP4(2.016708e+01f),
    162         HWY_REP4(9.263710e+01f), HWY_REP4(1.120607e+02f),
    163         HWY_REP4(2.590418e+01f),
    164     };
    165 
    166     HWY_ALIGN constexpr float plo[(4 + 1) * 4] = {
    167         HWY_REP4(9.863406e-06f),  HWY_REP4(3.881234e-01f),
    168         HWY_REP4(1.352821e+02f),  HWY_REP4(6.889862e+04f),
    169         HWY_REP4(-2.864824e+05f),
    170     };
    171     HWY_ALIGN constexpr float qlo[(4 + 1) * 4] = {
    172         HWY_REP4(3.371868e+01f),  HWY_REP4(1.477719e+03f),
    173         HWY_REP4(1.608477e+04f),  HWY_REP4(-4.389884e+04f),
    174         HWY_REP4(-2.072546e+05f),
    175     };
    176 
    177     auto magnitude = IfThenElse(Lt(x, Set(d, 1e-4f)),
    178                                 EvalRationalPolynomial(d, xto025, plo, qlo),
    179                                 EvalRationalPolynomial(d, xto025, p, q));
    180     return Or(AndNot(kSign, magnitude), original_sign);
    181   }
    182 
    183  private:
    184   const float display_scaling_factor_to_10000_nits_;
    185   const float display_scaling_factor_from_10000_nits_;
    186 };
    187 
    188 // sRGB
    189 class TF_SRGB {
    190  public:
    191   template <typename V>
    192   JXL_INLINE V DisplayFromEncoded(V x) const {
    193     const HWY_FULL(float) d;
    194     const HWY_FULL(uint32_t) du;
    195     const V kSign = BitCast(d, Set(du, 0x80000000u));
    196     const V original_sign = And(x, kSign);
    197     x = AndNot(kSign, x);  // abs
    198 
    199     // TODO(janwas): range reduction
    200     // Computed via af_cheb_rational (k=100); replicated 4x.
    201     HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
    202         2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f,
    203         1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f,
    204         1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f,
    205         7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f,
    206         8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f,
    207     };
    208     HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
    209         2.631846970e-01f,  2.631846970e-01f,  2.631846970e-01f,
    210         2.631846970e-01f,  1.076976492e+00f,  1.076976492e+00f,
    211         1.076976492e+00f,  1.076976492e+00f,  4.987528350e-01f,
    212         4.987528350e-01f,  4.987528350e-01f,  4.987528350e-01f,
    213         -5.512498495e-02f, -5.512498495e-02f, -5.512498495e-02f,
    214         -5.512498495e-02f, 6.521209011e-03f,  6.521209011e-03f,
    215         6.521209011e-03f,  6.521209011e-03f,
    216     };
    217     const V linear = Mul(x, Set(d, kLowDivInv));
    218     const V poly = EvalRationalPolynomial(d, x, p, q);
    219     const V magnitude =
    220         IfThenElse(Gt(x, Set(d, kThreshSRGBToLinear)), poly, linear);
    221     return Or(AndNot(kSign, magnitude), original_sign);
    222   }
    223 
    224   // Error ~5e-07
    225   template <class D, class V>
    226   JXL_INLINE V EncodedFromDisplay(D d, V x) const {
    227     const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
    228     const V kSign = BitCast(d, Set(du, 0x80000000u));
    229     const V original_sign = And(x, kSign);
    230     x = AndNot(kSign, x);  // abs
    231 
    232     // Computed via af_cheb_rational (k=100); replicated 4x.
    233     HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
    234         -5.135152395e-04f, -5.135152395e-04f, -5.135152395e-04f,
    235         -5.135152395e-04f, 5.287254571e-03f,  5.287254571e-03f,
    236         5.287254571e-03f,  5.287254571e-03f,  3.903842876e-01f,
    237         3.903842876e-01f,  3.903842876e-01f,  3.903842876e-01f,
    238         1.474205315e+00f,  1.474205315e+00f,  1.474205315e+00f,
    239         1.474205315e+00f,  7.352629620e-01f,  7.352629620e-01f,
    240         7.352629620e-01f,  7.352629620e-01f,
    241     };
    242     HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
    243         1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f,
    244         3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f,
    245         1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f,
    246         9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f,
    247         2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f,
    248     };
    249     const V linear = Mul(x, Set(d, kLowDiv));
    250     const V poly = EvalRationalPolynomial(d, Sqrt(x), p, q);
    251     const V magnitude =
    252         IfThenElse(Gt(x, Set(d, kThreshLinearToSRGB)), poly, linear);
    253     return Or(AndNot(kSign, magnitude), original_sign);
    254   }
    255 
    256  private:
    257   static constexpr float kThreshSRGBToLinear = 0.04045f;
    258   static constexpr float kThreshLinearToSRGB = 0.0031308f;
    259   static constexpr float kLowDiv = 12.92f;
    260   static constexpr float kLowDivInv = 1.0f / kLowDiv;
    261 };
    262 
    263 // Linear to sRGB conversion with error of at most 1.2e-4.
    264 template <typename D, typename V>
    265 V FastLinearToSRGB(D d, V v) {
    266   const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
    267   const hwy::HWY_NAMESPACE::Rebind<int32_t, D> di;
    268   // Convert to 0.25 - 0.5 range.
    269   auto v025_05 = BitCast(
    270       d, And(Or(BitCast(du, v), Set(du, 0x3e800000)), Set(du, 0x3effffff)));
    271   // third degree polynomial approximation between 0.25 and 0.5
    272   // of 1.055/2^(7/2.4) * x^(1/2.4) * 0.5. A degree 4 polynomial only improves
    273   // accuracy by about 3x.
    274   auto d1 = MulAdd(v025_05, Set(d, 0.059914046f), Set(d, -0.108894556f));
    275   auto d2 = MulAdd(d1, v025_05, Set(d, 0.107963754f));
    276   auto pow = MulAdd(d2, v025_05, Set(d, 0.018092343f));
    277   // Compute extra multiplier depending on exponent. Valid exponent range for
    278   // [0.0031308f, 1.0) is 0...8 after subtracting 118.
    279   // The next three constants contain a representation of the powers of
    280   // 2**(1/2.4) = 2**(5/12) times two; in particular, bits from 26 to 31 are
    281   // always the same and in k2to512powers_basebits, and the two arrays contain
    282   // the next groups of 8 bits. This ends up being a 22-bit representation (with
    283   // a mantissa of 13 bits). The choice of polynomial to approximate is such
    284   // that the multiplication factor has the highest 5 bits constant, and that
    285   // the factor for the lowest possible exponent is a power of two (thus making
    286   // the additional bits 0, which is used to correctly merge back together the
    287   // floats).
    288   constexpr uint32_t k2to512powers_basebits = 0x40000000;
    289   HWY_ALIGN constexpr uint8_t k2to512powers_25to18bits[16] = {
    290       0x0,  0xa,  0x19, 0x26, 0x32, 0x41, 0x4d, 0x5c,
    291       0x68, 0x75, 0x83, 0x8f, 0xa0, 0xaa, 0xb9, 0xc6,
    292   };
    293   HWY_ALIGN constexpr uint8_t k2to512powers_17to10bits[16] = {
    294       0x0,  0xb7, 0x4,  0xd,  0xcb, 0xe7, 0x41, 0x68,
    295       0x51, 0xd1, 0xeb, 0xf2, 0x0,  0xb7, 0x4,  0xd,
    296   };
    297   // Note that vld1q_s8_x2 on ARM seems to actually be slower.
    298 #if HWY_TARGET != HWY_SCALAR
    299   using hwy::HWY_NAMESPACE::ShiftLeft;
    300   using hwy::HWY_NAMESPACE::ShiftRight;
    301   // Every lane of exp is now (if cast to byte) {0, 0, 0, <index for lookup>}.
    302   auto exp = Sub(ShiftRight<23>(BitCast(di, v)), Set(di, 118));
    303   auto pow25to18bits = TableLookupBytes(
    304       LoadDup128(di,
    305                  reinterpret_cast<const int32_t*>(k2to512powers_25to18bits)),
    306       exp);
    307   auto pow17to10bits = TableLookupBytes(
    308       LoadDup128(di,
    309                  reinterpret_cast<const int32_t*>(k2to512powers_17to10bits)),
    310       exp);
    311   // Now, pow* contain {0, 0, 0, <part of float repr of multiplier>}. Here
    312   // we take advantage of the fact that each table has its position 0 equal to
    313   // 0.
    314   // We can now just reassemble the float.
    315   auto mul = BitCast(
    316       d, Or(Or(ShiftLeft<18>(pow25to18bits), ShiftLeft<10>(pow17to10bits)),
    317             Set(di, k2to512powers_basebits)));
    318 #else
    319   // Fallback for scalar.
    320   uint32_t exp = ((BitCast(di, v).raw >> 23) - 118) & 0xf;
    321   auto mul = BitCast(d, Set(di, (k2to512powers_25to18bits[exp] << 18) |
    322                                     (k2to512powers_17to10bits[exp] << 10) |
    323                                     k2to512powers_basebits));
    324 #endif
    325   return IfThenElse(Lt(v, Set(d, 0.0031308f)), Mul(v, Set(d, 12.92f)),
    326                     MulAdd(pow, mul, Set(d, -0.055)));
    327 }
    328 
    329 // NOLINTNEXTLINE(google-readability-namespace-comments)
    330 }  // namespace HWY_NAMESPACE
    331 }  // namespace jxl
    332 HWY_AFTER_NAMESPACE();
    333 
    334 #endif  // LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_
	libjxl FORK: libjxl patches used on blog
	git clone https://git.neptards.moe/blog/libjxl.git
	Log \| Files \| Refs \| Submodules \| README \| LICENSE