transfer_functions-inl.h (13873B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 // Transfer functions for color encodings. 7 8 #if defined(LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_) == defined(HWY_TARGET_TOGGLE) 9 #ifdef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_ 10 #undef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_ 11 #else 12 #define LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_ 13 #endif 14 15 #include <algorithm> 16 #include <cmath> 17 #include <hwy/highway.h> 18 19 #include "lib/jxl/base/compiler_specific.h" 20 #include "lib/jxl/base/fast_math-inl.h" 21 #include "lib/jxl/base/rational_polynomial-inl.h" 22 #include "lib/jxl/base/status.h" 23 #include "lib/jxl/cms/transfer_functions.h" 24 25 HWY_BEFORE_NAMESPACE(); 26 namespace jxl { 27 namespace HWY_NAMESPACE { 28 29 // These templates are not found via ADL. 30 using hwy::HWY_NAMESPACE::And; 31 using hwy::HWY_NAMESPACE::AndNot; 32 using hwy::HWY_NAMESPACE::Gt; 33 using hwy::HWY_NAMESPACE::IfThenElse; 34 using hwy::HWY_NAMESPACE::Lt; 35 using hwy::HWY_NAMESPACE::Or; 36 using hwy::HWY_NAMESPACE::Sqrt; 37 using hwy::HWY_NAMESPACE::TableLookupBytes; 38 39 // Definitions for BT.2100-2 transfer functions (used inside/outside SIMD): 40 // "display" is linear light (nits) normalized to [0, 1]. 41 // "encoded" is a nonlinear encoding (e.g. PQ) in [0, 1]. 42 // "scene" is a linear function of photon counts, normalized to [0, 1]. 43 44 // Despite the stated ranges, we need unbounded transfer functions: see 45 // http://www.littlecms.com/CIC18_UnboundedCMM.pdf. Inputs can be negative or 46 // above 1 due to chromatic adaptation. To avoid severe round-trip errors caused 47 // by clamping, we mirror negative inputs via copysign (f(-x) = -f(x), see 48 // https://developer.apple.com/documentation/coregraphics/cgcolorspace/1644735-extendedsrgb) 49 // and extend the function domains above 1. 50 51 // Hybrid Log-Gamma. 52 class TF_HLG : TF_HLG_Base { 53 public: 54 // Maximum error 5e-7. 55 template <class D, class V> 56 JXL_INLINE V EncodedFromDisplay(D d, V x) const { 57 const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; 58 const V kSign = BitCast(d, Set(du, 0x80000000u)); 59 const V original_sign = And(x, kSign); 60 x = AndNot(kSign, x); // abs 61 const V below_div12 = Sqrt(Mul(Set(d, 3.0f), x)); 62 const V e = 63 MulAdd(Set(d, kA * 0.693147181f), 64 FastLog2f(d, MulAdd(Set(d, 12), x, Set(d, -kB))), Set(d, kC)); 65 const V magnitude = IfThenElse(Le(x, Set(d, kDiv12)), below_div12, e); 66 return Or(AndNot(kSign, magnitude), original_sign); 67 } 68 }; 69 70 class TF_709 { 71 public: 72 JXL_INLINE double EncodedFromDisplay(const double d) const { 73 if (d < kThresh) return kMulLow * d; 74 return kMulHi * std::pow(d, kPowHi) + kSub; 75 } 76 77 // Maximum error 1e-6. 78 template <class D, class V> 79 JXL_INLINE V EncodedFromDisplay(D d, V x) const { 80 auto low = Mul(Set(d, kMulLow), x); 81 auto hi = 82 MulAdd(Set(d, kMulHi), FastPowf(d, x, Set(d, kPowHi)), Set(d, kSub)); 83 return IfThenElse(Le(x, Set(d, kThresh)), low, hi); 84 } 85 86 template <class D, class V> 87 JXL_INLINE V DisplayFromEncoded(D d, V x) const { 88 auto low = Mul(Set(d, kInvMulLow), x); 89 auto hi = FastPowf(d, MulAdd(x, Set(d, kInvMulHi), Set(d, kInvAdd)), 90 Set(d, kInvPowHi)); 91 return IfThenElse(Lt(x, Set(d, kInvThresh)), low, hi); 92 } 93 94 private: 95 static constexpr double kThresh = 0.018; 96 static constexpr double kMulLow = 4.5; 97 static constexpr double kMulHi = 1.099; 98 static constexpr double kPowHi = 0.45; 99 static constexpr double kSub = -0.099; 100 101 static constexpr double kInvThresh = 0.081; 102 static constexpr double kInvMulLow = 1 / 4.5; 103 static constexpr double kInvMulHi = 1 / 1.099; 104 static constexpr double kInvPowHi = 1 / 0.45; 105 static constexpr double kInvAdd = 0.099 * kInvMulHi; 106 }; 107 108 // Perceptual Quantization 109 class TF_PQ : TF_PQ_Base { 110 public: 111 explicit TF_PQ(float display_intensity_target = kDefaultIntensityTarget) 112 : display_scaling_factor_to_10000_nits_(display_intensity_target * 113 (1.0f / 10000.0f)), 114 display_scaling_factor_from_10000_nits_(10000.0f / 115 display_intensity_target) {} 116 117 // Maximum error 3e-6 118 template <class D, class V> 119 JXL_INLINE V DisplayFromEncoded(D d, V x) const { 120 const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; 121 const V kSign = BitCast(d, Set(du, 0x80000000u)); 122 const V original_sign = And(x, kSign); 123 x = AndNot(kSign, x); // abs 124 // 4-over-4-degree rational polynomial approximation on x+x*x. This improves 125 // the maximum error by about 5x over a rational polynomial for x. 126 auto xpxx = MulAdd(x, x, x); 127 HWY_ALIGN constexpr float p[(4 + 1) * 4] = { 128 HWY_REP4(2.62975656e-04f), HWY_REP4(-6.23553089e-03f), 129 HWY_REP4(7.38602301e-01f), HWY_REP4(2.64553172e+00f), 130 HWY_REP4(5.50034862e-01f), 131 }; 132 HWY_ALIGN constexpr float q[(4 + 1) * 4] = { 133 HWY_REP4(4.21350107e+02f), HWY_REP4(-4.28736818e+02f), 134 HWY_REP4(1.74364667e+02f), HWY_REP4(-3.39078883e+01f), 135 HWY_REP4(2.67718770e+00f), 136 }; 137 auto magnitude = EvalRationalPolynomial(d, xpxx, p, q); 138 return Or( 139 AndNot(kSign, 140 Mul(magnitude, Set(d, display_scaling_factor_from_10000_nits_))), 141 original_sign); 142 } 143 144 // Maximum error 7e-7. 145 template <class D, class V> 146 JXL_INLINE V EncodedFromDisplay(D d, V x) const { 147 const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; 148 const V kSign = BitCast(d, Set(du, 0x80000000u)); 149 const V original_sign = And(x, kSign); 150 x = AndNot(kSign, x); // abs 151 // 4-over-4-degree rational polynomial approximation on x**0.25, with two 152 // different polynomials above and below 1e-4. 153 auto xto025 = 154 Sqrt(Sqrt(Mul(x, Set(d, display_scaling_factor_to_10000_nits_)))); 155 HWY_ALIGN constexpr float p[(4 + 1) * 4] = { 156 HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f), 157 HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f), 158 HWY_REP4(4.838434e+01f), 159 }; 160 HWY_ALIGN constexpr float q[(4 + 1) * 4] = { 161 HWY_REP4(1.012416e+00f), HWY_REP4(2.016708e+01f), 162 HWY_REP4(9.263710e+01f), HWY_REP4(1.120607e+02f), 163 HWY_REP4(2.590418e+01f), 164 }; 165 166 HWY_ALIGN constexpr float plo[(4 + 1) * 4] = { 167 HWY_REP4(9.863406e-06f), HWY_REP4(3.881234e-01f), 168 HWY_REP4(1.352821e+02f), HWY_REP4(6.889862e+04f), 169 HWY_REP4(-2.864824e+05f), 170 }; 171 HWY_ALIGN constexpr float qlo[(4 + 1) * 4] = { 172 HWY_REP4(3.371868e+01f), HWY_REP4(1.477719e+03f), 173 HWY_REP4(1.608477e+04f), HWY_REP4(-4.389884e+04f), 174 HWY_REP4(-2.072546e+05f), 175 }; 176 177 auto magnitude = IfThenElse(Lt(x, Set(d, 1e-4f)), 178 EvalRationalPolynomial(d, xto025, plo, qlo), 179 EvalRationalPolynomial(d, xto025, p, q)); 180 return Or(AndNot(kSign, magnitude), original_sign); 181 } 182 183 private: 184 const float display_scaling_factor_to_10000_nits_; 185 const float display_scaling_factor_from_10000_nits_; 186 }; 187 188 // sRGB 189 class TF_SRGB { 190 public: 191 template <typename V> 192 JXL_INLINE V DisplayFromEncoded(V x) const { 193 const HWY_FULL(float) d; 194 const HWY_FULL(uint32_t) du; 195 const V kSign = BitCast(d, Set(du, 0x80000000u)); 196 const V original_sign = And(x, kSign); 197 x = AndNot(kSign, x); // abs 198 199 // TODO(janwas): range reduction 200 // Computed via af_cheb_rational (k=100); replicated 4x. 201 HWY_ALIGN constexpr float p[(4 + 1) * 4] = { 202 2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f, 203 1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f, 204 1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f, 205 7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f, 206 8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f, 207 }; 208 HWY_ALIGN constexpr float q[(4 + 1) * 4] = { 209 2.631846970e-01f, 2.631846970e-01f, 2.631846970e-01f, 210 2.631846970e-01f, 1.076976492e+00f, 1.076976492e+00f, 211 1.076976492e+00f, 1.076976492e+00f, 4.987528350e-01f, 212 4.987528350e-01f, 4.987528350e-01f, 4.987528350e-01f, 213 -5.512498495e-02f, -5.512498495e-02f, -5.512498495e-02f, 214 -5.512498495e-02f, 6.521209011e-03f, 6.521209011e-03f, 215 6.521209011e-03f, 6.521209011e-03f, 216 }; 217 const V linear = Mul(x, Set(d, kLowDivInv)); 218 const V poly = EvalRationalPolynomial(d, x, p, q); 219 const V magnitude = 220 IfThenElse(Gt(x, Set(d, kThreshSRGBToLinear)), poly, linear); 221 return Or(AndNot(kSign, magnitude), original_sign); 222 } 223 224 // Error ~5e-07 225 template <class D, class V> 226 JXL_INLINE V EncodedFromDisplay(D d, V x) const { 227 const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; 228 const V kSign = BitCast(d, Set(du, 0x80000000u)); 229 const V original_sign = And(x, kSign); 230 x = AndNot(kSign, x); // abs 231 232 // Computed via af_cheb_rational (k=100); replicated 4x. 233 HWY_ALIGN constexpr float p[(4 + 1) * 4] = { 234 -5.135152395e-04f, -5.135152395e-04f, -5.135152395e-04f, 235 -5.135152395e-04f, 5.287254571e-03f, 5.287254571e-03f, 236 5.287254571e-03f, 5.287254571e-03f, 3.903842876e-01f, 237 3.903842876e-01f, 3.903842876e-01f, 3.903842876e-01f, 238 1.474205315e+00f, 1.474205315e+00f, 1.474205315e+00f, 239 1.474205315e+00f, 7.352629620e-01f, 7.352629620e-01f, 240 7.352629620e-01f, 7.352629620e-01f, 241 }; 242 HWY_ALIGN constexpr float q[(4 + 1) * 4] = { 243 1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f, 244 3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f, 245 1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f, 246 9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f, 247 2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f, 248 }; 249 const V linear = Mul(x, Set(d, kLowDiv)); 250 const V poly = EvalRationalPolynomial(d, Sqrt(x), p, q); 251 const V magnitude = 252 IfThenElse(Gt(x, Set(d, kThreshLinearToSRGB)), poly, linear); 253 return Or(AndNot(kSign, magnitude), original_sign); 254 } 255 256 private: 257 static constexpr float kThreshSRGBToLinear = 0.04045f; 258 static constexpr float kThreshLinearToSRGB = 0.0031308f; 259 static constexpr float kLowDiv = 12.92f; 260 static constexpr float kLowDivInv = 1.0f / kLowDiv; 261 }; 262 263 // Linear to sRGB conversion with error of at most 1.2e-4. 264 template <typename D, typename V> 265 V FastLinearToSRGB(D d, V v) { 266 const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; 267 const hwy::HWY_NAMESPACE::Rebind<int32_t, D> di; 268 // Convert to 0.25 - 0.5 range. 269 auto v025_05 = BitCast( 270 d, And(Or(BitCast(du, v), Set(du, 0x3e800000)), Set(du, 0x3effffff))); 271 // third degree polynomial approximation between 0.25 and 0.5 272 // of 1.055/2^(7/2.4) * x^(1/2.4) * 0.5. A degree 4 polynomial only improves 273 // accuracy by about 3x. 274 auto d1 = MulAdd(v025_05, Set(d, 0.059914046f), Set(d, -0.108894556f)); 275 auto d2 = MulAdd(d1, v025_05, Set(d, 0.107963754f)); 276 auto pow = MulAdd(d2, v025_05, Set(d, 0.018092343f)); 277 // Compute extra multiplier depending on exponent. Valid exponent range for 278 // [0.0031308f, 1.0) is 0...8 after subtracting 118. 279 // The next three constants contain a representation of the powers of 280 // 2**(1/2.4) = 2**(5/12) times two; in particular, bits from 26 to 31 are 281 // always the same and in k2to512powers_basebits, and the two arrays contain 282 // the next groups of 8 bits. This ends up being a 22-bit representation (with 283 // a mantissa of 13 bits). The choice of polynomial to approximate is such 284 // that the multiplication factor has the highest 5 bits constant, and that 285 // the factor for the lowest possible exponent is a power of two (thus making 286 // the additional bits 0, which is used to correctly merge back together the 287 // floats). 288 constexpr uint32_t k2to512powers_basebits = 0x40000000; 289 HWY_ALIGN constexpr uint8_t k2to512powers_25to18bits[16] = { 290 0x0, 0xa, 0x19, 0x26, 0x32, 0x41, 0x4d, 0x5c, 291 0x68, 0x75, 0x83, 0x8f, 0xa0, 0xaa, 0xb9, 0xc6, 292 }; 293 HWY_ALIGN constexpr uint8_t k2to512powers_17to10bits[16] = { 294 0x0, 0xb7, 0x4, 0xd, 0xcb, 0xe7, 0x41, 0x68, 295 0x51, 0xd1, 0xeb, 0xf2, 0x0, 0xb7, 0x4, 0xd, 296 }; 297 // Note that vld1q_s8_x2 on ARM seems to actually be slower. 298 #if HWY_TARGET != HWY_SCALAR 299 using hwy::HWY_NAMESPACE::ShiftLeft; 300 using hwy::HWY_NAMESPACE::ShiftRight; 301 // Every lane of exp is now (if cast to byte) {0, 0, 0, <index for lookup>}. 302 auto exp = Sub(ShiftRight<23>(BitCast(di, v)), Set(di, 118)); 303 auto pow25to18bits = TableLookupBytes( 304 LoadDup128(di, 305 reinterpret_cast<const int32_t*>(k2to512powers_25to18bits)), 306 exp); 307 auto pow17to10bits = TableLookupBytes( 308 LoadDup128(di, 309 reinterpret_cast<const int32_t*>(k2to512powers_17to10bits)), 310 exp); 311 // Now, pow* contain {0, 0, 0, <part of float repr of multiplier>}. Here 312 // we take advantage of the fact that each table has its position 0 equal to 313 // 0. 314 // We can now just reassemble the float. 315 auto mul = BitCast( 316 d, Or(Or(ShiftLeft<18>(pow25to18bits), ShiftLeft<10>(pow17to10bits)), 317 Set(di, k2to512powers_basebits))); 318 #else 319 // Fallback for scalar. 320 uint32_t exp = ((BitCast(di, v).raw >> 23) - 118) & 0xf; 321 auto mul = BitCast(d, Set(di, (k2to512powers_25to18bits[exp] << 18) | 322 (k2to512powers_17to10bits[exp] << 10) | 323 k2to512powers_basebits)); 324 #endif 325 return IfThenElse(Lt(v, Set(d, 0.0031308f)), Mul(v, Set(d, 12.92f)), 326 MulAdd(pow, mul, Set(d, -0.055))); 327 } 328 329 // NOLINTNEXTLINE(google-readability-namespace-comments) 330 } // namespace HWY_NAMESPACE 331 } // namespace jxl 332 HWY_AFTER_NAMESPACE(); 333 334 #endif // LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_