dec_xyb-inl.h (14318B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 // XYB -> linear sRGB helper function. 7 8 #if defined(LIB_JXL_DEC_XYB_INL_H_) == defined(HWY_TARGET_TOGGLE) 9 #ifdef LIB_JXL_DEC_XYB_INL_H_ 10 #undef LIB_JXL_DEC_XYB_INL_H_ 11 #else 12 #define LIB_JXL_DEC_XYB_INL_H_ 13 #endif 14 15 #include <hwy/highway.h> 16 17 #include "lib/jxl/dec_xyb.h" 18 HWY_BEFORE_NAMESPACE(); 19 namespace jxl { 20 namespace HWY_NAMESPACE { 21 namespace { 22 23 // These templates are not found via ADL. 24 using hwy::HWY_NAMESPACE::Add; 25 using hwy::HWY_NAMESPACE::Broadcast; 26 using hwy::HWY_NAMESPACE::Mul; 27 using hwy::HWY_NAMESPACE::MulAdd; 28 using hwy::HWY_NAMESPACE::Sub; 29 30 // Inverts the pixel-wise RGB->XYB conversion in OpsinDynamicsImage() (including 31 // the gamma mixing and simple gamma). Avoids clamping to [0, 1] - out of (sRGB) 32 // gamut values may be in-gamut after transforming to a wider space. 33 // "inverse_matrix" points to 9 broadcasted vectors, which are the 3x3 entries 34 // of the (row-major) opsin absorbance matrix inverse. Pre-multiplying its 35 // entries by c is equivalent to multiplying linear_* by c afterwards. 36 template <class D, class V> 37 HWY_INLINE HWY_MAYBE_UNUSED void XybToRgb(D d, const V opsin_x, const V opsin_y, 38 const V opsin_b, 39 const OpsinParams& opsin_params, 40 V* const HWY_RESTRICT linear_r, 41 V* const HWY_RESTRICT linear_g, 42 V* const HWY_RESTRICT linear_b) { 43 #if HWY_TARGET == HWY_SCALAR 44 const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]); 45 const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]); 46 const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]); 47 #else 48 const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases); 49 const auto neg_bias_r = Broadcast<0>(neg_bias_rgb); 50 const auto neg_bias_g = Broadcast<1>(neg_bias_rgb); 51 const auto neg_bias_b = Broadcast<2>(neg_bias_rgb); 52 #endif 53 54 // Color space: XYB -> RGB 55 auto gamma_r = Add(opsin_y, opsin_x); 56 auto gamma_g = Sub(opsin_y, opsin_x); 57 auto gamma_b = opsin_b; 58 59 gamma_r = Sub(gamma_r, Set(d, opsin_params.opsin_biases_cbrt[0])); 60 gamma_g = Sub(gamma_g, Set(d, opsin_params.opsin_biases_cbrt[1])); 61 gamma_b = Sub(gamma_b, Set(d, opsin_params.opsin_biases_cbrt[2])); 62 63 // Undo gamma compression: linear = gamma^3 for efficiency. 64 const auto gamma_r2 = Mul(gamma_r, gamma_r); 65 const auto gamma_g2 = Mul(gamma_g, gamma_g); 66 const auto gamma_b2 = Mul(gamma_b, gamma_b); 67 const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r); 68 const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g); 69 const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b); 70 71 const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix; 72 73 // Unmix (multiply by 3x3 inverse_matrix) 74 // TODO(eustas): ref would be more readable than pointer 75 *linear_r = Mul(LoadDup128(d, &inverse_matrix[0 * 4]), mixed_r); 76 *linear_g = Mul(LoadDup128(d, &inverse_matrix[3 * 4]), mixed_r); 77 *linear_b = Mul(LoadDup128(d, &inverse_matrix[6 * 4]), mixed_r); 78 *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r); 79 *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g); 80 *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b); 81 *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r); 82 *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g); 83 *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b); 84 } 85 86 inline HWY_MAYBE_UNUSED bool HasFastXYBTosRGB8() { 87 #if HWY_TARGET == HWY_NEON 88 return true; 89 #else 90 return false; 91 #endif 92 } 93 94 inline HWY_MAYBE_UNUSED void FastXYBTosRGB8(const float* input[4], 95 uint8_t* output, bool is_rgba, 96 size_t xsize) { 97 // This function is very NEON-specific. As such, it uses intrinsics directly. 98 #if HWY_TARGET == HWY_NEON 99 // WARNING: doing fixed point arithmetic correctly is very complicated. 100 // Changes to this function should be thoroughly tested. 101 102 // Note that the input is assumed to have 13 bits of mantissa, and the output 103 // will have 14 bits. 104 auto srgb_tf = [&](int16x8_t v16) { 105 int16x8_t clz = vclzq_s16(v16); 106 // Convert to [0.25, 0.5) range. 107 int16x8_t v025_05_16 = vqshlq_s16(v16, vqsubq_s16(clz, vdupq_n_s16(2))); 108 109 // third degree polynomial approximation between 0.25 and 0.5 110 // of 1.055/2^(7/2.4) * x^(1/2.4) / 32. 111 // poly ~ ((0.95x-1.75)*x+1.72)*x+0.29 112 // We actually compute ~ ((0.47x-0.87)*x+0.86)*(2x)+0.29 as 1.75 and 1.72 113 // overflow our fixed point representation. 114 115 int16x8_t twov = vqaddq_s16(v025_05_16, v025_05_16); 116 117 // 0.47 * x 118 int16x8_t step1 = vqrdmulhq_n_s16(v025_05_16, 15706); 119 // - 0.87 120 int16x8_t step2 = vsubq_s16(step1, vdupq_n_s16(28546)); 121 // * x 122 int16x8_t step3 = vqrdmulhq_s16(step2, v025_05_16); 123 // + 0.86 124 int16x8_t step4 = vaddq_s16(step3, vdupq_n_s16(28302)); 125 // * 2x 126 int16x8_t step5 = vqrdmulhq_s16(step4, twov); 127 // + 0.29 128 int16x8_t mul16 = vaddq_s16(step5, vdupq_n_s16(9485)); 129 130 int16x8_t exp16 = vsubq_s16(vdupq_n_s16(11), clz); 131 // Compute 2**(1/2.4*exp16)/32. Values of exp16 that would overflow are 132 // capped to 1. 133 // Generated with the following Python script: 134 // a = [] 135 // b = [] 136 // 137 // for i in range(0, 16): 138 // v = 2**(5/12.*i) 139 // v /= 16 140 // v *= 256 * 128 141 // v = int(v) 142 // a.append(v // 256) 143 // b.append(v % 256) 144 // 145 // print(", ".join("0x%02x" % x for x in a)) 146 // 147 // print(", ".join("0x%02x" % x for x in b)) 148 149 HWY_ALIGN constexpr uint8_t k2to512powersm1div32_high[16] = { 150 0x08, 0x0a, 0x0e, 0x13, 0x19, 0x21, 0x2d, 0x3c, 151 0x50, 0x6b, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, 152 }; 153 HWY_ALIGN constexpr uint8_t k2to512powersm1div32_low[16] = { 154 0x00, 0xad, 0x41, 0x06, 0x65, 0xe7, 0x41, 0x68, 155 0xa2, 0xa2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 156 }; 157 // Using the highway implementation here since vqtbl1q is aarch64-only. 158 using hwy::HWY_NAMESPACE::Vec128; 159 uint8x16_t pow_low = 160 TableLookupBytes( 161 Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_low)), 162 Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16))) 163 .raw; 164 uint8x16_t pow_high = 165 TableLookupBytes( 166 Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_high)), 167 Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16))) 168 .raw; 169 int16x8_t pow16 = vreinterpretq_s16_u16(vsliq_n_u16( 170 vreinterpretq_u16_u8(pow_low), vreinterpretq_u16_u8(pow_high), 8)); 171 172 // approximation of v * 12.92, divided by 2 173 // Note that our input is using 13 mantissa bits instead of 15. 174 int16x8_t v16_linear = vrshrq_n_s16(vmulq_n_s16(v16, 826), 5); 175 // 1.055*pow(v, 1/2.4) - 0.055, divided by 2 176 auto v16_pow = vsubq_s16(vqrdmulhq_s16(mul16, pow16), vdupq_n_s16(901)); 177 // > 0.0031308f (note that v16 has 13 mantissa bits) 178 return vbslq_s16(vcgeq_s16(v16, vdupq_n_s16(26)), v16_pow, v16_linear); 179 }; 180 181 const float* JXL_RESTRICT row_in_x = input[0]; 182 const float* JXL_RESTRICT row_in_y = input[1]; 183 const float* JXL_RESTRICT row_in_b = input[2]; 184 const float* JXL_RESTRICT row_in_a = input[3]; 185 for (size_t x = 0; x < xsize; x += 8) { 186 // Normal ranges for xyb for in-gamut sRGB colors: 187 // x: -0.015386 0.028100 188 // y: 0.000000 0.845308 189 // b: 0.000000 0.845308 190 191 // We actually want x * 8 to have some extra precision. 192 // TODO(veluca): consider different approaches here, like vld1q_f32_x2. 193 float32x4_t opsin_x_left = vld1q_f32(row_in_x + x); 194 int16x4_t opsin_x16_times8_left = 195 vqmovn_s32(vcvtq_n_s32_f32(opsin_x_left, 18)); 196 float32x4_t opsin_x_right = 197 vld1q_f32(row_in_x + x + (x + 4 < xsize ? 4 : 0)); 198 int16x4_t opsin_x16_times8_right = 199 vqmovn_s32(vcvtq_n_s32_f32(opsin_x_right, 18)); 200 int16x8_t opsin_x16_times8 = 201 vcombine_s16(opsin_x16_times8_left, opsin_x16_times8_right); 202 203 float32x4_t opsin_y_left = vld1q_f32(row_in_y + x); 204 int16x4_t opsin_y16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_left, 15)); 205 float32x4_t opsin_y_right = 206 vld1q_f32(row_in_y + x + (x + 4 < xsize ? 4 : 0)); 207 int16x4_t opsin_y16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_right, 15)); 208 int16x8_t opsin_y16 = vcombine_s16(opsin_y16_left, opsin_y16_right); 209 210 float32x4_t opsin_b_left = vld1q_f32(row_in_b + x); 211 int16x4_t opsin_b16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_left, 15)); 212 float32x4_t opsin_b_right = 213 vld1q_f32(row_in_b + x + (x + 4 < xsize ? 4 : 0)); 214 int16x4_t opsin_b16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_right, 15)); 215 int16x8_t opsin_b16 = vcombine_s16(opsin_b16_left, opsin_b16_right); 216 217 int16x8_t neg_bias16 = vdupq_n_s16(-124); // -0.0037930732552754493 218 int16x8_t neg_bias_cbrt16 = vdupq_n_s16(-5110); // -0.155954201 219 int16x8_t neg_bias_half16 = vdupq_n_s16(-62); 220 221 // Color space: XYB -> RGB 222 // Compute ((y+x-bias_cbrt)^3-(y-x-bias_cbrt)^3)/2, 223 // ((y+x-bias_cbrt)^3+(y-x-bias_cbrt)^3)/2+bias, (b-bias_cbrt)^3+bias. 224 // Note that ignoring x2 in the formulas below (as x << y) results in 225 // errors of at least 3 in the final sRGB values. 226 int16x8_t opsin_yp16 = vqsubq_s16(opsin_y16, neg_bias_cbrt16); 227 int16x8_t ysq16 = vqrdmulhq_s16(opsin_yp16, opsin_yp16); 228 int16x8_t twentyfourx16 = vmulq_n_s16(opsin_x16_times8, 3); 229 int16x8_t twentyfourxy16 = vqrdmulhq_s16(opsin_yp16, twentyfourx16); 230 int16x8_t threexsq16 = 231 vrshrq_n_s16(vqrdmulhq_s16(opsin_x16_times8, twentyfourx16), 6); 232 233 // We can ignore x^3 here. Note that this is multiplied by 8. 234 int16x8_t mixed_rmg16 = vqrdmulhq_s16(twentyfourxy16, opsin_yp16); 235 236 int16x8_t mixed_rpg_sos_half = vhaddq_s16(ysq16, threexsq16); 237 int16x8_t mixed_rpg16 = vhaddq_s16( 238 vqrdmulhq_s16(opsin_yp16, mixed_rpg_sos_half), neg_bias_half16); 239 240 int16x8_t gamma_b16 = vqsubq_s16(opsin_b16, neg_bias_cbrt16); 241 int16x8_t gamma_bsq16 = vqrdmulhq_s16(gamma_b16, gamma_b16); 242 int16x8_t gamma_bcb16 = vqrdmulhq_s16(gamma_bsq16, gamma_b16); 243 int16x8_t mixed_b16 = vqaddq_s16(gamma_bcb16, neg_bias16); 244 // mixed_rpg and mixed_b are in 0-1 range. 245 // mixed_rmg has a smaller range (-0.035 to 0.035 for valid sRGB). Note 246 // that at this point it is already multiplied by 8. 247 248 // We multiply all the mixed values by 1/4 (i.e. shift them to 13-bit 249 // fixed point) to ensure intermediate quantities are in range. Note that 250 // r-g is not shifted, and was x8 before here; this corresponds to a x32 251 // overall multiplicative factor and ensures that all the matrix constants 252 // are in 0-1 range. 253 // Similarly, mixed_rpg16 is already multiplied by 1/4 because of the two 254 // vhadd + using neg_bias_half. 255 mixed_b16 = vshrq_n_s16(mixed_b16, 2); 256 257 // Unmix (multiply by 3x3 inverse_matrix) 258 // For increased precision, we use a matrix for converting from 259 // ((mixed_r - mixed_g)/2, (mixed_r + mixed_g)/2, mixed_b) to rgb. This 260 // avoids cancellation effects when computing (y+x)^3-(y-x)^3. 261 // We compute mixed_rpg - mixed_b because the (1+c)*mixed_rpg - c * 262 // mixed_b pattern is repeated frequently in the code below. This allows 263 // us to save a multiply per channel, and removes the presence of 264 // some constants above 1. Moreover, mixed_rmg - mixed_b is in (-1, 1) 265 // range, so the subtraction is safe. 266 // All the magic-looking constants here are derived by computing the 267 // inverse opsin matrix for the transformation modified as described 268 // above. 269 270 // Precomputation common to multiple color values. 271 int16x8_t mixed_rpgmb16 = vqsubq_s16(mixed_rpg16, mixed_b16); 272 int16x8_t mixed_rpgmb_times_016 = vqrdmulhq_n_s16(mixed_rpgmb16, 5394); 273 int16x8_t mixed_rg16 = vqaddq_s16(mixed_rpgmb_times_016, mixed_rpg16); 274 275 // R 276 int16x8_t linear_r16 = 277 vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, 21400)); 278 279 // G 280 int16x8_t linear_g16 = 281 vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, -7857)); 282 283 // B 284 int16x8_t linear_b16 = vqrdmulhq_n_s16(mixed_rpgmb16, -30996); 285 linear_b16 = vqaddq_s16(linear_b16, mixed_b16); 286 linear_b16 = vqaddq_s16(linear_b16, vqrdmulhq_n_s16(mixed_rmg16, -6525)); 287 288 // Apply SRGB transfer function. 289 int16x8_t r = srgb_tf(linear_r16); 290 int16x8_t g = srgb_tf(linear_g16); 291 int16x8_t b = srgb_tf(linear_b16); 292 293 uint8x8_t r8 = 294 vqmovun_s16(vrshrq_n_s16(vsubq_s16(r, vshrq_n_s16(r, 8)), 6)); 295 uint8x8_t g8 = 296 vqmovun_s16(vrshrq_n_s16(vsubq_s16(g, vshrq_n_s16(g, 8)), 6)); 297 uint8x8_t b8 = 298 vqmovun_s16(vrshrq_n_s16(vsubq_s16(b, vshrq_n_s16(b, 8)), 6)); 299 300 size_t n = xsize - x; 301 if (is_rgba) { 302 float32x4_t a_f32_left = 303 row_in_a ? vld1q_f32(row_in_a + x) : vdupq_n_f32(1.0f); 304 float32x4_t a_f32_right = 305 row_in_a ? vld1q_f32(row_in_a + x + (x + 4 < xsize ? 4 : 0)) 306 : vdupq_n_f32(1.0f); 307 int16x4_t a16_left = vqmovn_s32(vcvtq_n_s32_f32(a_f32_left, 8)); 308 int16x4_t a16_right = vqmovn_s32(vcvtq_n_s32_f32(a_f32_right, 8)); 309 uint8x8_t a8 = vqmovun_s16(vcombine_s16(a16_left, a16_right)); 310 uint8_t* buf = output + 4 * x; 311 uint8x8x4_t data = {r8, g8, b8, a8}; 312 if (n >= 8) { 313 vst4_u8(buf, data); 314 } else { 315 uint8_t tmp[8 * 4]; 316 vst4_u8(tmp, data); 317 memcpy(buf, tmp, n * 4); 318 } 319 } else { 320 uint8_t* buf = output + 3 * x; 321 uint8x8x3_t data = {r8, g8, b8}; 322 if (n >= 8) { 323 vst3_u8(buf, data); 324 } else { 325 uint8_t tmp[8 * 3]; 326 vst3_u8(tmp, data); 327 memcpy(buf, tmp, n * 3); 328 } 329 } 330 } 331 #else 332 (void)input; 333 (void)output; 334 (void)is_rgba; 335 (void)xsize; 336 JXL_UNREACHABLE("Unreachable"); 337 #endif 338 } 339 340 } // namespace 341 // NOLINTNEXTLINE(google-readability-namespace-comments) 342 } // namespace HWY_NAMESPACE 343 } // namespace jxl 344 HWY_AFTER_NAMESPACE(); 345 346 #endif // LIB_JXL_DEC_XYB_INL_H_