libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

dec_xyb-inl.h (14318B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 // XYB -> linear sRGB helper function.
      7 
      8 #if defined(LIB_JXL_DEC_XYB_INL_H_) == defined(HWY_TARGET_TOGGLE)
      9 #ifdef LIB_JXL_DEC_XYB_INL_H_
     10 #undef LIB_JXL_DEC_XYB_INL_H_
     11 #else
     12 #define LIB_JXL_DEC_XYB_INL_H_
     13 #endif
     14 
     15 #include <hwy/highway.h>
     16 
     17 #include "lib/jxl/dec_xyb.h"
     18 HWY_BEFORE_NAMESPACE();
     19 namespace jxl {
     20 namespace HWY_NAMESPACE {
     21 namespace {
     22 
     23 // These templates are not found via ADL.
     24 using hwy::HWY_NAMESPACE::Add;
     25 using hwy::HWY_NAMESPACE::Broadcast;
     26 using hwy::HWY_NAMESPACE::Mul;
     27 using hwy::HWY_NAMESPACE::MulAdd;
     28 using hwy::HWY_NAMESPACE::Sub;
     29 
     30 // Inverts the pixel-wise RGB->XYB conversion in OpsinDynamicsImage() (including
     31 // the gamma mixing and simple gamma). Avoids clamping to [0, 1] - out of (sRGB)
     32 // gamut values may be in-gamut after transforming to a wider space.
     33 // "inverse_matrix" points to 9 broadcasted vectors, which are the 3x3 entries
     34 // of the (row-major) opsin absorbance matrix inverse. Pre-multiplying its
     35 // entries by c is equivalent to multiplying linear_* by c afterwards.
     36 template <class D, class V>
     37 HWY_INLINE HWY_MAYBE_UNUSED void XybToRgb(D d, const V opsin_x, const V opsin_y,
     38                                           const V opsin_b,
     39                                           const OpsinParams& opsin_params,
     40                                           V* const HWY_RESTRICT linear_r,
     41                                           V* const HWY_RESTRICT linear_g,
     42                                           V* const HWY_RESTRICT linear_b) {
     43 #if HWY_TARGET == HWY_SCALAR
     44   const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]);
     45   const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]);
     46   const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]);
     47 #else
     48   const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases);
     49   const auto neg_bias_r = Broadcast<0>(neg_bias_rgb);
     50   const auto neg_bias_g = Broadcast<1>(neg_bias_rgb);
     51   const auto neg_bias_b = Broadcast<2>(neg_bias_rgb);
     52 #endif
     53 
     54   // Color space: XYB -> RGB
     55   auto gamma_r = Add(opsin_y, opsin_x);
     56   auto gamma_g = Sub(opsin_y, opsin_x);
     57   auto gamma_b = opsin_b;
     58 
     59   gamma_r = Sub(gamma_r, Set(d, opsin_params.opsin_biases_cbrt[0]));
     60   gamma_g = Sub(gamma_g, Set(d, opsin_params.opsin_biases_cbrt[1]));
     61   gamma_b = Sub(gamma_b, Set(d, opsin_params.opsin_biases_cbrt[2]));
     62 
     63   // Undo gamma compression: linear = gamma^3 for efficiency.
     64   const auto gamma_r2 = Mul(gamma_r, gamma_r);
     65   const auto gamma_g2 = Mul(gamma_g, gamma_g);
     66   const auto gamma_b2 = Mul(gamma_b, gamma_b);
     67   const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r);
     68   const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g);
     69   const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b);
     70 
     71   const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix;
     72 
     73   // Unmix (multiply by 3x3 inverse_matrix)
     74   // TODO(eustas): ref would be more readable than pointer
     75   *linear_r = Mul(LoadDup128(d, &inverse_matrix[0 * 4]), mixed_r);
     76   *linear_g = Mul(LoadDup128(d, &inverse_matrix[3 * 4]), mixed_r);
     77   *linear_b = Mul(LoadDup128(d, &inverse_matrix[6 * 4]), mixed_r);
     78   *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r);
     79   *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g);
     80   *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b);
     81   *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r);
     82   *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g);
     83   *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b);
     84 }
     85 
     86 inline HWY_MAYBE_UNUSED bool HasFastXYBTosRGB8() {
     87 #if HWY_TARGET == HWY_NEON
     88   return true;
     89 #else
     90   return false;
     91 #endif
     92 }
     93 
     94 inline HWY_MAYBE_UNUSED void FastXYBTosRGB8(const float* input[4],
     95                                             uint8_t* output, bool is_rgba,
     96                                             size_t xsize) {
     97   // This function is very NEON-specific. As such, it uses intrinsics directly.
     98 #if HWY_TARGET == HWY_NEON
     99   // WARNING: doing fixed point arithmetic correctly is very complicated.
    100   // Changes to this function should be thoroughly tested.
    101 
    102   // Note that the input is assumed to have 13 bits of mantissa, and the output
    103   // will have 14 bits.
    104   auto srgb_tf = [&](int16x8_t v16) {
    105     int16x8_t clz = vclzq_s16(v16);
    106     // Convert to [0.25, 0.5) range.
    107     int16x8_t v025_05_16 = vqshlq_s16(v16, vqsubq_s16(clz, vdupq_n_s16(2)));
    108 
    109     // third degree polynomial approximation between 0.25 and 0.5
    110     // of 1.055/2^(7/2.4) * x^(1/2.4) / 32.
    111     // poly ~ ((0.95x-1.75)*x+1.72)*x+0.29
    112     // We actually compute ~ ((0.47x-0.87)*x+0.86)*(2x)+0.29 as 1.75 and 1.72
    113     // overflow our fixed point representation.
    114 
    115     int16x8_t twov = vqaddq_s16(v025_05_16, v025_05_16);
    116 
    117     // 0.47 * x
    118     int16x8_t step1 = vqrdmulhq_n_s16(v025_05_16, 15706);
    119     // - 0.87
    120     int16x8_t step2 = vsubq_s16(step1, vdupq_n_s16(28546));
    121     // * x
    122     int16x8_t step3 = vqrdmulhq_s16(step2, v025_05_16);
    123     // + 0.86
    124     int16x8_t step4 = vaddq_s16(step3, vdupq_n_s16(28302));
    125     // * 2x
    126     int16x8_t step5 = vqrdmulhq_s16(step4, twov);
    127     // + 0.29
    128     int16x8_t mul16 = vaddq_s16(step5, vdupq_n_s16(9485));
    129 
    130     int16x8_t exp16 = vsubq_s16(vdupq_n_s16(11), clz);
    131     // Compute 2**(1/2.4*exp16)/32. Values of exp16 that would overflow are
    132     // capped to 1.
    133     // Generated with the following Python script:
    134     // a = []
    135     // b = []
    136     //
    137     // for i in range(0, 16):
    138     //   v = 2**(5/12.*i)
    139     //   v /= 16
    140     //   v *= 256 * 128
    141     //   v = int(v)
    142     //   a.append(v // 256)
    143     //   b.append(v % 256)
    144     //
    145     // print(", ".join("0x%02x" % x for x in a))
    146     //
    147     // print(", ".join("0x%02x" % x for x in b))
    148 
    149     HWY_ALIGN constexpr uint8_t k2to512powersm1div32_high[16] = {
    150         0x08, 0x0a, 0x0e, 0x13, 0x19, 0x21, 0x2d, 0x3c,
    151         0x50, 0x6b, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f,
    152     };
    153     HWY_ALIGN constexpr uint8_t k2to512powersm1div32_low[16] = {
    154         0x00, 0xad, 0x41, 0x06, 0x65, 0xe7, 0x41, 0x68,
    155         0xa2, 0xa2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    156     };
    157     // Using the highway implementation here since vqtbl1q is aarch64-only.
    158     using hwy::HWY_NAMESPACE::Vec128;
    159     uint8x16_t pow_low =
    160         TableLookupBytes(
    161             Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_low)),
    162             Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16)))
    163             .raw;
    164     uint8x16_t pow_high =
    165         TableLookupBytes(
    166             Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_high)),
    167             Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16)))
    168             .raw;
    169     int16x8_t pow16 = vreinterpretq_s16_u16(vsliq_n_u16(
    170         vreinterpretq_u16_u8(pow_low), vreinterpretq_u16_u8(pow_high), 8));
    171 
    172     // approximation of v * 12.92, divided by 2
    173     // Note that our input is using 13 mantissa bits instead of 15.
    174     int16x8_t v16_linear = vrshrq_n_s16(vmulq_n_s16(v16, 826), 5);
    175     // 1.055*pow(v, 1/2.4) - 0.055, divided by 2
    176     auto v16_pow = vsubq_s16(vqrdmulhq_s16(mul16, pow16), vdupq_n_s16(901));
    177     // > 0.0031308f (note that v16 has 13 mantissa bits)
    178     return vbslq_s16(vcgeq_s16(v16, vdupq_n_s16(26)), v16_pow, v16_linear);
    179   };
    180 
    181   const float* JXL_RESTRICT row_in_x = input[0];
    182   const float* JXL_RESTRICT row_in_y = input[1];
    183   const float* JXL_RESTRICT row_in_b = input[2];
    184   const float* JXL_RESTRICT row_in_a = input[3];
    185   for (size_t x = 0; x < xsize; x += 8) {
    186     // Normal ranges for xyb for in-gamut sRGB colors:
    187     // x: -0.015386 0.028100
    188     // y: 0.000000 0.845308
    189     // b: 0.000000 0.845308
    190 
    191     // We actually want x * 8 to have some extra precision.
    192     // TODO(veluca): consider different approaches here, like vld1q_f32_x2.
    193     float32x4_t opsin_x_left = vld1q_f32(row_in_x + x);
    194     int16x4_t opsin_x16_times8_left =
    195         vqmovn_s32(vcvtq_n_s32_f32(opsin_x_left, 18));
    196     float32x4_t opsin_x_right =
    197         vld1q_f32(row_in_x + x + (x + 4 < xsize ? 4 : 0));
    198     int16x4_t opsin_x16_times8_right =
    199         vqmovn_s32(vcvtq_n_s32_f32(opsin_x_right, 18));
    200     int16x8_t opsin_x16_times8 =
    201         vcombine_s16(opsin_x16_times8_left, opsin_x16_times8_right);
    202 
    203     float32x4_t opsin_y_left = vld1q_f32(row_in_y + x);
    204     int16x4_t opsin_y16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_left, 15));
    205     float32x4_t opsin_y_right =
    206         vld1q_f32(row_in_y + x + (x + 4 < xsize ? 4 : 0));
    207     int16x4_t opsin_y16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_right, 15));
    208     int16x8_t opsin_y16 = vcombine_s16(opsin_y16_left, opsin_y16_right);
    209 
    210     float32x4_t opsin_b_left = vld1q_f32(row_in_b + x);
    211     int16x4_t opsin_b16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_left, 15));
    212     float32x4_t opsin_b_right =
    213         vld1q_f32(row_in_b + x + (x + 4 < xsize ? 4 : 0));
    214     int16x4_t opsin_b16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_right, 15));
    215     int16x8_t opsin_b16 = vcombine_s16(opsin_b16_left, opsin_b16_right);
    216 
    217     int16x8_t neg_bias16 = vdupq_n_s16(-124);        // -0.0037930732552754493
    218     int16x8_t neg_bias_cbrt16 = vdupq_n_s16(-5110);  // -0.155954201
    219     int16x8_t neg_bias_half16 = vdupq_n_s16(-62);
    220 
    221     // Color space: XYB -> RGB
    222     // Compute ((y+x-bias_cbrt)^3-(y-x-bias_cbrt)^3)/2,
    223     // ((y+x-bias_cbrt)^3+(y-x-bias_cbrt)^3)/2+bias, (b-bias_cbrt)^3+bias.
    224     // Note that ignoring x2 in the formulas below (as x << y) results in
    225     // errors of at least 3 in the final sRGB values.
    226     int16x8_t opsin_yp16 = vqsubq_s16(opsin_y16, neg_bias_cbrt16);
    227     int16x8_t ysq16 = vqrdmulhq_s16(opsin_yp16, opsin_yp16);
    228     int16x8_t twentyfourx16 = vmulq_n_s16(opsin_x16_times8, 3);
    229     int16x8_t twentyfourxy16 = vqrdmulhq_s16(opsin_yp16, twentyfourx16);
    230     int16x8_t threexsq16 =
    231         vrshrq_n_s16(vqrdmulhq_s16(opsin_x16_times8, twentyfourx16), 6);
    232 
    233     // We can ignore x^3 here. Note that this is multiplied by 8.
    234     int16x8_t mixed_rmg16 = vqrdmulhq_s16(twentyfourxy16, opsin_yp16);
    235 
    236     int16x8_t mixed_rpg_sos_half = vhaddq_s16(ysq16, threexsq16);
    237     int16x8_t mixed_rpg16 = vhaddq_s16(
    238         vqrdmulhq_s16(opsin_yp16, mixed_rpg_sos_half), neg_bias_half16);
    239 
    240     int16x8_t gamma_b16 = vqsubq_s16(opsin_b16, neg_bias_cbrt16);
    241     int16x8_t gamma_bsq16 = vqrdmulhq_s16(gamma_b16, gamma_b16);
    242     int16x8_t gamma_bcb16 = vqrdmulhq_s16(gamma_bsq16, gamma_b16);
    243     int16x8_t mixed_b16 = vqaddq_s16(gamma_bcb16, neg_bias16);
    244     // mixed_rpg and mixed_b are in 0-1 range.
    245     // mixed_rmg has a smaller range (-0.035 to 0.035 for valid sRGB). Note
    246     // that at this point it is already multiplied by 8.
    247 
    248     // We multiply all the mixed values by 1/4 (i.e. shift them to 13-bit
    249     // fixed point) to ensure intermediate quantities are in range. Note that
    250     // r-g is not shifted, and was x8 before here; this corresponds to a x32
    251     // overall multiplicative factor and ensures that all the matrix constants
    252     // are in 0-1 range.
    253     // Similarly, mixed_rpg16 is already multiplied by 1/4 because of the two
    254     // vhadd + using neg_bias_half.
    255     mixed_b16 = vshrq_n_s16(mixed_b16, 2);
    256 
    257     // Unmix (multiply by 3x3 inverse_matrix)
    258     // For increased precision, we use a matrix for converting from
    259     // ((mixed_r - mixed_g)/2, (mixed_r + mixed_g)/2, mixed_b) to rgb. This
    260     // avoids cancellation effects when computing (y+x)^3-(y-x)^3.
    261     // We compute mixed_rpg - mixed_b because the (1+c)*mixed_rpg - c *
    262     // mixed_b pattern is repeated frequently in the code below. This allows
    263     // us to save a multiply per channel, and removes the presence of
    264     // some constants above 1. Moreover, mixed_rmg - mixed_b is in (-1, 1)
    265     // range, so the subtraction is safe.
    266     // All the magic-looking constants here are derived by computing the
    267     // inverse opsin matrix for the transformation modified as described
    268     // above.
    269 
    270     // Precomputation common to multiple color values.
    271     int16x8_t mixed_rpgmb16 = vqsubq_s16(mixed_rpg16, mixed_b16);
    272     int16x8_t mixed_rpgmb_times_016 = vqrdmulhq_n_s16(mixed_rpgmb16, 5394);
    273     int16x8_t mixed_rg16 = vqaddq_s16(mixed_rpgmb_times_016, mixed_rpg16);
    274 
    275     // R
    276     int16x8_t linear_r16 =
    277         vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, 21400));
    278 
    279     // G
    280     int16x8_t linear_g16 =
    281         vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, -7857));
    282 
    283     // B
    284     int16x8_t linear_b16 = vqrdmulhq_n_s16(mixed_rpgmb16, -30996);
    285     linear_b16 = vqaddq_s16(linear_b16, mixed_b16);
    286     linear_b16 = vqaddq_s16(linear_b16, vqrdmulhq_n_s16(mixed_rmg16, -6525));
    287 
    288     // Apply SRGB transfer function.
    289     int16x8_t r = srgb_tf(linear_r16);
    290     int16x8_t g = srgb_tf(linear_g16);
    291     int16x8_t b = srgb_tf(linear_b16);
    292 
    293     uint8x8_t r8 =
    294         vqmovun_s16(vrshrq_n_s16(vsubq_s16(r, vshrq_n_s16(r, 8)), 6));
    295     uint8x8_t g8 =
    296         vqmovun_s16(vrshrq_n_s16(vsubq_s16(g, vshrq_n_s16(g, 8)), 6));
    297     uint8x8_t b8 =
    298         vqmovun_s16(vrshrq_n_s16(vsubq_s16(b, vshrq_n_s16(b, 8)), 6));
    299 
    300     size_t n = xsize - x;
    301     if (is_rgba) {
    302       float32x4_t a_f32_left =
    303           row_in_a ? vld1q_f32(row_in_a + x) : vdupq_n_f32(1.0f);
    304       float32x4_t a_f32_right =
    305           row_in_a ? vld1q_f32(row_in_a + x + (x + 4 < xsize ? 4 : 0))
    306                    : vdupq_n_f32(1.0f);
    307       int16x4_t a16_left = vqmovn_s32(vcvtq_n_s32_f32(a_f32_left, 8));
    308       int16x4_t a16_right = vqmovn_s32(vcvtq_n_s32_f32(a_f32_right, 8));
    309       uint8x8_t a8 = vqmovun_s16(vcombine_s16(a16_left, a16_right));
    310       uint8_t* buf = output + 4 * x;
    311       uint8x8x4_t data = {r8, g8, b8, a8};
    312       if (n >= 8) {
    313         vst4_u8(buf, data);
    314       } else {
    315         uint8_t tmp[8 * 4];
    316         vst4_u8(tmp, data);
    317         memcpy(buf, tmp, n * 4);
    318       }
    319     } else {
    320       uint8_t* buf = output + 3 * x;
    321       uint8x8x3_t data = {r8, g8, b8};
    322       if (n >= 8) {
    323         vst3_u8(buf, data);
    324       } else {
    325         uint8_t tmp[8 * 3];
    326         vst3_u8(tmp, data);
    327         memcpy(buf, tmp, n * 3);
    328       }
    329     }
    330   }
    331 #else
    332   (void)input;
    333   (void)output;
    334   (void)is_rgba;
    335   (void)xsize;
    336   JXL_UNREACHABLE("Unreachable");
    337 #endif
    338 }
    339 
    340 }  // namespace
    341 // NOLINTNEXTLINE(google-readability-namespace-comments)
    342 }  // namespace HWY_NAMESPACE
    343 }  // namespace jxl
    344 HWY_AFTER_NAMESPACE();
    345 
    346 #endif  // LIB_JXL_DEC_XYB_INL_H_