input.cc - libjxl - FORK: libjxl patches used on blog

input.cc (16428B)
      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jpegli/input.h"
      7 
      8 #undef HWY_TARGET_INCLUDE
      9 #define HWY_TARGET_INCLUDE "lib/jpegli/input.cc"
     10 #include <hwy/foreach_target.h>
     11 #include <hwy/highway.h>
     12 
     13 #include "lib/jpegli/encode_internal.h"
     14 #include "lib/jpegli/error.h"
     15 #include "lib/jxl/base/byte_order.h"
     16 #include "lib/jxl/base/compiler_specific.h"
     17 
     18 HWY_BEFORE_NAMESPACE();
     19 namespace jpegli {
     20 namespace HWY_NAMESPACE {
     21 
     22 using hwy::HWY_NAMESPACE::Mul;
     23 using hwy::HWY_NAMESPACE::Rebind;
     24 using hwy::HWY_NAMESPACE::Vec;
     25 
     26 using D = HWY_FULL(float);
     27 using DU = HWY_FULL(uint32_t);
     28 using DU8 = Rebind<uint8_t, D>;
     29 using DU16 = Rebind<uint16_t, D>;
     30 
     31 constexpr D d;
     32 constexpr DU du;
     33 constexpr DU8 du8;
     34 constexpr DU16 du16;
     35 
     36 static constexpr double kMul16 = 1.0 / 257.0;
     37 static constexpr double kMulFloat = 255.0;
     38 
     39 template <size_t C>
     40 void ReadUint8Row(const uint8_t* row_in, size_t x0, size_t len,
     41                   float* row_out[kMaxComponents]) {
     42   for (size_t x = x0; x < len; ++x) {
     43     for (size_t c = 0; c < C; ++c) {
     44       row_out[c][x] = row_in[C * x + c];
     45     }
     46   }
     47 }
     48 
     49 template <size_t C, bool swap_endianness = false>
     50 void ReadUint16Row(const uint8_t* row_in, size_t x0, size_t len,
     51                    float* row_out[kMaxComponents]) {
     52   const uint16_t* row16 = reinterpret_cast<const uint16_t*>(row_in);
     53   for (size_t x = x0; x < len; ++x) {
     54     for (size_t c = 0; c < C; ++c) {
     55       uint16_t val = row16[C * x + c];
     56       if (swap_endianness) val = JXL_BSWAP16(val);
     57       row_out[c][x] = val * kMul16;
     58     }
     59   }
     60 }
     61 
     62 template <size_t C, bool swap_endianness = false>
     63 void ReadFloatRow(const uint8_t* row_in, size_t x0, size_t len,
     64                   float* row_out[kMaxComponents]) {
     65   const float* rowf = reinterpret_cast<const float*>(row_in);
     66   for (size_t x = x0; x < len; ++x) {
     67     for (size_t c = 0; c < C; ++c) {
     68       float val = rowf[C * x + c];
     69       if (swap_endianness) val = BSwapFloat(val);
     70       row_out[c][x] = val * kMulFloat;
     71     }
     72   }
     73 }
     74 
     75 void ReadUint8RowSingle(const uint8_t* row_in, size_t len,
     76                         float* row_out[kMaxComponents]) {
     77   const size_t N = Lanes(d);
     78   const size_t simd_len = len & (~(N - 1));
     79   float* JXL_RESTRICT const row0 = row_out[0];
     80   for (size_t x = 0; x < simd_len; x += N) {
     81     Store(ConvertTo(d, PromoteTo(du, LoadU(du8, row_in + x))), d, row0 + x);
     82   }
     83   ReadUint8Row<1>(row_in, simd_len, len, row_out);
     84 }
     85 
     86 void ReadUint8RowInterleaved2(const uint8_t* row_in, size_t len,
     87                               float* row_out[kMaxComponents]) {
     88   const size_t N = Lanes(d);
     89   const size_t simd_len = len & (~(N - 1));
     90   float* JXL_RESTRICT const row0 = row_out[0];
     91   float* JXL_RESTRICT const row1 = row_out[1];
     92   Vec<DU8> out0, out1;  // NOLINT
     93   for (size_t x = 0; x < simd_len; x += N) {
     94     LoadInterleaved2(du8, row_in + 2 * x, out0, out1);
     95     Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
     96     Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
     97   }
     98   ReadUint8Row<2>(row_in, simd_len, len, row_out);
     99 }
    100 
    101 void ReadUint8RowInterleaved3(const uint8_t* row_in, size_t len,
    102                               float* row_out[kMaxComponents]) {
    103   const size_t N = Lanes(d);
    104   const size_t simd_len = len & (~(N - 1));
    105   float* JXL_RESTRICT const row0 = row_out[0];
    106   float* JXL_RESTRICT const row1 = row_out[1];
    107   float* JXL_RESTRICT const row2 = row_out[2];
    108   Vec<DU8> out0, out1, out2;  // NOLINT
    109   for (size_t x = 0; x < simd_len; x += N) {
    110     LoadInterleaved3(du8, row_in + 3 * x, out0, out1, out2);
    111     Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
    112     Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
    113     Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
    114   }
    115   ReadUint8Row<3>(row_in, simd_len, len, row_out);
    116 }
    117 
    118 void ReadUint8RowInterleaved4(const uint8_t* row_in, size_t len,
    119                               float* row_out[kMaxComponents]) {
    120   const size_t N = Lanes(d);
    121   const size_t simd_len = len & (~(N - 1));
    122   float* JXL_RESTRICT const row0 = row_out[0];
    123   float* JXL_RESTRICT const row1 = row_out[1];
    124   float* JXL_RESTRICT const row2 = row_out[2];
    125   float* JXL_RESTRICT const row3 = row_out[3];
    126   Vec<DU8> out0, out1, out2, out3;  // NOLINT
    127   for (size_t x = 0; x < simd_len; x += N) {
    128     LoadInterleaved4(du8, row_in + 4 * x, out0, out1, out2, out3);
    129     Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
    130     Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
    131     Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
    132     Store(ConvertTo(d, PromoteTo(du, out3)), d, row3 + x);
    133   }
    134   ReadUint8Row<4>(row_in, simd_len, len, row_out);
    135 }
    136 
    137 void ReadUint16RowSingle(const uint8_t* row_in, size_t len,
    138                          float* row_out[kMaxComponents]) {
    139   const size_t N = Lanes(d);
    140   const size_t simd_len = len & (~(N - 1));
    141   const auto mul = Set(d, kMul16);
    142   const uint16_t* JXL_RESTRICT const row =
    143       reinterpret_cast<const uint16_t*>(row_in);
    144   float* JXL_RESTRICT const row0 = row_out[0];
    145   for (size_t x = 0; x < simd_len; x += N) {
    146     Store(Mul(mul, ConvertTo(d, PromoteTo(du, LoadU(du16, row + x)))), d,
    147           row0 + x);
    148   }
    149   ReadUint16Row<1>(row_in, simd_len, len, row_out);
    150 }
    151 
    152 void ReadUint16RowInterleaved2(const uint8_t* row_in, size_t len,
    153                                float* row_out[kMaxComponents]) {
    154   const size_t N = Lanes(d);
    155   const size_t simd_len = len & (~(N - 1));
    156   const auto mul = Set(d, kMul16);
    157   const uint16_t* JXL_RESTRICT const row =
    158       reinterpret_cast<const uint16_t*>(row_in);
    159   float* JXL_RESTRICT const row0 = row_out[0];
    160   float* JXL_RESTRICT const row1 = row_out[1];
    161   Vec<DU16> out0, out1;  // NOLINT
    162   for (size_t x = 0; x < simd_len; x += N) {
    163     LoadInterleaved2(du16, row + 2 * x, out0, out1);
    164     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
    165     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
    166   }
    167   ReadUint16Row<2>(row_in, simd_len, len, row_out);
    168 }
    169 
    170 void ReadUint16RowInterleaved3(const uint8_t* row_in, size_t len,
    171                                float* row_out[kMaxComponents]) {
    172   const size_t N = Lanes(d);
    173   const size_t simd_len = len & (~(N - 1));
    174   const auto mul = Set(d, kMul16);
    175   const uint16_t* JXL_RESTRICT const row =
    176       reinterpret_cast<const uint16_t*>(row_in);
    177   float* JXL_RESTRICT const row0 = row_out[0];
    178   float* JXL_RESTRICT const row1 = row_out[1];
    179   float* JXL_RESTRICT const row2 = row_out[2];
    180   Vec<DU16> out0, out1, out2;  // NOLINT
    181   for (size_t x = 0; x < simd_len; x += N) {
    182     LoadInterleaved3(du16, row + 3 * x, out0, out1, out2);
    183     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
    184     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
    185     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
    186   }
    187   ReadUint16Row<3>(row_in, simd_len, len, row_out);
    188 }
    189 
    190 void ReadUint16RowInterleaved4(const uint8_t* row_in, size_t len,
    191                                float* row_out[kMaxComponents]) {
    192   const size_t N = Lanes(d);
    193   const size_t simd_len = len & (~(N - 1));
    194   const auto mul = Set(d, kMul16);
    195   const uint16_t* JXL_RESTRICT const row =
    196       reinterpret_cast<const uint16_t*>(row_in);
    197   float* JXL_RESTRICT const row0 = row_out[0];
    198   float* JXL_RESTRICT const row1 = row_out[1];
    199   float* JXL_RESTRICT const row2 = row_out[2];
    200   float* JXL_RESTRICT const row3 = row_out[3];
    201   Vec<DU16> out0, out1, out2, out3;  // NOLINT
    202   for (size_t x = 0; x < simd_len; x += N) {
    203     LoadInterleaved4(du16, row + 4 * x, out0, out1, out2, out3);
    204     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
    205     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
    206     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
    207     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out3))), d, row3 + x);
    208   }
    209   ReadUint16Row<4>(row_in, simd_len, len, row_out);
    210 }
    211 
    212 void ReadUint16RowSingleSwap(const uint8_t* row_in, size_t len,
    213                              float* row_out[kMaxComponents]) {
    214   ReadUint16Row<1, true>(row_in, 0, len, row_out);
    215 }
    216 
    217 void ReadUint16RowInterleaved2Swap(const uint8_t* row_in, size_t len,
    218                                    float* row_out[kMaxComponents]) {
    219   ReadUint16Row<2, true>(row_in, 0, len, row_out);
    220 }
    221 
    222 void ReadUint16RowInterleaved3Swap(const uint8_t* row_in, size_t len,
    223                                    float* row_out[kMaxComponents]) {
    224   ReadUint16Row<3, true>(row_in, 0, len, row_out);
    225 }
    226 
    227 void ReadUint16RowInterleaved4Swap(const uint8_t* row_in, size_t len,
    228                                    float* row_out[kMaxComponents]) {
    229   ReadUint16Row<4, true>(row_in, 0, len, row_out);
    230 }
    231 
    232 void ReadFloatRowSingle(const uint8_t* row_in, size_t len,
    233                         float* row_out[kMaxComponents]) {
    234   const size_t N = Lanes(d);
    235   const size_t simd_len = len & (~(N - 1));
    236   const auto mul = Set(d, kMulFloat);
    237   const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
    238   float* JXL_RESTRICT const row0 = row_out[0];
    239   for (size_t x = 0; x < simd_len; x += N) {
    240     Store(Mul(mul, LoadU(d, row + x)), d, row0 + x);
    241   }
    242   ReadFloatRow<1>(row_in, simd_len, len, row_out);
    243 }
    244 
    245 void ReadFloatRowInterleaved2(const uint8_t* row_in, size_t len,
    246                               float* row_out[kMaxComponents]) {
    247   const size_t N = Lanes(d);
    248   const size_t simd_len = len & (~(N - 1));
    249   const auto mul = Set(d, kMulFloat);
    250   const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
    251   float* JXL_RESTRICT const row0 = row_out[0];
    252   float* JXL_RESTRICT const row1 = row_out[1];
    253   Vec<D> out0, out1;  // NOLINT
    254   for (size_t x = 0; x < simd_len; x += N) {
    255     LoadInterleaved2(d, row + 2 * x, out0, out1);
    256     Store(Mul(mul, out0), d, row0 + x);
    257     Store(Mul(mul, out1), d, row1 + x);
    258   }
    259   ReadFloatRow<2>(row_in, simd_len, len, row_out);
    260 }
    261 
    262 void ReadFloatRowInterleaved3(const uint8_t* row_in, size_t len,
    263                               float* row_out[kMaxComponents]) {
    264   const size_t N = Lanes(d);
    265   const size_t simd_len = len & (~(N - 1));
    266   const auto mul = Set(d, kMulFloat);
    267   const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
    268   float* JXL_RESTRICT const row0 = row_out[0];
    269   float* JXL_RESTRICT const row1 = row_out[1];
    270   float* JXL_RESTRICT const row2 = row_out[2];
    271   Vec<D> out0, out1, out2;  // NOLINT
    272   for (size_t x = 0; x < simd_len; x += N) {
    273     LoadInterleaved3(d, row + 3 * x, out0, out1, out2);
    274     Store(Mul(mul, out0), d, row0 + x);
    275     Store(Mul(mul, out1), d, row1 + x);
    276     Store(Mul(mul, out2), d, row2 + x);
    277   }
    278   ReadFloatRow<3>(row_in, simd_len, len, row_out);
    279 }
    280 
    281 void ReadFloatRowInterleaved4(const uint8_t* row_in, size_t len,
    282                               float* row_out[kMaxComponents]) {
    283   const size_t N = Lanes(d);
    284   const size_t simd_len = len & (~(N - 1));
    285   const auto mul = Set(d, kMulFloat);
    286   const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
    287   float* JXL_RESTRICT const row0 = row_out[0];
    288   float* JXL_RESTRICT const row1 = row_out[1];
    289   float* JXL_RESTRICT const row2 = row_out[2];
    290   float* JXL_RESTRICT const row3 = row_out[3];
    291   Vec<D> out0, out1, out2, out3;  // NOLINT
    292   for (size_t x = 0; x < simd_len; x += N) {
    293     LoadInterleaved4(d, row + 4 * x, out0, out1, out2, out3);
    294     Store(Mul(mul, out0), d, row0 + x);
    295     Store(Mul(mul, out1), d, row1 + x);
    296     Store(Mul(mul, out2), d, row2 + x);
    297     Store(Mul(mul, out3), d, row3 + x);
    298   }
    299   ReadFloatRow<4>(row_in, simd_len, len, row_out);
    300 }
    301 
    302 void ReadFloatRowSingleSwap(const uint8_t* row_in, size_t len,
    303                             float* row_out[kMaxComponents]) {
    304   ReadFloatRow<1, true>(row_in, 0, len, row_out);
    305 }
    306 
    307 void ReadFloatRowInterleaved2Swap(const uint8_t* row_in, size_t len,
    308                                   float* row_out[kMaxComponents]) {
    309   ReadFloatRow<2, true>(row_in, 0, len, row_out);
    310 }
    311 
    312 void ReadFloatRowInterleaved3Swap(const uint8_t* row_in, size_t len,
    313                                   float* row_out[kMaxComponents]) {
    314   ReadFloatRow<3, true>(row_in, 0, len, row_out);
    315 }
    316 
    317 void ReadFloatRowInterleaved4Swap(const uint8_t* row_in, size_t len,
    318                                   float* row_out[kMaxComponents]) {
    319   ReadFloatRow<4, true>(row_in, 0, len, row_out);
    320 }
    321 
    322 // NOLINTNEXTLINE(google-readability-namespace-comments)
    323 }  // namespace HWY_NAMESPACE
    324 }  // namespace jpegli
    325 HWY_AFTER_NAMESPACE();
    326 
    327 #if HWY_ONCE
    328 namespace jpegli {
    329 
    330 HWY_EXPORT(ReadUint8RowSingle);
    331 HWY_EXPORT(ReadUint8RowInterleaved2);
    332 HWY_EXPORT(ReadUint8RowInterleaved3);
    333 HWY_EXPORT(ReadUint8RowInterleaved4);
    334 HWY_EXPORT(ReadUint16RowSingle);
    335 HWY_EXPORT(ReadUint16RowInterleaved2);
    336 HWY_EXPORT(ReadUint16RowInterleaved3);
    337 HWY_EXPORT(ReadUint16RowInterleaved4);
    338 HWY_EXPORT(ReadUint16RowSingleSwap);
    339 HWY_EXPORT(ReadUint16RowInterleaved2Swap);
    340 HWY_EXPORT(ReadUint16RowInterleaved3Swap);
    341 HWY_EXPORT(ReadUint16RowInterleaved4Swap);
    342 HWY_EXPORT(ReadFloatRowSingle);
    343 HWY_EXPORT(ReadFloatRowInterleaved2);
    344 HWY_EXPORT(ReadFloatRowInterleaved3);
    345 HWY_EXPORT(ReadFloatRowInterleaved4);
    346 HWY_EXPORT(ReadFloatRowSingleSwap);
    347 HWY_EXPORT(ReadFloatRowInterleaved2Swap);
    348 HWY_EXPORT(ReadFloatRowInterleaved3Swap);
    349 HWY_EXPORT(ReadFloatRowInterleaved4Swap);
    350 
    351 void ChooseInputMethod(j_compress_ptr cinfo) {
    352   jpeg_comp_master* m = cinfo->master;
    353   bool swap_endianness =
    354       (m->endianness == JPEGLI_LITTLE_ENDIAN && !IsLittleEndian()) ||
    355       (m->endianness == JPEGLI_BIG_ENDIAN && IsLittleEndian());
    356   m->input_method = nullptr;
    357   if (m->data_type == JPEGLI_TYPE_UINT8) {
    358     if (cinfo->raw_data_in || cinfo->input_components == 1) {
    359       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowSingle);
    360     } else if (cinfo->input_components == 2) {
    361       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved2);
    362     } else if (cinfo->input_components == 3) {
    363       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved3);
    364     } else if (cinfo->input_components == 4) {
    365       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved4);
    366     }
    367   } else if (m->data_type == JPEGLI_TYPE_UINT16 && !swap_endianness) {
    368     if (cinfo->raw_data_in || cinfo->input_components == 1) {
    369       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingle);
    370     } else if (cinfo->input_components == 2) {
    371       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2);
    372     } else if (cinfo->input_components == 3) {
    373       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3);
    374     } else if (cinfo->input_components == 4) {
    375       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4);
    376     }
    377   } else if (m->data_type == JPEGLI_TYPE_UINT16 && swap_endianness) {
    378     if (cinfo->raw_data_in || cinfo->input_components == 1) {
    379       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingleSwap);
    380     } else if (cinfo->input_components == 2) {
    381       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2Swap);
    382     } else if (cinfo->input_components == 3) {
    383       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3Swap);
    384     } else if (cinfo->input_components == 4) {
    385       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4Swap);
    386     }
    387   } else if (m->data_type == JPEGLI_TYPE_FLOAT && !swap_endianness) {
    388     if (cinfo->raw_data_in || cinfo->input_components == 1) {
    389       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingle);
    390     } else if (cinfo->input_components == 2) {
    391       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2);
    392     } else if (cinfo->input_components == 3) {
    393       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3);
    394     } else if (cinfo->input_components == 4) {
    395       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4);
    396     }
    397   } else if (m->data_type == JPEGLI_TYPE_FLOAT && swap_endianness) {
    398     if (cinfo->raw_data_in || cinfo->input_components == 1) {
    399       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingleSwap);
    400     } else if (cinfo->input_components == 2) {
    401       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2Swap);
    402     } else if (cinfo->input_components == 3) {
    403       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3Swap);
    404     } else if (cinfo->input_components == 4) {
    405       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4Swap);
    406     }
    407   }
    408   if (m->input_method == nullptr) {
    409     JPEGLI_ERROR("Could not find input method.");
    410   }
    411 }
    412 
    413 }  // namespace jpegli
    414 #endif  // HWY_ONCE
	libjxl FORK: libjxl patches used on blog
	git clone https://git.neptards.moe/blog/libjxl.git
	Log \| Files \| Refs \| Submodules \| README \| LICENSE