input.cc (16428B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "lib/jpegli/input.h" 7 8 #undef HWY_TARGET_INCLUDE 9 #define HWY_TARGET_INCLUDE "lib/jpegli/input.cc" 10 #include <hwy/foreach_target.h> 11 #include <hwy/highway.h> 12 13 #include "lib/jpegli/encode_internal.h" 14 #include "lib/jpegli/error.h" 15 #include "lib/jxl/base/byte_order.h" 16 #include "lib/jxl/base/compiler_specific.h" 17 18 HWY_BEFORE_NAMESPACE(); 19 namespace jpegli { 20 namespace HWY_NAMESPACE { 21 22 using hwy::HWY_NAMESPACE::Mul; 23 using hwy::HWY_NAMESPACE::Rebind; 24 using hwy::HWY_NAMESPACE::Vec; 25 26 using D = HWY_FULL(float); 27 using DU = HWY_FULL(uint32_t); 28 using DU8 = Rebind<uint8_t, D>; 29 using DU16 = Rebind<uint16_t, D>; 30 31 constexpr D d; 32 constexpr DU du; 33 constexpr DU8 du8; 34 constexpr DU16 du16; 35 36 static constexpr double kMul16 = 1.0 / 257.0; 37 static constexpr double kMulFloat = 255.0; 38 39 template <size_t C> 40 void ReadUint8Row(const uint8_t* row_in, size_t x0, size_t len, 41 float* row_out[kMaxComponents]) { 42 for (size_t x = x0; x < len; ++x) { 43 for (size_t c = 0; c < C; ++c) { 44 row_out[c][x] = row_in[C * x + c]; 45 } 46 } 47 } 48 49 template <size_t C, bool swap_endianness = false> 50 void ReadUint16Row(const uint8_t* row_in, size_t x0, size_t len, 51 float* row_out[kMaxComponents]) { 52 const uint16_t* row16 = reinterpret_cast<const uint16_t*>(row_in); 53 for (size_t x = x0; x < len; ++x) { 54 for (size_t c = 0; c < C; ++c) { 55 uint16_t val = row16[C * x + c]; 56 if (swap_endianness) val = JXL_BSWAP16(val); 57 row_out[c][x] = val * kMul16; 58 } 59 } 60 } 61 62 template <size_t C, bool swap_endianness = false> 63 void ReadFloatRow(const uint8_t* row_in, size_t x0, size_t len, 64 float* row_out[kMaxComponents]) { 65 const float* rowf = reinterpret_cast<const float*>(row_in); 66 for (size_t x = x0; x < len; ++x) { 67 for (size_t c = 0; c < C; ++c) { 68 float val = rowf[C * x + c]; 69 if (swap_endianness) val = BSwapFloat(val); 70 row_out[c][x] = val * kMulFloat; 71 } 72 } 73 } 74 75 void ReadUint8RowSingle(const uint8_t* row_in, size_t len, 76 float* row_out[kMaxComponents]) { 77 const size_t N = Lanes(d); 78 const size_t simd_len = len & (~(N - 1)); 79 float* JXL_RESTRICT const row0 = row_out[0]; 80 for (size_t x = 0; x < simd_len; x += N) { 81 Store(ConvertTo(d, PromoteTo(du, LoadU(du8, row_in + x))), d, row0 + x); 82 } 83 ReadUint8Row<1>(row_in, simd_len, len, row_out); 84 } 85 86 void ReadUint8RowInterleaved2(const uint8_t* row_in, size_t len, 87 float* row_out[kMaxComponents]) { 88 const size_t N = Lanes(d); 89 const size_t simd_len = len & (~(N - 1)); 90 float* JXL_RESTRICT const row0 = row_out[0]; 91 float* JXL_RESTRICT const row1 = row_out[1]; 92 Vec<DU8> out0, out1; // NOLINT 93 for (size_t x = 0; x < simd_len; x += N) { 94 LoadInterleaved2(du8, row_in + 2 * x, out0, out1); 95 Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x); 96 Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x); 97 } 98 ReadUint8Row<2>(row_in, simd_len, len, row_out); 99 } 100 101 void ReadUint8RowInterleaved3(const uint8_t* row_in, size_t len, 102 float* row_out[kMaxComponents]) { 103 const size_t N = Lanes(d); 104 const size_t simd_len = len & (~(N - 1)); 105 float* JXL_RESTRICT const row0 = row_out[0]; 106 float* JXL_RESTRICT const row1 = row_out[1]; 107 float* JXL_RESTRICT const row2 = row_out[2]; 108 Vec<DU8> out0, out1, out2; // NOLINT 109 for (size_t x = 0; x < simd_len; x += N) { 110 LoadInterleaved3(du8, row_in + 3 * x, out0, out1, out2); 111 Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x); 112 Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x); 113 Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x); 114 } 115 ReadUint8Row<3>(row_in, simd_len, len, row_out); 116 } 117 118 void ReadUint8RowInterleaved4(const uint8_t* row_in, size_t len, 119 float* row_out[kMaxComponents]) { 120 const size_t N = Lanes(d); 121 const size_t simd_len = len & (~(N - 1)); 122 float* JXL_RESTRICT const row0 = row_out[0]; 123 float* JXL_RESTRICT const row1 = row_out[1]; 124 float* JXL_RESTRICT const row2 = row_out[2]; 125 float* JXL_RESTRICT const row3 = row_out[3]; 126 Vec<DU8> out0, out1, out2, out3; // NOLINT 127 for (size_t x = 0; x < simd_len; x += N) { 128 LoadInterleaved4(du8, row_in + 4 * x, out0, out1, out2, out3); 129 Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x); 130 Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x); 131 Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x); 132 Store(ConvertTo(d, PromoteTo(du, out3)), d, row3 + x); 133 } 134 ReadUint8Row<4>(row_in, simd_len, len, row_out); 135 } 136 137 void ReadUint16RowSingle(const uint8_t* row_in, size_t len, 138 float* row_out[kMaxComponents]) { 139 const size_t N = Lanes(d); 140 const size_t simd_len = len & (~(N - 1)); 141 const auto mul = Set(d, kMul16); 142 const uint16_t* JXL_RESTRICT const row = 143 reinterpret_cast<const uint16_t*>(row_in); 144 float* JXL_RESTRICT const row0 = row_out[0]; 145 for (size_t x = 0; x < simd_len; x += N) { 146 Store(Mul(mul, ConvertTo(d, PromoteTo(du, LoadU(du16, row + x)))), d, 147 row0 + x); 148 } 149 ReadUint16Row<1>(row_in, simd_len, len, row_out); 150 } 151 152 void ReadUint16RowInterleaved2(const uint8_t* row_in, size_t len, 153 float* row_out[kMaxComponents]) { 154 const size_t N = Lanes(d); 155 const size_t simd_len = len & (~(N - 1)); 156 const auto mul = Set(d, kMul16); 157 const uint16_t* JXL_RESTRICT const row = 158 reinterpret_cast<const uint16_t*>(row_in); 159 float* JXL_RESTRICT const row0 = row_out[0]; 160 float* JXL_RESTRICT const row1 = row_out[1]; 161 Vec<DU16> out0, out1; // NOLINT 162 for (size_t x = 0; x < simd_len; x += N) { 163 LoadInterleaved2(du16, row + 2 * x, out0, out1); 164 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x); 165 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x); 166 } 167 ReadUint16Row<2>(row_in, simd_len, len, row_out); 168 } 169 170 void ReadUint16RowInterleaved3(const uint8_t* row_in, size_t len, 171 float* row_out[kMaxComponents]) { 172 const size_t N = Lanes(d); 173 const size_t simd_len = len & (~(N - 1)); 174 const auto mul = Set(d, kMul16); 175 const uint16_t* JXL_RESTRICT const row = 176 reinterpret_cast<const uint16_t*>(row_in); 177 float* JXL_RESTRICT const row0 = row_out[0]; 178 float* JXL_RESTRICT const row1 = row_out[1]; 179 float* JXL_RESTRICT const row2 = row_out[2]; 180 Vec<DU16> out0, out1, out2; // NOLINT 181 for (size_t x = 0; x < simd_len; x += N) { 182 LoadInterleaved3(du16, row + 3 * x, out0, out1, out2); 183 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x); 184 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x); 185 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x); 186 } 187 ReadUint16Row<3>(row_in, simd_len, len, row_out); 188 } 189 190 void ReadUint16RowInterleaved4(const uint8_t* row_in, size_t len, 191 float* row_out[kMaxComponents]) { 192 const size_t N = Lanes(d); 193 const size_t simd_len = len & (~(N - 1)); 194 const auto mul = Set(d, kMul16); 195 const uint16_t* JXL_RESTRICT const row = 196 reinterpret_cast<const uint16_t*>(row_in); 197 float* JXL_RESTRICT const row0 = row_out[0]; 198 float* JXL_RESTRICT const row1 = row_out[1]; 199 float* JXL_RESTRICT const row2 = row_out[2]; 200 float* JXL_RESTRICT const row3 = row_out[3]; 201 Vec<DU16> out0, out1, out2, out3; // NOLINT 202 for (size_t x = 0; x < simd_len; x += N) { 203 LoadInterleaved4(du16, row + 4 * x, out0, out1, out2, out3); 204 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x); 205 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x); 206 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x); 207 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out3))), d, row3 + x); 208 } 209 ReadUint16Row<4>(row_in, simd_len, len, row_out); 210 } 211 212 void ReadUint16RowSingleSwap(const uint8_t* row_in, size_t len, 213 float* row_out[kMaxComponents]) { 214 ReadUint16Row<1, true>(row_in, 0, len, row_out); 215 } 216 217 void ReadUint16RowInterleaved2Swap(const uint8_t* row_in, size_t len, 218 float* row_out[kMaxComponents]) { 219 ReadUint16Row<2, true>(row_in, 0, len, row_out); 220 } 221 222 void ReadUint16RowInterleaved3Swap(const uint8_t* row_in, size_t len, 223 float* row_out[kMaxComponents]) { 224 ReadUint16Row<3, true>(row_in, 0, len, row_out); 225 } 226 227 void ReadUint16RowInterleaved4Swap(const uint8_t* row_in, size_t len, 228 float* row_out[kMaxComponents]) { 229 ReadUint16Row<4, true>(row_in, 0, len, row_out); 230 } 231 232 void ReadFloatRowSingle(const uint8_t* row_in, size_t len, 233 float* row_out[kMaxComponents]) { 234 const size_t N = Lanes(d); 235 const size_t simd_len = len & (~(N - 1)); 236 const auto mul = Set(d, kMulFloat); 237 const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in); 238 float* JXL_RESTRICT const row0 = row_out[0]; 239 for (size_t x = 0; x < simd_len; x += N) { 240 Store(Mul(mul, LoadU(d, row + x)), d, row0 + x); 241 } 242 ReadFloatRow<1>(row_in, simd_len, len, row_out); 243 } 244 245 void ReadFloatRowInterleaved2(const uint8_t* row_in, size_t len, 246 float* row_out[kMaxComponents]) { 247 const size_t N = Lanes(d); 248 const size_t simd_len = len & (~(N - 1)); 249 const auto mul = Set(d, kMulFloat); 250 const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in); 251 float* JXL_RESTRICT const row0 = row_out[0]; 252 float* JXL_RESTRICT const row1 = row_out[1]; 253 Vec<D> out0, out1; // NOLINT 254 for (size_t x = 0; x < simd_len; x += N) { 255 LoadInterleaved2(d, row + 2 * x, out0, out1); 256 Store(Mul(mul, out0), d, row0 + x); 257 Store(Mul(mul, out1), d, row1 + x); 258 } 259 ReadFloatRow<2>(row_in, simd_len, len, row_out); 260 } 261 262 void ReadFloatRowInterleaved3(const uint8_t* row_in, size_t len, 263 float* row_out[kMaxComponents]) { 264 const size_t N = Lanes(d); 265 const size_t simd_len = len & (~(N - 1)); 266 const auto mul = Set(d, kMulFloat); 267 const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in); 268 float* JXL_RESTRICT const row0 = row_out[0]; 269 float* JXL_RESTRICT const row1 = row_out[1]; 270 float* JXL_RESTRICT const row2 = row_out[2]; 271 Vec<D> out0, out1, out2; // NOLINT 272 for (size_t x = 0; x < simd_len; x += N) { 273 LoadInterleaved3(d, row + 3 * x, out0, out1, out2); 274 Store(Mul(mul, out0), d, row0 + x); 275 Store(Mul(mul, out1), d, row1 + x); 276 Store(Mul(mul, out2), d, row2 + x); 277 } 278 ReadFloatRow<3>(row_in, simd_len, len, row_out); 279 } 280 281 void ReadFloatRowInterleaved4(const uint8_t* row_in, size_t len, 282 float* row_out[kMaxComponents]) { 283 const size_t N = Lanes(d); 284 const size_t simd_len = len & (~(N - 1)); 285 const auto mul = Set(d, kMulFloat); 286 const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in); 287 float* JXL_RESTRICT const row0 = row_out[0]; 288 float* JXL_RESTRICT const row1 = row_out[1]; 289 float* JXL_RESTRICT const row2 = row_out[2]; 290 float* JXL_RESTRICT const row3 = row_out[3]; 291 Vec<D> out0, out1, out2, out3; // NOLINT 292 for (size_t x = 0; x < simd_len; x += N) { 293 LoadInterleaved4(d, row + 4 * x, out0, out1, out2, out3); 294 Store(Mul(mul, out0), d, row0 + x); 295 Store(Mul(mul, out1), d, row1 + x); 296 Store(Mul(mul, out2), d, row2 + x); 297 Store(Mul(mul, out3), d, row3 + x); 298 } 299 ReadFloatRow<4>(row_in, simd_len, len, row_out); 300 } 301 302 void ReadFloatRowSingleSwap(const uint8_t* row_in, size_t len, 303 float* row_out[kMaxComponents]) { 304 ReadFloatRow<1, true>(row_in, 0, len, row_out); 305 } 306 307 void ReadFloatRowInterleaved2Swap(const uint8_t* row_in, size_t len, 308 float* row_out[kMaxComponents]) { 309 ReadFloatRow<2, true>(row_in, 0, len, row_out); 310 } 311 312 void ReadFloatRowInterleaved3Swap(const uint8_t* row_in, size_t len, 313 float* row_out[kMaxComponents]) { 314 ReadFloatRow<3, true>(row_in, 0, len, row_out); 315 } 316 317 void ReadFloatRowInterleaved4Swap(const uint8_t* row_in, size_t len, 318 float* row_out[kMaxComponents]) { 319 ReadFloatRow<4, true>(row_in, 0, len, row_out); 320 } 321 322 // NOLINTNEXTLINE(google-readability-namespace-comments) 323 } // namespace HWY_NAMESPACE 324 } // namespace jpegli 325 HWY_AFTER_NAMESPACE(); 326 327 #if HWY_ONCE 328 namespace jpegli { 329 330 HWY_EXPORT(ReadUint8RowSingle); 331 HWY_EXPORT(ReadUint8RowInterleaved2); 332 HWY_EXPORT(ReadUint8RowInterleaved3); 333 HWY_EXPORT(ReadUint8RowInterleaved4); 334 HWY_EXPORT(ReadUint16RowSingle); 335 HWY_EXPORT(ReadUint16RowInterleaved2); 336 HWY_EXPORT(ReadUint16RowInterleaved3); 337 HWY_EXPORT(ReadUint16RowInterleaved4); 338 HWY_EXPORT(ReadUint16RowSingleSwap); 339 HWY_EXPORT(ReadUint16RowInterleaved2Swap); 340 HWY_EXPORT(ReadUint16RowInterleaved3Swap); 341 HWY_EXPORT(ReadUint16RowInterleaved4Swap); 342 HWY_EXPORT(ReadFloatRowSingle); 343 HWY_EXPORT(ReadFloatRowInterleaved2); 344 HWY_EXPORT(ReadFloatRowInterleaved3); 345 HWY_EXPORT(ReadFloatRowInterleaved4); 346 HWY_EXPORT(ReadFloatRowSingleSwap); 347 HWY_EXPORT(ReadFloatRowInterleaved2Swap); 348 HWY_EXPORT(ReadFloatRowInterleaved3Swap); 349 HWY_EXPORT(ReadFloatRowInterleaved4Swap); 350 351 void ChooseInputMethod(j_compress_ptr cinfo) { 352 jpeg_comp_master* m = cinfo->master; 353 bool swap_endianness = 354 (m->endianness == JPEGLI_LITTLE_ENDIAN && !IsLittleEndian()) || 355 (m->endianness == JPEGLI_BIG_ENDIAN && IsLittleEndian()); 356 m->input_method = nullptr; 357 if (m->data_type == JPEGLI_TYPE_UINT8) { 358 if (cinfo->raw_data_in || cinfo->input_components == 1) { 359 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowSingle); 360 } else if (cinfo->input_components == 2) { 361 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved2); 362 } else if (cinfo->input_components == 3) { 363 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved3); 364 } else if (cinfo->input_components == 4) { 365 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved4); 366 } 367 } else if (m->data_type == JPEGLI_TYPE_UINT16 && !swap_endianness) { 368 if (cinfo->raw_data_in || cinfo->input_components == 1) { 369 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingle); 370 } else if (cinfo->input_components == 2) { 371 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2); 372 } else if (cinfo->input_components == 3) { 373 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3); 374 } else if (cinfo->input_components == 4) { 375 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4); 376 } 377 } else if (m->data_type == JPEGLI_TYPE_UINT16 && swap_endianness) { 378 if (cinfo->raw_data_in || cinfo->input_components == 1) { 379 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingleSwap); 380 } else if (cinfo->input_components == 2) { 381 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2Swap); 382 } else if (cinfo->input_components == 3) { 383 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3Swap); 384 } else if (cinfo->input_components == 4) { 385 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4Swap); 386 } 387 } else if (m->data_type == JPEGLI_TYPE_FLOAT && !swap_endianness) { 388 if (cinfo->raw_data_in || cinfo->input_components == 1) { 389 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingle); 390 } else if (cinfo->input_components == 2) { 391 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2); 392 } else if (cinfo->input_components == 3) { 393 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3); 394 } else if (cinfo->input_components == 4) { 395 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4); 396 } 397 } else if (m->data_type == JPEGLI_TYPE_FLOAT && swap_endianness) { 398 if (cinfo->raw_data_in || cinfo->input_components == 1) { 399 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingleSwap); 400 } else if (cinfo->input_components == 2) { 401 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2Swap); 402 } else if (cinfo->input_components == 3) { 403 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3Swap); 404 } else if (cinfo->input_components == 4) { 405 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4Swap); 406 } 407 } 408 if (m->input_method == nullptr) { 409 JPEGLI_ERROR("Could not find input method."); 410 } 411 } 412 413 } // namespace jpegli 414 #endif // HWY_ONCE