entropy_coding-inl.h (7290B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #if defined(LIB_JPEGLI_ENTROPY_CODING_INL_H_) == defined(HWY_TARGET_TOGGLE) 7 #ifdef LIB_JPEGLI_ENTROPY_CODING_INL_H_ 8 #undef LIB_JPEGLI_ENTROPY_CODING_INL_H_ 9 #else 10 #define LIB_JPEGLI_ENTROPY_CODING_INL_H_ 11 #endif 12 13 #include "lib/jxl/base/compiler_specific.h" 14 15 HWY_BEFORE_NAMESPACE(); 16 namespace jpegli { 17 namespace HWY_NAMESPACE { 18 namespace { 19 20 // These templates are not found via ADL. 21 using hwy::HWY_NAMESPACE::Abs; 22 using hwy::HWY_NAMESPACE::Add; 23 using hwy::HWY_NAMESPACE::And; 24 using hwy::HWY_NAMESPACE::AndNot; 25 using hwy::HWY_NAMESPACE::Compress; 26 using hwy::HWY_NAMESPACE::CountTrue; 27 using hwy::HWY_NAMESPACE::Eq; 28 using hwy::HWY_NAMESPACE::GetLane; 29 using hwy::HWY_NAMESPACE::MaskFromVec; 30 using hwy::HWY_NAMESPACE::Max; 31 using hwy::HWY_NAMESPACE::Not; 32 using hwy::HWY_NAMESPACE::Or; 33 using hwy::HWY_NAMESPACE::ShiftRight; 34 using hwy::HWY_NAMESPACE::Shl; 35 using hwy::HWY_NAMESPACE::Sub; 36 37 using DI = HWY_FULL(int32_t); 38 constexpr DI di; 39 40 template <typename DI, class V> 41 JXL_INLINE V NumBits(DI di, const V x) { 42 // TODO(szabadka) Add faster implementations for some specific architectures. 43 const auto b1 = And(x, Set(di, 1)); 44 const auto b2 = And(x, Set(di, 2)); 45 const auto b3 = Sub((And(x, Set(di, 4))), Set(di, 1)); 46 const auto b4 = Sub((And(x, Set(di, 8))), Set(di, 4)); 47 const auto b5 = Sub((And(x, Set(di, 16))), Set(di, 11)); 48 const auto b6 = Sub((And(x, Set(di, 32))), Set(di, 26)); 49 const auto b7 = Sub((And(x, Set(di, 64))), Set(di, 57)); 50 const auto b8 = Sub((And(x, Set(di, 128))), Set(di, 120)); 51 const auto b9 = Sub((And(x, Set(di, 256))), Set(di, 247)); 52 const auto b10 = Sub((And(x, Set(di, 512))), Set(di, 502)); 53 const auto b11 = Sub((And(x, Set(di, 1024))), Set(di, 1013)); 54 const auto b12 = Sub((And(x, Set(di, 2048))), Set(di, 2036)); 55 return Max(Max(Max(Max(b1, b2), Max(b3, b4)), Max(Max(b5, b6), Max(b7, b8))), 56 Max(Max(b9, b10), Max(b11, b12))); 57 } 58 59 // Coefficient indexes pre-multiplied by 16 for the symbol calculation. 60 HWY_ALIGN constexpr int32_t kIndexes[64] = { 61 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 62 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 63 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 64 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 65 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 66 }; 67 68 JXL_INLINE int CompactBlock(int32_t* JXL_RESTRICT block, 69 int32_t* JXL_RESTRICT nonzero_idx) { 70 const auto zero = Zero(di); 71 HWY_ALIGN constexpr int32_t dc_mask_lanes[HWY_LANES(DI)] = {-1}; 72 const auto dc_mask = MaskFromVec(Load(di, dc_mask_lanes)); 73 int num_nonzeros = 0; 74 int k = 0; 75 { 76 const auto coef = Load(di, block); 77 const auto idx = Load(di, kIndexes); 78 const auto nonzero_mask = Or(dc_mask, Not(Eq(coef, zero))); 79 const auto nzero_coef = Compress(coef, nonzero_mask); 80 const auto nzero_idx = Compress(idx, nonzero_mask); 81 StoreU(nzero_coef, di, &block[num_nonzeros]); 82 StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]); 83 num_nonzeros += CountTrue(di, nonzero_mask); 84 k += Lanes(di); 85 } 86 for (; k < DCTSIZE2; k += Lanes(di)) { 87 const auto coef = Load(di, &block[k]); 88 const auto idx = Load(di, &kIndexes[k]); 89 const auto nonzero_mask = Not(Eq(coef, zero)); 90 const auto nzero_coef = Compress(coef, nonzero_mask); 91 const auto nzero_idx = Compress(idx, nonzero_mask); 92 StoreU(nzero_coef, di, &block[num_nonzeros]); 93 StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]); 94 num_nonzeros += CountTrue(di, nonzero_mask); 95 } 96 return num_nonzeros; 97 } 98 99 JXL_INLINE void ComputeSymbols(const int num_nonzeros, 100 int32_t* JXL_RESTRICT nonzero_idx, 101 int32_t* JXL_RESTRICT block, 102 int32_t* JXL_RESTRICT symbols) { 103 nonzero_idx[-1] = -16; 104 const auto one = Set(di, 1); 105 const auto offset = Set(di, 16); 106 for (int i = 0; i < num_nonzeros; i += Lanes(di)) { 107 const auto idx = Load(di, &nonzero_idx[i]); 108 const auto prev_idx = LoadU(di, &nonzero_idx[i - 1]); 109 const auto coeff = Load(di, &block[i]); 110 const auto nbits = NumBits(di, Abs(coeff)); 111 const auto mask = ShiftRight<8 * sizeof(int32_t) - 1>(coeff); 112 const auto bits = And(Add(coeff, mask), Sub(Shl(one, nbits), one)); 113 const auto symbol = Sub(Add(nbits, idx), Add(prev_idx, offset)); 114 Store(symbol, di, symbols + i); 115 Store(bits, di, block + i); 116 } 117 } 118 119 template <typename T> 120 int NumNonZero8x8ExceptDC(const T* block) { 121 const HWY_CAPPED(T, 8) di; 122 123 const auto zero = Zero(di); 124 // Add FFFF for every zero coefficient, negate to get #zeros. 125 auto neg_sum_zero = zero; 126 { 127 // First row has DC, so mask 128 const size_t y = 0; 129 HWY_ALIGN const T dc_mask_lanes[8] = {-1}; 130 131 for (size_t x = 0; x < 8; x += Lanes(di)) { 132 const auto dc_mask = Load(di, dc_mask_lanes + x); 133 134 // DC counts as zero so we don't include it in nzeros. 135 const auto coef = AndNot(dc_mask, Load(di, &block[y * 8 + x])); 136 137 neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); 138 } 139 } 140 // Remaining rows: no mask 141 for (size_t y = 1; y < 8; y++) { 142 for (size_t x = 0; x < 8; x += Lanes(di)) { 143 const auto coef = Load(di, &block[y * 8 + x]); 144 neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); 145 } 146 } 147 148 // We want 64 - sum_zero, add because neg_sum_zero is already negated. 149 return kDCTBlockSize + GetLane(SumOfLanes(di, neg_sum_zero)); 150 } 151 152 template <typename T, bool zig_zag_order> 153 void ComputeTokensForBlock(const T* block, int last_dc, int dc_ctx, int ac_ctx, 154 Token** tokens_ptr) { 155 Token* next_token = *tokens_ptr; 156 coeff_t temp2; 157 coeff_t temp; 158 temp = block[0] - last_dc; 159 if (temp == 0) { 160 *next_token++ = Token(dc_ctx, 0, 0); 161 } else { 162 temp2 = temp; 163 if (temp < 0) { 164 temp = -temp; 165 temp2--; 166 } 167 int dc_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1; 168 int dc_mask = (1 << dc_nbits) - 1; 169 *next_token++ = Token(dc_ctx, dc_nbits, temp2 & dc_mask); 170 } 171 int num_nonzeros = NumNonZero8x8ExceptDC(block); 172 for (int k = 1; k < 64; ++k) { 173 if (num_nonzeros == 0) { 174 *next_token++ = Token(ac_ctx, 0, 0); 175 break; 176 } 177 int r = 0; 178 if (zig_zag_order) { 179 while ((temp = block[k]) == 0) { 180 r++; 181 k++; 182 } 183 } else { 184 while ((temp = block[kJPEGNaturalOrder[k]]) == 0) { 185 r++; 186 k++; 187 } 188 } 189 --num_nonzeros; 190 if (temp < 0) { 191 temp = -temp; 192 temp2 = ~temp; 193 } else { 194 temp2 = temp; 195 } 196 while (r > 15) { 197 *next_token++ = Token(ac_ctx, 0xf0, 0); 198 r -= 16; 199 } 200 int ac_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1; 201 int ac_mask = (1 << ac_nbits) - 1; 202 int symbol = (r << 4u) + ac_nbits; 203 *next_token++ = Token(ac_ctx, symbol, temp2 & ac_mask); 204 } 205 *tokens_ptr = next_token; 206 } 207 208 // NOLINTNEXTLINE(google-readability-namespace-comments) 209 } // namespace 210 } // namespace HWY_NAMESPACE 211 } // namespace jpegli 212 HWY_AFTER_NAMESPACE(); 213 #endif // LIB_JPEGLI_ENTROPY_CODING_INL_H_