enc_entropy_coder.cc (10580B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "lib/jxl/enc_entropy_coder.h" 7 8 #include <stddef.h> 9 #include <stdint.h> 10 11 #include <algorithm> 12 #include <utility> 13 #include <vector> 14 15 #undef HWY_TARGET_INCLUDE 16 #define HWY_TARGET_INCLUDE "lib/jxl/enc_entropy_coder.cc" 17 #include <hwy/foreach_target.h> 18 #include <hwy/highway.h> 19 20 #include "lib/jxl/ac_context.h" 21 #include "lib/jxl/ac_strategy.h" 22 #include "lib/jxl/base/bits.h" 23 #include "lib/jxl/base/compiler_specific.h" 24 #include "lib/jxl/base/status.h" 25 #include "lib/jxl/coeff_order.h" 26 #include "lib/jxl/coeff_order_fwd.h" 27 #include "lib/jxl/dec_ans.h" 28 #include "lib/jxl/dec_bit_reader.h" 29 #include "lib/jxl/dec_context_map.h" 30 #include "lib/jxl/entropy_coder.h" 31 #include "lib/jxl/epf.h" 32 #include "lib/jxl/image.h" 33 #include "lib/jxl/image_ops.h" 34 #include "lib/jxl/pack_signed.h" 35 36 HWY_BEFORE_NAMESPACE(); 37 namespace jxl { 38 namespace HWY_NAMESPACE { 39 40 // These templates are not found via ADL. 41 using hwy::HWY_NAMESPACE::Add; 42 using hwy::HWY_NAMESPACE::AndNot; 43 using hwy::HWY_NAMESPACE::Eq; 44 using hwy::HWY_NAMESPACE::GetLane; 45 46 // Returns number of non-zero coefficients (but skip LLF). 47 // We cannot rely on block[] being all-zero bits, so first truncate to integer. 48 // Also writes the per-8x8 block nzeros starting at nzeros_pos. 49 int32_t NumNonZeroExceptLLF(const size_t cx, const size_t cy, 50 const AcStrategy acs, const size_t covered_blocks, 51 const size_t log2_covered_blocks, 52 const int32_t* JXL_RESTRICT block, 53 const size_t nzeros_stride, 54 int32_t* JXL_RESTRICT nzeros_pos) { 55 const HWY_CAPPED(int32_t, kBlockDim) di; 56 57 const auto zero = Zero(di); 58 // Add FF..FF for every zero coefficient, negate to get #zeros. 59 auto neg_sum_zero = zero; 60 61 { 62 // Mask sufficient for one row of coefficients. 63 HWY_ALIGN const int32_t 64 llf_mask_lanes[AcStrategy::kMaxCoeffBlocks * (1 + kBlockDim)] = { 65 -1, -1, -1, -1}; 66 // First cx=1,2,4 elements are FF..FF, others 0. 67 const int32_t* llf_mask_pos = 68 llf_mask_lanes + AcStrategy::kMaxCoeffBlocks - cx; 69 70 // Rows with LLF: mask out the LLF 71 for (size_t y = 0; y < cy; y++) { 72 for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) { 73 const auto llf_mask = LoadU(di, llf_mask_pos + x); 74 75 // LLF counts as zero so we don't include it in nzeros. 76 const auto coef = 77 AndNot(llf_mask, Load(di, &block[y * cx * kBlockDim + x])); 78 79 neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); 80 } 81 } 82 } 83 84 // Remaining rows: no mask 85 for (size_t y = cy; y < cy * kBlockDim; y++) { 86 for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) { 87 const auto coef = Load(di, &block[y * cx * kBlockDim + x]); 88 neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); 89 } 90 } 91 92 // We want area - sum_zero, add because neg_sum_zero is already negated. 93 const int32_t nzeros = static_cast<int32_t>(cx * cy * kDCTBlockSize) + 94 GetLane(SumOfLanes(di, neg_sum_zero)); 95 96 const int32_t shifted_nzeros = static_cast<int32_t>( 97 (nzeros + covered_blocks - 1) >> log2_covered_blocks); 98 // Need non-canonicalized dimensions! 99 for (size_t y = 0; y < acs.covered_blocks_y(); y++) { 100 for (size_t x = 0; x < acs.covered_blocks_x(); x++) { 101 nzeros_pos[x + y * nzeros_stride] = shifted_nzeros; 102 } 103 } 104 105 return nzeros; 106 } 107 108 // Specialization for 8x8, where only top-left is LLF/DC. 109 // About 1% overall speedup vs. NumNonZeroExceptLLF. 110 int32_t NumNonZero8x8ExceptDC(const int32_t* JXL_RESTRICT block, 111 int32_t* JXL_RESTRICT nzeros_pos) { 112 const HWY_CAPPED(int32_t, kBlockDim) di; 113 114 const auto zero = Zero(di); 115 // Add FF..FF for every zero coefficient, negate to get #zeros. 116 auto neg_sum_zero = zero; 117 118 { 119 // First row has DC, so mask 120 const size_t y = 0; 121 HWY_ALIGN const int32_t dc_mask_lanes[kBlockDim] = {-1}; 122 123 for (size_t x = 0; x < kBlockDim; x += Lanes(di)) { 124 const auto dc_mask = Load(di, dc_mask_lanes + x); 125 126 // DC counts as zero so we don't include it in nzeros. 127 const auto coef = AndNot(dc_mask, Load(di, &block[y * kBlockDim + x])); 128 129 neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); 130 } 131 } 132 133 // Remaining rows: no mask 134 for (size_t y = 1; y < kBlockDim; y++) { 135 for (size_t x = 0; x < kBlockDim; x += Lanes(di)) { 136 const auto coef = Load(di, &block[y * kBlockDim + x]); 137 neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); 138 } 139 } 140 141 // We want 64 - sum_zero, add because neg_sum_zero is already negated. 142 const int32_t nzeros = static_cast<int32_t>(kDCTBlockSize) + 143 GetLane(SumOfLanes(di, neg_sum_zero)); 144 145 *nzeros_pos = nzeros; 146 147 return nzeros; 148 } 149 150 // The number of nonzeros of each block is predicted from the top and the left 151 // blocks, with opportune scaling to take into account the number of blocks of 152 // each strategy. The predicted number of nonzeros divided by two is used as a 153 // context; if this number is above 63, a specific context is used. If the 154 // number of nonzeros of a strategy is above 63, it is written directly using a 155 // fixed number of bits (that depends on the size of the strategy). 156 void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders, 157 const Rect& rect, 158 const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows, 159 const AcStrategyImage& ac_strategy, 160 const YCbCrChromaSubsampling& cs, 161 Image3I* JXL_RESTRICT tmp_num_nzeroes, 162 std::vector<Token>* JXL_RESTRICT output, 163 const ImageB& qdc, const ImageI& qf, 164 const BlockCtxMap& block_ctx_map) { 165 const size_t xsize_blocks = rect.xsize(); 166 const size_t ysize_blocks = rect.ysize(); 167 output->clear(); 168 // TODO(user): update the estimate: usually less coefficients are used. 169 output->reserve(3 * xsize_blocks * ysize_blocks * kDCTBlockSize); 170 171 size_t offset[3] = {}; 172 const size_t nzeros_stride = tmp_num_nzeroes->PixelsPerRow(); 173 for (size_t by = 0; by < ysize_blocks; ++by) { 174 size_t sby[3] = {by >> cs.VShift(0), by >> cs.VShift(1), 175 by >> cs.VShift(2)}; 176 int32_t* JXL_RESTRICT row_nzeros[3] = { 177 tmp_num_nzeroes->PlaneRow(0, sby[0]), 178 tmp_num_nzeroes->PlaneRow(1, sby[1]), 179 tmp_num_nzeroes->PlaneRow(2, sby[2]), 180 }; 181 const int32_t* JXL_RESTRICT row_nzeros_top[3] = { 182 sby[0] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(0, sby[0] - 1), 183 sby[1] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(1, sby[1] - 1), 184 sby[2] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(2, sby[2] - 1), 185 }; 186 const uint8_t* JXL_RESTRICT row_qdc = 187 qdc.ConstRow(rect.y0() + by) + rect.x0(); 188 const int32_t* JXL_RESTRICT row_qf = rect.ConstRow(qf, by); 189 AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by); 190 for (size_t bx = 0; bx < xsize_blocks; ++bx) { 191 AcStrategy acs = acs_row[bx]; 192 if (!acs.IsFirstBlock()) continue; 193 size_t sbx[3] = {bx >> cs.HShift(0), bx >> cs.HShift(1), 194 bx >> cs.HShift(2)}; 195 size_t cx = acs.covered_blocks_x(); 196 size_t cy = acs.covered_blocks_y(); 197 const size_t covered_blocks = cx * cy; // = #LLF coefficients 198 const size_t log2_covered_blocks = 199 Num0BitsBelowLS1Bit_Nonzero(covered_blocks); 200 const size_t size = covered_blocks * kDCTBlockSize; 201 202 CoefficientLayout(&cy, &cx); // swap cx/cy to canonical order 203 204 for (int c : {1, 0, 2}) { 205 if (sbx[c] << cs.HShift(c) != bx) continue; 206 if (sby[c] << cs.VShift(c) != by) continue; 207 const int32_t* JXL_RESTRICT block = ac_rows[c] + offset[c]; 208 209 int32_t nzeros = 210 (covered_blocks == 1) 211 ? NumNonZero8x8ExceptDC(block, row_nzeros[c] + sbx[c]) 212 : NumNonZeroExceptLLF(cx, cy, acs, covered_blocks, 213 log2_covered_blocks, block, nzeros_stride, 214 row_nzeros[c] + sbx[c]); 215 216 int ord = kStrategyOrder[acs.RawStrategy()]; 217 const coeff_order_t* JXL_RESTRICT order = 218 &orders[CoeffOrderOffset(ord, c)]; 219 220 int32_t predicted_nzeros = 221 PredictFromTopAndLeft(row_nzeros_top[c], row_nzeros[c], sbx[c], 32); 222 size_t block_ctx = 223 block_ctx_map.Context(row_qdc[bx], row_qf[sbx[c]], ord, c); 224 const int32_t nzero_ctx = 225 block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx); 226 227 output->emplace_back(nzero_ctx, nzeros); 228 const size_t histo_offset = 229 block_ctx_map.ZeroDensityContextsOffset(block_ctx); 230 // Skip LLF. 231 size_t prev = (nzeros > static_cast<ssize_t>(size / 16) ? 0 : 1); 232 for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) { 233 int32_t coeff = block[order[k]]; 234 size_t ctx = 235 histo_offset + ZeroDensityContext(nzeros, k, covered_blocks, 236 log2_covered_blocks, prev); 237 uint32_t u_coeff = PackSigned(coeff); 238 output->emplace_back(ctx, u_coeff); 239 prev = (coeff != 0) ? 1 : 0; 240 nzeros -= prev; 241 } 242 JXL_DASSERT(nzeros == 0); 243 offset[c] += size; 244 } 245 } 246 } 247 } 248 249 // NOLINTNEXTLINE(google-readability-namespace-comments) 250 } // namespace HWY_NAMESPACE 251 } // namespace jxl 252 HWY_AFTER_NAMESPACE(); 253 254 #if HWY_ONCE 255 namespace jxl { 256 HWY_EXPORT(TokenizeCoefficients); 257 void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders, 258 const Rect& rect, 259 const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows, 260 const AcStrategyImage& ac_strategy, 261 const YCbCrChromaSubsampling& cs, 262 Image3I* JXL_RESTRICT tmp_num_nzeroes, 263 std::vector<Token>* JXL_RESTRICT output, 264 const ImageB& qdc, const ImageI& qf, 265 const BlockCtxMap& block_ctx_map) { 266 HWY_DYNAMIC_DISPATCH(TokenizeCoefficients) 267 (orders, rect, ac_rows, ac_strategy, cs, tmp_num_nzeroes, output, qdc, qf, 268 block_ctx_map); 269 } 270 271 } // namespace jxl 272 #endif // HWY_ONCE