libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

dec_cache.h (8933B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #ifndef LIB_JXL_DEC_CACHE_H_
      7 #define LIB_JXL_DEC_CACHE_H_
      8 
      9 #include <jxl/decode.h>
     10 #include <jxl/types.h>
     11 #include <stdint.h>
     12 
     13 #include <algorithm>
     14 #include <atomic>
     15 #include <cmath>
     16 #include <hwy/base.h>  // HWY_ALIGN_MAX
     17 #include <memory>
     18 #include <vector>
     19 
     20 #include "hwy/aligned_allocator.h"
     21 #include "lib/jxl/ac_strategy.h"
     22 #include "lib/jxl/base/common.h"  // kMaxNumPasses
     23 #include "lib/jxl/base/compiler_specific.h"
     24 #include "lib/jxl/base/data_parallel.h"
     25 #include "lib/jxl/base/status.h"
     26 #include "lib/jxl/coeff_order.h"
     27 #include "lib/jxl/common.h"
     28 #include "lib/jxl/dct_util.h"
     29 #include "lib/jxl/dec_ans.h"
     30 #include "lib/jxl/dec_xyb.h"
     31 #include "lib/jxl/frame_dimensions.h"
     32 #include "lib/jxl/frame_header.h"
     33 #include "lib/jxl/image.h"
     34 #include "lib/jxl/image_bundle.h"
     35 #include "lib/jxl/image_metadata.h"
     36 #include "lib/jxl/passes_state.h"
     37 #include "lib/jxl/render_pipeline/render_pipeline.h"
     38 #include "lib/jxl/render_pipeline/render_pipeline_stage.h"
     39 #include "lib/jxl/render_pipeline/stage_upsampling.h"
     40 
     41 namespace jxl {
     42 
     43 constexpr size_t kSigmaBorder = 1;
     44 constexpr size_t kSigmaPadding = 2;
     45 
     46 struct PixelCallback {
     47   PixelCallback() = default;
     48   PixelCallback(JxlImageOutInitCallback init, JxlImageOutRunCallback run,
     49                 JxlImageOutDestroyCallback destroy, void* init_opaque)
     50       : init(init), run(run), destroy(destroy), init_opaque(init_opaque) {
     51 #if JXL_ENABLE_ASSERT
     52     const bool has_init = init != nullptr;
     53     const bool has_run = run != nullptr;
     54     const bool has_destroy = destroy != nullptr;
     55     const bool healthy = (has_init == has_run) && (has_run == has_destroy);
     56     JXL_ASSERT(healthy);
     57 #endif
     58   }
     59 
     60   bool IsPresent() const { return run != nullptr; }
     61 
     62   void* Init(size_t num_threads, size_t num_pixels) const {
     63     return init(init_opaque, num_threads, num_pixels);
     64   }
     65 
     66   JxlImageOutInitCallback init = nullptr;
     67   JxlImageOutRunCallback run = nullptr;
     68   JxlImageOutDestroyCallback destroy = nullptr;
     69   void* init_opaque = nullptr;
     70 };
     71 
     72 struct ImageOutput {
     73   // Pixel format of the output pixels, used for buffer and callback output.
     74   JxlPixelFormat format;
     75   // Output bit depth for unsigned data types, used for float to int conversion.
     76   size_t bits_per_sample;
     77   // Callback for line-by-line output.
     78   PixelCallback callback;
     79   // Pixel buffer for image output.
     80   void* buffer;
     81   size_t buffer_size;
     82   // Length of a row of image_buffer in bytes (based on oriented width).
     83   size_t stride;
     84 };
     85 
     86 // Per-frame decoder state. All the images here should be accessed through a
     87 // group rect (either with block units or pixel units).
     88 struct PassesDecoderState {
     89   PassesSharedState shared_storage;
     90   // Allows avoiding copies for encoder loop.
     91   const PassesSharedState* JXL_RESTRICT shared = &shared_storage;
     92 
     93   // 8x upsampling stage for DC.
     94   std::unique_ptr<RenderPipelineStage> upsampler8x;
     95 
     96   // For ANS decoding.
     97   std::vector<ANSCode> code;
     98   std::vector<std::vector<uint8_t>> context_map;
     99 
    100   // Multiplier to be applied to the quant matrices of the x channel.
    101   float x_dm_multiplier;
    102   float b_dm_multiplier;
    103 
    104   // Sigma values for EPF.
    105   ImageF sigma;
    106 
    107   // Image dimensions before applying undo_orientation.
    108   size_t width;
    109   size_t height;
    110   ImageOutput main_output;
    111   std::vector<ImageOutput> extra_output;
    112 
    113   // Whether to use int16 float-XYB-to-uint8-srgb conversion.
    114   bool fast_xyb_srgb8_conversion;
    115 
    116   // If true, the RGBA output will be unpremultiplied before writing to the
    117   // output.
    118   bool unpremul_alpha;
    119 
    120   // The render pipeline will apply this orientation to bring the image to the
    121   // intended display orientation.
    122   Orientation undo_orientation;
    123 
    124   // Used for seeding noise.
    125   size_t visible_frame_index = 0;
    126   size_t nonvisible_frame_index = 0;
    127 
    128   // Keep track of the transform types used.
    129   std::atomic<uint32_t> used_acs{0};
    130 
    131   // Storage for coefficients if in "accumulate" mode.
    132   std::unique_ptr<ACImage> coefficients = make_unique<ACImageT<int32_t>>();
    133 
    134   // Rendering pipeline.
    135   std::unique_ptr<RenderPipeline> render_pipeline;
    136 
    137   // Storage for the current frame if it can be referenced by future frames.
    138   ImageBundle frame_storage_for_referencing;
    139 
    140   struct PipelineOptions {
    141     bool use_slow_render_pipeline;
    142     bool coalescing;
    143     bool render_spotcolors;
    144     bool render_noise;
    145   };
    146 
    147   Status PreparePipeline(const FrameHeader& frame_header, ImageBundle* decoded,
    148                          PipelineOptions options);
    149 
    150   // Information for colour conversions.
    151   OutputEncodingInfo output_encoding_info;
    152 
    153   // Initializes decoder-specific structures using information from *shared.
    154   Status Init(const FrameHeader& frame_header) {
    155     x_dm_multiplier = std::pow(1 / (1.25f), frame_header.x_qm_scale - 2.0f);
    156     b_dm_multiplier = std::pow(1 / (1.25f), frame_header.b_qm_scale - 2.0f);
    157 
    158     main_output.callback = PixelCallback();
    159     main_output.buffer = nullptr;
    160     extra_output.clear();
    161 
    162     fast_xyb_srgb8_conversion = false;
    163     unpremul_alpha = false;
    164     undo_orientation = Orientation::kIdentity;
    165 
    166     used_acs = 0;
    167 
    168     upsampler8x = GetUpsamplingStage(shared->metadata->transform_data, 0, 3);
    169     if (frame_header.loop_filter.epf_iters > 0) {
    170       JXL_ASSIGN_OR_RETURN(
    171           sigma,
    172           ImageF::Create(shared->frame_dim.xsize_blocks + 2 * kSigmaPadding,
    173                          shared->frame_dim.ysize_blocks + 2 * kSigmaPadding));
    174     }
    175     return true;
    176   }
    177 
    178   // Initialize the decoder state after all of DC is decoded.
    179   Status InitForAC(size_t num_passes, ThreadPool* pool) {
    180     shared_storage.coeff_order_size = 0;
    181     for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
    182       if (((1 << o) & used_acs) == 0) continue;
    183       uint8_t ord = kStrategyOrder[o];
    184       shared_storage.coeff_order_size =
    185           std::max(kCoeffOrderOffset[3 * (ord + 1)] * kDCTBlockSize,
    186                    shared_storage.coeff_order_size);
    187     }
    188     size_t sz = num_passes * shared_storage.coeff_order_size;
    189     if (sz > shared_storage.coeff_orders.size()) {
    190       shared_storage.coeff_orders.resize(sz);
    191     }
    192     return true;
    193   }
    194 };
    195 
    196 // Temp images required for decoding a single group. Reduces memory allocations
    197 // for large images because we only initialize min(#threads, #groups) instances.
    198 struct GroupDecCache {
    199   Status InitOnce(size_t num_passes, size_t used_acs) {
    200     for (size_t i = 0; i < num_passes; i++) {
    201       if (num_nzeroes[i].xsize() == 0) {
    202         // Allocate enough for a whole group - partial groups on the
    203         // right/bottom border just use a subset. The valid size is passed via
    204         // Rect.
    205 
    206         JXL_ASSIGN_OR_RETURN(
    207             num_nzeroes[i],
    208             Image3I::Create(kGroupDimInBlocks, kGroupDimInBlocks));
    209       }
    210     }
    211     size_t max_block_area = 0;
    212 
    213     for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
    214       AcStrategy acs = AcStrategy::FromRawStrategy(o);
    215       if ((used_acs & (1 << o)) == 0) continue;
    216       size_t area =
    217           acs.covered_blocks_x() * acs.covered_blocks_y() * kDCTBlockSize;
    218       max_block_area = std::max(area, max_block_area);
    219     }
    220 
    221     if (max_block_area > max_block_area_) {
    222       max_block_area_ = max_block_area;
    223       // We need 3x float blocks for dequantized coefficients and 1x for scratch
    224       // space for transforms.
    225       float_memory_ = hwy::AllocateAligned<float>(max_block_area_ * 7);
    226       // We need 3x int32 or int16 blocks for quantized coefficients.
    227       int32_memory_ = hwy::AllocateAligned<int32_t>(max_block_area_ * 3);
    228       int16_memory_ = hwy::AllocateAligned<int16_t>(max_block_area_ * 3);
    229     }
    230 
    231     dec_group_block = float_memory_.get();
    232     scratch_space = dec_group_block + max_block_area_ * 3;
    233     dec_group_qblock = int32_memory_.get();
    234     dec_group_qblock16 = int16_memory_.get();
    235     return true;
    236   }
    237 
    238   Status InitDCBufferOnce() {
    239     if (dc_buffer.xsize() == 0) {
    240       JXL_ASSIGN_OR_RETURN(
    241           dc_buffer,
    242           ImageF::Create(kGroupDimInBlocks + kRenderPipelineXOffset * 2,
    243                          kGroupDimInBlocks + 4));
    244     }
    245     return true;
    246   }
    247 
    248   // Scratch space used by DecGroupImpl().
    249   float* dec_group_block;
    250   int32_t* dec_group_qblock;
    251   int16_t* dec_group_qblock16;
    252 
    253   // For TransformToPixels.
    254   float* scratch_space;
    255   // Note that scratch_space is never used at the same time as dec_group_qblock.
    256   // Moreover, only one of dec_group_qblock16 is ever used.
    257   // TODO(veluca): figure out if we can save allocations.
    258 
    259   // AC decoding
    260   Image3I num_nzeroes[kMaxNumPasses];
    261 
    262   // Buffer for DC upsampling.
    263   ImageF dc_buffer;
    264 
    265  private:
    266   hwy::AlignedFreeUniquePtr<float[]> float_memory_;
    267   hwy::AlignedFreeUniquePtr<int32_t[]> int32_memory_;
    268   hwy::AlignedFreeUniquePtr<int16_t[]> int16_memory_;
    269   size_t max_block_area_ = 0;
    270 };
    271 
    272 }  // namespace jxl
    273 
    274 #endif  // LIB_JXL_DEC_CACHE_H_