libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

low_memory_render_pipeline.cc (36622B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jxl/render_pipeline/low_memory_render_pipeline.h"
      7 
      8 #include <algorithm>
      9 
     10 #include "lib/jxl/base/arch_macros.h"
     11 #include "lib/jxl/base/status.h"
     12 #include "lib/jxl/image_ops.h"
     13 
     14 namespace jxl {
     15 std::pair<size_t, size_t>
     16 LowMemoryRenderPipeline::ColorDimensionsToChannelDimensions(
     17     std::pair<size_t, size_t> in, size_t c, size_t stage) const {
     18   std::pair<size_t, size_t> ret;
     19   std::pair<size_t, size_t> shift = channel_shifts_[stage][c];
     20   ret.first =
     21       ((in.first << base_color_shift_) + (1 << shift.first) - 1) >> shift.first;
     22   ret.second = ((in.second << base_color_shift_) + (1 << shift.second) - 1) >>
     23                shift.second;
     24   return ret;
     25 }
     26 
     27 std::pair<size_t, size_t> LowMemoryRenderPipeline::BorderToStore(
     28     size_t c) const {
     29   auto ret = ColorDimensionsToChannelDimensions(group_border_, c, 0);
     30   ret.first += padding_[0][c].first;
     31   ret.second += padding_[0][c].second;
     32   return ret;
     33 }
     34 
     35 void LowMemoryRenderPipeline::SaveBorders(size_t group_id, size_t c,
     36                                           const ImageF& in) {
     37   size_t gy = group_id / frame_dimensions_.xsize_groups;
     38   size_t gx = group_id % frame_dimensions_.xsize_groups;
     39   size_t hshift = channel_shifts_[0][c].first;
     40   size_t vshift = channel_shifts_[0][c].second;
     41   size_t x0 = gx * GroupInputXSize(c);
     42   size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
     43                        DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
     44   size_t y0 = gy * GroupInputYSize(c);
     45   size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
     46                        DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
     47 
     48   auto borders = BorderToStore(c);
     49   size_t borderx_write = borders.first;
     50   size_t bordery_write = borders.second;
     51 
     52   if (gy > 0) {
     53     Rect from(group_data_x_border_, group_data_y_border_, x1 - x0,
     54               bordery_write);
     55     Rect to(x0, (gy * 2 - 1) * bordery_write, x1 - x0, bordery_write);
     56     CopyImageTo(from, in, to, &borders_horizontal_[c]);
     57   }
     58   if (gy + 1 < frame_dimensions_.ysize_groups) {
     59     Rect from(group_data_x_border_,
     60               group_data_y_border_ + y1 - y0 - bordery_write, x1 - x0,
     61               bordery_write);
     62     Rect to(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write);
     63     CopyImageTo(from, in, to, &borders_horizontal_[c]);
     64   }
     65   if (gx > 0) {
     66     Rect from(group_data_x_border_, group_data_y_border_, borderx_write,
     67               y1 - y0);
     68     Rect to((gx * 2 - 1) * borderx_write, y0, borderx_write, y1 - y0);
     69     CopyImageTo(from, in, to, &borders_vertical_[c]);
     70   }
     71   if (gx + 1 < frame_dimensions_.xsize_groups) {
     72     Rect from(group_data_x_border_ + x1 - x0 - borderx_write,
     73               group_data_y_border_, borderx_write, y1 - y0);
     74     Rect to((gx * 2) * borderx_write, y0, borderx_write, y1 - y0);
     75     CopyImageTo(from, in, to, &borders_vertical_[c]);
     76   }
     77 }
     78 
     79 void LowMemoryRenderPipeline::LoadBorders(size_t group_id, size_t c,
     80                                           const Rect& r, ImageF* out) {
     81   size_t gy = group_id / frame_dimensions_.xsize_groups;
     82   size_t gx = group_id % frame_dimensions_.xsize_groups;
     83   size_t hshift = channel_shifts_[0][c].first;
     84   size_t vshift = channel_shifts_[0][c].second;
     85   // Coordinates of the group in the image.
     86   size_t x0 = gx * GroupInputXSize(c);
     87   size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
     88                        DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
     89   size_t y0 = gy * GroupInputYSize(c);
     90   size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
     91                        DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
     92 
     93   size_t paddingx = padding_[0][c].first;
     94   size_t paddingy = padding_[0][c].second;
     95 
     96   auto borders = BorderToStore(c);
     97   size_t borderx_write = borders.first;
     98   size_t bordery_write = borders.second;
     99 
    100   // Limits of the area to copy from, in image coordinates.
    101   JXL_DASSERT(r.x0() == 0 || (r.x0() << base_color_shift_) >= paddingx);
    102   size_t x0src = DivCeil(r.x0() << base_color_shift_, 1 << hshift);
    103   if (x0src != 0) {
    104     x0src -= paddingx;
    105   }
    106   // r may be such that r.x1 (namely x0() + xsize()) is within paddingx of the
    107   // right side of the image, so we use min() here.
    108   size_t x1src =
    109       DivCeil((r.x0() + r.xsize()) << base_color_shift_, 1 << hshift);
    110   x1src = std::min(x1src + paddingx,
    111                    DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
    112 
    113   // Similar computation for y.
    114   JXL_DASSERT(r.y0() == 0 || (r.y0() << base_color_shift_) >= paddingy);
    115   size_t y0src = DivCeil(r.y0() << base_color_shift_, 1 << vshift);
    116   if (y0src != 0) {
    117     y0src -= paddingy;
    118   }
    119   size_t y1src =
    120       DivCeil((r.y0() + r.ysize()) << base_color_shift_, 1 << vshift);
    121   y1src = std::min(y1src + paddingy,
    122                    DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
    123 
    124   // Copy other groups' borders from the border storage.
    125   if (y0src < y0) {
    126     JXL_DASSERT(gy > 0);
    127     CopyImageTo(
    128         Rect(x0src, (gy * 2 - 2) * bordery_write, x1src - x0src, bordery_write),
    129         borders_horizontal_[c],
    130         Rect(group_data_x_border_ + x0src - x0,
    131              group_data_y_border_ - bordery_write, x1src - x0src,
    132              bordery_write),
    133         out);
    134   }
    135   if (y1src > y1) {
    136     // When copying the bottom border we must not be on the bottom groups.
    137     JXL_DASSERT(gy + 1 < frame_dimensions_.ysize_groups);
    138     CopyImageTo(
    139         Rect(x0src, (gy * 2 + 1) * bordery_write, x1src - x0src, bordery_write),
    140         borders_horizontal_[c],
    141         Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ + y1 - y0,
    142              x1src - x0src, bordery_write),
    143         out);
    144   }
    145   if (x0src < x0) {
    146     JXL_DASSERT(gx > 0);
    147     CopyImageTo(
    148         Rect((gx * 2 - 2) * borderx_write, y0src, borderx_write, y1src - y0src),
    149         borders_vertical_[c],
    150         Rect(group_data_x_border_ - borderx_write,
    151              group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src),
    152         out);
    153   }
    154   if (x1src > x1) {
    155     // When copying the right border we must not be on the rightmost groups.
    156     JXL_DASSERT(gx + 1 < frame_dimensions_.xsize_groups);
    157     CopyImageTo(
    158         Rect((gx * 2 + 1) * borderx_write, y0src, borderx_write, y1src - y0src),
    159         borders_vertical_[c],
    160         Rect(group_data_x_border_ + x1 - x0, group_data_y_border_ + y0src - y0,
    161              borderx_write, y1src - y0src),
    162         out);
    163   }
    164 }
    165 
    166 size_t LowMemoryRenderPipeline::GroupInputXSize(size_t c) const {
    167   return (frame_dimensions_.group_dim << base_color_shift_) >>
    168          channel_shifts_[0][c].first;
    169 }
    170 
    171 size_t LowMemoryRenderPipeline::GroupInputYSize(size_t c) const {
    172   return (frame_dimensions_.group_dim << base_color_shift_) >>
    173          channel_shifts_[0][c].second;
    174 }
    175 
    176 Status LowMemoryRenderPipeline::EnsureBordersStorage() {
    177   const auto& shifts = channel_shifts_[0];
    178   if (borders_horizontal_.size() < shifts.size()) {
    179     borders_horizontal_.resize(shifts.size());
    180     borders_vertical_.resize(shifts.size());
    181   }
    182   for (size_t c = 0; c < shifts.size(); c++) {
    183     auto borders = BorderToStore(c);
    184     size_t borderx = borders.first;
    185     size_t bordery = borders.second;
    186     JXL_DASSERT(frame_dimensions_.xsize_groups > 0);
    187     size_t num_xborders = (frame_dimensions_.xsize_groups - 1) * 2;
    188     JXL_DASSERT(frame_dimensions_.ysize_groups > 0);
    189     size_t num_yborders = (frame_dimensions_.ysize_groups - 1) * 2;
    190     size_t downsampled_xsize =
    191         DivCeil(frame_dimensions_.xsize_upsampled_padded, 1 << shifts[c].first);
    192     size_t downsampled_ysize = DivCeil(frame_dimensions_.ysize_upsampled_padded,
    193                                        1 << shifts[c].second);
    194     Rect horizontal = Rect(0, 0, downsampled_xsize, bordery * num_yborders);
    195     if (!SameSize(horizontal, borders_horizontal_[c])) {
    196       JXL_ASSIGN_OR_RETURN(
    197           borders_horizontal_[c],
    198           ImageF::Create(horizontal.xsize(), horizontal.ysize()));
    199     }
    200     Rect vertical = Rect(0, 0, borderx * num_xborders, downsampled_ysize);
    201     if (!SameSize(vertical, borders_vertical_[c])) {
    202       JXL_ASSIGN_OR_RETURN(borders_vertical_[c],
    203                            ImageF::Create(vertical.xsize(), vertical.ysize()));
    204     }
    205   }
    206   return true;
    207 }
    208 
    209 Status LowMemoryRenderPipeline::Init() {
    210   group_border_ = {0, 0};
    211   base_color_shift_ = CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded /
    212                                       frame_dimensions_.xsize_padded);
    213 
    214   const auto& shifts = channel_shifts_[0];
    215 
    216   // Ensure that each channel has enough many border pixels.
    217   for (size_t c = 0; c < shifts.size(); c++) {
    218     group_border_.first =
    219         std::max(group_border_.first,
    220                  DivCeil(padding_[0][c].first << channel_shifts_[0][c].first,
    221                          1 << base_color_shift_));
    222     group_border_.second =
    223         std::max(group_border_.second,
    224                  DivCeil(padding_[0][c].second << channel_shifts_[0][c].second,
    225                          1 << base_color_shift_));
    226   }
    227 
    228   // Ensure that all channels have an integer number of border pixels in the
    229   // input.
    230   for (size_t c = 0; c < shifts.size(); c++) {
    231     if (channel_shifts_[0][c].first >= base_color_shift_) {
    232       group_border_.first =
    233           RoundUpTo(group_border_.first,
    234                     1 << (channel_shifts_[0][c].first - base_color_shift_));
    235     }
    236     if (channel_shifts_[0][c].second >= base_color_shift_) {
    237       group_border_.second =
    238           RoundUpTo(group_border_.second,
    239                     1 << (channel_shifts_[0][c].second - base_color_shift_));
    240     }
    241   }
    242   // Ensure that the X border on color channels is a multiple of kBlockDim or
    243   // the vector size (required for EPF stages). Vectors on ARM NEON are never
    244   // wider than 4 floats, so rounding to multiples of 4 is enough.
    245 #if JXL_ARCH_ARM
    246   constexpr size_t kGroupXAlign = 4;
    247 #else
    248   constexpr size_t kGroupXAlign = 16;
    249 #endif
    250   group_border_.first = RoundUpTo(group_border_.first, kGroupXAlign);
    251   // Allocate borders in group images that are just enough for storing the
    252   // borders to be copied in, plus any rounding to ensure alignment.
    253   std::pair<size_t, size_t> max_border = {0, 0};
    254   for (size_t c = 0; c < shifts.size(); c++) {
    255     max_border.first = std::max(BorderToStore(c).first, max_border.first);
    256     max_border.second = std::max(BorderToStore(c).second, max_border.second);
    257   }
    258   group_data_x_border_ = RoundUpTo(max_border.first, kGroupXAlign);
    259   group_data_y_border_ = max_border.second;
    260 
    261   JXL_RETURN_IF_ERROR(EnsureBordersStorage());
    262   group_border_assigner_.Init(frame_dimensions_);
    263 
    264   for (first_trailing_stage_ = stages_.size(); first_trailing_stage_ > 0;
    265        first_trailing_stage_--) {
    266     bool has_inout_c = false;
    267     for (size_t c = 0; c < shifts.size(); c++) {
    268       if (stages_[first_trailing_stage_ - 1]->GetChannelMode(c) ==
    269           RenderPipelineChannelMode::kInOut) {
    270         has_inout_c = true;
    271       }
    272     }
    273     if (has_inout_c) {
    274       break;
    275     }
    276   }
    277 
    278   first_image_dim_stage_ = stages_.size();
    279   for (size_t i = 0; i < stages_.size(); i++) {
    280     std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
    281     for (size_t c = 0; c < shifts.size(); c++) {
    282       input_sizes[c] =
    283           std::make_pair(DivCeil(frame_dimensions_.xsize_upsampled,
    284                                  1 << channel_shifts_[i][c].first),
    285                          DivCeil(frame_dimensions_.ysize_upsampled,
    286                                  1 << channel_shifts_[i][c].second));
    287     }
    288     JXL_RETURN_IF_ERROR(stages_[i]->SetInputSizes(input_sizes));
    289     if (stages_[i]->SwitchToImageDimensions()) {
    290       // We don't allow kInOut after switching to image dimensions.
    291       JXL_ASSERT(i >= first_trailing_stage_);
    292       first_image_dim_stage_ = i + 1;
    293       stages_[i]->GetImageDimensions(&full_image_xsize_, &full_image_ysize_,
    294                                      &frame_origin_);
    295       break;
    296     }
    297   }
    298   for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
    299     if (stages_[i]->SwitchToImageDimensions()) {
    300       JXL_UNREACHABLE("Cannot switch to image dimensions multiple times");
    301     }
    302     std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
    303     for (size_t c = 0; c < shifts.size(); c++) {
    304       input_sizes[c] = {full_image_xsize_, full_image_ysize_};
    305     }
    306     JXL_RETURN_IF_ERROR(stages_[i]->SetInputSizes(input_sizes));
    307   }
    308 
    309   anyc_.resize(stages_.size());
    310   for (size_t i = 0; i < stages_.size(); i++) {
    311     for (size_t c = 0; c < shifts.size(); c++) {
    312       if (stages_[i]->GetChannelMode(c) !=
    313           RenderPipelineChannelMode::kIgnored) {
    314         anyc_[i] = c;
    315       }
    316     }
    317   }
    318 
    319   stage_input_for_channel_ = std::vector<std::vector<int32_t>>(
    320       stages_.size(), std::vector<int32_t>(shifts.size()));
    321   for (size_t c = 0; c < shifts.size(); c++) {
    322     int input = -1;
    323     for (size_t i = 0; i < stages_.size(); i++) {
    324       stage_input_for_channel_[i][c] = input;
    325       if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
    326         input = i;
    327       }
    328     }
    329   }
    330 
    331   image_rect_.resize(stages_.size());
    332   for (size_t i = 0; i < stages_.size(); i++) {
    333     size_t x1 = DivCeil(frame_dimensions_.xsize_upsampled,
    334                         1 << channel_shifts_[i][anyc_[i]].first);
    335     size_t y1 = DivCeil(frame_dimensions_.ysize_upsampled,
    336                         1 << channel_shifts_[i][anyc_[i]].second);
    337     image_rect_[i] = Rect(0, 0, x1, y1);
    338   }
    339 
    340   virtual_ypadding_for_output_.resize(stages_.size());
    341   xpadding_for_output_.resize(stages_.size());
    342   for (size_t c = 0; c < shifts.size(); c++) {
    343     int ypad = 0;
    344     int xpad = 0;
    345     for (size_t i = stages_.size(); i-- > 0;) {
    346       if (stages_[i]->GetChannelMode(c) !=
    347           RenderPipelineChannelMode::kIgnored) {
    348         virtual_ypadding_for_output_[i] =
    349             std::max(ypad, virtual_ypadding_for_output_[i]);
    350         xpadding_for_output_[i] = std::max(xpad, xpadding_for_output_[i]);
    351       }
    352       if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
    353         ypad = (DivCeil(ypad, 1 << channel_shifts_[i][c].second) +
    354                 stages_[i]->settings_.border_y)
    355                << channel_shifts_[i][c].second;
    356         xpad = DivCeil(xpad, 1 << stages_[i]->settings_.shift_x) +
    357                stages_[i]->settings_.border_x;
    358       }
    359     }
    360   }
    361   return true;
    362 }
    363 
    364 Status LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num,
    365                                                           bool use_group_ids) {
    366   const auto& shifts = channel_shifts_[0];
    367   use_group_ids_ = use_group_ids;
    368   size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num;
    369   for (size_t t = group_data_.size(); t < num_buffers; t++) {
    370     group_data_.emplace_back();
    371     group_data_[t].resize(shifts.size());
    372     for (size_t c = 0; c < shifts.size(); c++) {
    373       JXL_ASSIGN_OR_RETURN(
    374           group_data_[t][c],
    375           ImageF::Create(GroupInputXSize(c) + group_data_x_border_ * 2,
    376                          GroupInputYSize(c) + group_data_y_border_ * 2));
    377     }
    378   }
    379   // TODO(veluca): avoid reallocating buffers if not needed.
    380   stage_data_.resize(num);
    381   size_t upsampling = 1u << base_color_shift_;
    382   size_t group_dim = frame_dimensions_.group_dim * upsampling;
    383   size_t padding =
    384       2 * group_data_x_border_ * upsampling +  // maximum size of a rect
    385       2 * kRenderPipelineXOffset;              // extra padding for processing
    386   size_t stage_buffer_xsize = group_dim + padding;
    387   for (size_t t = 0; t < num; t++) {
    388     stage_data_[t].resize(shifts.size());
    389     for (size_t c = 0; c < shifts.size(); c++) {
    390       stage_data_[t][c].resize(stages_.size());
    391       size_t next_y_border = 0;
    392       for (size_t i = stages_.size(); i-- > 0;) {
    393         if (stages_[i]->GetChannelMode(c) ==
    394             RenderPipelineChannelMode::kInOut) {
    395           size_t stage_buffer_ysize =
    396               2 * next_y_border + (1 << stages_[i]->settings_.shift_y);
    397           stage_buffer_ysize = 1 << CeilLog2Nonzero(stage_buffer_ysize);
    398           next_y_border = stages_[i]->settings_.border_y;
    399           JXL_ASSIGN_OR_RETURN(
    400               stage_data_[t][c][i],
    401               ImageF::Create(stage_buffer_xsize, stage_buffer_ysize));
    402         }
    403       }
    404     }
    405   }
    406   if (first_image_dim_stage_ != stages_.size()) {
    407     RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
    408                               frame_dimensions_.ysize_upsampled);
    409     RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
    410     image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
    411     image_rect = image_rect.Intersection(full_image_rect);
    412     if (image_rect.xsize() == 0 || image_rect.ysize() == 0) {
    413       image_rect = RectT<ssize_t>(0, 0, 0, 0);
    414     }
    415     size_t left_padding = image_rect.x0();
    416     size_t middle_padding = group_dim;
    417     size_t right_padding = full_image_xsize_ - image_rect.x1();
    418     size_t out_of_frame_xsize =
    419         padding +
    420         std::max(left_padding, std::max(middle_padding, right_padding));
    421     out_of_frame_data_.resize(num);
    422     for (size_t t = 0; t < num; t++) {
    423       JXL_ASSIGN_OR_RETURN(out_of_frame_data_[t],
    424                            ImageF::Create(out_of_frame_xsize, shifts.size()));
    425     }
    426   }
    427   return true;
    428 }
    429 
    430 std::vector<std::pair<ImageF*, Rect>> LowMemoryRenderPipeline::PrepareBuffers(
    431     size_t group_id, size_t thread_id) {
    432   std::vector<std::pair<ImageF*, Rect>> ret(channel_shifts_[0].size());
    433   const size_t gx = group_id % frame_dimensions_.xsize_groups;
    434   const size_t gy = group_id / frame_dimensions_.xsize_groups;
    435   for (size_t c = 0; c < channel_shifts_[0].size(); c++) {
    436     ret[c].first = &group_data_[use_group_ids_ ? group_id : thread_id][c];
    437     ret[c].second = Rect(group_data_x_border_, group_data_y_border_,
    438                          GroupInputXSize(c), GroupInputYSize(c),
    439                          DivCeil(frame_dimensions_.xsize_upsampled,
    440                                  1 << channel_shifts_[0][c].first) -
    441                              gx * GroupInputXSize(c) + group_data_x_border_,
    442                          DivCeil(frame_dimensions_.ysize_upsampled,
    443                                  1 << channel_shifts_[0][c].second) -
    444                              gy * GroupInputYSize(c) + group_data_y_border_);
    445   }
    446   return ret;
    447 }
    448 
    449 namespace {
    450 
    451 JXL_INLINE int GetMirroredY(int y, ssize_t group_y0, ssize_t image_ysize) {
    452   if (group_y0 == 0 && (y < 0 || y + group_y0 >= image_ysize)) {
    453     return Mirror(y, image_ysize);
    454   }
    455   if (y + group_y0 >= image_ysize) {
    456     // Here we know that the one mirroring step is sufficient.
    457     return 2 * image_ysize - (y + group_y0) - 1 - group_y0;
    458   }
    459   return y;
    460 }
    461 
    462 JXL_INLINE void ApplyXMirroring(float* row, ssize_t borderx, ssize_t group_x0,
    463                                 ssize_t group_xsize, ssize_t image_xsize) {
    464   if (image_xsize <= borderx) {
    465     if (group_x0 == 0) {
    466       for (ssize_t ix = 0; ix < borderx; ix++) {
    467         row[kRenderPipelineXOffset - ix - 1] =
    468             row[kRenderPipelineXOffset + Mirror(-ix - 1, image_xsize)];
    469       }
    470     }
    471     if (group_xsize + borderx + group_x0 >= image_xsize) {
    472       for (ssize_t ix = 0; ix < borderx; ix++) {
    473         row[kRenderPipelineXOffset + image_xsize + ix - group_x0] =
    474             row[kRenderPipelineXOffset + Mirror(image_xsize + ix, image_xsize) -
    475                 group_x0];
    476       }
    477     }
    478   } else {
    479     // Here we know that the one mirroring step is sufficient.
    480     if (group_x0 == 0) {
    481       for (ssize_t ix = 0; ix < borderx; ix++) {
    482         row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + ix];
    483       }
    484     }
    485     if (group_xsize + borderx + group_x0 >= image_xsize) {
    486       for (ssize_t ix = 0; ix < borderx; ix++) {
    487         row[kRenderPipelineXOffset + image_xsize - group_x0 + ix] =
    488             row[kRenderPipelineXOffset + image_xsize - group_x0 - ix - 1];
    489       }
    490     }
    491   }
    492 }
    493 
    494 // Information about where the *output* of each stage is stored.
    495 class Rows {
    496  public:
    497   Rows(const std::vector<std::unique_ptr<RenderPipelineStage>>& stages,
    498        const Rect data_max_color_channel_rect, int group_data_x_border,
    499        int group_data_y_border,
    500        const std::vector<std::pair<size_t, size_t>>& group_data_shift,
    501        size_t base_color_shift, std::vector<std::vector<ImageF>>& thread_data,
    502        std::vector<ImageF>& input_data) {
    503     size_t num_stages = stages.size();
    504     size_t num_channels = input_data.size();
    505 
    506     JXL_ASSERT(thread_data.size() == num_channels);
    507     JXL_ASSERT(group_data_shift.size() == num_channels);
    508 
    509 #if JXL_ENABLE_ASSERT
    510     for (const auto& td : thread_data) {
    511       JXL_ASSERT(td.size() == num_stages);
    512     }
    513 #endif
    514 
    515     rows_.resize(num_stages + 1, std::vector<RowInfo>(num_channels));
    516 
    517     for (size_t i = 0; i < num_stages; i++) {
    518       for (size_t c = 0; c < input_data.size(); c++) {
    519         if (stages[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
    520           rows_[i + 1][c].ymod_minus_1 = thread_data[c][i].ysize() - 1;
    521           rows_[i + 1][c].base_ptr = thread_data[c][i].Row(0);
    522           rows_[i + 1][c].stride = thread_data[c][i].PixelsPerRow();
    523         }
    524       }
    525     }
    526 
    527     for (size_t c = 0; c < input_data.size(); c++) {
    528       auto channel_group_data_rect =
    529           data_max_color_channel_rect.As<ssize_t>()
    530               .Translate(-group_data_x_border, -group_data_y_border)
    531               .ShiftLeft(base_color_shift)
    532               .CeilShiftRight(group_data_shift[c])
    533               .Translate(group_data_x_border -
    534                              static_cast<ssize_t>(kRenderPipelineXOffset),
    535                          group_data_y_border);
    536       rows_[0][c].base_ptr = channel_group_data_rect.Row(&input_data[c], 0);
    537       rows_[0][c].stride = input_data[c].PixelsPerRow();
    538       rows_[0][c].ymod_minus_1 = -1;
    539     }
    540   }
    541 
    542   // Stage -1 refers to the input data; all other values must be nonnegative and
    543   // refer to the data for the output of that stage.
    544   JXL_INLINE float* GetBuffer(int stage, int y, size_t c) const {
    545     JXL_DASSERT(stage >= -1);
    546     const RowInfo& info = rows_[stage + 1][c];
    547     return info.base_ptr +
    548            static_cast<ssize_t>(info.stride) * (y & info.ymod_minus_1);
    549   }
    550 
    551  private:
    552   struct RowInfo {
    553     // Pointer to beginning of the first row.
    554     float* base_ptr;
    555     // Modulo value for the y axis minus 1 (ymod is guaranteed to be a power of
    556     // 2, which allows efficient mod computation by masking).
    557     int ymod_minus_1;
    558     // Number of floats per row.
    559     size_t stride;
    560   };
    561   std::vector<std::vector<RowInfo>> rows_;
    562 };
    563 
    564 }  // namespace
    565 
    566 Status LowMemoryRenderPipeline::RenderRect(size_t thread_id,
    567                                            std::vector<ImageF>& input_data,
    568                                            Rect data_max_color_channel_rect,
    569                                            Rect image_max_color_channel_rect) {
    570   // For each stage, the rect corresponding to the image area currently being
    571   // processed, in the coordinates of that stage (i.e. with the scaling factor
    572   // that that stage has).
    573   std::vector<Rect> group_rect;
    574   group_rect.resize(stages_.size());
    575   Rect image_area_rect =
    576       image_max_color_channel_rect.ShiftLeft(base_color_shift_)
    577           .Crop(frame_dimensions_.xsize_upsampled,
    578                 frame_dimensions_.ysize_upsampled);
    579   for (size_t i = 0; i < stages_.size(); i++) {
    580     group_rect[i] =
    581         image_area_rect.CeilShiftRight(channel_shifts_[i][anyc_[i]]);
    582   }
    583 
    584   ssize_t frame_x0 =
    585       first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.x0;
    586   ssize_t frame_y0 =
    587       first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.y0;
    588   size_t full_image_xsize = first_image_dim_stage_ == stages_.size()
    589                                 ? frame_dimensions_.xsize_upsampled
    590                                 : full_image_xsize_;
    591   size_t full_image_ysize = first_image_dim_stage_ == stages_.size()
    592                                 ? frame_dimensions_.ysize_upsampled
    593                                 : full_image_ysize_;
    594 
    595   // Compute actual x-axis bounds for the current image area in the context of
    596   // the full image this frame is part of. As the left boundary may be negative,
    597   // we also create the x_pixels_skip value, defined as follows:
    598   // - both x_pixels_skip and full_image_x0 are >= 0, and at least one is 0;
    599   // - full_image_x0 - x_pixels_skip is the position of the current frame area
    600   //   in the full image.
    601   ssize_t full_image_x0 = frame_x0 + image_area_rect.x0();
    602   ssize_t x_pixels_skip = 0;
    603   if (full_image_x0 < 0) {
    604     x_pixels_skip = -full_image_x0;
    605     full_image_x0 = 0;
    606   }
    607   ssize_t full_image_x1 = frame_x0 + image_area_rect.x1();
    608   full_image_x1 = std::min<ssize_t>(full_image_x1, full_image_xsize);
    609 
    610   // If the current image area is entirely outside of the visible image, there
    611   // is no point in proceeding. Note: this uses the assumption that if there is
    612   // a stage with observable effects (i.e. a kInput stage), it only appears
    613   // after the stage that switches to image dimensions.
    614   if (full_image_x1 <= full_image_x0) return true;
    615 
    616   // Data structures to hold information about input/output rows and their
    617   // buffers.
    618   Rows rows(stages_, data_max_color_channel_rect, group_data_x_border_,
    619             group_data_y_border_, channel_shifts_[0], base_color_shift_,
    620             stage_data_[thread_id], input_data);
    621 
    622   std::vector<RenderPipelineStage::RowInfo> input_rows(first_trailing_stage_ +
    623                                                        1);
    624   for (size_t i = 0; i < first_trailing_stage_; i++) {
    625     input_rows[i].resize(input_data.size());
    626   }
    627   input_rows[first_trailing_stage_].resize(input_data.size(),
    628                                            std::vector<float*>(1));
    629 
    630   // Maximum possible shift is 3.
    631   RenderPipelineStage::RowInfo output_rows(input_data.size(),
    632                                            std::vector<float*>(8));
    633 
    634   // Fills in input_rows and output_rows for a given y value (relative to the
    635   // start of the group, measured in actual pixels at the appropriate vertical
    636   // scaling factor) and a given stage, applying mirroring if necessary. This
    637   // function is somewhat inefficient for trailing kInOut or kInput stages,
    638   // where just filling the input row once ought to be sufficient.
    639   auto prepare_io_rows = [&](int y, size_t i) {
    640     ssize_t bordery = stages_[i]->settings_.border_y;
    641     size_t shifty = stages_[i]->settings_.shift_y;
    642     auto make_row = [&](size_t c, ssize_t iy) {
    643       size_t mirrored_y = GetMirroredY(y + iy - bordery, group_rect[i].y0(),
    644                                        image_rect_[i].ysize());
    645       input_rows[i][c][iy] =
    646           rows.GetBuffer(stage_input_for_channel_[i][c], mirrored_y, c);
    647       ApplyXMirroring(input_rows[i][c][iy], stages_[i]->settings_.border_x,
    648                       group_rect[i].x0(), group_rect[i].xsize(),
    649                       image_rect_[i].xsize());
    650     };
    651     for (size_t c = 0; c < input_data.size(); c++) {
    652       RenderPipelineChannelMode mode = stages_[i]->GetChannelMode(c);
    653       if (mode == RenderPipelineChannelMode::kIgnored) {
    654         continue;
    655       }
    656       // If we already have rows from a previous iteration, we can just shift
    657       // the rows by 1 and insert the new one.
    658       if (input_rows[i][c].size() == 2 * static_cast<size_t>(bordery) + 1) {
    659         for (ssize_t iy = 0; iy < 2 * bordery; iy++) {
    660           input_rows[i][c][iy] = input_rows[i][c][iy + 1];
    661         }
    662         make_row(c, bordery * 2);
    663       } else {
    664         input_rows[i][c].resize(2 * bordery + 1);
    665         for (ssize_t iy = 0; iy < 2 * bordery + 1; iy++) {
    666           make_row(c, iy);
    667         }
    668       }
    669 
    670       // If necessary, get the output buffers.
    671       if (mode == RenderPipelineChannelMode::kInOut) {
    672         for (size_t iy = 0; iy < (1u << shifty); iy++) {
    673           output_rows[c][iy] = rows.GetBuffer(i, y * (1 << shifty) + iy, c);
    674         }
    675       }
    676     }
    677   };
    678 
    679   // We pretend that every stage has a vertical shift of 0, i.e. it is as tall
    680   // as the final image.
    681   // We call each such row a "virtual" row, because it may or may not correspond
    682   // to an actual row of the current processing stage; actual processing happens
    683   // when vy % (1<<vshift) == 0.
    684 
    685   int num_extra_rows = *std::max_element(virtual_ypadding_for_output_.begin(),
    686                                          virtual_ypadding_for_output_.end());
    687 
    688   for (int vy = -num_extra_rows;
    689        vy < static_cast<int>(image_area_rect.ysize()) + num_extra_rows; vy++) {
    690     for (size_t i = 0; i < first_trailing_stage_; i++) {
    691       int stage_vy = vy - num_extra_rows + virtual_ypadding_for_output_[i];
    692 
    693       if (stage_vy % (1 << channel_shifts_[i][anyc_[i]].second) != 0) {
    694         continue;
    695       }
    696 
    697       if (stage_vy < -virtual_ypadding_for_output_[i]) {
    698         continue;
    699       }
    700 
    701       int y = stage_vy >> channel_shifts_[i][anyc_[i]].second;
    702 
    703       ssize_t image_y = static_cast<ssize_t>(group_rect[i].y0()) + y;
    704       // Do not produce rows in out-of-bounds areas.
    705       if (image_y < 0 ||
    706           image_y >= static_cast<ssize_t>(image_rect_[i].ysize())) {
    707         continue;
    708       }
    709 
    710       // Get the input/output rows and potentially apply mirroring to the input.
    711       prepare_io_rows(y, i);
    712 
    713       // Produce output rows.
    714       JXL_RETURN_IF_ERROR(stages_[i]->ProcessRow(
    715           input_rows[i], output_rows, xpadding_for_output_[i],
    716           group_rect[i].xsize(), group_rect[i].x0(), image_y, thread_id));
    717     }
    718 
    719     // Process trailing stages, i.e. the final set of non-kInOut stages; they
    720     // all have the same input buffer and no need to use any mirroring.
    721 
    722     int y = vy - num_extra_rows;
    723 
    724     for (size_t c = 0; c < input_data.size(); c++) {
    725       // Skip pixels that are not part of the actual final image area.
    726       input_rows[first_trailing_stage_][c][0] =
    727           rows.GetBuffer(stage_input_for_channel_[first_trailing_stage_][c], y,
    728                          c) +
    729           x_pixels_skip;
    730     }
    731 
    732     // Check that we are not outside of the bounds for the current rendering
    733     // rect. Not doing so might result in overwriting some rows that have been
    734     // written (or will be written) by other threads.
    735     if (y < 0 || y >= static_cast<ssize_t>(image_area_rect.ysize())) {
    736       continue;
    737     }
    738 
    739     // Avoid running pipeline stages on pixels that are outside the full image
    740     // area. As trailing stages have no borders, this is a free optimization
    741     // (and may be necessary for correctness, as some stages assume coordinates
    742     // are within bounds).
    743     ssize_t full_image_y = frame_y0 + image_area_rect.y0() + y;
    744     if (full_image_y < 0 ||
    745         full_image_y >= static_cast<ssize_t>(full_image_ysize)) {
    746       continue;
    747     }
    748 
    749     for (size_t i = first_trailing_stage_; i < stages_.size(); i++) {
    750       // Before the first_image_dim_stage_, coordinates are relative to the
    751       // current frame.
    752       size_t x0 =
    753           i < first_image_dim_stage_ ? full_image_x0 - frame_x0 : full_image_x0;
    754       size_t y =
    755           i < first_image_dim_stage_ ? full_image_y - frame_y0 : full_image_y;
    756       JXL_RETURN_IF_ERROR(stages_[i]->ProcessRow(
    757           input_rows[first_trailing_stage_], output_rows,
    758           /*xextra=*/0, full_image_x1 - full_image_x0, x0, y, thread_id));
    759     }
    760   }
    761   return true;
    762 }
    763 
    764 Status LowMemoryRenderPipeline::RenderPadding(size_t thread_id, Rect rect) {
    765   if (rect.xsize() == 0) return true;
    766   size_t numc = channel_shifts_[0].size();
    767   RenderPipelineStage::RowInfo input_rows(numc, std::vector<float*>(1));
    768   RenderPipelineStage::RowInfo output_rows;
    769 
    770   for (size_t c = 0; c < numc; c++) {
    771     input_rows[c][0] = out_of_frame_data_[thread_id].Row(c);
    772   }
    773 
    774   for (size_t y = 0; y < rect.ysize(); y++) {
    775     stages_[first_image_dim_stage_ - 1]->ProcessPaddingRow(
    776         input_rows, rect.xsize(), rect.x0(), rect.y0() + y);
    777     for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
    778       JXL_RETURN_IF_ERROR(stages_[i]->ProcessRow(
    779           input_rows, output_rows,
    780           /*xextra=*/0, rect.xsize(), rect.x0(), rect.y0() + y, thread_id));
    781     }
    782   }
    783   return true;
    784 }
    785 
    786 Status LowMemoryRenderPipeline::ProcessBuffers(size_t group_id,
    787                                                size_t thread_id) {
    788   std::vector<ImageF>& input_data =
    789       group_data_[use_group_ids_ ? group_id : thread_id];
    790 
    791   // Copy the group borders to the border storage.
    792   for (size_t c = 0; c < input_data.size(); c++) {
    793     SaveBorders(group_id, c, input_data[c]);
    794   }
    795 
    796   size_t gy = group_id / frame_dimensions_.xsize_groups;
    797   size_t gx = group_id % frame_dimensions_.xsize_groups;
    798 
    799   if (first_image_dim_stage_ != stages_.size()) {
    800     size_t group_dim = frame_dimensions_.group_dim << base_color_shift_;
    801     RectT<ssize_t> group_rect(gx * group_dim, gy * group_dim, group_dim,
    802                               group_dim);
    803     RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
    804                               frame_dimensions_.ysize_upsampled);
    805     RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
    806     group_rect = group_rect.Translate(frame_origin_.x0, frame_origin_.y0);
    807     image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
    808     image_rect = image_rect.Intersection(full_image_rect);
    809     group_rect = group_rect.Intersection(image_rect);
    810     size_t x0 = group_rect.x0();
    811     size_t y0 = group_rect.y0();
    812     size_t x1 = group_rect.x1();
    813     size_t y1 = group_rect.y1();
    814     JXL_DEBUG_V(6,
    815                 "Rendering padding for full image rect %s "
    816                 "outside group rect %s",
    817                 Description(full_image_rect).c_str(),
    818                 Description(group_rect).c_str());
    819 
    820     if (group_id == 0 && (image_rect.xsize() == 0 || image_rect.ysize() == 0)) {
    821       // If this frame does not intersect with the full image, we have to
    822       // initialize the whole image area with RenderPadding.
    823       JXL_RETURN_IF_ERROR(RenderPadding(
    824           thread_id, Rect(0, 0, full_image_xsize_, full_image_ysize_)));
    825     }
    826 
    827     // Render padding for groups that intersect with the full image. The case
    828     // where no groups intersect was handled above.
    829     if (group_rect.xsize() > 0 && group_rect.ysize() > 0) {
    830       if (gx == 0 && gy == 0) {
    831         JXL_RETURN_IF_ERROR(RenderPadding(thread_id, Rect(0, 0, x0, y0)));
    832       }
    833       if (gy == 0) {
    834         JXL_RETURN_IF_ERROR(RenderPadding(thread_id, Rect(x0, 0, x1 - x0, y0)));
    835       }
    836       if (gx == 0) {
    837         JXL_RETURN_IF_ERROR(RenderPadding(thread_id, Rect(0, y0, x0, y1 - y0)));
    838       }
    839       if (gx == 0 && gy + 1 == frame_dimensions_.ysize_groups) {
    840         JXL_RETURN_IF_ERROR(
    841             RenderPadding(thread_id, Rect(0, y1, x0, full_image_ysize_ - y1)));
    842       }
    843       if (gy + 1 == frame_dimensions_.ysize_groups) {
    844         JXL_RETURN_IF_ERROR(RenderPadding(
    845             thread_id, Rect(x0, y1, x1 - x0, full_image_ysize_ - y1)));
    846       }
    847       if (gy == 0 && gx + 1 == frame_dimensions_.xsize_groups) {
    848         JXL_RETURN_IF_ERROR(
    849             RenderPadding(thread_id, Rect(x1, 0, full_image_xsize_ - x1, y0)));
    850       }
    851       if (gx + 1 == frame_dimensions_.xsize_groups) {
    852         JXL_RETURN_IF_ERROR(RenderPadding(
    853             thread_id, Rect(x1, y0, full_image_xsize_ - x1, y1 - y0)));
    854       }
    855       if (gy + 1 == frame_dimensions_.ysize_groups &&
    856           gx + 1 == frame_dimensions_.xsize_groups) {
    857         JXL_RETURN_IF_ERROR(RenderPadding(
    858             thread_id,
    859             Rect(x1, y1, full_image_xsize_ - x1, full_image_ysize_ - y1)));
    860       }
    861     }
    862   }
    863 
    864   Rect ready_rects[GroupBorderAssigner::kMaxToFinalize];
    865   size_t num_ready_rects = 0;
    866   group_border_assigner_.GroupDone(group_id, group_border_.first,
    867                                    group_border_.second, ready_rects,
    868                                    &num_ready_rects);
    869   for (size_t i = 0; i < num_ready_rects; i++) {
    870     const Rect& image_max_color_channel_rect = ready_rects[i];
    871     for (size_t c = 0; c < input_data.size(); c++) {
    872       LoadBorders(group_id, c, image_max_color_channel_rect, &input_data[c]);
    873     }
    874     Rect data_max_color_channel_rect(
    875         group_data_x_border_ + image_max_color_channel_rect.x0() -
    876             gx * frame_dimensions_.group_dim,
    877         group_data_y_border_ + image_max_color_channel_rect.y0() -
    878             gy * frame_dimensions_.group_dim,
    879         image_max_color_channel_rect.xsize(),
    880         image_max_color_channel_rect.ysize());
    881     JXL_RETURN_IF_ERROR(RenderRect(thread_id, input_data,
    882                                    data_max_color_channel_rect,
    883                                    image_max_color_channel_rect));
    884   }
    885   return true;
    886 }
    887 }  // namespace jxl