low_memory_render_pipeline.cc (36622B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "lib/jxl/render_pipeline/low_memory_render_pipeline.h" 7 8 #include <algorithm> 9 10 #include "lib/jxl/base/arch_macros.h" 11 #include "lib/jxl/base/status.h" 12 #include "lib/jxl/image_ops.h" 13 14 namespace jxl { 15 std::pair<size_t, size_t> 16 LowMemoryRenderPipeline::ColorDimensionsToChannelDimensions( 17 std::pair<size_t, size_t> in, size_t c, size_t stage) const { 18 std::pair<size_t, size_t> ret; 19 std::pair<size_t, size_t> shift = channel_shifts_[stage][c]; 20 ret.first = 21 ((in.first << base_color_shift_) + (1 << shift.first) - 1) >> shift.first; 22 ret.second = ((in.second << base_color_shift_) + (1 << shift.second) - 1) >> 23 shift.second; 24 return ret; 25 } 26 27 std::pair<size_t, size_t> LowMemoryRenderPipeline::BorderToStore( 28 size_t c) const { 29 auto ret = ColorDimensionsToChannelDimensions(group_border_, c, 0); 30 ret.first += padding_[0][c].first; 31 ret.second += padding_[0][c].second; 32 return ret; 33 } 34 35 void LowMemoryRenderPipeline::SaveBorders(size_t group_id, size_t c, 36 const ImageF& in) { 37 size_t gy = group_id / frame_dimensions_.xsize_groups; 38 size_t gx = group_id % frame_dimensions_.xsize_groups; 39 size_t hshift = channel_shifts_[0][c].first; 40 size_t vshift = channel_shifts_[0][c].second; 41 size_t x0 = gx * GroupInputXSize(c); 42 size_t x1 = std::min((gx + 1) * GroupInputXSize(c), 43 DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); 44 size_t y0 = gy * GroupInputYSize(c); 45 size_t y1 = std::min((gy + 1) * GroupInputYSize(c), 46 DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); 47 48 auto borders = BorderToStore(c); 49 size_t borderx_write = borders.first; 50 size_t bordery_write = borders.second; 51 52 if (gy > 0) { 53 Rect from(group_data_x_border_, group_data_y_border_, x1 - x0, 54 bordery_write); 55 Rect to(x0, (gy * 2 - 1) * bordery_write, x1 - x0, bordery_write); 56 CopyImageTo(from, in, to, &borders_horizontal_[c]); 57 } 58 if (gy + 1 < frame_dimensions_.ysize_groups) { 59 Rect from(group_data_x_border_, 60 group_data_y_border_ + y1 - y0 - bordery_write, x1 - x0, 61 bordery_write); 62 Rect to(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write); 63 CopyImageTo(from, in, to, &borders_horizontal_[c]); 64 } 65 if (gx > 0) { 66 Rect from(group_data_x_border_, group_data_y_border_, borderx_write, 67 y1 - y0); 68 Rect to((gx * 2 - 1) * borderx_write, y0, borderx_write, y1 - y0); 69 CopyImageTo(from, in, to, &borders_vertical_[c]); 70 } 71 if (gx + 1 < frame_dimensions_.xsize_groups) { 72 Rect from(group_data_x_border_ + x1 - x0 - borderx_write, 73 group_data_y_border_, borderx_write, y1 - y0); 74 Rect to((gx * 2) * borderx_write, y0, borderx_write, y1 - y0); 75 CopyImageTo(from, in, to, &borders_vertical_[c]); 76 } 77 } 78 79 void LowMemoryRenderPipeline::LoadBorders(size_t group_id, size_t c, 80 const Rect& r, ImageF* out) { 81 size_t gy = group_id / frame_dimensions_.xsize_groups; 82 size_t gx = group_id % frame_dimensions_.xsize_groups; 83 size_t hshift = channel_shifts_[0][c].first; 84 size_t vshift = channel_shifts_[0][c].second; 85 // Coordinates of the group in the image. 86 size_t x0 = gx * GroupInputXSize(c); 87 size_t x1 = std::min((gx + 1) * GroupInputXSize(c), 88 DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); 89 size_t y0 = gy * GroupInputYSize(c); 90 size_t y1 = std::min((gy + 1) * GroupInputYSize(c), 91 DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); 92 93 size_t paddingx = padding_[0][c].first; 94 size_t paddingy = padding_[0][c].second; 95 96 auto borders = BorderToStore(c); 97 size_t borderx_write = borders.first; 98 size_t bordery_write = borders.second; 99 100 // Limits of the area to copy from, in image coordinates. 101 JXL_DASSERT(r.x0() == 0 || (r.x0() << base_color_shift_) >= paddingx); 102 size_t x0src = DivCeil(r.x0() << base_color_shift_, 1 << hshift); 103 if (x0src != 0) { 104 x0src -= paddingx; 105 } 106 // r may be such that r.x1 (namely x0() + xsize()) is within paddingx of the 107 // right side of the image, so we use min() here. 108 size_t x1src = 109 DivCeil((r.x0() + r.xsize()) << base_color_shift_, 1 << hshift); 110 x1src = std::min(x1src + paddingx, 111 DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); 112 113 // Similar computation for y. 114 JXL_DASSERT(r.y0() == 0 || (r.y0() << base_color_shift_) >= paddingy); 115 size_t y0src = DivCeil(r.y0() << base_color_shift_, 1 << vshift); 116 if (y0src != 0) { 117 y0src -= paddingy; 118 } 119 size_t y1src = 120 DivCeil((r.y0() + r.ysize()) << base_color_shift_, 1 << vshift); 121 y1src = std::min(y1src + paddingy, 122 DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); 123 124 // Copy other groups' borders from the border storage. 125 if (y0src < y0) { 126 JXL_DASSERT(gy > 0); 127 CopyImageTo( 128 Rect(x0src, (gy * 2 - 2) * bordery_write, x1src - x0src, bordery_write), 129 borders_horizontal_[c], 130 Rect(group_data_x_border_ + x0src - x0, 131 group_data_y_border_ - bordery_write, x1src - x0src, 132 bordery_write), 133 out); 134 } 135 if (y1src > y1) { 136 // When copying the bottom border we must not be on the bottom groups. 137 JXL_DASSERT(gy + 1 < frame_dimensions_.ysize_groups); 138 CopyImageTo( 139 Rect(x0src, (gy * 2 + 1) * bordery_write, x1src - x0src, bordery_write), 140 borders_horizontal_[c], 141 Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ + y1 - y0, 142 x1src - x0src, bordery_write), 143 out); 144 } 145 if (x0src < x0) { 146 JXL_DASSERT(gx > 0); 147 CopyImageTo( 148 Rect((gx * 2 - 2) * borderx_write, y0src, borderx_write, y1src - y0src), 149 borders_vertical_[c], 150 Rect(group_data_x_border_ - borderx_write, 151 group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src), 152 out); 153 } 154 if (x1src > x1) { 155 // When copying the right border we must not be on the rightmost groups. 156 JXL_DASSERT(gx + 1 < frame_dimensions_.xsize_groups); 157 CopyImageTo( 158 Rect((gx * 2 + 1) * borderx_write, y0src, borderx_write, y1src - y0src), 159 borders_vertical_[c], 160 Rect(group_data_x_border_ + x1 - x0, group_data_y_border_ + y0src - y0, 161 borderx_write, y1src - y0src), 162 out); 163 } 164 } 165 166 size_t LowMemoryRenderPipeline::GroupInputXSize(size_t c) const { 167 return (frame_dimensions_.group_dim << base_color_shift_) >> 168 channel_shifts_[0][c].first; 169 } 170 171 size_t LowMemoryRenderPipeline::GroupInputYSize(size_t c) const { 172 return (frame_dimensions_.group_dim << base_color_shift_) >> 173 channel_shifts_[0][c].second; 174 } 175 176 Status LowMemoryRenderPipeline::EnsureBordersStorage() { 177 const auto& shifts = channel_shifts_[0]; 178 if (borders_horizontal_.size() < shifts.size()) { 179 borders_horizontal_.resize(shifts.size()); 180 borders_vertical_.resize(shifts.size()); 181 } 182 for (size_t c = 0; c < shifts.size(); c++) { 183 auto borders = BorderToStore(c); 184 size_t borderx = borders.first; 185 size_t bordery = borders.second; 186 JXL_DASSERT(frame_dimensions_.xsize_groups > 0); 187 size_t num_xborders = (frame_dimensions_.xsize_groups - 1) * 2; 188 JXL_DASSERT(frame_dimensions_.ysize_groups > 0); 189 size_t num_yborders = (frame_dimensions_.ysize_groups - 1) * 2; 190 size_t downsampled_xsize = 191 DivCeil(frame_dimensions_.xsize_upsampled_padded, 1 << shifts[c].first); 192 size_t downsampled_ysize = DivCeil(frame_dimensions_.ysize_upsampled_padded, 193 1 << shifts[c].second); 194 Rect horizontal = Rect(0, 0, downsampled_xsize, bordery * num_yborders); 195 if (!SameSize(horizontal, borders_horizontal_[c])) { 196 JXL_ASSIGN_OR_RETURN( 197 borders_horizontal_[c], 198 ImageF::Create(horizontal.xsize(), horizontal.ysize())); 199 } 200 Rect vertical = Rect(0, 0, borderx * num_xborders, downsampled_ysize); 201 if (!SameSize(vertical, borders_vertical_[c])) { 202 JXL_ASSIGN_OR_RETURN(borders_vertical_[c], 203 ImageF::Create(vertical.xsize(), vertical.ysize())); 204 } 205 } 206 return true; 207 } 208 209 Status LowMemoryRenderPipeline::Init() { 210 group_border_ = {0, 0}; 211 base_color_shift_ = CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded / 212 frame_dimensions_.xsize_padded); 213 214 const auto& shifts = channel_shifts_[0]; 215 216 // Ensure that each channel has enough many border pixels. 217 for (size_t c = 0; c < shifts.size(); c++) { 218 group_border_.first = 219 std::max(group_border_.first, 220 DivCeil(padding_[0][c].first << channel_shifts_[0][c].first, 221 1 << base_color_shift_)); 222 group_border_.second = 223 std::max(group_border_.second, 224 DivCeil(padding_[0][c].second << channel_shifts_[0][c].second, 225 1 << base_color_shift_)); 226 } 227 228 // Ensure that all channels have an integer number of border pixels in the 229 // input. 230 for (size_t c = 0; c < shifts.size(); c++) { 231 if (channel_shifts_[0][c].first >= base_color_shift_) { 232 group_border_.first = 233 RoundUpTo(group_border_.first, 234 1 << (channel_shifts_[0][c].first - base_color_shift_)); 235 } 236 if (channel_shifts_[0][c].second >= base_color_shift_) { 237 group_border_.second = 238 RoundUpTo(group_border_.second, 239 1 << (channel_shifts_[0][c].second - base_color_shift_)); 240 } 241 } 242 // Ensure that the X border on color channels is a multiple of kBlockDim or 243 // the vector size (required for EPF stages). Vectors on ARM NEON are never 244 // wider than 4 floats, so rounding to multiples of 4 is enough. 245 #if JXL_ARCH_ARM 246 constexpr size_t kGroupXAlign = 4; 247 #else 248 constexpr size_t kGroupXAlign = 16; 249 #endif 250 group_border_.first = RoundUpTo(group_border_.first, kGroupXAlign); 251 // Allocate borders in group images that are just enough for storing the 252 // borders to be copied in, plus any rounding to ensure alignment. 253 std::pair<size_t, size_t> max_border = {0, 0}; 254 for (size_t c = 0; c < shifts.size(); c++) { 255 max_border.first = std::max(BorderToStore(c).first, max_border.first); 256 max_border.second = std::max(BorderToStore(c).second, max_border.second); 257 } 258 group_data_x_border_ = RoundUpTo(max_border.first, kGroupXAlign); 259 group_data_y_border_ = max_border.second; 260 261 JXL_RETURN_IF_ERROR(EnsureBordersStorage()); 262 group_border_assigner_.Init(frame_dimensions_); 263 264 for (first_trailing_stage_ = stages_.size(); first_trailing_stage_ > 0; 265 first_trailing_stage_--) { 266 bool has_inout_c = false; 267 for (size_t c = 0; c < shifts.size(); c++) { 268 if (stages_[first_trailing_stage_ - 1]->GetChannelMode(c) == 269 RenderPipelineChannelMode::kInOut) { 270 has_inout_c = true; 271 } 272 } 273 if (has_inout_c) { 274 break; 275 } 276 } 277 278 first_image_dim_stage_ = stages_.size(); 279 for (size_t i = 0; i < stages_.size(); i++) { 280 std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size()); 281 for (size_t c = 0; c < shifts.size(); c++) { 282 input_sizes[c] = 283 std::make_pair(DivCeil(frame_dimensions_.xsize_upsampled, 284 1 << channel_shifts_[i][c].first), 285 DivCeil(frame_dimensions_.ysize_upsampled, 286 1 << channel_shifts_[i][c].second)); 287 } 288 JXL_RETURN_IF_ERROR(stages_[i]->SetInputSizes(input_sizes)); 289 if (stages_[i]->SwitchToImageDimensions()) { 290 // We don't allow kInOut after switching to image dimensions. 291 JXL_ASSERT(i >= first_trailing_stage_); 292 first_image_dim_stage_ = i + 1; 293 stages_[i]->GetImageDimensions(&full_image_xsize_, &full_image_ysize_, 294 &frame_origin_); 295 break; 296 } 297 } 298 for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) { 299 if (stages_[i]->SwitchToImageDimensions()) { 300 JXL_UNREACHABLE("Cannot switch to image dimensions multiple times"); 301 } 302 std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size()); 303 for (size_t c = 0; c < shifts.size(); c++) { 304 input_sizes[c] = {full_image_xsize_, full_image_ysize_}; 305 } 306 JXL_RETURN_IF_ERROR(stages_[i]->SetInputSizes(input_sizes)); 307 } 308 309 anyc_.resize(stages_.size()); 310 for (size_t i = 0; i < stages_.size(); i++) { 311 for (size_t c = 0; c < shifts.size(); c++) { 312 if (stages_[i]->GetChannelMode(c) != 313 RenderPipelineChannelMode::kIgnored) { 314 anyc_[i] = c; 315 } 316 } 317 } 318 319 stage_input_for_channel_ = std::vector<std::vector<int32_t>>( 320 stages_.size(), std::vector<int32_t>(shifts.size())); 321 for (size_t c = 0; c < shifts.size(); c++) { 322 int input = -1; 323 for (size_t i = 0; i < stages_.size(); i++) { 324 stage_input_for_channel_[i][c] = input; 325 if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { 326 input = i; 327 } 328 } 329 } 330 331 image_rect_.resize(stages_.size()); 332 for (size_t i = 0; i < stages_.size(); i++) { 333 size_t x1 = DivCeil(frame_dimensions_.xsize_upsampled, 334 1 << channel_shifts_[i][anyc_[i]].first); 335 size_t y1 = DivCeil(frame_dimensions_.ysize_upsampled, 336 1 << channel_shifts_[i][anyc_[i]].second); 337 image_rect_[i] = Rect(0, 0, x1, y1); 338 } 339 340 virtual_ypadding_for_output_.resize(stages_.size()); 341 xpadding_for_output_.resize(stages_.size()); 342 for (size_t c = 0; c < shifts.size(); c++) { 343 int ypad = 0; 344 int xpad = 0; 345 for (size_t i = stages_.size(); i-- > 0;) { 346 if (stages_[i]->GetChannelMode(c) != 347 RenderPipelineChannelMode::kIgnored) { 348 virtual_ypadding_for_output_[i] = 349 std::max(ypad, virtual_ypadding_for_output_[i]); 350 xpadding_for_output_[i] = std::max(xpad, xpadding_for_output_[i]); 351 } 352 if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { 353 ypad = (DivCeil(ypad, 1 << channel_shifts_[i][c].second) + 354 stages_[i]->settings_.border_y) 355 << channel_shifts_[i][c].second; 356 xpad = DivCeil(xpad, 1 << stages_[i]->settings_.shift_x) + 357 stages_[i]->settings_.border_x; 358 } 359 } 360 } 361 return true; 362 } 363 364 Status LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num, 365 bool use_group_ids) { 366 const auto& shifts = channel_shifts_[0]; 367 use_group_ids_ = use_group_ids; 368 size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num; 369 for (size_t t = group_data_.size(); t < num_buffers; t++) { 370 group_data_.emplace_back(); 371 group_data_[t].resize(shifts.size()); 372 for (size_t c = 0; c < shifts.size(); c++) { 373 JXL_ASSIGN_OR_RETURN( 374 group_data_[t][c], 375 ImageF::Create(GroupInputXSize(c) + group_data_x_border_ * 2, 376 GroupInputYSize(c) + group_data_y_border_ * 2)); 377 } 378 } 379 // TODO(veluca): avoid reallocating buffers if not needed. 380 stage_data_.resize(num); 381 size_t upsampling = 1u << base_color_shift_; 382 size_t group_dim = frame_dimensions_.group_dim * upsampling; 383 size_t padding = 384 2 * group_data_x_border_ * upsampling + // maximum size of a rect 385 2 * kRenderPipelineXOffset; // extra padding for processing 386 size_t stage_buffer_xsize = group_dim + padding; 387 for (size_t t = 0; t < num; t++) { 388 stage_data_[t].resize(shifts.size()); 389 for (size_t c = 0; c < shifts.size(); c++) { 390 stage_data_[t][c].resize(stages_.size()); 391 size_t next_y_border = 0; 392 for (size_t i = stages_.size(); i-- > 0;) { 393 if (stages_[i]->GetChannelMode(c) == 394 RenderPipelineChannelMode::kInOut) { 395 size_t stage_buffer_ysize = 396 2 * next_y_border + (1 << stages_[i]->settings_.shift_y); 397 stage_buffer_ysize = 1 << CeilLog2Nonzero(stage_buffer_ysize); 398 next_y_border = stages_[i]->settings_.border_y; 399 JXL_ASSIGN_OR_RETURN( 400 stage_data_[t][c][i], 401 ImageF::Create(stage_buffer_xsize, stage_buffer_ysize)); 402 } 403 } 404 } 405 } 406 if (first_image_dim_stage_ != stages_.size()) { 407 RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled, 408 frame_dimensions_.ysize_upsampled); 409 RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_); 410 image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0); 411 image_rect = image_rect.Intersection(full_image_rect); 412 if (image_rect.xsize() == 0 || image_rect.ysize() == 0) { 413 image_rect = RectT<ssize_t>(0, 0, 0, 0); 414 } 415 size_t left_padding = image_rect.x0(); 416 size_t middle_padding = group_dim; 417 size_t right_padding = full_image_xsize_ - image_rect.x1(); 418 size_t out_of_frame_xsize = 419 padding + 420 std::max(left_padding, std::max(middle_padding, right_padding)); 421 out_of_frame_data_.resize(num); 422 for (size_t t = 0; t < num; t++) { 423 JXL_ASSIGN_OR_RETURN(out_of_frame_data_[t], 424 ImageF::Create(out_of_frame_xsize, shifts.size())); 425 } 426 } 427 return true; 428 } 429 430 std::vector<std::pair<ImageF*, Rect>> LowMemoryRenderPipeline::PrepareBuffers( 431 size_t group_id, size_t thread_id) { 432 std::vector<std::pair<ImageF*, Rect>> ret(channel_shifts_[0].size()); 433 const size_t gx = group_id % frame_dimensions_.xsize_groups; 434 const size_t gy = group_id / frame_dimensions_.xsize_groups; 435 for (size_t c = 0; c < channel_shifts_[0].size(); c++) { 436 ret[c].first = &group_data_[use_group_ids_ ? group_id : thread_id][c]; 437 ret[c].second = Rect(group_data_x_border_, group_data_y_border_, 438 GroupInputXSize(c), GroupInputYSize(c), 439 DivCeil(frame_dimensions_.xsize_upsampled, 440 1 << channel_shifts_[0][c].first) - 441 gx * GroupInputXSize(c) + group_data_x_border_, 442 DivCeil(frame_dimensions_.ysize_upsampled, 443 1 << channel_shifts_[0][c].second) - 444 gy * GroupInputYSize(c) + group_data_y_border_); 445 } 446 return ret; 447 } 448 449 namespace { 450 451 JXL_INLINE int GetMirroredY(int y, ssize_t group_y0, ssize_t image_ysize) { 452 if (group_y0 == 0 && (y < 0 || y + group_y0 >= image_ysize)) { 453 return Mirror(y, image_ysize); 454 } 455 if (y + group_y0 >= image_ysize) { 456 // Here we know that the one mirroring step is sufficient. 457 return 2 * image_ysize - (y + group_y0) - 1 - group_y0; 458 } 459 return y; 460 } 461 462 JXL_INLINE void ApplyXMirroring(float* row, ssize_t borderx, ssize_t group_x0, 463 ssize_t group_xsize, ssize_t image_xsize) { 464 if (image_xsize <= borderx) { 465 if (group_x0 == 0) { 466 for (ssize_t ix = 0; ix < borderx; ix++) { 467 row[kRenderPipelineXOffset - ix - 1] = 468 row[kRenderPipelineXOffset + Mirror(-ix - 1, image_xsize)]; 469 } 470 } 471 if (group_xsize + borderx + group_x0 >= image_xsize) { 472 for (ssize_t ix = 0; ix < borderx; ix++) { 473 row[kRenderPipelineXOffset + image_xsize + ix - group_x0] = 474 row[kRenderPipelineXOffset + Mirror(image_xsize + ix, image_xsize) - 475 group_x0]; 476 } 477 } 478 } else { 479 // Here we know that the one mirroring step is sufficient. 480 if (group_x0 == 0) { 481 for (ssize_t ix = 0; ix < borderx; ix++) { 482 row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + ix]; 483 } 484 } 485 if (group_xsize + borderx + group_x0 >= image_xsize) { 486 for (ssize_t ix = 0; ix < borderx; ix++) { 487 row[kRenderPipelineXOffset + image_xsize - group_x0 + ix] = 488 row[kRenderPipelineXOffset + image_xsize - group_x0 - ix - 1]; 489 } 490 } 491 } 492 } 493 494 // Information about where the *output* of each stage is stored. 495 class Rows { 496 public: 497 Rows(const std::vector<std::unique_ptr<RenderPipelineStage>>& stages, 498 const Rect data_max_color_channel_rect, int group_data_x_border, 499 int group_data_y_border, 500 const std::vector<std::pair<size_t, size_t>>& group_data_shift, 501 size_t base_color_shift, std::vector<std::vector<ImageF>>& thread_data, 502 std::vector<ImageF>& input_data) { 503 size_t num_stages = stages.size(); 504 size_t num_channels = input_data.size(); 505 506 JXL_ASSERT(thread_data.size() == num_channels); 507 JXL_ASSERT(group_data_shift.size() == num_channels); 508 509 #if JXL_ENABLE_ASSERT 510 for (const auto& td : thread_data) { 511 JXL_ASSERT(td.size() == num_stages); 512 } 513 #endif 514 515 rows_.resize(num_stages + 1, std::vector<RowInfo>(num_channels)); 516 517 for (size_t i = 0; i < num_stages; i++) { 518 for (size_t c = 0; c < input_data.size(); c++) { 519 if (stages[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { 520 rows_[i + 1][c].ymod_minus_1 = thread_data[c][i].ysize() - 1; 521 rows_[i + 1][c].base_ptr = thread_data[c][i].Row(0); 522 rows_[i + 1][c].stride = thread_data[c][i].PixelsPerRow(); 523 } 524 } 525 } 526 527 for (size_t c = 0; c < input_data.size(); c++) { 528 auto channel_group_data_rect = 529 data_max_color_channel_rect.As<ssize_t>() 530 .Translate(-group_data_x_border, -group_data_y_border) 531 .ShiftLeft(base_color_shift) 532 .CeilShiftRight(group_data_shift[c]) 533 .Translate(group_data_x_border - 534 static_cast<ssize_t>(kRenderPipelineXOffset), 535 group_data_y_border); 536 rows_[0][c].base_ptr = channel_group_data_rect.Row(&input_data[c], 0); 537 rows_[0][c].stride = input_data[c].PixelsPerRow(); 538 rows_[0][c].ymod_minus_1 = -1; 539 } 540 } 541 542 // Stage -1 refers to the input data; all other values must be nonnegative and 543 // refer to the data for the output of that stage. 544 JXL_INLINE float* GetBuffer(int stage, int y, size_t c) const { 545 JXL_DASSERT(stage >= -1); 546 const RowInfo& info = rows_[stage + 1][c]; 547 return info.base_ptr + 548 static_cast<ssize_t>(info.stride) * (y & info.ymod_minus_1); 549 } 550 551 private: 552 struct RowInfo { 553 // Pointer to beginning of the first row. 554 float* base_ptr; 555 // Modulo value for the y axis minus 1 (ymod is guaranteed to be a power of 556 // 2, which allows efficient mod computation by masking). 557 int ymod_minus_1; 558 // Number of floats per row. 559 size_t stride; 560 }; 561 std::vector<std::vector<RowInfo>> rows_; 562 }; 563 564 } // namespace 565 566 Status LowMemoryRenderPipeline::RenderRect(size_t thread_id, 567 std::vector<ImageF>& input_data, 568 Rect data_max_color_channel_rect, 569 Rect image_max_color_channel_rect) { 570 // For each stage, the rect corresponding to the image area currently being 571 // processed, in the coordinates of that stage (i.e. with the scaling factor 572 // that that stage has). 573 std::vector<Rect> group_rect; 574 group_rect.resize(stages_.size()); 575 Rect image_area_rect = 576 image_max_color_channel_rect.ShiftLeft(base_color_shift_) 577 .Crop(frame_dimensions_.xsize_upsampled, 578 frame_dimensions_.ysize_upsampled); 579 for (size_t i = 0; i < stages_.size(); i++) { 580 group_rect[i] = 581 image_area_rect.CeilShiftRight(channel_shifts_[i][anyc_[i]]); 582 } 583 584 ssize_t frame_x0 = 585 first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.x0; 586 ssize_t frame_y0 = 587 first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.y0; 588 size_t full_image_xsize = first_image_dim_stage_ == stages_.size() 589 ? frame_dimensions_.xsize_upsampled 590 : full_image_xsize_; 591 size_t full_image_ysize = first_image_dim_stage_ == stages_.size() 592 ? frame_dimensions_.ysize_upsampled 593 : full_image_ysize_; 594 595 // Compute actual x-axis bounds for the current image area in the context of 596 // the full image this frame is part of. As the left boundary may be negative, 597 // we also create the x_pixels_skip value, defined as follows: 598 // - both x_pixels_skip and full_image_x0 are >= 0, and at least one is 0; 599 // - full_image_x0 - x_pixels_skip is the position of the current frame area 600 // in the full image. 601 ssize_t full_image_x0 = frame_x0 + image_area_rect.x0(); 602 ssize_t x_pixels_skip = 0; 603 if (full_image_x0 < 0) { 604 x_pixels_skip = -full_image_x0; 605 full_image_x0 = 0; 606 } 607 ssize_t full_image_x1 = frame_x0 + image_area_rect.x1(); 608 full_image_x1 = std::min<ssize_t>(full_image_x1, full_image_xsize); 609 610 // If the current image area is entirely outside of the visible image, there 611 // is no point in proceeding. Note: this uses the assumption that if there is 612 // a stage with observable effects (i.e. a kInput stage), it only appears 613 // after the stage that switches to image dimensions. 614 if (full_image_x1 <= full_image_x0) return true; 615 616 // Data structures to hold information about input/output rows and their 617 // buffers. 618 Rows rows(stages_, data_max_color_channel_rect, group_data_x_border_, 619 group_data_y_border_, channel_shifts_[0], base_color_shift_, 620 stage_data_[thread_id], input_data); 621 622 std::vector<RenderPipelineStage::RowInfo> input_rows(first_trailing_stage_ + 623 1); 624 for (size_t i = 0; i < first_trailing_stage_; i++) { 625 input_rows[i].resize(input_data.size()); 626 } 627 input_rows[first_trailing_stage_].resize(input_data.size(), 628 std::vector<float*>(1)); 629 630 // Maximum possible shift is 3. 631 RenderPipelineStage::RowInfo output_rows(input_data.size(), 632 std::vector<float*>(8)); 633 634 // Fills in input_rows and output_rows for a given y value (relative to the 635 // start of the group, measured in actual pixels at the appropriate vertical 636 // scaling factor) and a given stage, applying mirroring if necessary. This 637 // function is somewhat inefficient for trailing kInOut or kInput stages, 638 // where just filling the input row once ought to be sufficient. 639 auto prepare_io_rows = [&](int y, size_t i) { 640 ssize_t bordery = stages_[i]->settings_.border_y; 641 size_t shifty = stages_[i]->settings_.shift_y; 642 auto make_row = [&](size_t c, ssize_t iy) { 643 size_t mirrored_y = GetMirroredY(y + iy - bordery, group_rect[i].y0(), 644 image_rect_[i].ysize()); 645 input_rows[i][c][iy] = 646 rows.GetBuffer(stage_input_for_channel_[i][c], mirrored_y, c); 647 ApplyXMirroring(input_rows[i][c][iy], stages_[i]->settings_.border_x, 648 group_rect[i].x0(), group_rect[i].xsize(), 649 image_rect_[i].xsize()); 650 }; 651 for (size_t c = 0; c < input_data.size(); c++) { 652 RenderPipelineChannelMode mode = stages_[i]->GetChannelMode(c); 653 if (mode == RenderPipelineChannelMode::kIgnored) { 654 continue; 655 } 656 // If we already have rows from a previous iteration, we can just shift 657 // the rows by 1 and insert the new one. 658 if (input_rows[i][c].size() == 2 * static_cast<size_t>(bordery) + 1) { 659 for (ssize_t iy = 0; iy < 2 * bordery; iy++) { 660 input_rows[i][c][iy] = input_rows[i][c][iy + 1]; 661 } 662 make_row(c, bordery * 2); 663 } else { 664 input_rows[i][c].resize(2 * bordery + 1); 665 for (ssize_t iy = 0; iy < 2 * bordery + 1; iy++) { 666 make_row(c, iy); 667 } 668 } 669 670 // If necessary, get the output buffers. 671 if (mode == RenderPipelineChannelMode::kInOut) { 672 for (size_t iy = 0; iy < (1u << shifty); iy++) { 673 output_rows[c][iy] = rows.GetBuffer(i, y * (1 << shifty) + iy, c); 674 } 675 } 676 } 677 }; 678 679 // We pretend that every stage has a vertical shift of 0, i.e. it is as tall 680 // as the final image. 681 // We call each such row a "virtual" row, because it may or may not correspond 682 // to an actual row of the current processing stage; actual processing happens 683 // when vy % (1<<vshift) == 0. 684 685 int num_extra_rows = *std::max_element(virtual_ypadding_for_output_.begin(), 686 virtual_ypadding_for_output_.end()); 687 688 for (int vy = -num_extra_rows; 689 vy < static_cast<int>(image_area_rect.ysize()) + num_extra_rows; vy++) { 690 for (size_t i = 0; i < first_trailing_stage_; i++) { 691 int stage_vy = vy - num_extra_rows + virtual_ypadding_for_output_[i]; 692 693 if (stage_vy % (1 << channel_shifts_[i][anyc_[i]].second) != 0) { 694 continue; 695 } 696 697 if (stage_vy < -virtual_ypadding_for_output_[i]) { 698 continue; 699 } 700 701 int y = stage_vy >> channel_shifts_[i][anyc_[i]].second; 702 703 ssize_t image_y = static_cast<ssize_t>(group_rect[i].y0()) + y; 704 // Do not produce rows in out-of-bounds areas. 705 if (image_y < 0 || 706 image_y >= static_cast<ssize_t>(image_rect_[i].ysize())) { 707 continue; 708 } 709 710 // Get the input/output rows and potentially apply mirroring to the input. 711 prepare_io_rows(y, i); 712 713 // Produce output rows. 714 JXL_RETURN_IF_ERROR(stages_[i]->ProcessRow( 715 input_rows[i], output_rows, xpadding_for_output_[i], 716 group_rect[i].xsize(), group_rect[i].x0(), image_y, thread_id)); 717 } 718 719 // Process trailing stages, i.e. the final set of non-kInOut stages; they 720 // all have the same input buffer and no need to use any mirroring. 721 722 int y = vy - num_extra_rows; 723 724 for (size_t c = 0; c < input_data.size(); c++) { 725 // Skip pixels that are not part of the actual final image area. 726 input_rows[first_trailing_stage_][c][0] = 727 rows.GetBuffer(stage_input_for_channel_[first_trailing_stage_][c], y, 728 c) + 729 x_pixels_skip; 730 } 731 732 // Check that we are not outside of the bounds for the current rendering 733 // rect. Not doing so might result in overwriting some rows that have been 734 // written (or will be written) by other threads. 735 if (y < 0 || y >= static_cast<ssize_t>(image_area_rect.ysize())) { 736 continue; 737 } 738 739 // Avoid running pipeline stages on pixels that are outside the full image 740 // area. As trailing stages have no borders, this is a free optimization 741 // (and may be necessary for correctness, as some stages assume coordinates 742 // are within bounds). 743 ssize_t full_image_y = frame_y0 + image_area_rect.y0() + y; 744 if (full_image_y < 0 || 745 full_image_y >= static_cast<ssize_t>(full_image_ysize)) { 746 continue; 747 } 748 749 for (size_t i = first_trailing_stage_; i < stages_.size(); i++) { 750 // Before the first_image_dim_stage_, coordinates are relative to the 751 // current frame. 752 size_t x0 = 753 i < first_image_dim_stage_ ? full_image_x0 - frame_x0 : full_image_x0; 754 size_t y = 755 i < first_image_dim_stage_ ? full_image_y - frame_y0 : full_image_y; 756 JXL_RETURN_IF_ERROR(stages_[i]->ProcessRow( 757 input_rows[first_trailing_stage_], output_rows, 758 /*xextra=*/0, full_image_x1 - full_image_x0, x0, y, thread_id)); 759 } 760 } 761 return true; 762 } 763 764 Status LowMemoryRenderPipeline::RenderPadding(size_t thread_id, Rect rect) { 765 if (rect.xsize() == 0) return true; 766 size_t numc = channel_shifts_[0].size(); 767 RenderPipelineStage::RowInfo input_rows(numc, std::vector<float*>(1)); 768 RenderPipelineStage::RowInfo output_rows; 769 770 for (size_t c = 0; c < numc; c++) { 771 input_rows[c][0] = out_of_frame_data_[thread_id].Row(c); 772 } 773 774 for (size_t y = 0; y < rect.ysize(); y++) { 775 stages_[first_image_dim_stage_ - 1]->ProcessPaddingRow( 776 input_rows, rect.xsize(), rect.x0(), rect.y0() + y); 777 for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) { 778 JXL_RETURN_IF_ERROR(stages_[i]->ProcessRow( 779 input_rows, output_rows, 780 /*xextra=*/0, rect.xsize(), rect.x0(), rect.y0() + y, thread_id)); 781 } 782 } 783 return true; 784 } 785 786 Status LowMemoryRenderPipeline::ProcessBuffers(size_t group_id, 787 size_t thread_id) { 788 std::vector<ImageF>& input_data = 789 group_data_[use_group_ids_ ? group_id : thread_id]; 790 791 // Copy the group borders to the border storage. 792 for (size_t c = 0; c < input_data.size(); c++) { 793 SaveBorders(group_id, c, input_data[c]); 794 } 795 796 size_t gy = group_id / frame_dimensions_.xsize_groups; 797 size_t gx = group_id % frame_dimensions_.xsize_groups; 798 799 if (first_image_dim_stage_ != stages_.size()) { 800 size_t group_dim = frame_dimensions_.group_dim << base_color_shift_; 801 RectT<ssize_t> group_rect(gx * group_dim, gy * group_dim, group_dim, 802 group_dim); 803 RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled, 804 frame_dimensions_.ysize_upsampled); 805 RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_); 806 group_rect = group_rect.Translate(frame_origin_.x0, frame_origin_.y0); 807 image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0); 808 image_rect = image_rect.Intersection(full_image_rect); 809 group_rect = group_rect.Intersection(image_rect); 810 size_t x0 = group_rect.x0(); 811 size_t y0 = group_rect.y0(); 812 size_t x1 = group_rect.x1(); 813 size_t y1 = group_rect.y1(); 814 JXL_DEBUG_V(6, 815 "Rendering padding for full image rect %s " 816 "outside group rect %s", 817 Description(full_image_rect).c_str(), 818 Description(group_rect).c_str()); 819 820 if (group_id == 0 && (image_rect.xsize() == 0 || image_rect.ysize() == 0)) { 821 // If this frame does not intersect with the full image, we have to 822 // initialize the whole image area with RenderPadding. 823 JXL_RETURN_IF_ERROR(RenderPadding( 824 thread_id, Rect(0, 0, full_image_xsize_, full_image_ysize_))); 825 } 826 827 // Render padding for groups that intersect with the full image. The case 828 // where no groups intersect was handled above. 829 if (group_rect.xsize() > 0 && group_rect.ysize() > 0) { 830 if (gx == 0 && gy == 0) { 831 JXL_RETURN_IF_ERROR(RenderPadding(thread_id, Rect(0, 0, x0, y0))); 832 } 833 if (gy == 0) { 834 JXL_RETURN_IF_ERROR(RenderPadding(thread_id, Rect(x0, 0, x1 - x0, y0))); 835 } 836 if (gx == 0) { 837 JXL_RETURN_IF_ERROR(RenderPadding(thread_id, Rect(0, y0, x0, y1 - y0))); 838 } 839 if (gx == 0 && gy + 1 == frame_dimensions_.ysize_groups) { 840 JXL_RETURN_IF_ERROR( 841 RenderPadding(thread_id, Rect(0, y1, x0, full_image_ysize_ - y1))); 842 } 843 if (gy + 1 == frame_dimensions_.ysize_groups) { 844 JXL_RETURN_IF_ERROR(RenderPadding( 845 thread_id, Rect(x0, y1, x1 - x0, full_image_ysize_ - y1))); 846 } 847 if (gy == 0 && gx + 1 == frame_dimensions_.xsize_groups) { 848 JXL_RETURN_IF_ERROR( 849 RenderPadding(thread_id, Rect(x1, 0, full_image_xsize_ - x1, y0))); 850 } 851 if (gx + 1 == frame_dimensions_.xsize_groups) { 852 JXL_RETURN_IF_ERROR(RenderPadding( 853 thread_id, Rect(x1, y0, full_image_xsize_ - x1, y1 - y0))); 854 } 855 if (gy + 1 == frame_dimensions_.ysize_groups && 856 gx + 1 == frame_dimensions_.xsize_groups) { 857 JXL_RETURN_IF_ERROR(RenderPadding( 858 thread_id, 859 Rect(x1, y1, full_image_xsize_ - x1, full_image_ysize_ - y1))); 860 } 861 } 862 } 863 864 Rect ready_rects[GroupBorderAssigner::kMaxToFinalize]; 865 size_t num_ready_rects = 0; 866 group_border_assigner_.GroupDone(group_id, group_border_.first, 867 group_border_.second, ready_rects, 868 &num_ready_rects); 869 for (size_t i = 0; i < num_ready_rects; i++) { 870 const Rect& image_max_color_channel_rect = ready_rects[i]; 871 for (size_t c = 0; c < input_data.size(); c++) { 872 LoadBorders(group_id, c, image_max_color_channel_rect, &input_data[c]); 873 } 874 Rect data_max_color_channel_rect( 875 group_data_x_border_ + image_max_color_channel_rect.x0() - 876 gx * frame_dimensions_.group_dim, 877 group_data_y_border_ + image_max_color_channel_rect.y0() - 878 gy * frame_dimensions_.group_dim, 879 image_max_color_channel_rect.xsize(), 880 image_max_color_channel_rect.ysize()); 881 JXL_RETURN_IF_ERROR(RenderRect(thread_id, input_data, 882 data_max_color_channel_rect, 883 image_max_color_channel_rect)); 884 } 885 return true; 886 } 887 } // namespace jxl