downsample.cc (12314B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "lib/jpegli/downsample.h" 7 8 #undef HWY_TARGET_INCLUDE 9 #define HWY_TARGET_INCLUDE "lib/jpegli/downsample.cc" 10 #include <hwy/foreach_target.h> 11 #include <hwy/highway.h> 12 13 #include "lib/jpegli/encode_internal.h" 14 #include "lib/jpegli/error.h" 15 16 HWY_BEFORE_NAMESPACE(); 17 namespace jpegli { 18 namespace HWY_NAMESPACE { 19 20 // These templates are not found via ADL. 21 using hwy::HWY_NAMESPACE::Add; 22 using hwy::HWY_NAMESPACE::Mul; 23 using hwy::HWY_NAMESPACE::Vec; 24 25 using D = HWY_CAPPED(float, 8); 26 constexpr D d; 27 28 void DownsampleRow2x1(const float* row_in, size_t len, float* row_out) { 29 const size_t N = Lanes(d); 30 const size_t len_out = len / 2; 31 const auto mul = Set(d, 0.5f); 32 Vec<D> v0, v1; // NOLINT 33 for (size_t x = 0; x < len_out; x += N) { 34 LoadInterleaved2(d, row_in + 2 * x, v0, v1); 35 Store(Mul(mul, Add(v0, v1)), d, row_out + x); 36 } 37 } 38 39 void DownsampleRow3x1(const float* row_in, size_t len, float* row_out) { 40 const size_t N = Lanes(d); 41 const size_t len_out = len / 3; 42 const auto mul = Set(d, 1.0f / 3); 43 Vec<D> v0, v1, v2; // NOLINT 44 for (size_t x = 0; x < len_out; x += N) { 45 LoadInterleaved3(d, row_in + 3 * x, v0, v1, v2); 46 Store(Mul(mul, Add(Add(v0, v1), v2)), d, row_out + x); 47 } 48 } 49 50 void DownsampleRow4x1(const float* row_in, size_t len, float* row_out) { 51 const size_t N = Lanes(d); 52 const size_t len_out = len / 4; 53 const auto mul = Set(d, 0.25f); 54 Vec<D> v0, v1, v2, v3; // NOLINT 55 for (size_t x = 0; x < len_out; x += N) { 56 LoadInterleaved4(d, row_in + 4 * x, v0, v1, v2, v3); 57 Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x); 58 } 59 } 60 61 void Downsample2x1(float* rows_in[MAX_SAMP_FACTOR], size_t len, 62 float* row_out) { 63 DownsampleRow2x1(rows_in[0], len, row_out); 64 } 65 66 void Downsample3x1(float* rows_in[MAX_SAMP_FACTOR], size_t len, 67 float* row_out) { 68 DownsampleRow3x1(rows_in[0], len, row_out); 69 } 70 71 void Downsample4x1(float* rows_in[MAX_SAMP_FACTOR], size_t len, 72 float* row_out) { 73 DownsampleRow4x1(rows_in[0], len, row_out); 74 } 75 76 void Downsample1x2(float* rows_in[MAX_SAMP_FACTOR], size_t len, 77 float* row_out) { 78 const size_t N = Lanes(d); 79 const auto mul = Set(d, 0.5f); 80 float* row0 = rows_in[0]; 81 float* row1 = rows_in[1]; 82 for (size_t x = 0; x < len; x += N) { 83 Store(Mul(mul, Add(Load(d, row0 + x), Load(d, row1 + x))), d, row_out + x); 84 } 85 } 86 87 void Downsample2x2(float* rows_in[MAX_SAMP_FACTOR], size_t len, 88 float* row_out) { 89 const size_t N = Lanes(d); 90 const size_t len_out = len / 2; 91 const auto mul = Set(d, 0.25f); 92 float* row0 = rows_in[0]; 93 float* row1 = rows_in[1]; 94 Vec<D> v0, v1, v2, v3; // NOLINT 95 for (size_t x = 0; x < len_out; x += N) { 96 LoadInterleaved2(d, row0 + 2 * x, v0, v1); 97 LoadInterleaved2(d, row1 + 2 * x, v2, v3); 98 Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x); 99 } 100 } 101 102 void Downsample3x2(float* rows_in[MAX_SAMP_FACTOR], size_t len, 103 float* row_out) { 104 DownsampleRow3x1(rows_in[0], len, rows_in[0]); 105 DownsampleRow3x1(rows_in[1], len, rows_in[1]); 106 Downsample1x2(rows_in, len / 3, row_out); 107 } 108 109 void Downsample4x2(float* rows_in[MAX_SAMP_FACTOR], size_t len, 110 float* row_out) { 111 DownsampleRow4x1(rows_in[0], len, rows_in[0]); 112 DownsampleRow4x1(rows_in[1], len, rows_in[1]); 113 Downsample1x2(rows_in, len / 4, row_out); 114 } 115 116 void Downsample1x3(float* rows_in[MAX_SAMP_FACTOR], size_t len, 117 float* row_out) { 118 const size_t N = Lanes(d); 119 const auto mul = Set(d, 1.0f / 3); 120 float* row0 = rows_in[0]; 121 float* row1 = rows_in[1]; 122 float* row2 = rows_in[2]; 123 for (size_t x = 0; x < len; x += N) { 124 const auto in0 = Load(d, row0 + x); 125 const auto in1 = Load(d, row1 + x); 126 const auto in2 = Load(d, row2 + x); 127 Store(Mul(mul, Add(Add(in0, in1), in2)), d, row_out + x); 128 } 129 } 130 131 void Downsample2x3(float* rows_in[MAX_SAMP_FACTOR], size_t len, 132 float* row_out) { 133 DownsampleRow2x1(rows_in[0], len, rows_in[0]); 134 DownsampleRow2x1(rows_in[1], len, rows_in[1]); 135 DownsampleRow2x1(rows_in[2], len, rows_in[2]); 136 Downsample1x3(rows_in, len / 2, row_out); 137 } 138 139 void Downsample3x3(float* rows_in[MAX_SAMP_FACTOR], size_t len, 140 float* row_out) { 141 DownsampleRow3x1(rows_in[0], len, rows_in[0]); 142 DownsampleRow3x1(rows_in[1], len, rows_in[1]); 143 DownsampleRow3x1(rows_in[2], len, rows_in[2]); 144 Downsample1x3(rows_in, len / 3, row_out); 145 } 146 147 void Downsample4x3(float* rows_in[MAX_SAMP_FACTOR], size_t len, 148 float* row_out) { 149 DownsampleRow4x1(rows_in[0], len, rows_in[0]); 150 DownsampleRow4x1(rows_in[1], len, rows_in[1]); 151 DownsampleRow4x1(rows_in[2], len, rows_in[2]); 152 Downsample1x3(rows_in, len / 4, row_out); 153 } 154 155 void Downsample1x4(float* rows_in[MAX_SAMP_FACTOR], size_t len, 156 float* row_out) { 157 const size_t N = Lanes(d); 158 const auto mul = Set(d, 0.25f); 159 float* row0 = rows_in[0]; 160 float* row1 = rows_in[1]; 161 float* row2 = rows_in[2]; 162 float* row3 = rows_in[3]; 163 for (size_t x = 0; x < len; x += N) { 164 const auto in0 = Load(d, row0 + x); 165 const auto in1 = Load(d, row1 + x); 166 const auto in2 = Load(d, row2 + x); 167 const auto in3 = Load(d, row3 + x); 168 Store(Mul(mul, Add(Add(in0, in1), Add(in2, in3))), d, row_out + x); 169 } 170 } 171 172 void Downsample2x4(float* rows_in[MAX_SAMP_FACTOR], size_t len, 173 float* row_out) { 174 DownsampleRow2x1(rows_in[0], len, rows_in[0]); 175 DownsampleRow2x1(rows_in[1], len, rows_in[1]); 176 DownsampleRow2x1(rows_in[2], len, rows_in[2]); 177 DownsampleRow2x1(rows_in[3], len, rows_in[3]); 178 Downsample1x4(rows_in, len / 2, row_out); 179 } 180 181 void Downsample3x4(float* rows_in[MAX_SAMP_FACTOR], size_t len, 182 float* row_out) { 183 DownsampleRow3x1(rows_in[0], len, rows_in[0]); 184 DownsampleRow3x1(rows_in[1], len, rows_in[1]); 185 DownsampleRow3x1(rows_in[2], len, rows_in[2]); 186 DownsampleRow3x1(rows_in[3], len, rows_in[3]); 187 Downsample1x4(rows_in, len / 3, row_out); 188 } 189 190 void Downsample4x4(float* rows_in[MAX_SAMP_FACTOR], size_t len, 191 float* row_out) { 192 DownsampleRow4x1(rows_in[0], len, rows_in[0]); 193 DownsampleRow4x1(rows_in[1], len, rows_in[1]); 194 DownsampleRow4x1(rows_in[2], len, rows_in[2]); 195 DownsampleRow4x1(rows_in[3], len, rows_in[3]); 196 Downsample1x4(rows_in, len / 4, row_out); 197 } 198 199 // NOLINTNEXTLINE(google-readability-namespace-comments) 200 } // namespace HWY_NAMESPACE 201 } // namespace jpegli 202 HWY_AFTER_NAMESPACE(); 203 204 #if HWY_ONCE 205 namespace jpegli { 206 207 HWY_EXPORT(Downsample1x2); 208 HWY_EXPORT(Downsample1x3); 209 HWY_EXPORT(Downsample1x4); 210 HWY_EXPORT(Downsample2x1); 211 HWY_EXPORT(Downsample2x2); 212 HWY_EXPORT(Downsample2x3); 213 HWY_EXPORT(Downsample2x4); 214 HWY_EXPORT(Downsample3x1); 215 HWY_EXPORT(Downsample3x2); 216 HWY_EXPORT(Downsample3x3); 217 HWY_EXPORT(Downsample3x4); 218 HWY_EXPORT(Downsample4x1); 219 HWY_EXPORT(Downsample4x2); 220 HWY_EXPORT(Downsample4x3); 221 HWY_EXPORT(Downsample4x4); 222 223 void NullDownsample(float* rows_in[MAX_SAMP_FACTOR], size_t len, 224 float* row_out) {} 225 226 void ChooseDownsampleMethods(j_compress_ptr cinfo) { 227 jpeg_comp_master* m = cinfo->master; 228 for (int c = 0; c < cinfo->num_components; c++) { 229 m->downsample_method[c] = nullptr; 230 jpeg_component_info* comp = &cinfo->comp_info[c]; 231 const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor; 232 const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor; 233 if (v_factor == 1) { 234 if (h_factor == 1) { 235 m->downsample_method[c] = NullDownsample; 236 } else if (h_factor == 2) { 237 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x1); 238 } else if (h_factor == 3) { 239 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x1); 240 } else if (h_factor == 4) { 241 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x1); 242 } 243 } else if (v_factor == 2) { 244 if (h_factor == 1) { 245 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2); 246 } else if (h_factor == 2) { 247 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2); 248 } else if (h_factor == 3) { 249 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2); 250 } else if (h_factor == 4) { 251 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2); 252 } 253 } else if (v_factor == 3) { 254 if (h_factor == 1) { 255 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2); 256 } else if (h_factor == 2) { 257 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2); 258 } else if (h_factor == 3) { 259 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2); 260 } else if (h_factor == 4) { 261 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2); 262 } 263 } else if (v_factor == 4) { 264 if (h_factor == 1) { 265 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x4); 266 } else if (h_factor == 2) { 267 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x4); 268 } else if (h_factor == 3) { 269 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x4); 270 } else if (h_factor == 4) { 271 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x4); 272 } 273 } 274 if (m->downsample_method[c] == nullptr) { 275 JPEGLI_ERROR("Unsupported downsampling ratio %dx%d", h_factor, v_factor); 276 } 277 } 278 } 279 280 void DownsampleInputBuffer(j_compress_ptr cinfo) { 281 if (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1) { 282 return; 283 } 284 jpeg_comp_master* m = cinfo->master; 285 const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor; 286 const size_t y0 = m->next_iMCU_row * iMCU_height; 287 const size_t y1 = y0 + iMCU_height; 288 const size_t xsize_padded = m->xsize_blocks * DCTSIZE; 289 for (int c = 0; c < cinfo->num_components; c++) { 290 jpeg_component_info* comp = &cinfo->comp_info[c]; 291 const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor; 292 const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor; 293 if (h_factor == 1 && v_factor == 1) { 294 continue; 295 } 296 auto& input = *m->smooth_input[c]; 297 auto& output = *m->raw_data[c]; 298 const size_t yout0 = y0 / v_factor; 299 float* rows_in[MAX_SAMP_FACTOR]; 300 for (size_t yin = y0, yout = yout0; yin < y1; yin += v_factor, ++yout) { 301 for (int iy = 0; iy < v_factor; ++iy) { 302 rows_in[iy] = input.Row(yin + iy); 303 } 304 float* row_out = output.Row(yout); 305 (*m->downsample_method[c])(rows_in, xsize_padded, row_out); 306 } 307 } 308 } 309 310 void ApplyInputSmoothing(j_compress_ptr cinfo) { 311 if (!cinfo->smoothing_factor) { 312 return; 313 } 314 jpeg_comp_master* m = cinfo->master; 315 const float kW1 = cinfo->smoothing_factor / 1024.0; 316 const float kW0 = 1.0f - 8.0f * kW1; 317 const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor; 318 const ssize_t y0 = m->next_iMCU_row * iMCU_height; 319 const ssize_t y1 = y0 + iMCU_height; 320 const ssize_t xsize_padded = m->xsize_blocks * DCTSIZE; 321 for (int c = 0; c < cinfo->num_components; c++) { 322 auto& input = m->input_buffer[c]; 323 auto& output = *m->smooth_input[c]; 324 if (m->next_iMCU_row == 0) { 325 input.CopyRow(-1, 0, 1); 326 } 327 if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) { 328 size_t last_row = m->ysize_blocks * DCTSIZE - 1; 329 input.CopyRow(last_row + 1, last_row, 1); 330 } 331 // TODO(szabadka) SIMDify this. 332 for (ssize_t y = y0; y < y1; ++y) { 333 const float* row_t = input.Row(y - 1); 334 const float* row_m = input.Row(y); 335 const float* row_b = input.Row(y + 1); 336 float* row_out = output.Row(y); 337 for (ssize_t x = 0; x < xsize_padded; ++x) { 338 float val_tl = row_t[x - 1]; 339 float val_tm = row_t[x]; 340 float val_tr = row_t[x + 1]; 341 float val_ml = row_m[x - 1]; 342 float val_mm = row_m[x]; 343 float val_mr = row_m[x + 1]; 344 float val_bl = row_b[x - 1]; 345 float val_bm = row_b[x]; 346 float val_br = row_b[x + 1]; 347 float val1 = (val_tl + val_tm + val_tr + val_ml + val_mr + val_bl + 348 val_bm + val_br); 349 row_out[x] = val_mm * kW0 + val1 * kW1; 350 } 351 } 352 } 353 } 354 355 } // namespace jpegli 356 #endif // HWY_ONCE