libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

downsample.cc (12314B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jpegli/downsample.h"
      7 
      8 #undef HWY_TARGET_INCLUDE
      9 #define HWY_TARGET_INCLUDE "lib/jpegli/downsample.cc"
     10 #include <hwy/foreach_target.h>
     11 #include <hwy/highway.h>
     12 
     13 #include "lib/jpegli/encode_internal.h"
     14 #include "lib/jpegli/error.h"
     15 
     16 HWY_BEFORE_NAMESPACE();
     17 namespace jpegli {
     18 namespace HWY_NAMESPACE {
     19 
     20 // These templates are not found via ADL.
     21 using hwy::HWY_NAMESPACE::Add;
     22 using hwy::HWY_NAMESPACE::Mul;
     23 using hwy::HWY_NAMESPACE::Vec;
     24 
     25 using D = HWY_CAPPED(float, 8);
     26 constexpr D d;
     27 
     28 void DownsampleRow2x1(const float* row_in, size_t len, float* row_out) {
     29   const size_t N = Lanes(d);
     30   const size_t len_out = len / 2;
     31   const auto mul = Set(d, 0.5f);
     32   Vec<D> v0, v1;  // NOLINT
     33   for (size_t x = 0; x < len_out; x += N) {
     34     LoadInterleaved2(d, row_in + 2 * x, v0, v1);
     35     Store(Mul(mul, Add(v0, v1)), d, row_out + x);
     36   }
     37 }
     38 
     39 void DownsampleRow3x1(const float* row_in, size_t len, float* row_out) {
     40   const size_t N = Lanes(d);
     41   const size_t len_out = len / 3;
     42   const auto mul = Set(d, 1.0f / 3);
     43   Vec<D> v0, v1, v2;  // NOLINT
     44   for (size_t x = 0; x < len_out; x += N) {
     45     LoadInterleaved3(d, row_in + 3 * x, v0, v1, v2);
     46     Store(Mul(mul, Add(Add(v0, v1), v2)), d, row_out + x);
     47   }
     48 }
     49 
     50 void DownsampleRow4x1(const float* row_in, size_t len, float* row_out) {
     51   const size_t N = Lanes(d);
     52   const size_t len_out = len / 4;
     53   const auto mul = Set(d, 0.25f);
     54   Vec<D> v0, v1, v2, v3;  // NOLINT
     55   for (size_t x = 0; x < len_out; x += N) {
     56     LoadInterleaved4(d, row_in + 4 * x, v0, v1, v2, v3);
     57     Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
     58   }
     59 }
     60 
     61 void Downsample2x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
     62                    float* row_out) {
     63   DownsampleRow2x1(rows_in[0], len, row_out);
     64 }
     65 
     66 void Downsample3x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
     67                    float* row_out) {
     68   DownsampleRow3x1(rows_in[0], len, row_out);
     69 }
     70 
     71 void Downsample4x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
     72                    float* row_out) {
     73   DownsampleRow4x1(rows_in[0], len, row_out);
     74 }
     75 
     76 void Downsample1x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
     77                    float* row_out) {
     78   const size_t N = Lanes(d);
     79   const auto mul = Set(d, 0.5f);
     80   float* row0 = rows_in[0];
     81   float* row1 = rows_in[1];
     82   for (size_t x = 0; x < len; x += N) {
     83     Store(Mul(mul, Add(Load(d, row0 + x), Load(d, row1 + x))), d, row_out + x);
     84   }
     85 }
     86 
     87 void Downsample2x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
     88                    float* row_out) {
     89   const size_t N = Lanes(d);
     90   const size_t len_out = len / 2;
     91   const auto mul = Set(d, 0.25f);
     92   float* row0 = rows_in[0];
     93   float* row1 = rows_in[1];
     94   Vec<D> v0, v1, v2, v3;  // NOLINT
     95   for (size_t x = 0; x < len_out; x += N) {
     96     LoadInterleaved2(d, row0 + 2 * x, v0, v1);
     97     LoadInterleaved2(d, row1 + 2 * x, v2, v3);
     98     Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
     99   }
    100 }
    101 
    102 void Downsample3x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    103                    float* row_out) {
    104   DownsampleRow3x1(rows_in[0], len, rows_in[0]);
    105   DownsampleRow3x1(rows_in[1], len, rows_in[1]);
    106   Downsample1x2(rows_in, len / 3, row_out);
    107 }
    108 
    109 void Downsample4x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    110                    float* row_out) {
    111   DownsampleRow4x1(rows_in[0], len, rows_in[0]);
    112   DownsampleRow4x1(rows_in[1], len, rows_in[1]);
    113   Downsample1x2(rows_in, len / 4, row_out);
    114 }
    115 
    116 void Downsample1x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    117                    float* row_out) {
    118   const size_t N = Lanes(d);
    119   const auto mul = Set(d, 1.0f / 3);
    120   float* row0 = rows_in[0];
    121   float* row1 = rows_in[1];
    122   float* row2 = rows_in[2];
    123   for (size_t x = 0; x < len; x += N) {
    124     const auto in0 = Load(d, row0 + x);
    125     const auto in1 = Load(d, row1 + x);
    126     const auto in2 = Load(d, row2 + x);
    127     Store(Mul(mul, Add(Add(in0, in1), in2)), d, row_out + x);
    128   }
    129 }
    130 
    131 void Downsample2x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    132                    float* row_out) {
    133   DownsampleRow2x1(rows_in[0], len, rows_in[0]);
    134   DownsampleRow2x1(rows_in[1], len, rows_in[1]);
    135   DownsampleRow2x1(rows_in[2], len, rows_in[2]);
    136   Downsample1x3(rows_in, len / 2, row_out);
    137 }
    138 
    139 void Downsample3x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    140                    float* row_out) {
    141   DownsampleRow3x1(rows_in[0], len, rows_in[0]);
    142   DownsampleRow3x1(rows_in[1], len, rows_in[1]);
    143   DownsampleRow3x1(rows_in[2], len, rows_in[2]);
    144   Downsample1x3(rows_in, len / 3, row_out);
    145 }
    146 
    147 void Downsample4x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    148                    float* row_out) {
    149   DownsampleRow4x1(rows_in[0], len, rows_in[0]);
    150   DownsampleRow4x1(rows_in[1], len, rows_in[1]);
    151   DownsampleRow4x1(rows_in[2], len, rows_in[2]);
    152   Downsample1x3(rows_in, len / 4, row_out);
    153 }
    154 
    155 void Downsample1x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    156                    float* row_out) {
    157   const size_t N = Lanes(d);
    158   const auto mul = Set(d, 0.25f);
    159   float* row0 = rows_in[0];
    160   float* row1 = rows_in[1];
    161   float* row2 = rows_in[2];
    162   float* row3 = rows_in[3];
    163   for (size_t x = 0; x < len; x += N) {
    164     const auto in0 = Load(d, row0 + x);
    165     const auto in1 = Load(d, row1 + x);
    166     const auto in2 = Load(d, row2 + x);
    167     const auto in3 = Load(d, row3 + x);
    168     Store(Mul(mul, Add(Add(in0, in1), Add(in2, in3))), d, row_out + x);
    169   }
    170 }
    171 
    172 void Downsample2x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    173                    float* row_out) {
    174   DownsampleRow2x1(rows_in[0], len, rows_in[0]);
    175   DownsampleRow2x1(rows_in[1], len, rows_in[1]);
    176   DownsampleRow2x1(rows_in[2], len, rows_in[2]);
    177   DownsampleRow2x1(rows_in[3], len, rows_in[3]);
    178   Downsample1x4(rows_in, len / 2, row_out);
    179 }
    180 
    181 void Downsample3x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    182                    float* row_out) {
    183   DownsampleRow3x1(rows_in[0], len, rows_in[0]);
    184   DownsampleRow3x1(rows_in[1], len, rows_in[1]);
    185   DownsampleRow3x1(rows_in[2], len, rows_in[2]);
    186   DownsampleRow3x1(rows_in[3], len, rows_in[3]);
    187   Downsample1x4(rows_in, len / 3, row_out);
    188 }
    189 
    190 void Downsample4x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    191                    float* row_out) {
    192   DownsampleRow4x1(rows_in[0], len, rows_in[0]);
    193   DownsampleRow4x1(rows_in[1], len, rows_in[1]);
    194   DownsampleRow4x1(rows_in[2], len, rows_in[2]);
    195   DownsampleRow4x1(rows_in[3], len, rows_in[3]);
    196   Downsample1x4(rows_in, len / 4, row_out);
    197 }
    198 
    199 // NOLINTNEXTLINE(google-readability-namespace-comments)
    200 }  // namespace HWY_NAMESPACE
    201 }  // namespace jpegli
    202 HWY_AFTER_NAMESPACE();
    203 
    204 #if HWY_ONCE
    205 namespace jpegli {
    206 
    207 HWY_EXPORT(Downsample1x2);
    208 HWY_EXPORT(Downsample1x3);
    209 HWY_EXPORT(Downsample1x4);
    210 HWY_EXPORT(Downsample2x1);
    211 HWY_EXPORT(Downsample2x2);
    212 HWY_EXPORT(Downsample2x3);
    213 HWY_EXPORT(Downsample2x4);
    214 HWY_EXPORT(Downsample3x1);
    215 HWY_EXPORT(Downsample3x2);
    216 HWY_EXPORT(Downsample3x3);
    217 HWY_EXPORT(Downsample3x4);
    218 HWY_EXPORT(Downsample4x1);
    219 HWY_EXPORT(Downsample4x2);
    220 HWY_EXPORT(Downsample4x3);
    221 HWY_EXPORT(Downsample4x4);
    222 
    223 void NullDownsample(float* rows_in[MAX_SAMP_FACTOR], size_t len,
    224                     float* row_out) {}
    225 
    226 void ChooseDownsampleMethods(j_compress_ptr cinfo) {
    227   jpeg_comp_master* m = cinfo->master;
    228   for (int c = 0; c < cinfo->num_components; c++) {
    229     m->downsample_method[c] = nullptr;
    230     jpeg_component_info* comp = &cinfo->comp_info[c];
    231     const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
    232     const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
    233     if (v_factor == 1) {
    234       if (h_factor == 1) {
    235         m->downsample_method[c] = NullDownsample;
    236       } else if (h_factor == 2) {
    237         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x1);
    238       } else if (h_factor == 3) {
    239         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x1);
    240       } else if (h_factor == 4) {
    241         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x1);
    242       }
    243     } else if (v_factor == 2) {
    244       if (h_factor == 1) {
    245         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
    246       } else if (h_factor == 2) {
    247         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
    248       } else if (h_factor == 3) {
    249         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
    250       } else if (h_factor == 4) {
    251         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
    252       }
    253     } else if (v_factor == 3) {
    254       if (h_factor == 1) {
    255         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
    256       } else if (h_factor == 2) {
    257         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
    258       } else if (h_factor == 3) {
    259         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
    260       } else if (h_factor == 4) {
    261         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
    262       }
    263     } else if (v_factor == 4) {
    264       if (h_factor == 1) {
    265         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x4);
    266       } else if (h_factor == 2) {
    267         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x4);
    268       } else if (h_factor == 3) {
    269         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x4);
    270       } else if (h_factor == 4) {
    271         m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x4);
    272       }
    273     }
    274     if (m->downsample_method[c] == nullptr) {
    275       JPEGLI_ERROR("Unsupported downsampling ratio %dx%d", h_factor, v_factor);
    276     }
    277   }
    278 }
    279 
    280 void DownsampleInputBuffer(j_compress_ptr cinfo) {
    281   if (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1) {
    282     return;
    283   }
    284   jpeg_comp_master* m = cinfo->master;
    285   const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
    286   const size_t y0 = m->next_iMCU_row * iMCU_height;
    287   const size_t y1 = y0 + iMCU_height;
    288   const size_t xsize_padded = m->xsize_blocks * DCTSIZE;
    289   for (int c = 0; c < cinfo->num_components; c++) {
    290     jpeg_component_info* comp = &cinfo->comp_info[c];
    291     const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
    292     const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
    293     if (h_factor == 1 && v_factor == 1) {
    294       continue;
    295     }
    296     auto& input = *m->smooth_input[c];
    297     auto& output = *m->raw_data[c];
    298     const size_t yout0 = y0 / v_factor;
    299     float* rows_in[MAX_SAMP_FACTOR];
    300     for (size_t yin = y0, yout = yout0; yin < y1; yin += v_factor, ++yout) {
    301       for (int iy = 0; iy < v_factor; ++iy) {
    302         rows_in[iy] = input.Row(yin + iy);
    303       }
    304       float* row_out = output.Row(yout);
    305       (*m->downsample_method[c])(rows_in, xsize_padded, row_out);
    306     }
    307   }
    308 }
    309 
    310 void ApplyInputSmoothing(j_compress_ptr cinfo) {
    311   if (!cinfo->smoothing_factor) {
    312     return;
    313   }
    314   jpeg_comp_master* m = cinfo->master;
    315   const float kW1 = cinfo->smoothing_factor / 1024.0;
    316   const float kW0 = 1.0f - 8.0f * kW1;
    317   const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
    318   const ssize_t y0 = m->next_iMCU_row * iMCU_height;
    319   const ssize_t y1 = y0 + iMCU_height;
    320   const ssize_t xsize_padded = m->xsize_blocks * DCTSIZE;
    321   for (int c = 0; c < cinfo->num_components; c++) {
    322     auto& input = m->input_buffer[c];
    323     auto& output = *m->smooth_input[c];
    324     if (m->next_iMCU_row == 0) {
    325       input.CopyRow(-1, 0, 1);
    326     }
    327     if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
    328       size_t last_row = m->ysize_blocks * DCTSIZE - 1;
    329       input.CopyRow(last_row + 1, last_row, 1);
    330     }
    331     // TODO(szabadka) SIMDify this.
    332     for (ssize_t y = y0; y < y1; ++y) {
    333       const float* row_t = input.Row(y - 1);
    334       const float* row_m = input.Row(y);
    335       const float* row_b = input.Row(y + 1);
    336       float* row_out = output.Row(y);
    337       for (ssize_t x = 0; x < xsize_padded; ++x) {
    338         float val_tl = row_t[x - 1];
    339         float val_tm = row_t[x];
    340         float val_tr = row_t[x + 1];
    341         float val_ml = row_m[x - 1];
    342         float val_mm = row_m[x];
    343         float val_mr = row_m[x + 1];
    344         float val_bl = row_b[x - 1];
    345         float val_bm = row_b[x];
    346         float val_br = row_b[x + 1];
    347         float val1 = (val_tl + val_tm + val_tr + val_ml + val_mr + val_bl +
    348                       val_bm + val_br);
    349         row_out[x] = val_mm * kW0 + val1 * kW1;
    350       }
    351     }
    352   }
    353 }
    354 
    355 }  // namespace jpegli
    356 #endif  // HWY_ONCE