libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

convolve_symmetric5.cc (6944B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jxl/convolve.h"
      7 
      8 #undef HWY_TARGET_INCLUDE
      9 #define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric5.cc"
     10 #include <hwy/foreach_target.h>
     11 #include <hwy/highway.h>
     12 
     13 #include "lib/jxl/base/common.h"
     14 #include "lib/jxl/convolve-inl.h"
     15 
     16 HWY_BEFORE_NAMESPACE();
     17 namespace jxl {
     18 namespace HWY_NAMESPACE {
     19 
     20 // These templates are not found via ADL.
     21 using hwy::HWY_NAMESPACE::Add;
     22 using hwy::HWY_NAMESPACE::Mul;
     23 using hwy::HWY_NAMESPACE::Vec;
     24 
     25 // Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2].
     26 template <class WrapY>
     27 static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y,
     28                                const int64_t ix, const int64_t iy,
     29                                const size_t xsize, const size_t ysize,
     30                                const float wx0, const float wx1,
     31                                const float wx2) {
     32   const WrapMirror wrap_x;
     33   const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize));
     34   const float in_m2 = row[wrap_x(ix - 2, xsize)];
     35   const float in_p2 = row[wrap_x(ix + 2, xsize)];
     36   const float in_m1 = row[wrap_x(ix - 1, xsize)];
     37   const float in_p1 = row[wrap_x(ix + 1, xsize)];
     38   const float in_00 = row[ix];
     39   const float sum_2 = wx2 * (in_m2 + in_p2);
     40   const float sum_1 = wx1 * (in_m1 + in_p1);
     41   const float sum_0 = wx0 * in_00;
     42   return sum_2 + (sum_1 + sum_0);
     43 }
     44 
     45 template <class WrapY, class V>
     46 static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
     47                      const int64_t iy, const size_t ysize, const V wx0,
     48                      const V wx1, const V wx2) {
     49   const HWY_FULL(float) d;
     50   const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
     51   const auto in_m2 = LoadU(d, center - 2);
     52   const auto in_p2 = LoadU(d, center + 2);
     53   const auto in_m1 = LoadU(d, center - 1);
     54   const auto in_p1 = LoadU(d, center + 1);
     55   const auto in_00 = LoadU(d, center);
     56   const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
     57   const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
     58   const auto sum_0 = Mul(wx0, in_00);
     59   return Add(sum_2, Add(sum_1, sum_0));
     60 }
     61 
     62 // Produces result for one pixel
     63 template <class WrapY>
     64 float Symmetric5Border(const ImageF& in, const int64_t ix, const int64_t iy,
     65                        const WeightsSymmetric5& weights) {
     66   const float w0 = weights.c[0];
     67   const float w1 = weights.r[0];
     68   const float w2 = weights.R[0];
     69   const float w4 = weights.d[0];
     70   const float w5 = weights.L[0];
     71   const float w8 = weights.D[0];
     72 
     73   const size_t xsize = in.xsize();
     74   const size_t ysize = in.ysize();
     75   const WrapY wrap_y;
     76   // Unrolled loop over all 5 rows of the kernel.
     77   float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2);
     78 
     79   sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8);
     80   float sum1 =
     81       WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8);
     82 
     83   sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5);
     84   sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5);
     85 
     86   return sum0 + sum1;
     87 }
     88 
     89 // Produces result for one vector's worth of pixels
     90 template <class WrapY>
     91 static void Symmetric5Interior(const ImageF& in, const int64_t ix,
     92                                const int64_t rix, const int64_t iy,
     93                                const WeightsSymmetric5& weights,
     94                                float* JXL_RESTRICT row_out) {
     95   const HWY_FULL(float) d;
     96 
     97   const auto w0 = LoadDup128(d, weights.c);
     98   const auto w1 = LoadDup128(d, weights.r);
     99   const auto w2 = LoadDup128(d, weights.R);
    100   const auto w4 = LoadDup128(d, weights.d);
    101   const auto w5 = LoadDup128(d, weights.L);
    102   const auto w8 = LoadDup128(d, weights.D);
    103 
    104   const size_t ysize = in.ysize();
    105   const WrapY wrap_y;
    106   // Unrolled loop over all 5 rows of the kernel.
    107   auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2);
    108 
    109   sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8));
    110   auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8);
    111 
    112   sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5));
    113   sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5));
    114 
    115   StoreU(Add(sum0, sum1), d, row_out + rix);
    116 }
    117 
    118 template <class WrapY>
    119 static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy,
    120                           const WeightsSymmetric5& weights,
    121                           float* JXL_RESTRICT row_out) {
    122   const int64_t kRadius = 2;
    123   const size_t xend = rect.x1();
    124 
    125   size_t rix = 0;
    126   size_t ix = rect.x0();
    127   const HWY_FULL(float) d;
    128   const size_t N = Lanes(d);
    129   const size_t aligned_x = RoundUpTo(kRadius, N);
    130   for (; ix < std::min(aligned_x, xend); ++ix, ++rix) {
    131     row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
    132   }
    133   for (; ix + N + kRadius <= xend; ix += N, rix += N) {
    134     Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out);
    135   }
    136   for (; ix < xend; ++ix, ++rix) {
    137     row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
    138   }
    139 }
    140 
    141 // Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike
    142 // the fully vectorized strategies below.
    143 void Symmetric5(const ImageF& in, const Rect& in_rect,
    144                 const WeightsSymmetric5& weights, ThreadPool* pool,
    145                 ImageF* JXL_RESTRICT out, const Rect& out_rect) {
    146   JXL_ASSERT(in_rect.xsize() == out_rect.xsize());
    147   JXL_ASSERT(in_rect.ysize() == out_rect.ysize());
    148   const size_t ysize = in_rect.ysize();
    149   JXL_CHECK(RunOnPool(
    150       pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
    151       [&](const uint32_t task, size_t /*thread*/) {
    152         const int64_t riy = task;
    153         const int64_t iy = in_rect.y0() + riy;
    154 
    155         if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) {
    156           Symmetric5Row<WrapMirror>(in, in_rect, iy, weights,
    157                                     out_rect.Row(out, riy));
    158         } else {
    159           Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights,
    160                                        out_rect.Row(out, riy));
    161         }
    162       },
    163       "Symmetric5x5Convolution"));
    164 }
    165 
    166 // NOLINTNEXTLINE(google-readability-namespace-comments)
    167 }  // namespace HWY_NAMESPACE
    168 }  // namespace jxl
    169 HWY_AFTER_NAMESPACE();
    170 
    171 #if HWY_ONCE
    172 namespace jxl {
    173 
    174 HWY_EXPORT(Symmetric5);
    175 void Symmetric5(const ImageF& in, const Rect& in_rect,
    176                 const WeightsSymmetric5& weights, ThreadPool* pool,
    177                 ImageF* JXL_RESTRICT out, const Rect& out_rect) {
    178   HWY_DYNAMIC_DISPATCH(Symmetric5)(in, in_rect, weights, pool, out, out_rect);
    179 }
    180 
    181 void Symmetric5(const ImageF& in, const Rect& rect,
    182                 const WeightsSymmetric5& weights, ThreadPool* pool,
    183                 ImageF* JXL_RESTRICT out) {
    184   Symmetric5(in, rect, weights, pool, out, Rect(*out));
    185 }
    186 
    187 }  // namespace jxl
    188 #endif  // HWY_ONCE