libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

convolve_symmetric3.cc (6819B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jxl/convolve.h"
      7 
      8 #undef HWY_TARGET_INCLUDE
      9 #define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric3.cc"
     10 #include <hwy/foreach_target.h>
     11 #include <hwy/highway.h>
     12 
     13 #include "lib/jxl/convolve-inl.h"
     14 
     15 HWY_BEFORE_NAMESPACE();
     16 namespace jxl {
     17 namespace HWY_NAMESPACE {
     18 
     19 // These templates are not found via ADL.
     20 using hwy::HWY_NAMESPACE::Add;
     21 using hwy::HWY_NAMESPACE::Mul;
     22 using hwy::HWY_NAMESPACE::MulAdd;
     23 using hwy::HWY_NAMESPACE::Vec;
     24 
     25 template <class WrapY, class V>
     26 static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
     27                      const int64_t iy, const size_t ysize, const V wx0,
     28                      const V wx1, const V wx2) {
     29   const HWY_FULL(float) d;
     30   const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
     31   const auto in_m2 = LoadU(d, center - 2);
     32   const auto in_p2 = LoadU(d, center + 2);
     33   const auto in_m1 = LoadU(d, center - 1);
     34   const auto in_p1 = LoadU(d, center + 1);
     35   const auto in_00 = Load(d, center);
     36   const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
     37   const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
     38   const auto sum_0 = Mul(wx0, in_00);
     39   return Add(sum_2, Add(sum_1, sum_0));
     40 }
     41 
     42 // 3x3 convolution by symmetric kernel with a single scan through the input.
     43 class Symmetric3Strategy {
     44   using D = HWY_CAPPED(float, 16);
     45   using V = Vec<D>;
     46 
     47  public:
     48   static constexpr int64_t kRadius = 1;
     49 
     50   // Only accesses pixels in [0, xsize).
     51   template <size_t kSizeModN, class WrapRow>
     52   static JXL_MAYBE_INLINE void ConvolveRow(
     53       const float* const JXL_RESTRICT row_m, const size_t xsize,
     54       const int64_t stride, const WrapRow& wrap_row,
     55       const WeightsSymmetric3& weights, float* const JXL_RESTRICT row_out) {
     56     const D d;
     57     // t, m, b = top, middle, bottom row;
     58     const float* const JXL_RESTRICT row_t = wrap_row(row_m - stride, stride);
     59     const float* const JXL_RESTRICT row_b = wrap_row(row_m + stride, stride);
     60 
     61     // Must load in advance - compiler doesn't understand LoadDup128 and
     62     // schedules them too late.
     63     const V w0 = LoadDup128(d, weights.c);
     64     const V w1 = LoadDup128(d, weights.r);
     65     const V w2 = LoadDup128(d, weights.d);
     66 
     67     // l, c, r = left, center, right. Leftmost vector: need FirstL1.
     68     {
     69       const V tc = LoadU(d, row_t + 0);
     70       const V mc = LoadU(d, row_m + 0);
     71       const V bc = LoadU(d, row_b + 0);
     72       const V tl = Neighbors::FirstL1(tc);
     73       const V tr = LoadU(d, row_t + 0 + 1);
     74       const V ml = Neighbors::FirstL1(mc);
     75       const V mr = LoadU(d, row_m + 0 + 1);
     76       const V bl = Neighbors::FirstL1(bc);
     77       const V br = LoadU(d, row_b + 0 + 1);
     78       const V conv =
     79           WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
     80       Store(conv, d, row_out + 0);
     81     }
     82 
     83     // Loop as long as we can load enough new values:
     84     const size_t N = Lanes(d);
     85     size_t x = N;
     86     for (; x + N + kRadius <= xsize; x += N) {
     87       const auto conv = ConvolveValid(row_t, row_m, row_b, x, w0, w1, w2);
     88       Store(conv, d, row_out + x);
     89     }
     90 
     91     // For final (partial) vector:
     92     const V tc = LoadU(d, row_t + x);
     93     const V mc = LoadU(d, row_m + x);
     94     const V bc = LoadU(d, row_b + x);
     95 
     96     V tr;
     97     V mr;
     98     V br;
     99 #if HWY_TARGET == HWY_SCALAR
    100     tr = tc;  // Single-lane => mirrored right neighbor = center value.
    101     mr = mc;
    102     br = bc;
    103 #else
    104     if (kSizeModN == 0) {
    105       // The above loop didn't handle the last vector because it needs an
    106       // additional right neighbor (generated via mirroring).
    107       auto mirror = SetTableIndices(d, MirrorLanes(N - 1));
    108       tr = TableLookupLanes(tc, mirror);
    109       mr = TableLookupLanes(mc, mirror);
    110       br = TableLookupLanes(bc, mirror);
    111     } else {
    112       auto mirror = SetTableIndices(d, MirrorLanes((xsize % N) - 1));
    113       // Loads last valid value into uppermost lane and mirrors.
    114       tr = TableLookupLanes(LoadU(d, row_t + xsize - N), mirror);
    115       mr = TableLookupLanes(LoadU(d, row_m + xsize - N), mirror);
    116       br = TableLookupLanes(LoadU(d, row_b + xsize - N), mirror);
    117     }
    118 #endif
    119 
    120     const V tl = LoadU(d, row_t + x - 1);
    121     const V ml = LoadU(d, row_m + x - 1);
    122     const V bl = LoadU(d, row_b + x - 1);
    123     const V conv = WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
    124     Store(conv, d, row_out + x);
    125   }
    126 
    127  private:
    128   // Returns sum{x_i * w_i}.
    129   template <class V>
    130   static JXL_MAYBE_INLINE V WeightedSum(const V tl, const V tc, const V tr,
    131                                         const V ml, const V mc, const V mr,
    132                                         const V bl, const V bc, const V br,
    133                                         const V w0, const V w1, const V w2) {
    134     const V sum_tb = Add(tc, bc);
    135 
    136     // Faster than 5 mul + 4 FMA.
    137     const V mul0 = Mul(mc, w0);
    138     const V sum_lr = Add(ml, mr);
    139 
    140     const V x1 = Add(sum_tb, sum_lr);
    141     const V mul1 = MulAdd(x1, w1, mul0);
    142 
    143     const V sum_t2 = Add(tl, tr);
    144     const V sum_b2 = Add(bl, br);
    145     const V x2 = Add(sum_t2, sum_b2);
    146     const V mul2 = MulAdd(x2, w2, mul1);
    147     return mul2;
    148   }
    149 
    150   static JXL_MAYBE_INLINE V ConvolveValid(const float* JXL_RESTRICT row_t,
    151                                           const float* JXL_RESTRICT row_m,
    152                                           const float* JXL_RESTRICT row_b,
    153                                           const int64_t x, const V w0,
    154                                           const V w1, const V w2) {
    155     const D d;
    156     const V tc = LoadU(d, row_t + x);
    157     const V mc = LoadU(d, row_m + x);
    158     const V bc = LoadU(d, row_b + x);
    159     const V tl = LoadU(d, row_t + x - 1);
    160     const V tr = LoadU(d, row_t + x + 1);
    161     const V ml = LoadU(d, row_m + x - 1);
    162     const V mr = LoadU(d, row_m + x + 1);
    163     const V bl = LoadU(d, row_b + x - 1);
    164     const V br = LoadU(d, row_b + x + 1);
    165     return WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
    166   }
    167 };
    168 
    169 void Symmetric3(const ImageF& in, const Rect& rect,
    170                 const WeightsSymmetric3& weights, ThreadPool* pool,
    171                 ImageF* out) {
    172   using Conv = ConvolveT<Symmetric3Strategy>;
    173   if (rect.xsize() >= Conv::MinWidth()) {
    174     Conv::Run(in, rect, weights, pool, out);
    175     return;
    176   }
    177 
    178   SlowSymmetric3(in, rect, weights, pool, out);
    179 }
    180 
    181 // NOLINTNEXTLINE(google-readability-namespace-comments)
    182 }  // namespace HWY_NAMESPACE
    183 }  // namespace jxl
    184 HWY_AFTER_NAMESPACE();
    185 
    186 #if HWY_ONCE
    187 namespace jxl {
    188 
    189 HWY_EXPORT(Symmetric3);
    190 void Symmetric3(const ImageF& in, const Rect& rect,
    191                 const WeightsSymmetric3& weights, ThreadPool* pool,
    192                 ImageF* out) {
    193   HWY_DYNAMIC_DISPATCH(Symmetric3)(in, rect, weights, pool, out);
    194 }
    195 
    196 }  // namespace jxl
    197 #endif  // HWY_ONCE