convolve_symmetric5.cc (6944B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "lib/jxl/convolve.h" 7 8 #undef HWY_TARGET_INCLUDE 9 #define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric5.cc" 10 #include <hwy/foreach_target.h> 11 #include <hwy/highway.h> 12 13 #include "lib/jxl/base/common.h" 14 #include "lib/jxl/convolve-inl.h" 15 16 HWY_BEFORE_NAMESPACE(); 17 namespace jxl { 18 namespace HWY_NAMESPACE { 19 20 // These templates are not found via ADL. 21 using hwy::HWY_NAMESPACE::Add; 22 using hwy::HWY_NAMESPACE::Mul; 23 using hwy::HWY_NAMESPACE::Vec; 24 25 // Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2]. 26 template <class WrapY> 27 static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y, 28 const int64_t ix, const int64_t iy, 29 const size_t xsize, const size_t ysize, 30 const float wx0, const float wx1, 31 const float wx2) { 32 const WrapMirror wrap_x; 33 const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize)); 34 const float in_m2 = row[wrap_x(ix - 2, xsize)]; 35 const float in_p2 = row[wrap_x(ix + 2, xsize)]; 36 const float in_m1 = row[wrap_x(ix - 1, xsize)]; 37 const float in_p1 = row[wrap_x(ix + 1, xsize)]; 38 const float in_00 = row[ix]; 39 const float sum_2 = wx2 * (in_m2 + in_p2); 40 const float sum_1 = wx1 * (in_m1 + in_p1); 41 const float sum_0 = wx0 * in_00; 42 return sum_2 + (sum_1 + sum_0); 43 } 44 45 template <class WrapY, class V> 46 static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix, 47 const int64_t iy, const size_t ysize, const V wx0, 48 const V wx1, const V wx2) { 49 const HWY_FULL(float) d; 50 const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix; 51 const auto in_m2 = LoadU(d, center - 2); 52 const auto in_p2 = LoadU(d, center + 2); 53 const auto in_m1 = LoadU(d, center - 1); 54 const auto in_p1 = LoadU(d, center + 1); 55 const auto in_00 = LoadU(d, center); 56 const auto sum_2 = Mul(wx2, Add(in_m2, in_p2)); 57 const auto sum_1 = Mul(wx1, Add(in_m1, in_p1)); 58 const auto sum_0 = Mul(wx0, in_00); 59 return Add(sum_2, Add(sum_1, sum_0)); 60 } 61 62 // Produces result for one pixel 63 template <class WrapY> 64 float Symmetric5Border(const ImageF& in, const int64_t ix, const int64_t iy, 65 const WeightsSymmetric5& weights) { 66 const float w0 = weights.c[0]; 67 const float w1 = weights.r[0]; 68 const float w2 = weights.R[0]; 69 const float w4 = weights.d[0]; 70 const float w5 = weights.L[0]; 71 const float w8 = weights.D[0]; 72 73 const size_t xsize = in.xsize(); 74 const size_t ysize = in.ysize(); 75 const WrapY wrap_y; 76 // Unrolled loop over all 5 rows of the kernel. 77 float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2); 78 79 sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8); 80 float sum1 = 81 WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8); 82 83 sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5); 84 sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5); 85 86 return sum0 + sum1; 87 } 88 89 // Produces result for one vector's worth of pixels 90 template <class WrapY> 91 static void Symmetric5Interior(const ImageF& in, const int64_t ix, 92 const int64_t rix, const int64_t iy, 93 const WeightsSymmetric5& weights, 94 float* JXL_RESTRICT row_out) { 95 const HWY_FULL(float) d; 96 97 const auto w0 = LoadDup128(d, weights.c); 98 const auto w1 = LoadDup128(d, weights.r); 99 const auto w2 = LoadDup128(d, weights.R); 100 const auto w4 = LoadDup128(d, weights.d); 101 const auto w5 = LoadDup128(d, weights.L); 102 const auto w8 = LoadDup128(d, weights.D); 103 104 const size_t ysize = in.ysize(); 105 const WrapY wrap_y; 106 // Unrolled loop over all 5 rows of the kernel. 107 auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2); 108 109 sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8)); 110 auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8); 111 112 sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5)); 113 sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5)); 114 115 StoreU(Add(sum0, sum1), d, row_out + rix); 116 } 117 118 template <class WrapY> 119 static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy, 120 const WeightsSymmetric5& weights, 121 float* JXL_RESTRICT row_out) { 122 const int64_t kRadius = 2; 123 const size_t xend = rect.x1(); 124 125 size_t rix = 0; 126 size_t ix = rect.x0(); 127 const HWY_FULL(float) d; 128 const size_t N = Lanes(d); 129 const size_t aligned_x = RoundUpTo(kRadius, N); 130 for (; ix < std::min(aligned_x, xend); ++ix, ++rix) { 131 row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); 132 } 133 for (; ix + N + kRadius <= xend; ix += N, rix += N) { 134 Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out); 135 } 136 for (; ix < xend; ++ix, ++rix) { 137 row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); 138 } 139 } 140 141 // Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike 142 // the fully vectorized strategies below. 143 void Symmetric5(const ImageF& in, const Rect& in_rect, 144 const WeightsSymmetric5& weights, ThreadPool* pool, 145 ImageF* JXL_RESTRICT out, const Rect& out_rect) { 146 JXL_ASSERT(in_rect.xsize() == out_rect.xsize()); 147 JXL_ASSERT(in_rect.ysize() == out_rect.ysize()); 148 const size_t ysize = in_rect.ysize(); 149 JXL_CHECK(RunOnPool( 150 pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit, 151 [&](const uint32_t task, size_t /*thread*/) { 152 const int64_t riy = task; 153 const int64_t iy = in_rect.y0() + riy; 154 155 if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) { 156 Symmetric5Row<WrapMirror>(in, in_rect, iy, weights, 157 out_rect.Row(out, riy)); 158 } else { 159 Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights, 160 out_rect.Row(out, riy)); 161 } 162 }, 163 "Symmetric5x5Convolution")); 164 } 165 166 // NOLINTNEXTLINE(google-readability-namespace-comments) 167 } // namespace HWY_NAMESPACE 168 } // namespace jxl 169 HWY_AFTER_NAMESPACE(); 170 171 #if HWY_ONCE 172 namespace jxl { 173 174 HWY_EXPORT(Symmetric5); 175 void Symmetric5(const ImageF& in, const Rect& in_rect, 176 const WeightsSymmetric5& weights, ThreadPool* pool, 177 ImageF* JXL_RESTRICT out, const Rect& out_rect) { 178 HWY_DYNAMIC_DISPATCH(Symmetric5)(in, in_rect, weights, pool, out, out_rect); 179 } 180 181 void Symmetric5(const ImageF& in, const Rect& rect, 182 const WeightsSymmetric5& weights, ThreadPool* pool, 183 ImageF* JXL_RESTRICT out) { 184 Symmetric5(in, rect, weights, pool, out, Rect(*out)); 185 } 186 187 } // namespace jxl 188 #endif // HWY_ONCE