rational_polynomial-inl.h (3969B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 // Fast SIMD evaluation of rational polynomials for approximating functions. 7 8 #if defined(LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_) == \ 9 defined(HWY_TARGET_TOGGLE) 10 #ifdef LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_ 11 #undef LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_ 12 #else 13 #define LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_ 14 #endif 15 16 #include <jxl/types.h> 17 #include <stddef.h> 18 19 #include <hwy/highway.h> 20 HWY_BEFORE_NAMESPACE(); 21 namespace jxl { 22 namespace HWY_NAMESPACE { 23 namespace { 24 25 // These templates are not found via ADL. 26 using hwy::HWY_NAMESPACE::Div; 27 using hwy::HWY_NAMESPACE::MulAdd; 28 29 // Primary template: default to actual division. 30 template <typename T, class V> 31 struct FastDivision { 32 HWY_INLINE V operator()(const V n, const V d) const { return n / d; } 33 }; 34 // Partial specialization for float vectors. 35 template <class V> 36 struct FastDivision<float, V> { 37 // One Newton-Raphson iteration. 38 static HWY_INLINE V ReciprocalNR(const V x) { 39 const auto rcp = ApproximateReciprocal(x); 40 const auto sum = Add(rcp, rcp); 41 const auto x_rcp = Mul(x, rcp); 42 return NegMulAdd(x_rcp, rcp, sum); 43 } 44 45 V operator()(const V n, const V d) const { 46 #if JXL_TRUE // Faster on SKX 47 return Div(n, d); 48 #else 49 return n * ReciprocalNR(d); 50 #endif 51 } 52 }; 53 54 // Approximates smooth functions via rational polynomials (i.e. dividing two 55 // polynomials). Evaluates polynomials via Horner's scheme, which is faster than 56 // Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to 57 // specify constants (replicated 4x) independently of the lane count. 58 template <size_t NP, size_t NQ, class D, class V, typename T> 59 HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x, 60 const T (&p)[NP], 61 const T (&q)[NQ]) { 62 constexpr size_t kDegP = NP / 4 - 1; 63 constexpr size_t kDegQ = NQ / 4 - 1; 64 auto yp = LoadDup128(d, &p[kDegP * 4]); 65 auto yq = LoadDup128(d, &q[kDegQ * 4]); 66 // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a 67 // compiler warning that the index is out of bounds since we are already 68 // checking that it is not out of bounds with (kDegP >= n) and the access 69 // will be optimized away. Similarly with q and kDegQ. 70 HWY_FENCE; 71 if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4))); 72 if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4))); 73 HWY_FENCE; 74 if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4))); 75 if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4))); 76 HWY_FENCE; 77 if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4))); 78 if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4))); 79 HWY_FENCE; 80 if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4))); 81 if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4))); 82 HWY_FENCE; 83 if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4))); 84 if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4))); 85 HWY_FENCE; 86 if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4))); 87 if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4))); 88 HWY_FENCE; 89 if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4))); 90 if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4))); 91 92 static_assert(kDegP < 8, "Polynomial degree is too high"); 93 static_assert(kDegQ < 8, "Polynomial degree is too high"); 94 95 return FastDivision<T, V>()(yp, yq); 96 } 97 98 } // namespace 99 // NOLINTNEXTLINE(google-readability-namespace-comments) 100 } // namespace HWY_NAMESPACE 101 } // namespace jxl 102 HWY_AFTER_NAMESPACE(); 103 #endif // LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_