libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

upsample.cc (4647B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jpegli/upsample.h"
      7 
      8 #include <string.h>
      9 
     10 #undef HWY_TARGET_INCLUDE
     11 #define HWY_TARGET_INCLUDE "lib/jpegli/upsample.cc"
     12 #include <hwy/foreach_target.h>
     13 #include <hwy/highway.h>
     14 
     15 HWY_BEFORE_NAMESPACE();
     16 namespace jpegli {
     17 namespace HWY_NAMESPACE {
     18 
     19 // These templates are not found via ADL.
     20 using hwy::HWY_NAMESPACE::Mul;
     21 using hwy::HWY_NAMESPACE::MulAdd;
     22 using hwy::HWY_NAMESPACE::Vec;
     23 
     24 #if HWY_CAP_GE512
     25 using hwy::HWY_NAMESPACE::Half;
     26 using hwy::HWY_NAMESPACE::Vec;
     27 template <size_t i, class DF, class V>
     28 HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
     29   using HF = Half<DF>;
     30   using HHF = Half<HF>;
     31   auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
     32   return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
     33 }
     34 
     35 template <class DF, class V>
     36 HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
     37   using HF = Half<DF>;
     38   return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
     39 }
     40 
     41 #endif
     42 
     43 // Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
     44 // aligned.
     45 template <class DF, class V, typename T>
     46 void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
     47   static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
     48 #if HWY_TARGET == HWY_SCALAR
     49   Store(v0, df, mem);
     50   Store(v1, df, mem + 1);
     51 #elif !HWY_CAP_GE256
     52   Store(InterleaveLower(df, v0, v1), df, mem);
     53   Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
     54 #else
     55   if (!HWY_CAP_GE512 || Lanes(df) == 8) {
     56     auto t0 = InterleaveLower(df, v0, v1);
     57     auto t1 = InterleaveUpper(df, v0, v1);
     58     Store(ConcatLowerLower(df, t1, t0), df, mem);
     59     Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
     60   } else {
     61 #if HWY_CAP_GE512
     62     auto t0 = InterleaveLower(df, v0, v1);
     63     auto t1 = InterleaveUpper(df, v0, v1);
     64     Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
     65                   Quarter<1>(df, t0), Quarter<1>(df, t1)),
     66           df, mem);
     67     Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
     68                   Quarter<3>(df, t0), Quarter<3>(df, t1)),
     69           df, mem + Lanes(df));
     70 #endif
     71   }
     72 #endif
     73 }
     74 
     75 void Upsample2Horizontal(float* JXL_RESTRICT row,
     76                          float* JXL_RESTRICT scratch_space, size_t len_out) {
     77   HWY_FULL(float) df;
     78   auto threefour = Set(df, 0.75f);
     79   auto onefour = Set(df, 0.25f);
     80   const size_t len_in = (len_out + 1) >> 1;
     81   memcpy(scratch_space, row, len_in * sizeof(row[0]));
     82   scratch_space[-1] = scratch_space[0];
     83   scratch_space[len_in] = scratch_space[len_in - 1];
     84   for (size_t x = 0; x < len_in; x += Lanes(df)) {
     85     auto current = Mul(Load(df, scratch_space + x), threefour);
     86     auto prev = LoadU(df, scratch_space + x - 1);
     87     auto next = LoadU(df, scratch_space + x + 1);
     88     auto left = MulAdd(onefour, prev, current);
     89     auto right = MulAdd(onefour, next, current);
     90     StoreInterleaved(df, left, right, row + x * 2);
     91   }
     92 }
     93 
     94 void Upsample2Vertical(const float* JXL_RESTRICT row_top,
     95                        const float* JXL_RESTRICT row_mid,
     96                        const float* JXL_RESTRICT row_bot,
     97                        float* JXL_RESTRICT row_out0,
     98                        float* JXL_RESTRICT row_out1, size_t len) {
     99   HWY_FULL(float) df;
    100   auto threefour = Set(df, 0.75f);
    101   auto onefour = Set(df, 0.25f);
    102   for (size_t x = 0; x < len; x += Lanes(df)) {
    103     auto it = Load(df, row_top + x);
    104     auto im = Load(df, row_mid + x);
    105     auto ib = Load(df, row_bot + x);
    106     auto im_scaled = Mul(im, threefour);
    107     Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
    108     Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
    109   }
    110 }
    111 
    112 // NOLINTNEXTLINE(google-readability-namespace-comments)
    113 }  // namespace HWY_NAMESPACE
    114 }  // namespace jpegli
    115 HWY_AFTER_NAMESPACE();
    116 
    117 #if HWY_ONCE
    118 namespace jpegli {
    119 
    120 HWY_EXPORT(Upsample2Horizontal);
    121 HWY_EXPORT(Upsample2Vertical);
    122 
    123 void Upsample2Horizontal(float* JXL_RESTRICT row,
    124                          float* JXL_RESTRICT scratch_space, size_t len_out) {
    125   HWY_DYNAMIC_DISPATCH(Upsample2Horizontal)(row, scratch_space, len_out);
    126 }
    127 
    128 void Upsample2Vertical(const float* JXL_RESTRICT row_top,
    129                        const float* JXL_RESTRICT row_mid,
    130                        const float* JXL_RESTRICT row_bot,
    131                        float* JXL_RESTRICT row_out0,
    132                        float* JXL_RESTRICT row_out1, size_t len) {
    133   HWY_DYNAMIC_DISPATCH(Upsample2Vertical)
    134   (row_top, row_mid, row_bot, row_out0, row_out1, len);
    135 }
    136 }  // namespace jpegli
    137 #endif  // HWY_ONCE