libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

simd_util-inl.h (12991B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 // Misc utilities for SIMD operations
      7 
      8 #if defined(LIB_JXL_SIMD_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE)
      9 #ifdef LIB_JXL_SIMD_UTIL_INL_H_
     10 #undef LIB_JXL_SIMD_UTIL_INL_H_
     11 #else
     12 #define LIB_JXL_SIMD_UTIL_INL_H_
     13 #endif
     14 
     15 #include <hwy/highway.h>
     16 
     17 #include "lib/jxl/base/compiler_specific.h"
     18 
     19 HWY_BEFORE_NAMESPACE();
     20 namespace jxl {
     21 namespace HWY_NAMESPACE {
     22 
     23 #if HWY_CAP_GE512
     24 using hwy::HWY_NAMESPACE::Half;
     25 using hwy::HWY_NAMESPACE::Vec;
     26 template <size_t i, class DF, class V>
     27 HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
     28   using HF = Half<DF>;
     29   using HHF = Half<HF>;
     30   auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
     31   return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
     32 }
     33 
     34 template <class DF, class V>
     35 HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
     36   using HF = Half<DF>;
     37   return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
     38 }
     39 
     40 #endif
     41 
     42 // Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
     43 // aligned.
     44 template <class DF, class V, typename T>
     45 void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
     46   static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
     47 #if HWY_TARGET == HWY_SCALAR
     48   Store(v0, df, mem);
     49   Store(v1, df, mem + 1);
     50 #elif !HWY_CAP_GE256
     51   Store(InterleaveLower(df, v0, v1), df, mem);
     52   Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
     53 #else
     54   if (!HWY_CAP_GE512 || Lanes(df) == 8) {
     55     auto t0 = InterleaveLower(df, v0, v1);
     56     auto t1 = InterleaveUpper(df, v0, v1);
     57     Store(ConcatLowerLower(df, t1, t0), df, mem);
     58     Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
     59   } else {
     60 #if HWY_CAP_GE512
     61     auto t0 = InterleaveLower(df, v0, v1);
     62     auto t1 = InterleaveUpper(df, v0, v1);
     63     Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
     64                   Quarter<1>(df, t0), Quarter<1>(df, t1)),
     65           df, mem);
     66     Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
     67                   Quarter<3>(df, t0), Quarter<3>(df, t1)),
     68           df, mem + Lanes(df));
     69 #endif
     70   }
     71 #endif
     72 }
     73 
     74 // Stores v0[0], v1[0], v2[0], v3[0], v0[1] ... to mem, in this order. Mem must
     75 // be aligned.
     76 template <class DF, class V, typename T>
     77 void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, T* mem) {
     78   static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
     79 #if HWY_TARGET == HWY_SCALAR
     80   Store(v0, df, mem);
     81   Store(v1, df, mem + 1);
     82   Store(v2, df, mem + 2);
     83   Store(v3, df, mem + 3);
     84 #elif !HWY_CAP_GE256
     85   auto t0 = InterleaveLower(df, v0, v2);
     86   auto t1 = InterleaveLower(df, v1, v3);
     87   auto t2 = InterleaveUpper(df, v0, v2);
     88   auto t3 = InterleaveUpper(df, v1, v3);
     89   Store(InterleaveLower(df, t0, t1), df, mem);
     90   Store(InterleaveUpper(df, t0, t1), df, mem + Lanes(df));
     91   Store(InterleaveLower(df, t2, t3), df, mem + 2 * Lanes(df));
     92   Store(InterleaveUpper(df, t2, t3), df, mem + 3 * Lanes(df));
     93 #elif !HWY_CAP_GE512
     94   auto t0 = InterleaveLower(df, v0, v2);
     95   auto t1 = InterleaveLower(df, v1, v3);
     96   auto t2 = InterleaveUpper(df, v0, v2);
     97   auto t3 = InterleaveUpper(df, v1, v3);
     98 
     99   auto m0 = InterleaveLower(df, t0, t1);
    100   auto m1 = InterleaveUpper(df, t0, t1);
    101   auto m2 = InterleaveLower(df, t2, t3);
    102   auto m3 = InterleaveUpper(df, t2, t3);
    103 
    104   Store(ConcatLowerLower(df, m1, m0), df, mem);
    105   Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df));
    106   Store(ConcatUpperUpper(df, m1, m0), df, mem + 2 * Lanes(df));
    107   Store(ConcatUpperUpper(df, m3, m2), df, mem + 3 * Lanes(df));
    108 #else
    109   auto t0 = InterleaveLower(df, v0, v2);
    110   auto t1 = InterleaveLower(df, v1, v3);
    111   auto t2 = InterleaveUpper(df, v0, v2);
    112   auto t3 = InterleaveUpper(df, v1, v3);
    113 
    114   auto m0 = InterleaveLower(df, t0, t1);
    115   auto m1 = InterleaveUpper(df, t0, t1);
    116   auto m2 = InterleaveLower(df, t2, t3);
    117   auto m3 = InterleaveUpper(df, t2, t3);
    118 
    119   Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2),
    120                 Quarter<0>(df, m3)),
    121         df, mem);
    122   Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2),
    123                 Quarter<1>(df, m3)),
    124         df, mem + Lanes(df));
    125   Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2),
    126                 Quarter<2>(df, m3)),
    127         df, mem + 2 * Lanes(df));
    128   Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2),
    129                 Quarter<3>(df, m3)),
    130         df, mem + 3 * Lanes(df));
    131 #endif
    132 }
    133 
    134 // Stores v0[0], v1[0], v2[0], v3[0], v4[0], v5[0], v6[0], v7[0], v0[1] ... to
    135 // mem, in this order. Mem must be aligned.
    136 template <class DF, class V>
    137 void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, V v4, V v5, V v6,
    138                       V v7, float* mem) {
    139 #if HWY_TARGET == HWY_SCALAR
    140   Store(v0, df, mem);
    141   Store(v1, df, mem + 1);
    142   Store(v2, df, mem + 2);
    143   Store(v3, df, mem + 3);
    144   Store(v4, df, mem + 4);
    145   Store(v5, df, mem + 5);
    146   Store(v6, df, mem + 6);
    147   Store(v7, df, mem + 7);
    148 #elif !HWY_CAP_GE256
    149   auto t0 = InterleaveLower(df, v0, v4);
    150   auto t1 = InterleaveLower(df, v1, v5);
    151   auto t2 = InterleaveLower(df, v2, v6);
    152   auto t3 = InterleaveLower(df, v3, v7);
    153   auto t4 = InterleaveUpper(df, v0, v4);
    154   auto t5 = InterleaveUpper(df, v1, v5);
    155   auto t6 = InterleaveUpper(df, v2, v6);
    156   auto t7 = InterleaveUpper(df, v3, v7);
    157 
    158   auto w0 = InterleaveLower(df, t0, t2);
    159   auto w1 = InterleaveLower(df, t1, t3);
    160   auto w2 = InterleaveUpper(df, t0, t2);
    161   auto w3 = InterleaveUpper(df, t1, t3);
    162   auto w4 = InterleaveLower(df, t4, t6);
    163   auto w5 = InterleaveLower(df, t5, t7);
    164   auto w6 = InterleaveUpper(df, t4, t6);
    165   auto w7 = InterleaveUpper(df, t5, t7);
    166 
    167   Store(InterleaveLower(df, w0, w1), df, mem);
    168   Store(InterleaveUpper(df, w0, w1), df, mem + Lanes(df));
    169   Store(InterleaveLower(df, w2, w3), df, mem + 2 * Lanes(df));
    170   Store(InterleaveUpper(df, w2, w3), df, mem + 3 * Lanes(df));
    171   Store(InterleaveLower(df, w4, w5), df, mem + 4 * Lanes(df));
    172   Store(InterleaveUpper(df, w4, w5), df, mem + 5 * Lanes(df));
    173   Store(InterleaveLower(df, w6, w7), df, mem + 6 * Lanes(df));
    174   Store(InterleaveUpper(df, w6, w7), df, mem + 7 * Lanes(df));
    175 #elif !HWY_CAP_GE512
    176   auto t0 = InterleaveLower(df, v0, v4);
    177   auto t1 = InterleaveLower(df, v1, v5);
    178   auto t2 = InterleaveLower(df, v2, v6);
    179   auto t3 = InterleaveLower(df, v3, v7);
    180   auto t4 = InterleaveUpper(df, v0, v4);
    181   auto t5 = InterleaveUpper(df, v1, v5);
    182   auto t6 = InterleaveUpper(df, v2, v6);
    183   auto t7 = InterleaveUpper(df, v3, v7);
    184 
    185   auto w0 = InterleaveLower(df, t0, t2);
    186   auto w1 = InterleaveLower(df, t1, t3);
    187   auto w2 = InterleaveUpper(df, t0, t2);
    188   auto w3 = InterleaveUpper(df, t1, t3);
    189   auto w4 = InterleaveLower(df, t4, t6);
    190   auto w5 = InterleaveLower(df, t5, t7);
    191   auto w6 = InterleaveUpper(df, t4, t6);
    192   auto w7 = InterleaveUpper(df, t5, t7);
    193 
    194   auto m0 = InterleaveLower(df, w0, w1);
    195   auto m1 = InterleaveUpper(df, w0, w1);
    196   auto m2 = InterleaveLower(df, w2, w3);
    197   auto m3 = InterleaveUpper(df, w2, w3);
    198   auto m4 = InterleaveLower(df, w4, w5);
    199   auto m5 = InterleaveUpper(df, w4, w5);
    200   auto m6 = InterleaveLower(df, w6, w7);
    201   auto m7 = InterleaveUpper(df, w6, w7);
    202 
    203   Store(ConcatLowerLower(df, m1, m0), df, mem);
    204   Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df));
    205   Store(ConcatLowerLower(df, m5, m4), df, mem + 2 * Lanes(df));
    206   Store(ConcatLowerLower(df, m7, m6), df, mem + 3 * Lanes(df));
    207   Store(ConcatUpperUpper(df, m1, m0), df, mem + 4 * Lanes(df));
    208   Store(ConcatUpperUpper(df, m3, m2), df, mem + 5 * Lanes(df));
    209   Store(ConcatUpperUpper(df, m5, m4), df, mem + 6 * Lanes(df));
    210   Store(ConcatUpperUpper(df, m7, m6), df, mem + 7 * Lanes(df));
    211 #else
    212   auto t0 = InterleaveLower(df, v0, v4);
    213   auto t1 = InterleaveLower(df, v1, v5);
    214   auto t2 = InterleaveLower(df, v2, v6);
    215   auto t3 = InterleaveLower(df, v3, v7);
    216   auto t4 = InterleaveUpper(df, v0, v4);
    217   auto t5 = InterleaveUpper(df, v1, v5);
    218   auto t6 = InterleaveUpper(df, v2, v6);
    219   auto t7 = InterleaveUpper(df, v3, v7);
    220 
    221   auto w0 = InterleaveLower(df, t0, t2);
    222   auto w1 = InterleaveLower(df, t1, t3);
    223   auto w2 = InterleaveUpper(df, t0, t2);
    224   auto w3 = InterleaveUpper(df, t1, t3);
    225   auto w4 = InterleaveLower(df, t4, t6);
    226   auto w5 = InterleaveLower(df, t5, t7);
    227   auto w6 = InterleaveUpper(df, t4, t6);
    228   auto w7 = InterleaveUpper(df, t5, t7);
    229 
    230   auto m0 = InterleaveLower(df, w0, w1);
    231   auto m1 = InterleaveUpper(df, w0, w1);
    232   auto m2 = InterleaveLower(df, w2, w3);
    233   auto m3 = InterleaveUpper(df, w2, w3);
    234   auto m4 = InterleaveLower(df, w4, w5);
    235   auto m5 = InterleaveUpper(df, w4, w5);
    236   auto m6 = InterleaveLower(df, w6, w7);
    237   auto m7 = InterleaveUpper(df, w6, w7);
    238 
    239   Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2),
    240                 Quarter<0>(df, m3)),
    241         df, mem);
    242   Store(Concat4(df, Quarter<0>(df, m4), Quarter<0>(df, m5), Quarter<0>(df, m6),
    243                 Quarter<0>(df, m7)),
    244         df, mem + Lanes(df));
    245   Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2),
    246                 Quarter<1>(df, m3)),
    247         df, mem + 2 * Lanes(df));
    248   Store(Concat4(df, Quarter<1>(df, m4), Quarter<1>(df, m5), Quarter<1>(df, m6),
    249                 Quarter<1>(df, m7)),
    250         df, mem + 3 * Lanes(df));
    251   Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2),
    252                 Quarter<2>(df, m3)),
    253         df, mem + 4 * Lanes(df));
    254   Store(Concat4(df, Quarter<2>(df, m4), Quarter<2>(df, m5), Quarter<2>(df, m6),
    255                 Quarter<2>(df, m7)),
    256         df, mem + 5 * Lanes(df));
    257   Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2),
    258                 Quarter<3>(df, m3)),
    259         df, mem + 6 * Lanes(df));
    260   Store(Concat4(df, Quarter<3>(df, m4), Quarter<3>(df, m5), Quarter<3>(df, m6),
    261                 Quarter<3>(df, m7)),
    262         df, mem + 7 * Lanes(df));
    263 #endif
    264 }
    265 
    266 #if HWY_CAP_GE256
    267 JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from,
    268                                   int32_t* JXL_RESTRICT to, size_t fromstride) {
    269   const HWY_CAPPED(int32_t, 8) d;
    270   auto i0 = Load(d, from);
    271   auto i1 = Load(d, from + 1 * fromstride);
    272   auto i2 = Load(d, from + 2 * fromstride);
    273   auto i3 = Load(d, from + 3 * fromstride);
    274   auto i4 = Load(d, from + 4 * fromstride);
    275   auto i5 = Load(d, from + 5 * fromstride);
    276   auto i6 = Load(d, from + 6 * fromstride);
    277   auto i7 = Load(d, from + 7 * fromstride);
    278 
    279   const auto q0 = InterleaveLower(d, i0, i2);
    280   const auto q1 = InterleaveLower(d, i1, i3);
    281   const auto q2 = InterleaveUpper(d, i0, i2);
    282   const auto q3 = InterleaveUpper(d, i1, i3);
    283   const auto q4 = InterleaveLower(d, i4, i6);
    284   const auto q5 = InterleaveLower(d, i5, i7);
    285   const auto q6 = InterleaveUpper(d, i4, i6);
    286   const auto q7 = InterleaveUpper(d, i5, i7);
    287 
    288   const auto r0 = InterleaveLower(d, q0, q1);
    289   const auto r1 = InterleaveUpper(d, q0, q1);
    290   const auto r2 = InterleaveLower(d, q2, q3);
    291   const auto r3 = InterleaveUpper(d, q2, q3);
    292   const auto r4 = InterleaveLower(d, q4, q5);
    293   const auto r5 = InterleaveUpper(d, q4, q5);
    294   const auto r6 = InterleaveLower(d, q6, q7);
    295   const auto r7 = InterleaveUpper(d, q6, q7);
    296 
    297   i0 = ConcatLowerLower(d, r4, r0);
    298   i1 = ConcatLowerLower(d, r5, r1);
    299   i2 = ConcatLowerLower(d, r6, r2);
    300   i3 = ConcatLowerLower(d, r7, r3);
    301   i4 = ConcatUpperUpper(d, r4, r0);
    302   i5 = ConcatUpperUpper(d, r5, r1);
    303   i6 = ConcatUpperUpper(d, r6, r2);
    304   i7 = ConcatUpperUpper(d, r7, r3);
    305 
    306   Store(i0, d, to);
    307   Store(i1, d, to + 1 * 8);
    308   Store(i2, d, to + 2 * 8);
    309   Store(i3, d, to + 3 * 8);
    310   Store(i4, d, to + 4 * 8);
    311   Store(i5, d, to + 5 * 8);
    312   Store(i6, d, to + 6 * 8);
    313   Store(i7, d, to + 7 * 8);
    314 }
    315 #elif HWY_TARGET != HWY_SCALAR
    316 JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from,
    317                                   int32_t* JXL_RESTRICT to, size_t fromstride) {
    318   const HWY_CAPPED(int32_t, 4) d;
    319   for (size_t n = 0; n < 8; n += 4) {
    320     for (size_t m = 0; m < 8; m += 4) {
    321       auto p0 = Load(d, from + n * fromstride + m);
    322       auto p1 = Load(d, from + (n + 1) * fromstride + m);
    323       auto p2 = Load(d, from + (n + 2) * fromstride + m);
    324       auto p3 = Load(d, from + (n + 3) * fromstride + m);
    325       const auto q0 = InterleaveLower(d, p0, p2);
    326       const auto q1 = InterleaveLower(d, p1, p3);
    327       const auto q2 = InterleaveUpper(d, p0, p2);
    328       const auto q3 = InterleaveUpper(d, p1, p3);
    329 
    330       const auto r0 = InterleaveLower(d, q0, q1);
    331       const auto r1 = InterleaveUpper(d, q0, q1);
    332       const auto r2 = InterleaveLower(d, q2, q3);
    333       const auto r3 = InterleaveUpper(d, q2, q3);
    334       Store(r0, d, to + m * 8 + n);
    335       Store(r1, d, to + (1 + m) * 8 + n);
    336       Store(r2, d, to + (2 + m) * 8 + n);
    337       Store(r3, d, to + (3 + m) * 8 + n);
    338     }
    339   }
    340 }
    341 
    342 #endif
    343 
    344 // NOLINTNEXTLINE(google-readability-namespace-comments)
    345 }  // namespace HWY_NAMESPACE
    346 }  // namespace jxl
    347 HWY_AFTER_NAMESPACE();
    348 
    349 #endif  // LIB_JXL_SIMD_UTIL_INL_H_