libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

transpose-inl.h (3684B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #if defined(LIB_JPEGLI_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE)
      7 #ifdef LIB_JPEGLI_TRANSPOSE_INL_H_
      8 #undef LIB_JPEGLI_TRANSPOSE_INL_H_
      9 #else
     10 #define LIB_JPEGLI_TRANSPOSE_INL_H_
     11 #endif
     12 
     13 #include "lib/jxl/base/compiler_specific.h"
     14 
     15 HWY_BEFORE_NAMESPACE();
     16 namespace jpegli {
     17 namespace HWY_NAMESPACE {
     18 namespace {
     19 
     20 #if HWY_CAP_GE256
     21 JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
     22                                   float* JXL_RESTRICT to) {
     23   const HWY_CAPPED(float, 8) d;
     24   auto i0 = Load(d, from);
     25   auto i1 = Load(d, from + 1 * 8);
     26   auto i2 = Load(d, from + 2 * 8);
     27   auto i3 = Load(d, from + 3 * 8);
     28   auto i4 = Load(d, from + 4 * 8);
     29   auto i5 = Load(d, from + 5 * 8);
     30   auto i6 = Load(d, from + 6 * 8);
     31   auto i7 = Load(d, from + 7 * 8);
     32 
     33   const auto q0 = InterleaveLower(d, i0, i2);
     34   const auto q1 = InterleaveLower(d, i1, i3);
     35   const auto q2 = InterleaveUpper(d, i0, i2);
     36   const auto q3 = InterleaveUpper(d, i1, i3);
     37   const auto q4 = InterleaveLower(d, i4, i6);
     38   const auto q5 = InterleaveLower(d, i5, i7);
     39   const auto q6 = InterleaveUpper(d, i4, i6);
     40   const auto q7 = InterleaveUpper(d, i5, i7);
     41 
     42   const auto r0 = InterleaveLower(d, q0, q1);
     43   const auto r1 = InterleaveUpper(d, q0, q1);
     44   const auto r2 = InterleaveLower(d, q2, q3);
     45   const auto r3 = InterleaveUpper(d, q2, q3);
     46   const auto r4 = InterleaveLower(d, q4, q5);
     47   const auto r5 = InterleaveUpper(d, q4, q5);
     48   const auto r6 = InterleaveLower(d, q6, q7);
     49   const auto r7 = InterleaveUpper(d, q6, q7);
     50 
     51   i0 = ConcatLowerLower(d, r4, r0);
     52   i1 = ConcatLowerLower(d, r5, r1);
     53   i2 = ConcatLowerLower(d, r6, r2);
     54   i3 = ConcatLowerLower(d, r7, r3);
     55   i4 = ConcatUpperUpper(d, r4, r0);
     56   i5 = ConcatUpperUpper(d, r5, r1);
     57   i6 = ConcatUpperUpper(d, r6, r2);
     58   i7 = ConcatUpperUpper(d, r7, r3);
     59 
     60   Store(i0, d, to);
     61   Store(i1, d, to + 1 * 8);
     62   Store(i2, d, to + 2 * 8);
     63   Store(i3, d, to + 3 * 8);
     64   Store(i4, d, to + 4 * 8);
     65   Store(i5, d, to + 5 * 8);
     66   Store(i6, d, to + 6 * 8);
     67   Store(i7, d, to + 7 * 8);
     68 }
     69 #elif HWY_TARGET != HWY_SCALAR
     70 JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
     71                                   float* JXL_RESTRICT to) {
     72   const HWY_CAPPED(float, 4) d;
     73   for (size_t n = 0; n < 8; n += 4) {
     74     for (size_t m = 0; m < 8; m += 4) {
     75       auto p0 = Load(d, from + n * 8 + m);
     76       auto p1 = Load(d, from + (n + 1) * 8 + m);
     77       auto p2 = Load(d, from + (n + 2) * 8 + m);
     78       auto p3 = Load(d, from + (n + 3) * 8 + m);
     79       const auto q0 = InterleaveLower(d, p0, p2);
     80       const auto q1 = InterleaveLower(d, p1, p3);
     81       const auto q2 = InterleaveUpper(d, p0, p2);
     82       const auto q3 = InterleaveUpper(d, p1, p3);
     83 
     84       const auto r0 = InterleaveLower(d, q0, q1);
     85       const auto r1 = InterleaveUpper(d, q0, q1);
     86       const auto r2 = InterleaveLower(d, q2, q3);
     87       const auto r3 = InterleaveUpper(d, q2, q3);
     88       Store(r0, d, to + m * 8 + n);
     89       Store(r1, d, to + (1 + m) * 8 + n);
     90       Store(r2, d, to + (2 + m) * 8 + n);
     91       Store(r3, d, to + (3 + m) * 8 + n);
     92     }
     93   }
     94 }
     95 #else
     96 static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
     97                                          float* JXL_RESTRICT to) {
     98   for (size_t n = 0; n < 8; ++n) {
     99     for (size_t m = 0; m < 8; ++m) {
    100       to[8 * n + m] = from[8 * m + n];
    101     }
    102   }
    103 }
    104 #endif
    105 
    106 // NOLINTNEXTLINE(google-readability-namespace-comments)
    107 }  // namespace
    108 }  // namespace HWY_NAMESPACE
    109 }  // namespace jpegli
    110 HWY_AFTER_NAMESPACE();
    111 #endif  // LIB_JPEGLI_TRANSPOSE_INL_H_