libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

fast_dct8-inl.h (3411B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 /* This file is automatically generated. Do not modify it directly. */
      7 #if HWY_TARGET != HWY_NEON
      8 #error "only include this file from fast_dct-inl.h"
      9 #endif
     10 
     11 constexpr size_t FastIDCTIntegerBits(FastDCTTag<8>) { return 1; }
     12 
     13 void FastIDCT(FastDCTTag<8>, const int16_t* in, size_t in_stride, int16_t* out,
     14               size_t out_stride, size_t count) {
     15   JXL_ASSERT(count % 8 == 0);
     16   for (size_t i = 0; i < count; i += 8) {
     17     int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
     18     int16x8_t v1 = vld1q_s16(in + in_stride * 4 + i);
     19     int16x8_t v2 = vaddq_s16(v0, v1);
     20     int16x8_t v3 = vld1q_s16(in + in_stride * 2 + i);
     21     int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
     22     int16x8_t v4 = vaddq_s16(v4_tmp, v3);
     23     int16x8_t v5 = vld1q_s16(in + in_stride * 6 + i);
     24     int16x8_t v6 = vaddq_s16(v5, v3);
     25     int16x8_t v7 = vaddq_s16(v4, v6);
     26     int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
     27     int16x8_t v9 = vaddq_s16(v2, v8);
     28     int16x8_t v10 = vld1q_s16(in + in_stride * 1 + i);
     29     int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
     30     int16x8_t v11 = vaddq_s16(v11_tmp, v10);
     31     int16x8_t v12 = vld1q_s16(in + in_stride * 5 + i);
     32     int16x8_t v13 = vld1q_s16(in + in_stride * 3 + i);
     33     int16x8_t v14 = vaddq_s16(v12, v13);
     34     int16x8_t v15 = vaddq_s16(v11, v14);
     35     int16x8_t v16 = vaddq_s16(v13, v10);
     36     int16x8_t v17 = vqrdmulhq_n_s16(v16, 25080);
     37     int16x8_t v18 = vld1q_s16(in + in_stride * 7 + i);
     38     int16x8_t v19 = vaddq_s16(v18, v12);
     39     int16x8_t v20 = vaddq_s16(v16, v19);
     40     int16x8_t v21 = vqrdmulhq_n_s16(v20, 17734);
     41     int16x8_t v22 = vaddq_s16(v17, v21);
     42     int16x8_t v23 = vaddq_s16(v15, v22);
     43     int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
     44     int16x8_t v25 = vaddq_s16(v9, v24);
     45     int16x8_t v26 = vsubq_s16(v0, v1);
     46     int16x8_t v27 = vsubq_s16(v4, v6);
     47     int16x8_t v28_tmp = vqrdmulhq_n_s16(v27, 10045);
     48     int16x8_t v28 = vaddq_s16(v28_tmp, v27);
     49     int16x8_t v29 = vaddq_s16(v26, v28);
     50     int16x8_t v30 = vsubq_s16(v11, v14);
     51     int16x8_t v31 = vqrdmulhq_n_s16(v16, 17734);
     52     int16x8_t v32_tmp = vqrdmulhq_n_s16(v19, 10045);
     53     int16x8_t v32 = vaddq_s16(v32_tmp, v19);
     54     int16x8_t v33 = vsubq_s16(v31, v32);
     55     int16x8_t v34 = vaddq_s16(v30, v33);
     56     int16x8_t v35 = vqrdmulhq_n_s16(v34, 19705);
     57     int16x8_t v36 = vaddq_s16(v29, v35);
     58     int16x8_t v37 = vsubq_s16(v26, v28);
     59     int16x8_t v38 = vsubq_s16(v30, v33);
     60     int16x8_t v39 = vqrdmulhq_n_s16(v38, 29490);
     61     int16x8_t v40 = vaddq_s16(v37, v39);
     62     int16x8_t v41 = vsubq_s16(v2, v8);
     63     int16x8_t v42 = vsubq_s16(v15, v22);
     64     int16x8_t v43_tmp = vqrdmulhq_n_s16(v42, 18446);
     65     int16x8_t v43 = vmlaq_n_s16(v43_tmp, v42, 2);
     66     int16x8_t v44 = vaddq_s16(v41, v43);
     67     int16x8_t v45 = vsubq_s16(v41, v43);
     68     int16x8_t v46 = vsubq_s16(v37, v39);
     69     int16x8_t v47 = vsubq_s16(v29, v35);
     70     int16x8_t v48 = vsubq_s16(v9, v24);
     71     vst1q_s16(out + out_stride * 0 + i, v25);
     72     vst1q_s16(out + out_stride * 1 + i, v36);
     73     vst1q_s16(out + out_stride * 2 + i, v40);
     74     vst1q_s16(out + out_stride * 3 + i, v44);
     75     vst1q_s16(out + out_stride * 4 + i, v45);
     76     vst1q_s16(out + out_stride * 5 + i, v46);
     77     vst1q_s16(out + out_stride * 6 + i, v47);
     78     vst1q_s16(out + out_stride * 7 + i, v48);
     79   }
     80 }