libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

fast_dct16-inl.h (8037B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 /* This file is automatically generated. Do not modify it directly. */
      7 #if HWY_TARGET != HWY_NEON
      8 #error "only include this file from fast_dct-inl.h"
      9 #endif
     10 
     11 constexpr size_t FastIDCTIntegerBits(FastDCTTag<16>) { return 1; }
     12 
     13 void FastIDCT(FastDCTTag<16>, const int16_t* in, size_t in_stride, int16_t* out,
     14               size_t out_stride, size_t count) {
     15   JXL_ASSERT(count % 8 == 0);
     16   for (size_t i = 0; i < count; i += 8) {
     17     int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
     18     int16x8_t v1 = vld1q_s16(in + in_stride * 8 + i);
     19     int16x8_t v2 = vaddq_s16(v0, v1);
     20     int16x8_t v3 = vld1q_s16(in + in_stride * 4 + i);
     21     int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
     22     int16x8_t v4 = vaddq_s16(v4_tmp, v3);
     23     int16x8_t v5 = vld1q_s16(in + in_stride * 12 + i);
     24     int16x8_t v6 = vaddq_s16(v5, v3);
     25     int16x8_t v7 = vaddq_s16(v4, v6);
     26     int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
     27     int16x8_t v9 = vaddq_s16(v2, v8);
     28     int16x8_t v10 = vld1q_s16(in + in_stride * 2 + i);
     29     int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
     30     int16x8_t v11 = vaddq_s16(v11_tmp, v10);
     31     int16x8_t v12 = vld1q_s16(in + in_stride * 10 + i);
     32     int16x8_t v13 = vld1q_s16(in + in_stride * 6 + i);
     33     int16x8_t v14 = vaddq_s16(v12, v13);
     34     int16x8_t v15 = vaddq_s16(v11, v14);
     35     int16x8_t v16 = vaddq_s16(v13, v10);
     36     int16x8_t v17 = vqrdmulhq_n_s16(v16, 25080);
     37     int16x8_t v18 = vld1q_s16(in + in_stride * 14 + i);
     38     int16x8_t v19 = vaddq_s16(v18, v12);
     39     int16x8_t v20 = vaddq_s16(v16, v19);
     40     int16x8_t v21 = vqrdmulhq_n_s16(v20, 17734);
     41     int16x8_t v22 = vaddq_s16(v17, v21);
     42     int16x8_t v23 = vaddq_s16(v15, v22);
     43     int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
     44     int16x8_t v25 = vaddq_s16(v9, v24);
     45     int16x8_t v26 = vld1q_s16(in + in_stride * 15 + i);
     46     int16x8_t v27 = vld1q_s16(in + in_stride * 13 + i);
     47     int16x8_t v28 = vaddq_s16(v26, v27);
     48     int16x8_t v29 = vld1q_s16(in + in_stride * 11 + i);
     49     int16x8_t v30 = vld1q_s16(in + in_stride * 9 + i);
     50     int16x8_t v31 = vaddq_s16(v29, v30);
     51     int16x8_t v32 = vaddq_s16(v28, v31);
     52     int16x8_t v33 = vqrdmulhq_n_s16(v32, 17734);
     53     int16x8_t v34 = vld1q_s16(in + in_stride * 3 + i);
     54     int16x8_t v35 = vld1q_s16(in + in_stride * 1 + i);
     55     int16x8_t v36 = vaddq_s16(v34, v35);
     56     int16x8_t v37 = vld1q_s16(in + in_stride * 7 + i);
     57     int16x8_t v38 = vld1q_s16(in + in_stride * 5 + i);
     58     int16x8_t v39 = vaddq_s16(v37, v38);
     59     int16x8_t v40 = vaddq_s16(v36, v39);
     60     int16x8_t v41_tmp = vqrdmulhq_n_s16(v40, 10045);
     61     int16x8_t v41 = vaddq_s16(v41_tmp, v40);
     62     int16x8_t v42 = vaddq_s16(v33, v41);
     63     int16x8_t v43 = vqrdmulhq_n_s16(v42, 16705);
     64     int16x8_t v44_tmp = vqrdmulhq_n_s16(v36, 13573);
     65     int16x8_t v44 = vaddq_s16(v44_tmp, v36);
     66     int16x8_t v45 = vaddq_s16(v39, v31);
     67     int16x8_t v46 = vaddq_s16(v44, v45);
     68     int16x8_t v47 = vqrdmulhq_n_s16(v46, 16705);
     69     int16x8_t v48 = vaddq_s16(v43, v47);
     70     int16x8_t v49_tmp = vqrdmulhq_n_s16(v35, 13573);
     71     int16x8_t v49 = vaddq_s16(v49_tmp, v35);
     72     int16x8_t v50 = vaddq_s16(v30, v37);
     73     int16x8_t v51 = vaddq_s16(v49, v50);
     74     int16x8_t v52 = vaddq_s16(v38, v34);
     75     int16x8_t v53 = vaddq_s16(v27, v29);
     76     int16x8_t v54 = vaddq_s16(v52, v53);
     77     int16x8_t v55 = vqrdmulhq_n_s16(v54, 17734);
     78     int16x8_t v56 = vqrdmulhq_n_s16(v52, 25080);
     79     int16x8_t v57 = vaddq_s16(v55, v56);
     80     int16x8_t v58 = vaddq_s16(v51, v57);
     81     int16x8_t v59 = vaddq_s16(v48, v58);
     82     int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
     83     int16x8_t v61 = vaddq_s16(v25, v60);
     84     int16x8_t v62 = vsubq_s16(v0, v1);
     85     int16x8_t v63 = vsubq_s16(v4, v6);
     86     int16x8_t v64_tmp = vqrdmulhq_n_s16(v63, 10045);
     87     int16x8_t v64 = vaddq_s16(v64_tmp, v63);
     88     int16x8_t v65 = vaddq_s16(v62, v64);
     89     int16x8_t v66 = vsubq_s16(v11, v14);
     90     int16x8_t v67 = vqrdmulhq_n_s16(v16, 17734);
     91     int16x8_t v68_tmp = vqrdmulhq_n_s16(v19, 10045);
     92     int16x8_t v68 = vaddq_s16(v68_tmp, v19);
     93     int16x8_t v69 = vsubq_s16(v67, v68);
     94     int16x8_t v70 = vaddq_s16(v66, v69);
     95     int16x8_t v71 = vqrdmulhq_n_s16(v70, 19705);
     96     int16x8_t v72 = vaddq_s16(v65, v71);
     97     int16x8_t v73 = vsubq_s16(v49, v50);
     98     int16x8_t v74 = vqrdmulhq_n_s16(v52, 17734);
     99     int16x8_t v75_tmp = vqrdmulhq_n_s16(v53, 10045);
    100     int16x8_t v75 = vaddq_s16(v75_tmp, v53);
    101     int16x8_t v76 = vsubq_s16(v74, v75);
    102     int16x8_t v77 = vaddq_s16(v73, v76);
    103     int16x8_t v78 = vsubq_s16(v44, v45);
    104     int16x8_t v79 = vqrdmulhq_n_s16(v78, 19705);
    105     int16x8_t v80 = vqrdmulhq_n_s16(v40, 13573);
    106     int16x8_t v81 = vsubq_s16(v80, v32);
    107     int16x8_t v82 = vqrdmulhq_n_s16(v81, 25746);
    108     int16x8_t v83 = vaddq_s16(v79, v82);
    109     int16x8_t v84 = vaddq_s16(v77, v83);
    110     int16x8_t v85 = vqrdmulhq_n_s16(v84, 17121);
    111     int16x8_t v86 = vaddq_s16(v72, v85);
    112     int16x8_t v87 = vsubq_s16(v62, v64);
    113     int16x8_t v88 = vsubq_s16(v66, v69);
    114     int16x8_t v89 = vqrdmulhq_n_s16(v88, 29490);
    115     int16x8_t v90 = vaddq_s16(v87, v89);
    116     int16x8_t v91 = vsubq_s16(v73, v76);
    117     int16x8_t v92 = vqrdmulhq_n_s16(v78, 29490);
    118     int16x8_t v93_tmp = vqrdmulhq_n_s16(v81, 5763);
    119     int16x8_t v93 = vaddq_s16(v93_tmp, v81);
    120     int16x8_t v94 = vsubq_s16(v92, v93);
    121     int16x8_t v95 = vaddq_s16(v91, v94);
    122     int16x8_t v96 = vqrdmulhq_n_s16(v95, 18578);
    123     int16x8_t v97 = vaddq_s16(v90, v96);
    124     int16x8_t v98 = vsubq_s16(v46, v42);
    125     int16x8_t v99_tmp = vqrdmulhq_n_s16(v98, 18446);
    126     int16x8_t v99 = vmlaq_n_s16(v99_tmp, v98, 2);
    127     int16x8_t v100 = vsubq_s16(v51, v57);
    128     int16x8_t v101 = vaddq_s16(v99, v100);
    129     int16x8_t v102 = vqrdmulhq_n_s16(v101, 21195);
    130     int16x8_t v103 = vsubq_s16(v2, v8);
    131     int16x8_t v104 = vsubq_s16(v15, v22);
    132     int16x8_t v105_tmp = vqrdmulhq_n_s16(v104, 18446);
    133     int16x8_t v105 = vmlaq_n_s16(v105_tmp, v104, 2);
    134     int16x8_t v106 = vaddq_s16(v103, v105);
    135     int16x8_t v107 = vaddq_s16(v102, v106);
    136     int16x8_t v108 = vsubq_s16(v103, v105);
    137     int16x8_t v109 = vsubq_s16(v100, v99);
    138     int16x8_t v110 = vqrdmulhq_n_s16(v109, 25826);
    139     int16x8_t v111 = vaddq_s16(v108, v110);
    140     int16x8_t v112 = vsubq_s16(v87, v89);
    141     int16x8_t v113 = vsubq_s16(v91, v94);
    142     int16x8_t v114_tmp = vqrdmulhq_n_s16(v113, 1988);
    143     int16x8_t v114 = vaddq_s16(v114_tmp, v113);
    144     int16x8_t v115 = vaddq_s16(v112, v114);
    145     int16x8_t v116 = vsubq_s16(v65, v71);
    146     int16x8_t v117 = vsubq_s16(v77, v83);
    147     int16x8_t v118_tmp = vqrdmulhq_n_s16(v117, 23673);
    148     int16x8_t v118 = vaddq_s16(v118_tmp, v117);
    149     int16x8_t v119 = vaddq_s16(v116, v118);
    150     int16x8_t v120 = vsubq_s16(v58, v48);
    151     int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 3314);
    152     int16x8_t v121 = vmlaq_n_s16(v121_tmp, v120, 5);
    153     int16x8_t v122 = vsubq_s16(v9, v24);
    154     int16x8_t v123 = vaddq_s16(v121, v122);
    155     int16x8_t v124 = vsubq_s16(v122, v121);
    156     int16x8_t v125 = vsubq_s16(v116, v118);
    157     int16x8_t v126 = vsubq_s16(v112, v114);
    158     int16x8_t v127 = vsubq_s16(v108, v110);
    159     int16x8_t v128 = vsubq_s16(v106, v102);
    160     int16x8_t v129 = vsubq_s16(v90, v96);
    161     int16x8_t v130 = vsubq_s16(v72, v85);
    162     int16x8_t v131 = vsubq_s16(v25, v60);
    163     vst1q_s16(out + out_stride * 0 + i, v61);
    164     vst1q_s16(out + out_stride * 1 + i, v86);
    165     vst1q_s16(out + out_stride * 2 + i, v97);
    166     vst1q_s16(out + out_stride * 3 + i, v107);
    167     vst1q_s16(out + out_stride * 4 + i, v111);
    168     vst1q_s16(out + out_stride * 5 + i, v115);
    169     vst1q_s16(out + out_stride * 6 + i, v119);
    170     vst1q_s16(out + out_stride * 7 + i, v123);
    171     vst1q_s16(out + out_stride * 8 + i, v124);
    172     vst1q_s16(out + out_stride * 9 + i, v125);
    173     vst1q_s16(out + out_stride * 10 + i, v126);
    174     vst1q_s16(out + out_stride * 11 + i, v127);
    175     vst1q_s16(out + out_stride * 12 + i, v128);
    176     vst1q_s16(out + out_stride * 13 + i, v129);
    177     vst1q_s16(out + out_stride * 14 + i, v130);
    178     vst1q_s16(out + out_stride * 15 + i, v131);
    179   }
    180 }