libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

fast_dct32-inl.h (19429B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 /* This file is automatically generated. Do not modify it directly. */
      7 #if HWY_TARGET != HWY_NEON
      8 #error "only include this file from fast_dct-inl.h"
      9 #endif
     10 
     11 constexpr size_t FastIDCTIntegerBits(FastDCTTag<32>) { return 1; }
     12 
     13 void FastIDCT(FastDCTTag<32>, const int16_t* in, size_t in_stride, int16_t* out,
     14               size_t out_stride, size_t count) {
     15   JXL_ASSERT(count % 8 == 0);
     16   for (size_t i = 0; i < count; i += 8) {
     17     int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
     18     int16x8_t v1 = vld1q_s16(in + in_stride * 16 + i);
     19     int16x8_t v2 = vaddq_s16(v0, v1);
     20     int16x8_t v3 = vld1q_s16(in + in_stride * 8 + i);
     21     int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
     22     int16x8_t v4 = vaddq_s16(v4_tmp, v3);
     23     int16x8_t v5 = vld1q_s16(in + in_stride * 24 + i);
     24     int16x8_t v6 = vaddq_s16(v5, v3);
     25     int16x8_t v7 = vaddq_s16(v4, v6);
     26     int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
     27     int16x8_t v9 = vaddq_s16(v2, v8);
     28     int16x8_t v10 = vld1q_s16(in + in_stride * 4 + i);
     29     int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
     30     int16x8_t v11 = vaddq_s16(v11_tmp, v10);
     31     int16x8_t v12 = vld1q_s16(in + in_stride * 20 + i);
     32     int16x8_t v13 = vld1q_s16(in + in_stride * 12 + i);
     33     int16x8_t v14 = vaddq_s16(v12, v13);
     34     int16x8_t v15 = vaddq_s16(v11, v14);
     35     int16x8_t v16 = vld1q_s16(in + in_stride * 28 + i);
     36     int16x8_t v17 = vaddq_s16(v16, v12);
     37     int16x8_t v18 = vaddq_s16(v13, v10);
     38     int16x8_t v19 = vaddq_s16(v17, v18);
     39     int16x8_t v20 = vqrdmulhq_n_s16(v19, 17734);
     40     int16x8_t v21 = vqrdmulhq_n_s16(v18, 25080);
     41     int16x8_t v22 = vaddq_s16(v20, v21);
     42     int16x8_t v23 = vaddq_s16(v15, v22);
     43     int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
     44     int16x8_t v25 = vaddq_s16(v9, v24);
     45     int16x8_t v26 = vld1q_s16(in + in_stride * 2 + i);
     46     int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
     47     int16x8_t v27 = vaddq_s16(v27_tmp, v26);
     48     int16x8_t v28 = vld1q_s16(in + in_stride * 18 + i);
     49     int16x8_t v29 = vld1q_s16(in + in_stride * 14 + i);
     50     int16x8_t v30 = vaddq_s16(v28, v29);
     51     int16x8_t v31 = vaddq_s16(v27, v30);
     52     int16x8_t v32 = vld1q_s16(in + in_stride * 10 + i);
     53     int16x8_t v33 = vld1q_s16(in + in_stride * 6 + i);
     54     int16x8_t v34 = vaddq_s16(v32, v33);
     55     int16x8_t v35 = vqrdmulhq_n_s16(v34, 25080);
     56     int16x8_t v36 = vld1q_s16(in + in_stride * 26 + i);
     57     int16x8_t v37 = vld1q_s16(in + in_stride * 22 + i);
     58     int16x8_t v38 = vaddq_s16(v36, v37);
     59     int16x8_t v39 = vaddq_s16(v38, v34);
     60     int16x8_t v40 = vqrdmulhq_n_s16(v39, 17734);
     61     int16x8_t v41 = vaddq_s16(v35, v40);
     62     int16x8_t v42 = vaddq_s16(v31, v41);
     63     int16x8_t v43 = vaddq_s16(v33, v26);
     64     int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
     65     int16x8_t v44 = vaddq_s16(v44_tmp, v43);
     66     int16x8_t v45 = vaddq_s16(v29, v32);
     67     int16x8_t v46 = vaddq_s16(v37, v28);
     68     int16x8_t v47 = vaddq_s16(v45, v46);
     69     int16x8_t v48 = vaddq_s16(v44, v47);
     70     int16x8_t v49 = vqrdmulhq_n_s16(v48, 16705);
     71     int16x8_t v50 = vld1q_s16(in + in_stride * 30 + i);
     72     int16x8_t v51 = vaddq_s16(v50, v36);
     73     int16x8_t v52 = vaddq_s16(v51, v46);
     74     int16x8_t v53 = vqrdmulhq_n_s16(v52, 17734);
     75     int16x8_t v54 = vaddq_s16(v45, v43);
     76     int16x8_t v55_tmp = vqrdmulhq_n_s16(v54, 10045);
     77     int16x8_t v55 = vaddq_s16(v55_tmp, v54);
     78     int16x8_t v56 = vaddq_s16(v53, v55);
     79     int16x8_t v57 = vqrdmulhq_n_s16(v56, 16705);
     80     int16x8_t v58 = vaddq_s16(v49, v57);
     81     int16x8_t v59 = vaddq_s16(v42, v58);
     82     int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
     83     int16x8_t v61 = vaddq_s16(v25, v60);
     84     int16x8_t v62 = vld1q_s16(in + in_stride * 13 + i);
     85     int16x8_t v63 = vld1q_s16(in + in_stride * 11 + i);
     86     int16x8_t v64 = vaddq_s16(v62, v63);
     87     int16x8_t v65 = vld1q_s16(in + in_stride * 5 + i);
     88     int16x8_t v66 = vld1q_s16(in + in_stride * 3 + i);
     89     int16x8_t v67 = vaddq_s16(v65, v66);
     90     int16x8_t v68 = vaddq_s16(v64, v67);
     91     int16x8_t v69_tmp = vqrdmulhq_n_s16(v68, 10045);
     92     int16x8_t v69 = vaddq_s16(v69_tmp, v68);
     93     int16x8_t v70 = vld1q_s16(in + in_stride * 21 + i);
     94     int16x8_t v71 = vld1q_s16(in + in_stride * 19 + i);
     95     int16x8_t v72 = vaddq_s16(v70, v71);
     96     int16x8_t v73 = vld1q_s16(in + in_stride * 29 + i);
     97     int16x8_t v74 = vld1q_s16(in + in_stride * 27 + i);
     98     int16x8_t v75 = vaddq_s16(v73, v74);
     99     int16x8_t v76 = vaddq_s16(v72, v75);
    100     int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734);
    101     int16x8_t v78 = vaddq_s16(v69, v77);
    102     int16x8_t v79 = vqrdmulhq_n_s16(v78, 16705);
    103     int16x8_t v80_tmp = vqrdmulhq_n_s16(v67, 13573);
    104     int16x8_t v80 = vaddq_s16(v80_tmp, v67);
    105     int16x8_t v81 = vaddq_s16(v64, v72);
    106     int16x8_t v82 = vaddq_s16(v80, v81);
    107     int16x8_t v83 = vqrdmulhq_n_s16(v82, 16705);
    108     int16x8_t v84 = vaddq_s16(v79, v83);
    109     int16x8_t v85 = vld1q_s16(in + in_stride * 1 + i);
    110     int16x8_t v86_tmp = vqrdmulhq_n_s16(v85, 13573);
    111     int16x8_t v86 = vaddq_s16(v86_tmp, v85);
    112     int16x8_t v87 = vld1q_s16(in + in_stride * 17 + i);
    113     int16x8_t v88 = vld1q_s16(in + in_stride * 15 + i);
    114     int16x8_t v89 = vaddq_s16(v87, v88);
    115     int16x8_t v90 = vaddq_s16(v86, v89);
    116     int16x8_t v91 = vld1q_s16(in + in_stride * 9 + i);
    117     int16x8_t v92 = vld1q_s16(in + in_stride * 7 + i);
    118     int16x8_t v93 = vaddq_s16(v91, v92);
    119     int16x8_t v94 = vqrdmulhq_n_s16(v93, 25080);
    120     int16x8_t v95 = vld1q_s16(in + in_stride * 25 + i);
    121     int16x8_t v96 = vld1q_s16(in + in_stride * 23 + i);
    122     int16x8_t v97 = vaddq_s16(v95, v96);
    123     int16x8_t v98 = vaddq_s16(v97, v93);
    124     int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734);
    125     int16x8_t v100 = vaddq_s16(v94, v99);
    126     int16x8_t v101 = vaddq_s16(v90, v100);
    127     int16x8_t v102 = vaddq_s16(v84, v101);
    128     int16x8_t v103 = vaddq_s16(v92, v65);
    129     int16x8_t v104 = vaddq_s16(v66, v85);
    130     int16x8_t v105 = vaddq_s16(v103, v104);
    131     int16x8_t v106_tmp = vqrdmulhq_n_s16(v105, 13573);
    132     int16x8_t v106 = vaddq_s16(v106_tmp, v105);
    133     int16x8_t v107 = vaddq_s16(v96, v70);
    134     int16x8_t v108 = vaddq_s16(v71, v87);
    135     int16x8_t v109 = vaddq_s16(v107, v108);
    136     int16x8_t v110 = vaddq_s16(v63, v91);
    137     int16x8_t v111 = vaddq_s16(v88, v62);
    138     int16x8_t v112 = vaddq_s16(v110, v111);
    139     int16x8_t v113 = vaddq_s16(v109, v112);
    140     int16x8_t v114 = vaddq_s16(v106, v113);
    141     int16x8_t v115 = vqrdmulhq_n_s16(v114, 16705);
    142     int16x8_t v116 = vaddq_s16(v112, v105);
    143     int16x8_t v117 = vqrdmulhq_n_s16(v116, 25080);
    144     int16x8_t v118 = vqrdmulhq_n_s16(v116, 17734);
    145     int16x8_t v119 = vaddq_s16(v74, v95);
    146     int16x8_t v120 = vld1q_s16(in + in_stride * 31 + i);
    147     int16x8_t v121 = vaddq_s16(v120, v73);
    148     int16x8_t v122 = vaddq_s16(v119, v121);
    149     int16x8_t v123 = vaddq_s16(v122, v109);
    150     int16x8_t v124 = vqrdmulhq_n_s16(v123, 17734);
    151     int16x8_t v125 = vaddq_s16(v118, v124);
    152     int16x8_t v126 = vaddq_s16(v117, v125);
    153     int16x8_t v127 = vqrdmulhq_n_s16(v126, 16705);
    154     int16x8_t v128 = vaddq_s16(v115, v127);
    155     int16x8_t v129 = vqrdmulhq_n_s16(v128, 16463);
    156     int16x8_t v130_tmp = vqrdmulhq_n_s16(v104, 13573);
    157     int16x8_t v130 = vaddq_s16(v130_tmp, v104);
    158     int16x8_t v131 = vaddq_s16(v108, v111);
    159     int16x8_t v132 = vaddq_s16(v130, v131);
    160     int16x8_t v133 = vaddq_s16(v119, v107);
    161     int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734);
    162     int16x8_t v135 = vaddq_s16(v110, v103);
    163     int16x8_t v136_tmp = vqrdmulhq_n_s16(v135, 10045);
    164     int16x8_t v136 = vaddq_s16(v136_tmp, v135);
    165     int16x8_t v137 = vaddq_s16(v134, v136);
    166     int16x8_t v138 = vaddq_s16(v132, v137);
    167     int16x8_t v139 = vqrdmulhq_n_s16(v138, 16463);
    168     int16x8_t v140 = vaddq_s16(v129, v139);
    169     int16x8_t v141 = vaddq_s16(v102, v140);
    170     int16x8_t v142 = vqrdmulhq_n_s16(v141, 16404);
    171     int16x8_t v143 = vaddq_s16(v61, v142);
    172     int16x8_t v144 = vsubq_s16(v0, v1);
    173     int16x8_t v145 = vsubq_s16(v4, v6);
    174     int16x8_t v146_tmp = vqrdmulhq_n_s16(v145, 10045);
    175     int16x8_t v146 = vaddq_s16(v146_tmp, v145);
    176     int16x8_t v147 = vaddq_s16(v144, v146);
    177     int16x8_t v148 = vsubq_s16(v11, v14);
    178     int16x8_t v149 = vqrdmulhq_n_s16(v18, 17734);
    179     int16x8_t v150_tmp = vqrdmulhq_n_s16(v17, 10045);
    180     int16x8_t v150 = vaddq_s16(v150_tmp, v17);
    181     int16x8_t v151 = vsubq_s16(v149, v150);
    182     int16x8_t v152 = vaddq_s16(v148, v151);
    183     int16x8_t v153 = vqrdmulhq_n_s16(v152, 19705);
    184     int16x8_t v154 = vaddq_s16(v147, v153);
    185     int16x8_t v155 = vsubq_s16(v27, v30);
    186     int16x8_t v156 = vqrdmulhq_n_s16(v34, 17734);
    187     int16x8_t v157_tmp = vqrdmulhq_n_s16(v38, 10045);
    188     int16x8_t v157 = vaddq_s16(v157_tmp, v38);
    189     int16x8_t v158 = vsubq_s16(v156, v157);
    190     int16x8_t v159 = vaddq_s16(v155, v158);
    191     int16x8_t v160 = vqrdmulhq_n_s16(v54, 13573);
    192     int16x8_t v161 = vsubq_s16(v160, v52);
    193     int16x8_t v162 = vqrdmulhq_n_s16(v161, 25746);
    194     int16x8_t v163 = vsubq_s16(v44, v47);
    195     int16x8_t v164 = vqrdmulhq_n_s16(v163, 19705);
    196     int16x8_t v165 = vaddq_s16(v162, v164);
    197     int16x8_t v166 = vaddq_s16(v159, v165);
    198     int16x8_t v167 = vqrdmulhq_n_s16(v166, 17121);
    199     int16x8_t v168 = vaddq_s16(v154, v167);
    200     int16x8_t v169 = vsubq_s16(v86, v89);
    201     int16x8_t v170 = vqrdmulhq_n_s16(v93, 17734);
    202     int16x8_t v171_tmp = vqrdmulhq_n_s16(v97, 10045);
    203     int16x8_t v171 = vaddq_s16(v171_tmp, v97);
    204     int16x8_t v172 = vsubq_s16(v170, v171);
    205     int16x8_t v173 = vaddq_s16(v169, v172);
    206     int16x8_t v174 = vsubq_s16(v80, v81);
    207     int16x8_t v175 = vqrdmulhq_n_s16(v174, 19705);
    208     int16x8_t v176 = vqrdmulhq_n_s16(v68, 13573);
    209     int16x8_t v177 = vsubq_s16(v176, v76);
    210     int16x8_t v178 = vqrdmulhq_n_s16(v177, 25746);
    211     int16x8_t v179 = vaddq_s16(v175, v178);
    212     int16x8_t v180 = vaddq_s16(v173, v179);
    213     int16x8_t v181 = vsubq_s16(v130, v131);
    214     int16x8_t v182 = vqrdmulhq_n_s16(v135, 13573);
    215     int16x8_t v183 = vsubq_s16(v182, v133);
    216     int16x8_t v184_tmp = vqrdmulhq_n_s16(v183, 10045);
    217     int16x8_t v184 = vaddq_s16(v184_tmp, v183);
    218     int16x8_t v185 = vaddq_s16(v181, v184);
    219     int16x8_t v186 = vqrdmulhq_n_s16(v185, 17121);
    220     int16x8_t v187 = vqrdmulhq_n_s16(v105, 27867);
    221     int16x8_t v188 = vqrdmulhq_n_s16(v113, 19705);
    222     int16x8_t v189 = vsubq_s16(v187, v188);
    223     int16x8_t v190 = vqrdmulhq_n_s16(v116, 13573);
    224     int16x8_t v191 = vsubq_s16(v190, v123);
    225     int16x8_t v192 = vqrdmulhq_n_s16(v191, 25746);
    226     int16x8_t v193 = vaddq_s16(v189, v192);
    227     int16x8_t v194 = vqrdmulhq_n_s16(v193, 17121);
    228     int16x8_t v195 = vaddq_s16(v186, v194);
    229     int16x8_t v196 = vaddq_s16(v180, v195);
    230     int16x8_t v197 = vqrdmulhq_n_s16(v196, 16563);
    231     int16x8_t v198 = vaddq_s16(v168, v197);
    232     int16x8_t v199 = vsubq_s16(v144, v146);
    233     int16x8_t v200 = vsubq_s16(v148, v151);
    234     int16x8_t v201 = vqrdmulhq_n_s16(v200, 29490);
    235     int16x8_t v202 = vaddq_s16(v199, v201);
    236     int16x8_t v203 = vsubq_s16(v155, v158);
    237     int16x8_t v204 = vqrdmulhq_n_s16(v163, 29490);
    238     int16x8_t v205_tmp = vqrdmulhq_n_s16(v161, 5763);
    239     int16x8_t v205 = vaddq_s16(v205_tmp, v161);
    240     int16x8_t v206 = vsubq_s16(v204, v205);
    241     int16x8_t v207 = vaddq_s16(v203, v206);
    242     int16x8_t v208 = vqrdmulhq_n_s16(v207, 18578);
    243     int16x8_t v209 = vaddq_s16(v202, v208);
    244     int16x8_t v210 = vsubq_s16(v169, v172);
    245     int16x8_t v211 = vqrdmulhq_n_s16(v174, 29490);
    246     int16x8_t v212_tmp = vqrdmulhq_n_s16(v177, 5763);
    247     int16x8_t v212 = vaddq_s16(v212_tmp, v177);
    248     int16x8_t v213 = vsubq_s16(v211, v212);
    249     int16x8_t v214 = vaddq_s16(v210, v213);
    250     int16x8_t v215 = vsubq_s16(v181, v184);
    251     int16x8_t v216 = vqrdmulhq_n_s16(v215, 18578);
    252     int16x8_t v217 = vqrdmulhq_n_s16(v189, 27803);
    253     int16x8_t v218 = vqrdmulhq_n_s16(v191, 21845);
    254     int16x8_t v219 = vsubq_s16(v217, v218);
    255     int16x8_t v220 = vaddq_s16(v216, v219);
    256     int16x8_t v221 = vaddq_s16(v214, v220);
    257     int16x8_t v222 = vqrdmulhq_n_s16(v221, 16890);
    258     int16x8_t v223 = vaddq_s16(v209, v222);
    259     int16x8_t v224 = vsubq_s16(v2, v8);
    260     int16x8_t v225 = vsubq_s16(v15, v22);
    261     int16x8_t v226_tmp = vqrdmulhq_n_s16(v225, 18446);
    262     int16x8_t v226 = vmlaq_n_s16(v226_tmp, v225, 2);
    263     int16x8_t v227 = vaddq_s16(v224, v226);
    264     int16x8_t v228 = vsubq_s16(v31, v41);
    265     int16x8_t v229 = vsubq_s16(v48, v56);
    266     int16x8_t v230_tmp = vqrdmulhq_n_s16(v229, 18446);
    267     int16x8_t v230 = vmlaq_n_s16(v230_tmp, v229, 2);
    268     int16x8_t v231 = vaddq_s16(v228, v230);
    269     int16x8_t v232 = vqrdmulhq_n_s16(v231, 21195);
    270     int16x8_t v233 = vaddq_s16(v227, v232);
    271     int16x8_t v234 = vsubq_s16(v82, v78);
    272     int16x8_t v235_tmp = vqrdmulhq_n_s16(v234, 18446);
    273     int16x8_t v235 = vmlaq_n_s16(v235_tmp, v234, 2);
    274     int16x8_t v236 = vsubq_s16(v90, v100);
    275     int16x8_t v237 = vaddq_s16(v235, v236);
    276     int16x8_t v238 = vsubq_s16(v132, v137);
    277     int16x8_t v239 = vsubq_s16(v114, v126);
    278     int16x8_t v240_tmp = vqrdmulhq_n_s16(v239, 18446);
    279     int16x8_t v240 = vmlaq_n_s16(v240_tmp, v239, 2);
    280     int16x8_t v241 = vaddq_s16(v238, v240);
    281     int16x8_t v242 = vqrdmulhq_n_s16(v241, 21195);
    282     int16x8_t v243 = vaddq_s16(v237, v242);
    283     int16x8_t v244 = vqrdmulhq_n_s16(v243, 17401);
    284     int16x8_t v245 = vaddq_s16(v233, v244);
    285     int16x8_t v246 = vsubq_s16(v228, v230);
    286     int16x8_t v247 = vqrdmulhq_n_s16(v246, 25826);
    287     int16x8_t v248 = vsubq_s16(v224, v226);
    288     int16x8_t v249 = vaddq_s16(v247, v248);
    289     int16x8_t v250 = vsubq_s16(v238, v240);
    290     int16x8_t v251 = vqrdmulhq_n_s16(v250, 25826);
    291     int16x8_t v252 = vsubq_s16(v236, v235);
    292     int16x8_t v253 = vaddq_s16(v251, v252);
    293     int16x8_t v254 = vqrdmulhq_n_s16(v253, 18124);
    294     int16x8_t v255 = vaddq_s16(v249, v254);
    295     int16x8_t v256 = vsubq_s16(v199, v201);
    296     int16x8_t v257 = vsubq_s16(v203, v206);
    297     int16x8_t v258_tmp = vqrdmulhq_n_s16(v257, 1988);
    298     int16x8_t v258 = vaddq_s16(v258_tmp, v257);
    299     int16x8_t v259 = vaddq_s16(v256, v258);
    300     int16x8_t v260 = vsubq_s16(v210, v213);
    301     int16x8_t v261_tmp = vqrdmulhq_n_s16(v219, 25030);
    302     int16x8_t v261 = vaddq_s16(v261_tmp, v219);
    303     int16x8_t v262 = vsubq_s16(v215, v261);
    304     int16x8_t v263_tmp = vqrdmulhq_n_s16(v262, 1988);
    305     int16x8_t v263 = vaddq_s16(v263_tmp, v262);
    306     int16x8_t v264 = vaddq_s16(v260, v263);
    307     int16x8_t v265 = vqrdmulhq_n_s16(v264, 19102);
    308     int16x8_t v266 = vaddq_s16(v259, v265);
    309     int16x8_t v267 = vsubq_s16(v147, v153);
    310     int16x8_t v268 = vsubq_s16(v159, v165);
    311     int16x8_t v269_tmp = vqrdmulhq_n_s16(v268, 23673);
    312     int16x8_t v269 = vaddq_s16(v269_tmp, v268);
    313     int16x8_t v270 = vaddq_s16(v267, v269);
    314     int16x8_t v271 = vsubq_s16(v173, v179);
    315     int16x8_t v272 = vsubq_s16(v185, v193);
    316     int16x8_t v273_tmp = vqrdmulhq_n_s16(v272, 23673);
    317     int16x8_t v273 = vaddq_s16(v273_tmp, v272);
    318     int16x8_t v274 = vaddq_s16(v271, v273);
    319     int16x8_t v275 = vqrdmulhq_n_s16(v274, 20398);
    320     int16x8_t v276 = vaddq_s16(v270, v275);
    321     int16x8_t v277 = vsubq_s16(v9, v24);
    322     int16x8_t v278 = vsubq_s16(v42, v58);
    323     int16x8_t v279_tmp = vqrdmulhq_n_s16(v278, 3314);
    324     int16x8_t v279 = vmlaq_n_s16(v279_tmp, v278, 5);
    325     int16x8_t v280 = vaddq_s16(v277, v279);
    326     int16x8_t v281 = vsubq_s16(v138, v128);
    327     int16x8_t v282_tmp = vqrdmulhq_n_s16(v281, 3314);
    328     int16x8_t v282 = vmlaq_n_s16(v282_tmp, v281, 5);
    329     int16x8_t v283 = vsubq_s16(v101, v84);
    330     int16x8_t v284 = vaddq_s16(v282, v283);
    331     int16x8_t v285 = vqrdmulhq_n_s16(v284, 22112);
    332     int16x8_t v286 = vaddq_s16(v280, v285);
    333     int16x8_t v287 = vsubq_s16(v277, v279);
    334     int16x8_t v288 = vsubq_s16(v283, v282);
    335     int16x8_t v289 = vqrdmulhq_n_s16(v288, 24397);
    336     int16x8_t v290 = vaddq_s16(v287, v289);
    337     int16x8_t v291 = vsubq_s16(v267, v269);
    338     int16x8_t v292 = vsubq_s16(v271, v273);
    339     int16x8_t v293 = vqrdmulhq_n_s16(v292, 27504);
    340     int16x8_t v294 = vaddq_s16(v291, v293);
    341     int16x8_t v295 = vsubq_s16(v260, v263);
    342     int16x8_t v296 = vqrdmulhq_n_s16(v295, 31869);
    343     int16x8_t v297 = vsubq_s16(v256, v258);
    344     int16x8_t v298 = vaddq_s16(v296, v297);
    345     int16x8_t v299 = vsubq_s16(v248, v247);
    346     int16x8_t v300 = vsubq_s16(v252, v251);
    347     int16x8_t v301_tmp = vqrdmulhq_n_s16(v300, 5552);
    348     int16x8_t v301 = vaddq_s16(v301_tmp, v300);
    349     int16x8_t v302 = vaddq_s16(v299, v301);
    350     int16x8_t v303 = vsubq_s16(v227, v232);
    351     int16x8_t v304 = vsubq_s16(v237, v242);
    352     int16x8_t v305_tmp = vqrdmulhq_n_s16(v304, 15865);
    353     int16x8_t v305 = vaddq_s16(v305_tmp, v304);
    354     int16x8_t v306 = vaddq_s16(v303, v305);
    355     int16x8_t v307 = vsubq_s16(v202, v208);
    356     int16x8_t v308 = vsubq_s16(v214, v220);
    357     int16x8_t v309_tmp = vqrdmulhq_n_s16(v308, 1893);
    358     int16x8_t v309 = vmlaq_n_s16(v309_tmp, v308, 2);
    359     int16x8_t v310 = vaddq_s16(v307, v309);
    360     int16x8_t v311 = vsubq_s16(v154, v167);
    361     int16x8_t v312 = vsubq_s16(v180, v195);
    362     int16x8_t v313_tmp = vqrdmulhq_n_s16(v312, 13357);
    363     int16x8_t v313 = vmlaq_n_s16(v313_tmp, v312, 3);
    364     int16x8_t v314 = vaddq_s16(v311, v313);
    365     int16x8_t v315 = vsubq_s16(v102, v140);
    366     int16x8_t v316_tmp = vqrdmulhq_n_s16(v315, 6226);
    367     int16x8_t v316 = vmlaq_n_s16(v316_tmp, v315, 10);
    368     int16x8_t v317 = vsubq_s16(v25, v60);
    369     int16x8_t v318 = vaddq_s16(v316, v317);
    370     int16x8_t v319 = vsubq_s16(v317, v316);
    371     int16x8_t v320 = vsubq_s16(v311, v313);
    372     int16x8_t v321 = vsubq_s16(v307, v309);
    373     int16x8_t v322 = vsubq_s16(v303, v305);
    374     int16x8_t v323 = vsubq_s16(v299, v301);
    375     int16x8_t v324 = vsubq_s16(v297, v296);
    376     int16x8_t v325 = vsubq_s16(v291, v293);
    377     int16x8_t v326 = vsubq_s16(v287, v289);
    378     int16x8_t v327 = vsubq_s16(v280, v285);
    379     int16x8_t v328 = vsubq_s16(v270, v275);
    380     int16x8_t v329 = vsubq_s16(v259, v265);
    381     int16x8_t v330 = vsubq_s16(v249, v254);
    382     int16x8_t v331 = vsubq_s16(v233, v244);
    383     int16x8_t v332 = vsubq_s16(v209, v222);
    384     int16x8_t v333 = vsubq_s16(v168, v197);
    385     int16x8_t v334 = vsubq_s16(v61, v142);
    386     vst1q_s16(out + out_stride * 0 + i, v143);
    387     vst1q_s16(out + out_stride * 1 + i, v198);
    388     vst1q_s16(out + out_stride * 2 + i, v223);
    389     vst1q_s16(out + out_stride * 3 + i, v245);
    390     vst1q_s16(out + out_stride * 4 + i, v255);
    391     vst1q_s16(out + out_stride * 5 + i, v266);
    392     vst1q_s16(out + out_stride * 6 + i, v276);
    393     vst1q_s16(out + out_stride * 7 + i, v286);
    394     vst1q_s16(out + out_stride * 8 + i, v290);
    395     vst1q_s16(out + out_stride * 9 + i, v294);
    396     vst1q_s16(out + out_stride * 10 + i, v298);
    397     vst1q_s16(out + out_stride * 11 + i, v302);
    398     vst1q_s16(out + out_stride * 12 + i, v306);
    399     vst1q_s16(out + out_stride * 13 + i, v310);
    400     vst1q_s16(out + out_stride * 14 + i, v314);
    401     vst1q_s16(out + out_stride * 15 + i, v318);
    402     vst1q_s16(out + out_stride * 16 + i, v319);
    403     vst1q_s16(out + out_stride * 17 + i, v320);
    404     vst1q_s16(out + out_stride * 18 + i, v321);
    405     vst1q_s16(out + out_stride * 19 + i, v322);
    406     vst1q_s16(out + out_stride * 20 + i, v323);
    407     vst1q_s16(out + out_stride * 21 + i, v324);
    408     vst1q_s16(out + out_stride * 22 + i, v325);
    409     vst1q_s16(out + out_stride * 23 + i, v326);
    410     vst1q_s16(out + out_stride * 24 + i, v327);
    411     vst1q_s16(out + out_stride * 25 + i, v328);
    412     vst1q_s16(out + out_stride * 26 + i, v329);
    413     vst1q_s16(out + out_stride * 27 + i, v330);
    414     vst1q_s16(out + out_stride * 28 + i, v331);
    415     vst1q_s16(out + out_stride * 29 + i, v332);
    416     vst1q_s16(out + out_stride * 30 + i, v333);
    417     vst1q_s16(out + out_stride * 31 + i, v334);
    418   }
    419 }