libjxl

FORK: libjxl patches used on blog
git clone https://git.neptards.moe/blog/libjxl.git
Log | Files | Refs | Submodules | README | LICENSE

fast_dct128-inl.h (103005B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 /* This file is automatically generated. Do not modify it directly. */
      7 #if HWY_TARGET != HWY_NEON
      8 #error "only include this file from fast_dct-inl.h"
      9 #endif
     10 
     11 constexpr size_t FastIDCTIntegerBits(FastDCTTag<128>) { return 2; }
     12 
     13 void FastIDCT(FastDCTTag<128>, const int16_t* in, size_t in_stride,
     14               int16_t* out, size_t out_stride, size_t count) {
     15   JXL_ASSERT(count % 8 == 0);
     16   for (size_t i = 0; i < count; i += 8) {
     17     int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
     18     int16x8_t v1 = vld1q_s16(in + in_stride * 64 + i);
     19     int16x8_t v2 = vaddq_s16(v0, v1);
     20     int16x8_t v3 = vld1q_s16(in + in_stride * 32 + i);
     21     int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
     22     int16x8_t v4 = vaddq_s16(v4_tmp, v3);
     23     int16x8_t v5 = vld1q_s16(in + in_stride * 96 + i);
     24     int16x8_t v6 = vaddq_s16(v5, v3);
     25     int16x8_t v7 = vaddq_s16(v4, v6);
     26     int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
     27     int16x8_t v9 = vaddq_s16(v2, v8);
     28     int16x8_t v10 = vld1q_s16(in + in_stride * 16 + i);
     29     int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
     30     int16x8_t v11 = vaddq_s16(v11_tmp, v10);
     31     int16x8_t v12 = vld1q_s16(in + in_stride * 80 + i);
     32     int16x8_t v13 = vld1q_s16(in + in_stride * 48 + i);
     33     int16x8_t v14 = vaddq_s16(v12, v13);
     34     int16x8_t v15 = vaddq_s16(v11, v14);
     35     int16x8_t v16 = vaddq_s16(v13, v10);
     36     int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573);
     37     int16x8_t v17 = vaddq_s16(v17_tmp, v16);
     38     int16x8_t v18 = vld1q_s16(in + in_stride * 112 + i);
     39     int16x8_t v19 = vaddq_s16(v18, v12);
     40     int16x8_t v20 = vaddq_s16(v19, v16);
     41     int16x8_t v21 = vaddq_s16(v17, v20);
     42     int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734);
     43     int16x8_t v23 = vaddq_s16(v15, v22);
     44     int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
     45     int16x8_t v25 = vaddq_s16(v9, v24);
     46     int16x8_t v26 = vld1q_s16(in + in_stride * 8 + i);
     47     int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
     48     int16x8_t v27 = vaddq_s16(v27_tmp, v26);
     49     int16x8_t v28 = vld1q_s16(in + in_stride * 72 + i);
     50     int16x8_t v29 = vld1q_s16(in + in_stride * 56 + i);
     51     int16x8_t v30 = vaddq_s16(v28, v29);
     52     int16x8_t v31 = vaddq_s16(v27, v30);
     53     int16x8_t v32 = vld1q_s16(in + in_stride * 40 + i);
     54     int16x8_t v33 = vld1q_s16(in + in_stride * 24 + i);
     55     int16x8_t v34 = vaddq_s16(v32, v33);
     56     int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573);
     57     int16x8_t v35 = vaddq_s16(v35_tmp, v34);
     58     int16x8_t v36 = vld1q_s16(in + in_stride * 104 + i);
     59     int16x8_t v37 = vld1q_s16(in + in_stride * 88 + i);
     60     int16x8_t v38 = vaddq_s16(v36, v37);
     61     int16x8_t v39 = vaddq_s16(v38, v34);
     62     int16x8_t v40 = vaddq_s16(v35, v39);
     63     int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734);
     64     int16x8_t v42 = vaddq_s16(v31, v41);
     65     int16x8_t v43 = vaddq_s16(v33, v26);
     66     int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
     67     int16x8_t v44 = vaddq_s16(v44_tmp, v43);
     68     int16x8_t v45 = vaddq_s16(v37, v28);
     69     int16x8_t v46 = vaddq_s16(v29, v32);
     70     int16x8_t v47 = vaddq_s16(v45, v46);
     71     int16x8_t v48 = vaddq_s16(v44, v47);
     72     int16x8_t v49 = vaddq_s16(v46, v43);
     73     int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573);
     74     int16x8_t v50 = vaddq_s16(v50_tmp, v49);
     75     int16x8_t v51 = vld1q_s16(in + in_stride * 120 + i);
     76     int16x8_t v52 = vaddq_s16(v51, v36);
     77     int16x8_t v53 = vaddq_s16(v52, v45);
     78     int16x8_t v54 = vaddq_s16(v53, v49);
     79     int16x8_t v55 = vaddq_s16(v50, v54);
     80     int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734);
     81     int16x8_t v57 = vaddq_s16(v48, v56);
     82     int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705);
     83     int16x8_t v59 = vaddq_s16(v42, v58);
     84     int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
     85     int16x8_t v61 = vaddq_s16(v25, v60);
     86     int16x8_t v62 = vld1q_s16(in + in_stride * 4 + i);
     87     int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573);
     88     int16x8_t v63 = vaddq_s16(v63_tmp, v62);
     89     int16x8_t v64 = vld1q_s16(in + in_stride * 68 + i);
     90     int16x8_t v65 = vld1q_s16(in + in_stride * 60 + i);
     91     int16x8_t v66 = vaddq_s16(v64, v65);
     92     int16x8_t v67 = vaddq_s16(v63, v66);
     93     int16x8_t v68 = vld1q_s16(in + in_stride * 36 + i);
     94     int16x8_t v69 = vld1q_s16(in + in_stride * 28 + i);
     95     int16x8_t v70 = vaddq_s16(v68, v69);
     96     int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573);
     97     int16x8_t v71 = vaddq_s16(v71_tmp, v70);
     98     int16x8_t v72 = vld1q_s16(in + in_stride * 100 + i);
     99     int16x8_t v73 = vld1q_s16(in + in_stride * 92 + i);
    100     int16x8_t v74 = vaddq_s16(v72, v73);
    101     int16x8_t v75 = vaddq_s16(v74, v70);
    102     int16x8_t v76 = vaddq_s16(v71, v75);
    103     int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734);
    104     int16x8_t v78 = vaddq_s16(v67, v77);
    105     int16x8_t v79 = vld1q_s16(in + in_stride * 20 + i);
    106     int16x8_t v80 = vld1q_s16(in + in_stride * 12 + i);
    107     int16x8_t v81 = vaddq_s16(v79, v80);
    108     int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573);
    109     int16x8_t v82 = vaddq_s16(v82_tmp, v81);
    110     int16x8_t v83 = vld1q_s16(in + in_stride * 84 + i);
    111     int16x8_t v84 = vld1q_s16(in + in_stride * 76 + i);
    112     int16x8_t v85 = vaddq_s16(v83, v84);
    113     int16x8_t v86 = vld1q_s16(in + in_stride * 52 + i);
    114     int16x8_t v87 = vld1q_s16(in + in_stride * 44 + i);
    115     int16x8_t v88 = vaddq_s16(v86, v87);
    116     int16x8_t v89 = vaddq_s16(v85, v88);
    117     int16x8_t v90 = vaddq_s16(v82, v89);
    118     int16x8_t v91 = vaddq_s16(v88, v81);
    119     int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573);
    120     int16x8_t v92 = vaddq_s16(v92_tmp, v91);
    121     int16x8_t v93 = vld1q_s16(in + in_stride * 116 + i);
    122     int16x8_t v94 = vld1q_s16(in + in_stride * 108 + i);
    123     int16x8_t v95 = vaddq_s16(v93, v94);
    124     int16x8_t v96 = vaddq_s16(v95, v85);
    125     int16x8_t v97 = vaddq_s16(v96, v91);
    126     int16x8_t v98 = vaddq_s16(v92, v97);
    127     int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734);
    128     int16x8_t v100 = vaddq_s16(v90, v99);
    129     int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705);
    130     int16x8_t v102 = vaddq_s16(v78, v101);
    131     int16x8_t v103 = vaddq_s16(v80, v62);
    132     int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573);
    133     int16x8_t v104 = vaddq_s16(v104_tmp, v103);
    134     int16x8_t v105 = vaddq_s16(v84, v64);
    135     int16x8_t v106 = vaddq_s16(v65, v86);
    136     int16x8_t v107 = vaddq_s16(v105, v106);
    137     int16x8_t v108 = vaddq_s16(v104, v107);
    138     int16x8_t v109 = vaddq_s16(v87, v68);
    139     int16x8_t v110 = vaddq_s16(v69, v79);
    140     int16x8_t v111 = vaddq_s16(v109, v110);
    141     int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573);
    142     int16x8_t v112 = vaddq_s16(v112_tmp, v111);
    143     int16x8_t v113 = vaddq_s16(v94, v72);
    144     int16x8_t v114 = vaddq_s16(v73, v83);
    145     int16x8_t v115 = vaddq_s16(v113, v114);
    146     int16x8_t v116 = vaddq_s16(v115, v111);
    147     int16x8_t v117 = vaddq_s16(v112, v116);
    148     int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734);
    149     int16x8_t v119 = vaddq_s16(v108, v118);
    150     int16x8_t v120 = vaddq_s16(v110, v103);
    151     int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573);
    152     int16x8_t v121 = vaddq_s16(v121_tmp, v120);
    153     int16x8_t v122 = vaddq_s16(v114, v105);
    154     int16x8_t v123 = vaddq_s16(v106, v109);
    155     int16x8_t v124 = vaddq_s16(v122, v123);
    156     int16x8_t v125 = vaddq_s16(v121, v124);
    157     int16x8_t v126 = vaddq_s16(v123, v120);
    158     int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573);
    159     int16x8_t v127 = vaddq_s16(v127_tmp, v126);
    160     int16x8_t v128 = vld1q_s16(in + in_stride * 124 + i);
    161     int16x8_t v129 = vaddq_s16(v128, v93);
    162     int16x8_t v130 = vaddq_s16(v129, v113);
    163     int16x8_t v131 = vaddq_s16(v130, v122);
    164     int16x8_t v132 = vaddq_s16(v131, v126);
    165     int16x8_t v133 = vaddq_s16(v127, v132);
    166     int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734);
    167     int16x8_t v135 = vaddq_s16(v125, v134);
    168     int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705);
    169     int16x8_t v137 = vaddq_s16(v119, v136);
    170     int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463);
    171     int16x8_t v139 = vaddq_s16(v102, v138);
    172     int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404);
    173     int16x8_t v141 = vaddq_s16(v61, v140);
    174     int16x8_t v142 = vld1q_s16(in + in_stride * 2 + i);
    175     int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573);
    176     int16x8_t v143 = vaddq_s16(v143_tmp, v142);
    177     int16x8_t v144 = vld1q_s16(in + in_stride * 66 + i);
    178     int16x8_t v145 = vld1q_s16(in + in_stride * 62 + i);
    179     int16x8_t v146 = vaddq_s16(v144, v145);
    180     int16x8_t v147 = vaddq_s16(v143, v146);
    181     int16x8_t v148 = vld1q_s16(in + in_stride * 34 + i);
    182     int16x8_t v149 = vld1q_s16(in + in_stride * 30 + i);
    183     int16x8_t v150 = vaddq_s16(v148, v149);
    184     int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573);
    185     int16x8_t v151 = vaddq_s16(v151_tmp, v150);
    186     int16x8_t v152 = vld1q_s16(in + in_stride * 98 + i);
    187     int16x8_t v153 = vld1q_s16(in + in_stride * 94 + i);
    188     int16x8_t v154 = vaddq_s16(v152, v153);
    189     int16x8_t v155 = vaddq_s16(v154, v150);
    190     int16x8_t v156 = vaddq_s16(v151, v155);
    191     int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734);
    192     int16x8_t v158 = vaddq_s16(v147, v157);
    193     int16x8_t v159 = vld1q_s16(in + in_stride * 18 + i);
    194     int16x8_t v160 = vld1q_s16(in + in_stride * 14 + i);
    195     int16x8_t v161 = vaddq_s16(v159, v160);
    196     int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573);
    197     int16x8_t v162 = vaddq_s16(v162_tmp, v161);
    198     int16x8_t v163 = vld1q_s16(in + in_stride * 82 + i);
    199     int16x8_t v164 = vld1q_s16(in + in_stride * 78 + i);
    200     int16x8_t v165 = vaddq_s16(v163, v164);
    201     int16x8_t v166 = vld1q_s16(in + in_stride * 50 + i);
    202     int16x8_t v167 = vld1q_s16(in + in_stride * 46 + i);
    203     int16x8_t v168 = vaddq_s16(v166, v167);
    204     int16x8_t v169 = vaddq_s16(v165, v168);
    205     int16x8_t v170 = vaddq_s16(v162, v169);
    206     int16x8_t v171 = vaddq_s16(v168, v161);
    207     int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573);
    208     int16x8_t v172 = vaddq_s16(v172_tmp, v171);
    209     int16x8_t v173 = vld1q_s16(in + in_stride * 114 + i);
    210     int16x8_t v174 = vld1q_s16(in + in_stride * 110 + i);
    211     int16x8_t v175 = vaddq_s16(v173, v174);
    212     int16x8_t v176 = vaddq_s16(v175, v165);
    213     int16x8_t v177 = vaddq_s16(v176, v171);
    214     int16x8_t v178 = vaddq_s16(v172, v177);
    215     int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734);
    216     int16x8_t v180 = vaddq_s16(v170, v179);
    217     int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705);
    218     int16x8_t v182 = vaddq_s16(v158, v181);
    219     int16x8_t v183 = vld1q_s16(in + in_stride * 10 + i);
    220     int16x8_t v184 = vld1q_s16(in + in_stride * 6 + i);
    221     int16x8_t v185 = vaddq_s16(v183, v184);
    222     int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573);
    223     int16x8_t v186 = vaddq_s16(v186_tmp, v185);
    224     int16x8_t v187 = vld1q_s16(in + in_stride * 74 + i);
    225     int16x8_t v188 = vld1q_s16(in + in_stride * 70 + i);
    226     int16x8_t v189 = vaddq_s16(v187, v188);
    227     int16x8_t v190 = vld1q_s16(in + in_stride * 58 + i);
    228     int16x8_t v191 = vld1q_s16(in + in_stride * 54 + i);
    229     int16x8_t v192 = vaddq_s16(v190, v191);
    230     int16x8_t v193 = vaddq_s16(v189, v192);
    231     int16x8_t v194 = vaddq_s16(v186, v193);
    232     int16x8_t v195 = vld1q_s16(in + in_stride * 42 + i);
    233     int16x8_t v196 = vld1q_s16(in + in_stride * 38 + i);
    234     int16x8_t v197 = vaddq_s16(v195, v196);
    235     int16x8_t v198 = vld1q_s16(in + in_stride * 26 + i);
    236     int16x8_t v199 = vld1q_s16(in + in_stride * 22 + i);
    237     int16x8_t v200 = vaddq_s16(v198, v199);
    238     int16x8_t v201 = vaddq_s16(v197, v200);
    239     int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573);
    240     int16x8_t v202 = vaddq_s16(v202_tmp, v201);
    241     int16x8_t v203 = vld1q_s16(in + in_stride * 106 + i);
    242     int16x8_t v204 = vld1q_s16(in + in_stride * 102 + i);
    243     int16x8_t v205 = vaddq_s16(v203, v204);
    244     int16x8_t v206 = vld1q_s16(in + in_stride * 90 + i);
    245     int16x8_t v207 = vld1q_s16(in + in_stride * 86 + i);
    246     int16x8_t v208 = vaddq_s16(v206, v207);
    247     int16x8_t v209 = vaddq_s16(v205, v208);
    248     int16x8_t v210 = vaddq_s16(v209, v201);
    249     int16x8_t v211 = vaddq_s16(v202, v210);
    250     int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734);
    251     int16x8_t v213 = vaddq_s16(v194, v212);
    252     int16x8_t v214 = vaddq_s16(v200, v185);
    253     int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573);
    254     int16x8_t v215 = vaddq_s16(v215_tmp, v214);
    255     int16x8_t v216 = vaddq_s16(v208, v189);
    256     int16x8_t v217 = vaddq_s16(v192, v197);
    257     int16x8_t v218 = vaddq_s16(v216, v217);
    258     int16x8_t v219 = vaddq_s16(v215, v218);
    259     int16x8_t v220 = vaddq_s16(v217, v214);
    260     int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573);
    261     int16x8_t v221 = vaddq_s16(v221_tmp, v220);
    262     int16x8_t v222 = vld1q_s16(in + in_stride * 122 + i);
    263     int16x8_t v223 = vld1q_s16(in + in_stride * 118 + i);
    264     int16x8_t v224 = vaddq_s16(v222, v223);
    265     int16x8_t v225 = vaddq_s16(v224, v205);
    266     int16x8_t v226 = vaddq_s16(v225, v216);
    267     int16x8_t v227 = vaddq_s16(v226, v220);
    268     int16x8_t v228 = vaddq_s16(v221, v227);
    269     int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734);
    270     int16x8_t v230 = vaddq_s16(v219, v229);
    271     int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705);
    272     int16x8_t v232 = vaddq_s16(v213, v231);
    273     int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463);
    274     int16x8_t v234 = vaddq_s16(v182, v233);
    275     int16x8_t v235 = vaddq_s16(v184, v142);
    276     int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573);
    277     int16x8_t v236 = vaddq_s16(v236_tmp, v235);
    278     int16x8_t v237 = vaddq_s16(v188, v144);
    279     int16x8_t v238 = vaddq_s16(v145, v190);
    280     int16x8_t v239 = vaddq_s16(v237, v238);
    281     int16x8_t v240 = vaddq_s16(v236, v239);
    282     int16x8_t v241 = vaddq_s16(v196, v148);
    283     int16x8_t v242 = vaddq_s16(v149, v198);
    284     int16x8_t v243 = vaddq_s16(v241, v242);
    285     int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573);
    286     int16x8_t v244 = vaddq_s16(v244_tmp, v243);
    287     int16x8_t v245 = vaddq_s16(v204, v152);
    288     int16x8_t v246 = vaddq_s16(v153, v206);
    289     int16x8_t v247 = vaddq_s16(v245, v246);
    290     int16x8_t v248 = vaddq_s16(v247, v243);
    291     int16x8_t v249 = vaddq_s16(v244, v248);
    292     int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734);
    293     int16x8_t v251 = vaddq_s16(v240, v250);
    294     int16x8_t v252 = vaddq_s16(v199, v159);
    295     int16x8_t v253 = vaddq_s16(v160, v183);
    296     int16x8_t v254 = vaddq_s16(v252, v253);
    297     int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573);
    298     int16x8_t v255 = vaddq_s16(v255_tmp, v254);
    299     int16x8_t v256 = vaddq_s16(v207, v163);
    300     int16x8_t v257 = vaddq_s16(v164, v187);
    301     int16x8_t v258 = vaddq_s16(v256, v257);
    302     int16x8_t v259 = vaddq_s16(v191, v166);
    303     int16x8_t v260 = vaddq_s16(v167, v195);
    304     int16x8_t v261 = vaddq_s16(v259, v260);
    305     int16x8_t v262 = vaddq_s16(v258, v261);
    306     int16x8_t v263 = vaddq_s16(v255, v262);
    307     int16x8_t v264 = vaddq_s16(v261, v254);
    308     int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573);
    309     int16x8_t v265 = vaddq_s16(v265_tmp, v264);
    310     int16x8_t v266 = vaddq_s16(v223, v173);
    311     int16x8_t v267 = vaddq_s16(v174, v203);
    312     int16x8_t v268 = vaddq_s16(v266, v267);
    313     int16x8_t v269 = vaddq_s16(v268, v258);
    314     int16x8_t v270 = vaddq_s16(v269, v264);
    315     int16x8_t v271 = vaddq_s16(v265, v270);
    316     int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734);
    317     int16x8_t v273 = vaddq_s16(v263, v272);
    318     int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705);
    319     int16x8_t v275 = vaddq_s16(v251, v274);
    320     int16x8_t v276 = vaddq_s16(v253, v235);
    321     int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573);
    322     int16x8_t v277 = vaddq_s16(v277_tmp, v276);
    323     int16x8_t v278 = vaddq_s16(v257, v237);
    324     int16x8_t v279 = vaddq_s16(v238, v259);
    325     int16x8_t v280 = vaddq_s16(v278, v279);
    326     int16x8_t v281 = vaddq_s16(v277, v280);
    327     int16x8_t v282 = vaddq_s16(v260, v241);
    328     int16x8_t v283 = vaddq_s16(v242, v252);
    329     int16x8_t v284 = vaddq_s16(v282, v283);
    330     int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573);
    331     int16x8_t v285 = vaddq_s16(v285_tmp, v284);
    332     int16x8_t v286 = vaddq_s16(v267, v245);
    333     int16x8_t v287 = vaddq_s16(v246, v256);
    334     int16x8_t v288 = vaddq_s16(v286, v287);
    335     int16x8_t v289 = vaddq_s16(v288, v284);
    336     int16x8_t v290 = vaddq_s16(v285, v289);
    337     int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734);
    338     int16x8_t v292 = vaddq_s16(v281, v291);
    339     int16x8_t v293 = vaddq_s16(v283, v276);
    340     int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573);
    341     int16x8_t v294 = vaddq_s16(v294_tmp, v293);
    342     int16x8_t v295 = vaddq_s16(v287, v278);
    343     int16x8_t v296 = vaddq_s16(v279, v282);
    344     int16x8_t v297 = vaddq_s16(v295, v296);
    345     int16x8_t v298 = vaddq_s16(v294, v297);
    346     int16x8_t v299 = vaddq_s16(v296, v293);
    347     int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573);
    348     int16x8_t v300 = vaddq_s16(v300_tmp, v299);
    349     int16x8_t v301 = vld1q_s16(in + in_stride * 126 + i);
    350     int16x8_t v302 = vaddq_s16(v301, v222);
    351     int16x8_t v303 = vaddq_s16(v302, v266);
    352     int16x8_t v304 = vaddq_s16(v303, v286);
    353     int16x8_t v305 = vaddq_s16(v304, v295);
    354     int16x8_t v306 = vaddq_s16(v305, v299);
    355     int16x8_t v307 = vaddq_s16(v300, v306);
    356     int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734);
    357     int16x8_t v309 = vaddq_s16(v298, v308);
    358     int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705);
    359     int16x8_t v311 = vaddq_s16(v292, v310);
    360     int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463);
    361     int16x8_t v313 = vaddq_s16(v275, v312);
    362     int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404);
    363     int16x8_t v315 = vaddq_s16(v234, v314);
    364     int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389);
    365     int16x8_t v317 = vaddq_s16(v141, v316);
    366     int16x8_t v318 = vld1q_s16(in + in_stride * 1 + i);
    367     int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573);
    368     int16x8_t v319 = vaddq_s16(v319_tmp, v318);
    369     int16x8_t v320 = vld1q_s16(in + in_stride * 65 + i);
    370     int16x8_t v321 = vld1q_s16(in + in_stride * 63 + i);
    371     int16x8_t v322 = vaddq_s16(v320, v321);
    372     int16x8_t v323 = vaddq_s16(v319, v322);
    373     int16x8_t v324 = vld1q_s16(in + in_stride * 33 + i);
    374     int16x8_t v325 = vld1q_s16(in + in_stride * 31 + i);
    375     int16x8_t v326 = vaddq_s16(v324, v325);
    376     int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573);
    377     int16x8_t v327 = vaddq_s16(v327_tmp, v326);
    378     int16x8_t v328 = vld1q_s16(in + in_stride * 97 + i);
    379     int16x8_t v329 = vld1q_s16(in + in_stride * 95 + i);
    380     int16x8_t v330 = vaddq_s16(v328, v329);
    381     int16x8_t v331 = vaddq_s16(v330, v326);
    382     int16x8_t v332 = vaddq_s16(v327, v331);
    383     int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734);
    384     int16x8_t v334 = vaddq_s16(v323, v333);
    385     int16x8_t v335 = vld1q_s16(in + in_stride * 17 + i);
    386     int16x8_t v336 = vld1q_s16(in + in_stride * 15 + i);
    387     int16x8_t v337 = vaddq_s16(v335, v336);
    388     int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573);
    389     int16x8_t v338 = vaddq_s16(v338_tmp, v337);
    390     int16x8_t v339 = vld1q_s16(in + in_stride * 81 + i);
    391     int16x8_t v340 = vld1q_s16(in + in_stride * 79 + i);
    392     int16x8_t v341 = vaddq_s16(v339, v340);
    393     int16x8_t v342 = vld1q_s16(in + in_stride * 49 + i);
    394     int16x8_t v343 = vld1q_s16(in + in_stride * 47 + i);
    395     int16x8_t v344 = vaddq_s16(v342, v343);
    396     int16x8_t v345 = vaddq_s16(v341, v344);
    397     int16x8_t v346 = vaddq_s16(v338, v345);
    398     int16x8_t v347 = vaddq_s16(v344, v337);
    399     int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573);
    400     int16x8_t v348 = vaddq_s16(v348_tmp, v347);
    401     int16x8_t v349 = vld1q_s16(in + in_stride * 113 + i);
    402     int16x8_t v350 = vld1q_s16(in + in_stride * 111 + i);
    403     int16x8_t v351 = vaddq_s16(v349, v350);
    404     int16x8_t v352 = vaddq_s16(v351, v341);
    405     int16x8_t v353 = vaddq_s16(v352, v347);
    406     int16x8_t v354 = vaddq_s16(v348, v353);
    407     int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734);
    408     int16x8_t v356 = vaddq_s16(v346, v355);
    409     int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705);
    410     int16x8_t v358 = vaddq_s16(v334, v357);
    411     int16x8_t v359 = vld1q_s16(in + in_stride * 9 + i);
    412     int16x8_t v360 = vld1q_s16(in + in_stride * 7 + i);
    413     int16x8_t v361 = vaddq_s16(v359, v360);
    414     int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573);
    415     int16x8_t v362 = vaddq_s16(v362_tmp, v361);
    416     int16x8_t v363 = vld1q_s16(in + in_stride * 73 + i);
    417     int16x8_t v364 = vld1q_s16(in + in_stride * 71 + i);
    418     int16x8_t v365 = vaddq_s16(v363, v364);
    419     int16x8_t v366 = vld1q_s16(in + in_stride * 57 + i);
    420     int16x8_t v367 = vld1q_s16(in + in_stride * 55 + i);
    421     int16x8_t v368 = vaddq_s16(v366, v367);
    422     int16x8_t v369 = vaddq_s16(v365, v368);
    423     int16x8_t v370 = vaddq_s16(v362, v369);
    424     int16x8_t v371 = vld1q_s16(in + in_stride * 41 + i);
    425     int16x8_t v372 = vld1q_s16(in + in_stride * 39 + i);
    426     int16x8_t v373 = vaddq_s16(v371, v372);
    427     int16x8_t v374 = vld1q_s16(in + in_stride * 25 + i);
    428     int16x8_t v375 = vld1q_s16(in + in_stride * 23 + i);
    429     int16x8_t v376 = vaddq_s16(v374, v375);
    430     int16x8_t v377 = vaddq_s16(v373, v376);
    431     int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573);
    432     int16x8_t v378 = vaddq_s16(v378_tmp, v377);
    433     int16x8_t v379 = vld1q_s16(in + in_stride * 105 + i);
    434     int16x8_t v380 = vld1q_s16(in + in_stride * 103 + i);
    435     int16x8_t v381 = vaddq_s16(v379, v380);
    436     int16x8_t v382 = vld1q_s16(in + in_stride * 89 + i);
    437     int16x8_t v383 = vld1q_s16(in + in_stride * 87 + i);
    438     int16x8_t v384 = vaddq_s16(v382, v383);
    439     int16x8_t v385 = vaddq_s16(v381, v384);
    440     int16x8_t v386 = vaddq_s16(v385, v377);
    441     int16x8_t v387 = vaddq_s16(v378, v386);
    442     int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734);
    443     int16x8_t v389 = vaddq_s16(v370, v388);
    444     int16x8_t v390 = vaddq_s16(v376, v361);
    445     int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573);
    446     int16x8_t v391 = vaddq_s16(v391_tmp, v390);
    447     int16x8_t v392 = vaddq_s16(v384, v365);
    448     int16x8_t v393 = vaddq_s16(v368, v373);
    449     int16x8_t v394 = vaddq_s16(v392, v393);
    450     int16x8_t v395 = vaddq_s16(v391, v394);
    451     int16x8_t v396 = vaddq_s16(v393, v390);
    452     int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573);
    453     int16x8_t v397 = vaddq_s16(v397_tmp, v396);
    454     int16x8_t v398 = vld1q_s16(in + in_stride * 121 + i);
    455     int16x8_t v399 = vld1q_s16(in + in_stride * 119 + i);
    456     int16x8_t v400 = vaddq_s16(v398, v399);
    457     int16x8_t v401 = vaddq_s16(v400, v381);
    458     int16x8_t v402 = vaddq_s16(v401, v392);
    459     int16x8_t v403 = vaddq_s16(v402, v396);
    460     int16x8_t v404 = vaddq_s16(v397, v403);
    461     int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734);
    462     int16x8_t v406 = vaddq_s16(v395, v405);
    463     int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705);
    464     int16x8_t v408 = vaddq_s16(v389, v407);
    465     int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463);
    466     int16x8_t v410 = vaddq_s16(v358, v409);
    467     int16x8_t v411 = vld1q_s16(in + in_stride * 5 + i);
    468     int16x8_t v412 = vld1q_s16(in + in_stride * 3 + i);
    469     int16x8_t v413 = vaddq_s16(v411, v412);
    470     int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573);
    471     int16x8_t v414 = vaddq_s16(v414_tmp, v413);
    472     int16x8_t v415 = vld1q_s16(in + in_stride * 69 + i);
    473     int16x8_t v416 = vld1q_s16(in + in_stride * 67 + i);
    474     int16x8_t v417 = vaddq_s16(v415, v416);
    475     int16x8_t v418 = vld1q_s16(in + in_stride * 61 + i);
    476     int16x8_t v419 = vld1q_s16(in + in_stride * 59 + i);
    477     int16x8_t v420 = vaddq_s16(v418, v419);
    478     int16x8_t v421 = vaddq_s16(v417, v420);
    479     int16x8_t v422 = vaddq_s16(v414, v421);
    480     int16x8_t v423 = vld1q_s16(in + in_stride * 37 + i);
    481     int16x8_t v424 = vld1q_s16(in + in_stride * 35 + i);
    482     int16x8_t v425 = vaddq_s16(v423, v424);
    483     int16x8_t v426 = vld1q_s16(in + in_stride * 29 + i);
    484     int16x8_t v427 = vld1q_s16(in + in_stride * 27 + i);
    485     int16x8_t v428 = vaddq_s16(v426, v427);
    486     int16x8_t v429 = vaddq_s16(v425, v428);
    487     int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573);
    488     int16x8_t v430 = vaddq_s16(v430_tmp, v429);
    489     int16x8_t v431 = vld1q_s16(in + in_stride * 101 + i);
    490     int16x8_t v432 = vld1q_s16(in + in_stride * 99 + i);
    491     int16x8_t v433 = vaddq_s16(v431, v432);
    492     int16x8_t v434 = vld1q_s16(in + in_stride * 93 + i);
    493     int16x8_t v435 = vld1q_s16(in + in_stride * 91 + i);
    494     int16x8_t v436 = vaddq_s16(v434, v435);
    495     int16x8_t v437 = vaddq_s16(v433, v436);
    496     int16x8_t v438 = vaddq_s16(v437, v429);
    497     int16x8_t v439 = vaddq_s16(v430, v438);
    498     int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734);
    499     int16x8_t v441 = vaddq_s16(v422, v440);
    500     int16x8_t v442 = vld1q_s16(in + in_stride * 21 + i);
    501     int16x8_t v443 = vld1q_s16(in + in_stride * 19 + i);
    502     int16x8_t v444 = vaddq_s16(v442, v443);
    503     int16x8_t v445 = vld1q_s16(in + in_stride * 13 + i);
    504     int16x8_t v446 = vld1q_s16(in + in_stride * 11 + i);
    505     int16x8_t v447 = vaddq_s16(v445, v446);
    506     int16x8_t v448 = vaddq_s16(v444, v447);
    507     int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573);
    508     int16x8_t v449 = vaddq_s16(v449_tmp, v448);
    509     int16x8_t v450 = vld1q_s16(in + in_stride * 85 + i);
    510     int16x8_t v451 = vld1q_s16(in + in_stride * 83 + i);
    511     int16x8_t v452 = vaddq_s16(v450, v451);
    512     int16x8_t v453 = vld1q_s16(in + in_stride * 77 + i);
    513     int16x8_t v454 = vld1q_s16(in + in_stride * 75 + i);
    514     int16x8_t v455 = vaddq_s16(v453, v454);
    515     int16x8_t v456 = vaddq_s16(v452, v455);
    516     int16x8_t v457 = vld1q_s16(in + in_stride * 53 + i);
    517     int16x8_t v458 = vld1q_s16(in + in_stride * 51 + i);
    518     int16x8_t v459 = vaddq_s16(v457, v458);
    519     int16x8_t v460 = vld1q_s16(in + in_stride * 45 + i);
    520     int16x8_t v461 = vld1q_s16(in + in_stride * 43 + i);
    521     int16x8_t v462 = vaddq_s16(v460, v461);
    522     int16x8_t v463 = vaddq_s16(v459, v462);
    523     int16x8_t v464 = vaddq_s16(v456, v463);
    524     int16x8_t v465 = vaddq_s16(v449, v464);
    525     int16x8_t v466 = vaddq_s16(v463, v448);
    526     int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573);
    527     int16x8_t v467 = vaddq_s16(v467_tmp, v466);
    528     int16x8_t v468 = vld1q_s16(in + in_stride * 117 + i);
    529     int16x8_t v469 = vld1q_s16(in + in_stride * 115 + i);
    530     int16x8_t v470 = vaddq_s16(v468, v469);
    531     int16x8_t v471 = vld1q_s16(in + in_stride * 109 + i);
    532     int16x8_t v472 = vld1q_s16(in + in_stride * 107 + i);
    533     int16x8_t v473 = vaddq_s16(v471, v472);
    534     int16x8_t v474 = vaddq_s16(v470, v473);
    535     int16x8_t v475 = vaddq_s16(v474, v456);
    536     int16x8_t v476 = vaddq_s16(v475, v466);
    537     int16x8_t v477 = vaddq_s16(v467, v476);
    538     int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734);
    539     int16x8_t v479 = vaddq_s16(v465, v478);
    540     int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705);
    541     int16x8_t v481 = vaddq_s16(v441, v480);
    542     int16x8_t v482 = vaddq_s16(v447, v413);
    543     int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573);
    544     int16x8_t v483 = vaddq_s16(v483_tmp, v482);
    545     int16x8_t v484 = vaddq_s16(v455, v417);
    546     int16x8_t v485 = vaddq_s16(v420, v459);
    547     int16x8_t v486 = vaddq_s16(v484, v485);
    548     int16x8_t v487 = vaddq_s16(v483, v486);
    549     int16x8_t v488 = vaddq_s16(v462, v425);
    550     int16x8_t v489 = vaddq_s16(v428, v444);
    551     int16x8_t v490 = vaddq_s16(v488, v489);
    552     int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573);
    553     int16x8_t v491 = vaddq_s16(v491_tmp, v490);
    554     int16x8_t v492 = vaddq_s16(v473, v433);
    555     int16x8_t v493 = vaddq_s16(v436, v452);
    556     int16x8_t v494 = vaddq_s16(v492, v493);
    557     int16x8_t v495 = vaddq_s16(v494, v490);
    558     int16x8_t v496 = vaddq_s16(v491, v495);
    559     int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734);
    560     int16x8_t v498 = vaddq_s16(v487, v497);
    561     int16x8_t v499 = vaddq_s16(v489, v482);
    562     int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573);
    563     int16x8_t v500 = vaddq_s16(v500_tmp, v499);
    564     int16x8_t v501 = vaddq_s16(v493, v484);
    565     int16x8_t v502 = vaddq_s16(v485, v488);
    566     int16x8_t v503 = vaddq_s16(v501, v502);
    567     int16x8_t v504 = vaddq_s16(v500, v503);
    568     int16x8_t v505 = vaddq_s16(v502, v499);
    569     int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573);
    570     int16x8_t v506 = vaddq_s16(v506_tmp, v505);
    571     int16x8_t v507 = vld1q_s16(in + in_stride * 125 + i);
    572     int16x8_t v508 = vld1q_s16(in + in_stride * 123 + i);
    573     int16x8_t v509 = vaddq_s16(v507, v508);
    574     int16x8_t v510 = vaddq_s16(v509, v470);
    575     int16x8_t v511 = vaddq_s16(v510, v492);
    576     int16x8_t v512 = vaddq_s16(v511, v501);
    577     int16x8_t v513 = vaddq_s16(v512, v505);
    578     int16x8_t v514 = vaddq_s16(v506, v513);
    579     int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734);
    580     int16x8_t v516 = vaddq_s16(v504, v515);
    581     int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705);
    582     int16x8_t v518 = vaddq_s16(v498, v517);
    583     int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463);
    584     int16x8_t v520 = vaddq_s16(v481, v519);
    585     int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404);
    586     int16x8_t v522 = vaddq_s16(v410, v521);
    587     int16x8_t v523 = vaddq_s16(v412, v318);
    588     int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573);
    589     int16x8_t v524 = vaddq_s16(v524_tmp, v523);
    590     int16x8_t v525 = vaddq_s16(v416, v320);
    591     int16x8_t v526 = vaddq_s16(v321, v418);
    592     int16x8_t v527 = vaddq_s16(v525, v526);
    593     int16x8_t v528 = vaddq_s16(v524, v527);
    594     int16x8_t v529 = vaddq_s16(v424, v324);
    595     int16x8_t v530 = vaddq_s16(v325, v426);
    596     int16x8_t v531 = vaddq_s16(v529, v530);
    597     int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573);
    598     int16x8_t v532 = vaddq_s16(v532_tmp, v531);
    599     int16x8_t v533 = vaddq_s16(v432, v328);
    600     int16x8_t v534 = vaddq_s16(v329, v434);
    601     int16x8_t v535 = vaddq_s16(v533, v534);
    602     int16x8_t v536 = vaddq_s16(v535, v531);
    603     int16x8_t v537 = vaddq_s16(v532, v536);
    604     int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734);
    605     int16x8_t v539 = vaddq_s16(v528, v538);
    606     int16x8_t v540 = vaddq_s16(v443, v335);
    607     int16x8_t v541 = vaddq_s16(v336, v445);
    608     int16x8_t v542 = vaddq_s16(v540, v541);
    609     int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573);
    610     int16x8_t v543 = vaddq_s16(v543_tmp, v542);
    611     int16x8_t v544 = vaddq_s16(v451, v339);
    612     int16x8_t v545 = vaddq_s16(v340, v453);
    613     int16x8_t v546 = vaddq_s16(v544, v545);
    614     int16x8_t v547 = vaddq_s16(v458, v342);
    615     int16x8_t v548 = vaddq_s16(v343, v460);
    616     int16x8_t v549 = vaddq_s16(v547, v548);
    617     int16x8_t v550 = vaddq_s16(v546, v549);
    618     int16x8_t v551 = vaddq_s16(v543, v550);
    619     int16x8_t v552 = vaddq_s16(v549, v542);
    620     int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573);
    621     int16x8_t v553 = vaddq_s16(v553_tmp, v552);
    622     int16x8_t v554 = vaddq_s16(v469, v349);
    623     int16x8_t v555 = vaddq_s16(v350, v471);
    624     int16x8_t v556 = vaddq_s16(v554, v555);
    625     int16x8_t v557 = vaddq_s16(v556, v546);
    626     int16x8_t v558 = vaddq_s16(v557, v552);
    627     int16x8_t v559 = vaddq_s16(v553, v558);
    628     int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734);
    629     int16x8_t v561 = vaddq_s16(v551, v560);
    630     int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705);
    631     int16x8_t v563 = vaddq_s16(v539, v562);
    632     int16x8_t v564 = vaddq_s16(v446, v359);
    633     int16x8_t v565 = vaddq_s16(v360, v411);
    634     int16x8_t v566 = vaddq_s16(v564, v565);
    635     int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573);
    636     int16x8_t v567 = vaddq_s16(v567_tmp, v566);
    637     int16x8_t v568 = vaddq_s16(v454, v363);
    638     int16x8_t v569 = vaddq_s16(v364, v415);
    639     int16x8_t v570 = vaddq_s16(v568, v569);
    640     int16x8_t v571 = vaddq_s16(v419, v366);
    641     int16x8_t v572 = vaddq_s16(v367, v457);
    642     int16x8_t v573 = vaddq_s16(v571, v572);
    643     int16x8_t v574 = vaddq_s16(v570, v573);
    644     int16x8_t v575 = vaddq_s16(v567, v574);
    645     int16x8_t v576 = vaddq_s16(v461, v371);
    646     int16x8_t v577 = vaddq_s16(v372, v423);
    647     int16x8_t v578 = vaddq_s16(v576, v577);
    648     int16x8_t v579 = vaddq_s16(v427, v374);
    649     int16x8_t v580 = vaddq_s16(v375, v442);
    650     int16x8_t v581 = vaddq_s16(v579, v580);
    651     int16x8_t v582 = vaddq_s16(v578, v581);
    652     int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573);
    653     int16x8_t v583 = vaddq_s16(v583_tmp, v582);
    654     int16x8_t v584 = vaddq_s16(v472, v379);
    655     int16x8_t v585 = vaddq_s16(v380, v431);
    656     int16x8_t v586 = vaddq_s16(v584, v585);
    657     int16x8_t v587 = vaddq_s16(v435, v382);
    658     int16x8_t v588 = vaddq_s16(v383, v450);
    659     int16x8_t v589 = vaddq_s16(v587, v588);
    660     int16x8_t v590 = vaddq_s16(v586, v589);
    661     int16x8_t v591 = vaddq_s16(v590, v582);
    662     int16x8_t v592 = vaddq_s16(v583, v591);
    663     int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734);
    664     int16x8_t v594 = vaddq_s16(v575, v593);
    665     int16x8_t v595 = vaddq_s16(v581, v566);
    666     int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573);
    667     int16x8_t v596 = vaddq_s16(v596_tmp, v595);
    668     int16x8_t v597 = vaddq_s16(v589, v570);
    669     int16x8_t v598 = vaddq_s16(v573, v578);
    670     int16x8_t v599 = vaddq_s16(v597, v598);
    671     int16x8_t v600 = vaddq_s16(v596, v599);
    672     int16x8_t v601 = vaddq_s16(v598, v595);
    673     int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573);
    674     int16x8_t v602 = vaddq_s16(v602_tmp, v601);
    675     int16x8_t v603 = vaddq_s16(v508, v398);
    676     int16x8_t v604 = vaddq_s16(v399, v468);
    677     int16x8_t v605 = vaddq_s16(v603, v604);
    678     int16x8_t v606 = vaddq_s16(v605, v586);
    679     int16x8_t v607 = vaddq_s16(v606, v597);
    680     int16x8_t v608 = vaddq_s16(v607, v601);
    681     int16x8_t v609 = vaddq_s16(v602, v608);
    682     int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734);
    683     int16x8_t v611 = vaddq_s16(v600, v610);
    684     int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705);
    685     int16x8_t v613 = vaddq_s16(v594, v612);
    686     int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463);
    687     int16x8_t v615 = vaddq_s16(v563, v614);
    688     int16x8_t v616 = vaddq_s16(v565, v523);
    689     int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573);
    690     int16x8_t v617 = vaddq_s16(v617_tmp, v616);
    691     int16x8_t v618 = vaddq_s16(v569, v525);
    692     int16x8_t v619 = vaddq_s16(v526, v571);
    693     int16x8_t v620 = vaddq_s16(v618, v619);
    694     int16x8_t v621 = vaddq_s16(v617, v620);
    695     int16x8_t v622 = vaddq_s16(v577, v529);
    696     int16x8_t v623 = vaddq_s16(v530, v579);
    697     int16x8_t v624 = vaddq_s16(v622, v623);
    698     int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573);
    699     int16x8_t v625 = vaddq_s16(v625_tmp, v624);
    700     int16x8_t v626 = vaddq_s16(v585, v533);
    701     int16x8_t v627 = vaddq_s16(v534, v587);
    702     int16x8_t v628 = vaddq_s16(v626, v627);
    703     int16x8_t v629 = vaddq_s16(v628, v624);
    704     int16x8_t v630 = vaddq_s16(v625, v629);
    705     int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734);
    706     int16x8_t v632 = vaddq_s16(v621, v631);
    707     int16x8_t v633 = vaddq_s16(v580, v540);
    708     int16x8_t v634 = vaddq_s16(v541, v564);
    709     int16x8_t v635 = vaddq_s16(v633, v634);
    710     int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573);
    711     int16x8_t v636 = vaddq_s16(v636_tmp, v635);
    712     int16x8_t v637 = vaddq_s16(v588, v544);
    713     int16x8_t v638 = vaddq_s16(v545, v568);
    714     int16x8_t v639 = vaddq_s16(v637, v638);
    715     int16x8_t v640 = vaddq_s16(v572, v547);
    716     int16x8_t v641 = vaddq_s16(v548, v576);
    717     int16x8_t v642 = vaddq_s16(v640, v641);
    718     int16x8_t v643 = vaddq_s16(v639, v642);
    719     int16x8_t v644 = vaddq_s16(v636, v643);
    720     int16x8_t v645 = vaddq_s16(v642, v635);
    721     int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573);
    722     int16x8_t v646 = vaddq_s16(v646_tmp, v645);
    723     int16x8_t v647 = vaddq_s16(v604, v554);
    724     int16x8_t v648 = vaddq_s16(v555, v584);
    725     int16x8_t v649 = vaddq_s16(v647, v648);
    726     int16x8_t v650 = vaddq_s16(v649, v639);
    727     int16x8_t v651 = vaddq_s16(v650, v645);
    728     int16x8_t v652 = vaddq_s16(v646, v651);
    729     int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734);
    730     int16x8_t v654 = vaddq_s16(v644, v653);
    731     int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705);
    732     int16x8_t v656 = vaddq_s16(v632, v655);
    733     int16x8_t v657 = vaddq_s16(v634, v616);
    734     int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573);
    735     int16x8_t v658 = vaddq_s16(v658_tmp, v657);
    736     int16x8_t v659 = vaddq_s16(v638, v618);
    737     int16x8_t v660 = vaddq_s16(v619, v640);
    738     int16x8_t v661 = vaddq_s16(v659, v660);
    739     int16x8_t v662 = vaddq_s16(v658, v661);
    740     int16x8_t v663 = vaddq_s16(v641, v622);
    741     int16x8_t v664 = vaddq_s16(v623, v633);
    742     int16x8_t v665 = vaddq_s16(v663, v664);
    743     int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573);
    744     int16x8_t v666 = vaddq_s16(v666_tmp, v665);
    745     int16x8_t v667 = vaddq_s16(v648, v626);
    746     int16x8_t v668 = vaddq_s16(v627, v637);
    747     int16x8_t v669 = vaddq_s16(v667, v668);
    748     int16x8_t v670 = vaddq_s16(v669, v665);
    749     int16x8_t v671 = vaddq_s16(v666, v670);
    750     int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734);
    751     int16x8_t v673 = vaddq_s16(v662, v672);
    752     int16x8_t v674 = vaddq_s16(v664, v657);
    753     int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573);
    754     int16x8_t v675 = vaddq_s16(v675_tmp, v674);
    755     int16x8_t v676 = vaddq_s16(v668, v659);
    756     int16x8_t v677 = vaddq_s16(v660, v663);
    757     int16x8_t v678 = vaddq_s16(v676, v677);
    758     int16x8_t v679 = vaddq_s16(v675, v678);
    759     int16x8_t v680 = vaddq_s16(v677, v674);
    760     int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573);
    761     int16x8_t v681 = vaddq_s16(v681_tmp, v680);
    762     int16x8_t v682 = vld1q_s16(in + in_stride * 127 + i);
    763     int16x8_t v683 = vaddq_s16(v682, v507);
    764     int16x8_t v684 = vaddq_s16(v683, v603);
    765     int16x8_t v685 = vaddq_s16(v684, v647);
    766     int16x8_t v686 = vaddq_s16(v685, v667);
    767     int16x8_t v687 = vaddq_s16(v686, v676);
    768     int16x8_t v688 = vaddq_s16(v687, v680);
    769     int16x8_t v689 = vaddq_s16(v681, v688);
    770     int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734);
    771     int16x8_t v691 = vaddq_s16(v679, v690);
    772     int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705);
    773     int16x8_t v693 = vaddq_s16(v673, v692);
    774     int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463);
    775     int16x8_t v695 = vaddq_s16(v656, v694);
    776     int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404);
    777     int16x8_t v697 = vaddq_s16(v615, v696);
    778     int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389);
    779     int16x8_t v699 = vaddq_s16(v522, v698);
    780     int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385);
    781     int16x8_t v701 = vaddq_s16(v317, v700);
    782     int16x8_t v702 = vsubq_s16(v0, v1);
    783     int16x8_t v703 = vsubq_s16(v4, v6);
    784     int16x8_t v704_tmp = vqrdmulhq_n_s16(v703, 10045);
    785     int16x8_t v704 = vaddq_s16(v704_tmp, v703);
    786     int16x8_t v705 = vaddq_s16(v702, v704);
    787     int16x8_t v706 = vsubq_s16(v11, v14);
    788     int16x8_t v707 = vsubq_s16(v17, v20);
    789     int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 10045);
    790     int16x8_t v708 = vaddq_s16(v708_tmp, v707);
    791     int16x8_t v709 = vaddq_s16(v706, v708);
    792     int16x8_t v710 = vqrdmulhq_n_s16(v709, 19705);
    793     int16x8_t v711 = vaddq_s16(v705, v710);
    794     int16x8_t v712 = vsubq_s16(v27, v30);
    795     int16x8_t v713 = vsubq_s16(v35, v39);
    796     int16x8_t v714_tmp = vqrdmulhq_n_s16(v713, 10045);
    797     int16x8_t v714 = vaddq_s16(v714_tmp, v713);
    798     int16x8_t v715 = vaddq_s16(v712, v714);
    799     int16x8_t v716 = vsubq_s16(v44, v47);
    800     int16x8_t v717 = vsubq_s16(v50, v54);
    801     int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 10045);
    802     int16x8_t v718 = vaddq_s16(v718_tmp, v717);
    803     int16x8_t v719 = vaddq_s16(v716, v718);
    804     int16x8_t v720 = vqrdmulhq_n_s16(v719, 19705);
    805     int16x8_t v721 = vaddq_s16(v715, v720);
    806     int16x8_t v722 = vqrdmulhq_n_s16(v721, 17121);
    807     int16x8_t v723 = vaddq_s16(v711, v722);
    808     int16x8_t v724 = vsubq_s16(v63, v66);
    809     int16x8_t v725 = vsubq_s16(v71, v75);
    810     int16x8_t v726_tmp = vqrdmulhq_n_s16(v725, 10045);
    811     int16x8_t v726 = vaddq_s16(v726_tmp, v725);
    812     int16x8_t v727 = vaddq_s16(v724, v726);
    813     int16x8_t v728 = vsubq_s16(v82, v89);
    814     int16x8_t v729 = vsubq_s16(v92, v97);
    815     int16x8_t v730_tmp = vqrdmulhq_n_s16(v729, 10045);
    816     int16x8_t v730 = vaddq_s16(v730_tmp, v729);
    817     int16x8_t v731 = vaddq_s16(v728, v730);
    818     int16x8_t v732 = vqrdmulhq_n_s16(v731, 19705);
    819     int16x8_t v733 = vaddq_s16(v727, v732);
    820     int16x8_t v734 = vsubq_s16(v104, v107);
    821     int16x8_t v735 = vsubq_s16(v112, v116);
    822     int16x8_t v736_tmp = vqrdmulhq_n_s16(v735, 10045);
    823     int16x8_t v736 = vaddq_s16(v736_tmp, v735);
    824     int16x8_t v737 = vaddq_s16(v734, v736);
    825     int16x8_t v738 = vsubq_s16(v121, v124);
    826     int16x8_t v739 = vsubq_s16(v127, v132);
    827     int16x8_t v740_tmp = vqrdmulhq_n_s16(v739, 10045);
    828     int16x8_t v740 = vaddq_s16(v740_tmp, v739);
    829     int16x8_t v741 = vaddq_s16(v738, v740);
    830     int16x8_t v742 = vqrdmulhq_n_s16(v741, 19705);
    831     int16x8_t v743 = vaddq_s16(v737, v742);
    832     int16x8_t v744 = vqrdmulhq_n_s16(v743, 17121);
    833     int16x8_t v745 = vaddq_s16(v733, v744);
    834     int16x8_t v746 = vqrdmulhq_n_s16(v745, 16563);
    835     int16x8_t v747 = vaddq_s16(v723, v746);
    836     int16x8_t v748 = vsubq_s16(v143, v146);
    837     int16x8_t v749 = vsubq_s16(v151, v155);
    838     int16x8_t v750_tmp = vqrdmulhq_n_s16(v749, 10045);
    839     int16x8_t v750 = vaddq_s16(v750_tmp, v749);
    840     int16x8_t v751 = vaddq_s16(v748, v750);
    841     int16x8_t v752 = vsubq_s16(v162, v169);
    842     int16x8_t v753 = vqrdmulhq_n_s16(v752, 19705);
    843     int16x8_t v754 = vsubq_s16(v172, v177);
    844     int16x8_t v755 = vqrdmulhq_n_s16(v754, 25746);
    845     int16x8_t v756 = vaddq_s16(v753, v755);
    846     int16x8_t v757 = vaddq_s16(v751, v756);
    847     int16x8_t v758 = vsubq_s16(v186, v193);
    848     int16x8_t v759 = vsubq_s16(v202, v210);
    849     int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 10045);
    850     int16x8_t v760 = vaddq_s16(v760_tmp, v759);
    851     int16x8_t v761 = vaddq_s16(v758, v760);
    852     int16x8_t v762 = vsubq_s16(v215, v218);
    853     int16x8_t v763 = vsubq_s16(v221, v227);
    854     int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 10045);
    855     int16x8_t v764 = vaddq_s16(v764_tmp, v763);
    856     int16x8_t v765 = vaddq_s16(v762, v764);
    857     int16x8_t v766 = vqrdmulhq_n_s16(v765, 19705);
    858     int16x8_t v767 = vaddq_s16(v761, v766);
    859     int16x8_t v768 = vqrdmulhq_n_s16(v767, 17121);
    860     int16x8_t v769 = vaddq_s16(v757, v768);
    861     int16x8_t v770 = vsubq_s16(v236, v239);
    862     int16x8_t v771 = vsubq_s16(v244, v248);
    863     int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 10045);
    864     int16x8_t v772 = vaddq_s16(v772_tmp, v771);
    865     int16x8_t v773 = vaddq_s16(v770, v772);
    866     int16x8_t v774 = vsubq_s16(v255, v262);
    867     int16x8_t v775 = vsubq_s16(v265, v270);
    868     int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 10045);
    869     int16x8_t v776 = vaddq_s16(v776_tmp, v775);
    870     int16x8_t v777 = vaddq_s16(v774, v776);
    871     int16x8_t v778 = vqrdmulhq_n_s16(v777, 19705);
    872     int16x8_t v779 = vaddq_s16(v773, v778);
    873     int16x8_t v780 = vsubq_s16(v277, v280);
    874     int16x8_t v781 = vsubq_s16(v285, v289);
    875     int16x8_t v782_tmp = vqrdmulhq_n_s16(v781, 10045);
    876     int16x8_t v782 = vaddq_s16(v782_tmp, v781);
    877     int16x8_t v783 = vaddq_s16(v780, v782);
    878     int16x8_t v784 = vsubq_s16(v294, v297);
    879     int16x8_t v785 = vsubq_s16(v300, v306);
    880     int16x8_t v786_tmp = vqrdmulhq_n_s16(v785, 10045);
    881     int16x8_t v786 = vaddq_s16(v786_tmp, v785);
    882     int16x8_t v787 = vaddq_s16(v784, v786);
    883     int16x8_t v788 = vqrdmulhq_n_s16(v787, 19705);
    884     int16x8_t v789 = vaddq_s16(v783, v788);
    885     int16x8_t v790 = vqrdmulhq_n_s16(v789, 17121);
    886     int16x8_t v791 = vaddq_s16(v779, v790);
    887     int16x8_t v792 = vqrdmulhq_n_s16(v791, 16563);
    888     int16x8_t v793 = vaddq_s16(v769, v792);
    889     int16x8_t v794 = vqrdmulhq_n_s16(v793, 16429);
    890     int16x8_t v795 = vaddq_s16(v747, v794);
    891     int16x8_t v796 = vsubq_s16(v319, v322);
    892     int16x8_t v797 = vsubq_s16(v327, v331);
    893     int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 10045);
    894     int16x8_t v798 = vaddq_s16(v798_tmp, v797);
    895     int16x8_t v799 = vaddq_s16(v796, v798);
    896     int16x8_t v800 = vsubq_s16(v338, v345);
    897     int16x8_t v801 = vsubq_s16(v348, v353);
    898     int16x8_t v802_tmp = vqrdmulhq_n_s16(v801, 10045);
    899     int16x8_t v802 = vaddq_s16(v802_tmp, v801);
    900     int16x8_t v803 = vaddq_s16(v800, v802);
    901     int16x8_t v804 = vqrdmulhq_n_s16(v803, 19705);
    902     int16x8_t v805 = vaddq_s16(v799, v804);
    903     int16x8_t v806 = vsubq_s16(v362, v369);
    904     int16x8_t v807 = vsubq_s16(v378, v386);
    905     int16x8_t v808_tmp = vqrdmulhq_n_s16(v807, 10045);
    906     int16x8_t v808 = vaddq_s16(v808_tmp, v807);
    907     int16x8_t v809 = vaddq_s16(v806, v808);
    908     int16x8_t v810 = vsubq_s16(v391, v394);
    909     int16x8_t v811 = vsubq_s16(v397, v403);
    910     int16x8_t v812_tmp = vqrdmulhq_n_s16(v811, 10045);
    911     int16x8_t v812 = vaddq_s16(v812_tmp, v811);
    912     int16x8_t v813 = vaddq_s16(v810, v812);
    913     int16x8_t v814 = vqrdmulhq_n_s16(v813, 19705);
    914     int16x8_t v815 = vaddq_s16(v809, v814);
    915     int16x8_t v816 = vqrdmulhq_n_s16(v815, 17121);
    916     int16x8_t v817 = vaddq_s16(v805, v816);
    917     int16x8_t v818 = vsubq_s16(v414, v421);
    918     int16x8_t v819 = vsubq_s16(v430, v438);
    919     int16x8_t v820_tmp = vqrdmulhq_n_s16(v819, 10045);
    920     int16x8_t v820 = vaddq_s16(v820_tmp, v819);
    921     int16x8_t v821 = vaddq_s16(v818, v820);
    922     int16x8_t v822 = vsubq_s16(v449, v464);
    923     int16x8_t v823 = vsubq_s16(v467, v476);
    924     int16x8_t v824_tmp = vqrdmulhq_n_s16(v823, 10045);
    925     int16x8_t v824 = vaddq_s16(v824_tmp, v823);
    926     int16x8_t v825 = vaddq_s16(v822, v824);
    927     int16x8_t v826 = vqrdmulhq_n_s16(v825, 19705);
    928     int16x8_t v827 = vaddq_s16(v821, v826);
    929     int16x8_t v828 = vsubq_s16(v483, v486);
    930     int16x8_t v829 = vsubq_s16(v491, v495);
    931     int16x8_t v830_tmp = vqrdmulhq_n_s16(v829, 10045);
    932     int16x8_t v830 = vaddq_s16(v830_tmp, v829);
    933     int16x8_t v831 = vaddq_s16(v828, v830);
    934     int16x8_t v832 = vsubq_s16(v500, v503);
    935     int16x8_t v833 = vsubq_s16(v506, v513);
    936     int16x8_t v834_tmp = vqrdmulhq_n_s16(v833, 10045);
    937     int16x8_t v834 = vaddq_s16(v834_tmp, v833);
    938     int16x8_t v835 = vaddq_s16(v832, v834);
    939     int16x8_t v836 = vqrdmulhq_n_s16(v835, 19705);
    940     int16x8_t v837 = vaddq_s16(v831, v836);
    941     int16x8_t v838 = vqrdmulhq_n_s16(v837, 17121);
    942     int16x8_t v839 = vaddq_s16(v827, v838);
    943     int16x8_t v840 = vqrdmulhq_n_s16(v839, 16563);
    944     int16x8_t v841 = vaddq_s16(v817, v840);
    945     int16x8_t v842 = vsubq_s16(v524, v527);
    946     int16x8_t v843 = vsubq_s16(v532, v536);
    947     int16x8_t v844_tmp = vqrdmulhq_n_s16(v843, 10045);
    948     int16x8_t v844 = vaddq_s16(v844_tmp, v843);
    949     int16x8_t v845 = vaddq_s16(v842, v844);
    950     int16x8_t v846 = vsubq_s16(v543, v550);
    951     int16x8_t v847 = vsubq_s16(v553, v558);
    952     int16x8_t v848_tmp = vqrdmulhq_n_s16(v847, 10045);
    953     int16x8_t v848 = vaddq_s16(v848_tmp, v847);
    954     int16x8_t v849 = vaddq_s16(v846, v848);
    955     int16x8_t v850 = vqrdmulhq_n_s16(v849, 19705);
    956     int16x8_t v851 = vaddq_s16(v845, v850);
    957     int16x8_t v852 = vsubq_s16(v567, v574);
    958     int16x8_t v853 = vsubq_s16(v583, v591);
    959     int16x8_t v854_tmp = vqrdmulhq_n_s16(v853, 10045);
    960     int16x8_t v854 = vaddq_s16(v854_tmp, v853);
    961     int16x8_t v855 = vaddq_s16(v852, v854);
    962     int16x8_t v856 = vsubq_s16(v596, v599);
    963     int16x8_t v857 = vsubq_s16(v602, v608);
    964     int16x8_t v858_tmp = vqrdmulhq_n_s16(v857, 10045);
    965     int16x8_t v858 = vaddq_s16(v858_tmp, v857);
    966     int16x8_t v859 = vaddq_s16(v856, v858);
    967     int16x8_t v860 = vqrdmulhq_n_s16(v859, 19705);
    968     int16x8_t v861 = vaddq_s16(v855, v860);
    969     int16x8_t v862 = vqrdmulhq_n_s16(v861, 17121);
    970     int16x8_t v863 = vaddq_s16(v851, v862);
    971     int16x8_t v864 = vsubq_s16(v617, v620);
    972     int16x8_t v865 = vsubq_s16(v625, v629);
    973     int16x8_t v866_tmp = vqrdmulhq_n_s16(v865, 10045);
    974     int16x8_t v866 = vaddq_s16(v866_tmp, v865);
    975     int16x8_t v867 = vaddq_s16(v864, v866);
    976     int16x8_t v868 = vsubq_s16(v636, v643);
    977     int16x8_t v869 = vsubq_s16(v646, v651);
    978     int16x8_t v870_tmp = vqrdmulhq_n_s16(v869, 10045);
    979     int16x8_t v870 = vaddq_s16(v870_tmp, v869);
    980     int16x8_t v871 = vaddq_s16(v868, v870);
    981     int16x8_t v872 = vqrdmulhq_n_s16(v871, 19705);
    982     int16x8_t v873 = vaddq_s16(v867, v872);
    983     int16x8_t v874 = vsubq_s16(v658, v661);
    984     int16x8_t v875 = vsubq_s16(v666, v670);
    985     int16x8_t v876_tmp = vqrdmulhq_n_s16(v875, 10045);
    986     int16x8_t v876 = vaddq_s16(v876_tmp, v875);
    987     int16x8_t v877 = vaddq_s16(v874, v876);
    988     int16x8_t v878 = vsubq_s16(v675, v678);
    989     int16x8_t v879 = vsubq_s16(v681, v688);
    990     int16x8_t v880_tmp = vqrdmulhq_n_s16(v879, 10045);
    991     int16x8_t v880 = vaddq_s16(v880_tmp, v879);
    992     int16x8_t v881 = vaddq_s16(v878, v880);
    993     int16x8_t v882 = vqrdmulhq_n_s16(v881, 19705);
    994     int16x8_t v883 = vaddq_s16(v877, v882);
    995     int16x8_t v884 = vqrdmulhq_n_s16(v883, 17121);
    996     int16x8_t v885 = vaddq_s16(v873, v884);
    997     int16x8_t v886 = vqrdmulhq_n_s16(v885, 16563);
    998     int16x8_t v887 = vaddq_s16(v863, v886);
    999     int16x8_t v888 = vqrdmulhq_n_s16(v887, 16429);
   1000     int16x8_t v889 = vaddq_s16(v841, v888);
   1001     int16x8_t v890 = vqrdmulhq_n_s16(v889, 16395);
   1002     int16x8_t v891 = vaddq_s16(v795, v890);
   1003     int16x8_t v892 = vsubq_s16(v702, v704);
   1004     int16x8_t v893 = vsubq_s16(v706, v708);
   1005     int16x8_t v894 = vqrdmulhq_n_s16(v893, 29490);
   1006     int16x8_t v895 = vaddq_s16(v892, v894);
   1007     int16x8_t v896 = vsubq_s16(v712, v714);
   1008     int16x8_t v897 = vsubq_s16(v716, v718);
   1009     int16x8_t v898 = vqrdmulhq_n_s16(v897, 29490);
   1010     int16x8_t v899 = vaddq_s16(v896, v898);
   1011     int16x8_t v900 = vqrdmulhq_n_s16(v899, 18578);
   1012     int16x8_t v901 = vaddq_s16(v895, v900);
   1013     int16x8_t v902 = vsubq_s16(v724, v726);
   1014     int16x8_t v903 = vsubq_s16(v728, v730);
   1015     int16x8_t v904 = vqrdmulhq_n_s16(v903, 29490);
   1016     int16x8_t v905 = vaddq_s16(v902, v904);
   1017     int16x8_t v906 = vsubq_s16(v734, v736);
   1018     int16x8_t v907 = vsubq_s16(v738, v740);
   1019     int16x8_t v908 = vqrdmulhq_n_s16(v907, 29490);
   1020     int16x8_t v909 = vaddq_s16(v906, v908);
   1021     int16x8_t v910 = vqrdmulhq_n_s16(v909, 18578);
   1022     int16x8_t v911 = vaddq_s16(v905, v910);
   1023     int16x8_t v912 = vqrdmulhq_n_s16(v911, 16890);
   1024     int16x8_t v913 = vaddq_s16(v901, v912);
   1025     int16x8_t v914 = vsubq_s16(v748, v750);
   1026     int16x8_t v915_tmp = vqrdmulhq_n_s16(v754, 10045);
   1027     int16x8_t v915 = vaddq_s16(v915_tmp, v754);
   1028     int16x8_t v916 = vsubq_s16(v752, v915);
   1029     int16x8_t v917 = vqrdmulhq_n_s16(v916, 29490);
   1030     int16x8_t v918 = vaddq_s16(v914, v917);
   1031     int16x8_t v919 = vsubq_s16(v758, v760);
   1032     int16x8_t v920 = vsubq_s16(v762, v764);
   1033     int16x8_t v921 = vqrdmulhq_n_s16(v920, 29490);
   1034     int16x8_t v922 = vaddq_s16(v919, v921);
   1035     int16x8_t v923 = vqrdmulhq_n_s16(v922, 18578);
   1036     int16x8_t v924 = vaddq_s16(v918, v923);
   1037     int16x8_t v925 = vsubq_s16(v770, v772);
   1038     int16x8_t v926 = vsubq_s16(v774, v776);
   1039     int16x8_t v927 = vqrdmulhq_n_s16(v926, 29490);
   1040     int16x8_t v928 = vaddq_s16(v925, v927);
   1041     int16x8_t v929 = vsubq_s16(v780, v782);
   1042     int16x8_t v930 = vsubq_s16(v784, v786);
   1043     int16x8_t v931 = vqrdmulhq_n_s16(v930, 29490);
   1044     int16x8_t v932 = vaddq_s16(v929, v931);
   1045     int16x8_t v933 = vqrdmulhq_n_s16(v932, 18578);
   1046     int16x8_t v934 = vaddq_s16(v928, v933);
   1047     int16x8_t v935 = vqrdmulhq_n_s16(v934, 16890);
   1048     int16x8_t v936 = vaddq_s16(v924, v935);
   1049     int16x8_t v937 = vqrdmulhq_n_s16(v936, 16508);
   1050     int16x8_t v938 = vaddq_s16(v913, v937);
   1051     int16x8_t v939 = vsubq_s16(v796, v798);
   1052     int16x8_t v940 = vsubq_s16(v800, v802);
   1053     int16x8_t v941 = vqrdmulhq_n_s16(v940, 29490);
   1054     int16x8_t v942 = vaddq_s16(v939, v941);
   1055     int16x8_t v943 = vsubq_s16(v806, v808);
   1056     int16x8_t v944 = vsubq_s16(v810, v812);
   1057     int16x8_t v945 = vqrdmulhq_n_s16(v944, 29490);
   1058     int16x8_t v946 = vaddq_s16(v943, v945);
   1059     int16x8_t v947 = vqrdmulhq_n_s16(v946, 18578);
   1060     int16x8_t v948 = vaddq_s16(v942, v947);
   1061     int16x8_t v949 = vsubq_s16(v818, v820);
   1062     int16x8_t v950 = vsubq_s16(v822, v824);
   1063     int16x8_t v951 = vqrdmulhq_n_s16(v950, 29490);
   1064     int16x8_t v952 = vaddq_s16(v949, v951);
   1065     int16x8_t v953 = vsubq_s16(v828, v830);
   1066     int16x8_t v954 = vsubq_s16(v832, v834);
   1067     int16x8_t v955 = vqrdmulhq_n_s16(v954, 29490);
   1068     int16x8_t v956 = vaddq_s16(v953, v955);
   1069     int16x8_t v957 = vqrdmulhq_n_s16(v956, 18578);
   1070     int16x8_t v958 = vaddq_s16(v952, v957);
   1071     int16x8_t v959 = vqrdmulhq_n_s16(v958, 16890);
   1072     int16x8_t v960 = vaddq_s16(v948, v959);
   1073     int16x8_t v961 = vsubq_s16(v842, v844);
   1074     int16x8_t v962 = vsubq_s16(v846, v848);
   1075     int16x8_t v963 = vqrdmulhq_n_s16(v962, 29490);
   1076     int16x8_t v964 = vaddq_s16(v961, v963);
   1077     int16x8_t v965 = vsubq_s16(v852, v854);
   1078     int16x8_t v966 = vsubq_s16(v856, v858);
   1079     int16x8_t v967 = vqrdmulhq_n_s16(v966, 29490);
   1080     int16x8_t v968 = vaddq_s16(v965, v967);
   1081     int16x8_t v969 = vqrdmulhq_n_s16(v968, 18578);
   1082     int16x8_t v970 = vaddq_s16(v964, v969);
   1083     int16x8_t v971 = vsubq_s16(v864, v866);
   1084     int16x8_t v972 = vsubq_s16(v868, v870);
   1085     int16x8_t v973 = vqrdmulhq_n_s16(v972, 29490);
   1086     int16x8_t v974 = vaddq_s16(v971, v973);
   1087     int16x8_t v975 = vsubq_s16(v874, v876);
   1088     int16x8_t v976 = vsubq_s16(v878, v880);
   1089     int16x8_t v977 = vqrdmulhq_n_s16(v976, 29490);
   1090     int16x8_t v978 = vaddq_s16(v975, v977);
   1091     int16x8_t v979 = vqrdmulhq_n_s16(v978, 18578);
   1092     int16x8_t v980 = vaddq_s16(v974, v979);
   1093     int16x8_t v981 = vqrdmulhq_n_s16(v980, 16890);
   1094     int16x8_t v982 = vaddq_s16(v970, v981);
   1095     int16x8_t v983 = vqrdmulhq_n_s16(v982, 16508);
   1096     int16x8_t v984 = vaddq_s16(v960, v983);
   1097     int16x8_t v985 = vqrdmulhq_n_s16(v984, 16415);
   1098     int16x8_t v986 = vaddq_s16(v938, v985);
   1099     int16x8_t v987 = vsubq_s16(v2, v8);
   1100     int16x8_t v988 = vsubq_s16(v15, v22);
   1101     int16x8_t v989_tmp = vqrdmulhq_n_s16(v988, 18446);
   1102     int16x8_t v989 = vmlaq_n_s16(v989_tmp, v988, 2);
   1103     int16x8_t v990 = vaddq_s16(v987, v989);
   1104     int16x8_t v991 = vsubq_s16(v31, v41);
   1105     int16x8_t v992 = vsubq_s16(v48, v56);
   1106     int16x8_t v993_tmp = vqrdmulhq_n_s16(v992, 18446);
   1107     int16x8_t v993 = vmlaq_n_s16(v993_tmp, v992, 2);
   1108     int16x8_t v994 = vaddq_s16(v991, v993);
   1109     int16x8_t v995 = vqrdmulhq_n_s16(v994, 21195);
   1110     int16x8_t v996 = vaddq_s16(v990, v995);
   1111     int16x8_t v997 = vsubq_s16(v67, v77);
   1112     int16x8_t v998 = vsubq_s16(v90, v99);
   1113     int16x8_t v999_tmp = vqrdmulhq_n_s16(v998, 18446);
   1114     int16x8_t v999 = vmlaq_n_s16(v999_tmp, v998, 2);
   1115     int16x8_t v1000 = vaddq_s16(v997, v999);
   1116     int16x8_t v1001 = vsubq_s16(v108, v118);
   1117     int16x8_t v1002 = vsubq_s16(v125, v134);
   1118     int16x8_t v1003_tmp = vqrdmulhq_n_s16(v1002, 18446);
   1119     int16x8_t v1003 = vmlaq_n_s16(v1003_tmp, v1002, 2);
   1120     int16x8_t v1004 = vaddq_s16(v1001, v1003);
   1121     int16x8_t v1005 = vqrdmulhq_n_s16(v1004, 21195);
   1122     int16x8_t v1006 = vaddq_s16(v1000, v1005);
   1123     int16x8_t v1007 = vqrdmulhq_n_s16(v1006, 17401);
   1124     int16x8_t v1008 = vaddq_s16(v996, v1007);
   1125     int16x8_t v1009 = vsubq_s16(v147, v157);
   1126     int16x8_t v1010 = vsubq_s16(v170, v179);
   1127     int16x8_t v1011_tmp = vqrdmulhq_n_s16(v1010, 18446);
   1128     int16x8_t v1011 = vmlaq_n_s16(v1011_tmp, v1010, 2);
   1129     int16x8_t v1012 = vaddq_s16(v1009, v1011);
   1130     int16x8_t v1013 = vsubq_s16(v194, v212);
   1131     int16x8_t v1014 = vsubq_s16(v219, v229);
   1132     int16x8_t v1015_tmp = vqrdmulhq_n_s16(v1014, 18446);
   1133     int16x8_t v1015 = vmlaq_n_s16(v1015_tmp, v1014, 2);
   1134     int16x8_t v1016 = vaddq_s16(v1013, v1015);
   1135     int16x8_t v1017 = vqrdmulhq_n_s16(v1016, 21195);
   1136     int16x8_t v1018 = vaddq_s16(v1012, v1017);
   1137     int16x8_t v1019 = vsubq_s16(v240, v250);
   1138     int16x8_t v1020 = vsubq_s16(v263, v272);
   1139     int16x8_t v1021_tmp = vqrdmulhq_n_s16(v1020, 18446);
   1140     int16x8_t v1021 = vmlaq_n_s16(v1021_tmp, v1020, 2);
   1141     int16x8_t v1022 = vaddq_s16(v1019, v1021);
   1142     int16x8_t v1023 = vsubq_s16(v281, v291);
   1143     int16x8_t v1024 = vsubq_s16(v298, v308);
   1144     int16x8_t v1025_tmp = vqrdmulhq_n_s16(v1024, 18446);
   1145     int16x8_t v1025 = vmlaq_n_s16(v1025_tmp, v1024, 2);
   1146     int16x8_t v1026 = vaddq_s16(v1023, v1025);
   1147     int16x8_t v1027 = vqrdmulhq_n_s16(v1026, 21195);
   1148     int16x8_t v1028 = vaddq_s16(v1022, v1027);
   1149     int16x8_t v1029 = vqrdmulhq_n_s16(v1028, 17401);
   1150     int16x8_t v1030 = vaddq_s16(v1018, v1029);
   1151     int16x8_t v1031 = vqrdmulhq_n_s16(v1030, 16629);
   1152     int16x8_t v1032 = vaddq_s16(v1008, v1031);
   1153     int16x8_t v1033 = vsubq_s16(v323, v333);
   1154     int16x8_t v1034 = vsubq_s16(v346, v355);
   1155     int16x8_t v1035_tmp = vqrdmulhq_n_s16(v1034, 18446);
   1156     int16x8_t v1035 = vmlaq_n_s16(v1035_tmp, v1034, 2);
   1157     int16x8_t v1036 = vaddq_s16(v1033, v1035);
   1158     int16x8_t v1037 = vsubq_s16(v370, v388);
   1159     int16x8_t v1038 = vsubq_s16(v395, v405);
   1160     int16x8_t v1039_tmp = vqrdmulhq_n_s16(v1038, 18446);
   1161     int16x8_t v1039 = vmlaq_n_s16(v1039_tmp, v1038, 2);
   1162     int16x8_t v1040 = vaddq_s16(v1037, v1039);
   1163     int16x8_t v1041 = vqrdmulhq_n_s16(v1040, 21195);
   1164     int16x8_t v1042 = vaddq_s16(v1036, v1041);
   1165     int16x8_t v1043 = vsubq_s16(v422, v440);
   1166     int16x8_t v1044 = vsubq_s16(v465, v478);
   1167     int16x8_t v1045_tmp = vqrdmulhq_n_s16(v1044, 18446);
   1168     int16x8_t v1045 = vmlaq_n_s16(v1045_tmp, v1044, 2);
   1169     int16x8_t v1046 = vaddq_s16(v1043, v1045);
   1170     int16x8_t v1047 = vsubq_s16(v487, v497);
   1171     int16x8_t v1048 = vsubq_s16(v504, v515);
   1172     int16x8_t v1049_tmp = vqrdmulhq_n_s16(v1048, 18446);
   1173     int16x8_t v1049 = vmlaq_n_s16(v1049_tmp, v1048, 2);
   1174     int16x8_t v1050 = vaddq_s16(v1047, v1049);
   1175     int16x8_t v1051 = vqrdmulhq_n_s16(v1050, 21195);
   1176     int16x8_t v1052 = vaddq_s16(v1046, v1051);
   1177     int16x8_t v1053 = vqrdmulhq_n_s16(v1052, 17401);
   1178     int16x8_t v1054 = vaddq_s16(v1042, v1053);
   1179     int16x8_t v1055 = vsubq_s16(v528, v538);
   1180     int16x8_t v1056 = vsubq_s16(v551, v560);
   1181     int16x8_t v1057_tmp = vqrdmulhq_n_s16(v1056, 18446);
   1182     int16x8_t v1057 = vmlaq_n_s16(v1057_tmp, v1056, 2);
   1183     int16x8_t v1058 = vaddq_s16(v1055, v1057);
   1184     int16x8_t v1059 = vsubq_s16(v575, v593);
   1185     int16x8_t v1060 = vsubq_s16(v600, v610);
   1186     int16x8_t v1061_tmp = vqrdmulhq_n_s16(v1060, 18446);
   1187     int16x8_t v1061 = vmlaq_n_s16(v1061_tmp, v1060, 2);
   1188     int16x8_t v1062 = vaddq_s16(v1059, v1061);
   1189     int16x8_t v1063 = vqrdmulhq_n_s16(v1062, 21195);
   1190     int16x8_t v1064 = vaddq_s16(v1058, v1063);
   1191     int16x8_t v1065 = vsubq_s16(v621, v631);
   1192     int16x8_t v1066 = vsubq_s16(v644, v653);
   1193     int16x8_t v1067_tmp = vqrdmulhq_n_s16(v1066, 18446);
   1194     int16x8_t v1067 = vmlaq_n_s16(v1067_tmp, v1066, 2);
   1195     int16x8_t v1068 = vaddq_s16(v1065, v1067);
   1196     int16x8_t v1069 = vsubq_s16(v662, v672);
   1197     int16x8_t v1070 = vsubq_s16(v679, v690);
   1198     int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 18446);
   1199     int16x8_t v1071 = vmlaq_n_s16(v1071_tmp, v1070, 2);
   1200     int16x8_t v1072 = vaddq_s16(v1069, v1071);
   1201     int16x8_t v1073 = vqrdmulhq_n_s16(v1072, 21195);
   1202     int16x8_t v1074 = vaddq_s16(v1068, v1073);
   1203     int16x8_t v1075 = vqrdmulhq_n_s16(v1074, 17401);
   1204     int16x8_t v1076 = vaddq_s16(v1064, v1075);
   1205     int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 16629);
   1206     int16x8_t v1078 = vaddq_s16(v1054, v1077);
   1207     int16x8_t v1079 = vqrdmulhq_n_s16(v1078, 16445);
   1208     int16x8_t v1080 = vaddq_s16(v1032, v1079);
   1209     int16x8_t v1081 = vsubq_s16(v987, v989);
   1210     int16x8_t v1082 = vsubq_s16(v991, v993);
   1211     int16x8_t v1083 = vqrdmulhq_n_s16(v1082, 25826);
   1212     int16x8_t v1084 = vaddq_s16(v1081, v1083);
   1213     int16x8_t v1085 = vsubq_s16(v997, v999);
   1214     int16x8_t v1086 = vsubq_s16(v1001, v1003);
   1215     int16x8_t v1087 = vqrdmulhq_n_s16(v1086, 25826);
   1216     int16x8_t v1088 = vaddq_s16(v1085, v1087);
   1217     int16x8_t v1089 = vqrdmulhq_n_s16(v1088, 18124);
   1218     int16x8_t v1090 = vaddq_s16(v1084, v1089);
   1219     int16x8_t v1091 = vsubq_s16(v1009, v1011);
   1220     int16x8_t v1092 = vsubq_s16(v1013, v1015);
   1221     int16x8_t v1093 = vqrdmulhq_n_s16(v1092, 25826);
   1222     int16x8_t v1094 = vaddq_s16(v1091, v1093);
   1223     int16x8_t v1095 = vsubq_s16(v1019, v1021);
   1224     int16x8_t v1096 = vsubq_s16(v1023, v1025);
   1225     int16x8_t v1097 = vqrdmulhq_n_s16(v1096, 25826);
   1226     int16x8_t v1098 = vaddq_s16(v1095, v1097);
   1227     int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 18124);
   1228     int16x8_t v1100 = vaddq_s16(v1094, v1099);
   1229     int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16792);
   1230     int16x8_t v1102 = vaddq_s16(v1090, v1101);
   1231     int16x8_t v1103 = vsubq_s16(v1033, v1035);
   1232     int16x8_t v1104 = vsubq_s16(v1037, v1039);
   1233     int16x8_t v1105 = vqrdmulhq_n_s16(v1104, 25826);
   1234     int16x8_t v1106 = vaddq_s16(v1103, v1105);
   1235     int16x8_t v1107 = vsubq_s16(v1043, v1045);
   1236     int16x8_t v1108 = vsubq_s16(v1047, v1049);
   1237     int16x8_t v1109 = vqrdmulhq_n_s16(v1108, 25826);
   1238     int16x8_t v1110 = vaddq_s16(v1107, v1109);
   1239     int16x8_t v1111 = vqrdmulhq_n_s16(v1110, 18124);
   1240     int16x8_t v1112 = vaddq_s16(v1106, v1111);
   1241     int16x8_t v1113 = vsubq_s16(v1055, v1057);
   1242     int16x8_t v1114 = vsubq_s16(v1059, v1061);
   1243     int16x8_t v1115 = vqrdmulhq_n_s16(v1114, 25826);
   1244     int16x8_t v1116 = vaddq_s16(v1113, v1115);
   1245     int16x8_t v1117 = vsubq_s16(v1065, v1067);
   1246     int16x8_t v1118 = vsubq_s16(v1069, v1071);
   1247     int16x8_t v1119 = vqrdmulhq_n_s16(v1118, 25826);
   1248     int16x8_t v1120 = vaddq_s16(v1117, v1119);
   1249     int16x8_t v1121 = vqrdmulhq_n_s16(v1120, 18124);
   1250     int16x8_t v1122 = vaddq_s16(v1116, v1121);
   1251     int16x8_t v1123 = vqrdmulhq_n_s16(v1122, 16792);
   1252     int16x8_t v1124 = vaddq_s16(v1112, v1123);
   1253     int16x8_t v1125 = vqrdmulhq_n_s16(v1124, 16484);
   1254     int16x8_t v1126 = vaddq_s16(v1102, v1125);
   1255     int16x8_t v1127 = vsubq_s16(v892, v894);
   1256     int16x8_t v1128 = vsubq_s16(v896, v898);
   1257     int16x8_t v1129_tmp = vqrdmulhq_n_s16(v1128, 1988);
   1258     int16x8_t v1129 = vaddq_s16(v1129_tmp, v1128);
   1259     int16x8_t v1130 = vaddq_s16(v1127, v1129);
   1260     int16x8_t v1131 = vsubq_s16(v902, v904);
   1261     int16x8_t v1132 = vsubq_s16(v906, v908);
   1262     int16x8_t v1133_tmp = vqrdmulhq_n_s16(v1132, 1988);
   1263     int16x8_t v1133 = vaddq_s16(v1133_tmp, v1132);
   1264     int16x8_t v1134 = vaddq_s16(v1131, v1133);
   1265     int16x8_t v1135 = vqrdmulhq_n_s16(v1134, 19102);
   1266     int16x8_t v1136 = vaddq_s16(v1130, v1135);
   1267     int16x8_t v1137 = vsubq_s16(v914, v917);
   1268     int16x8_t v1138 = vsubq_s16(v919, v921);
   1269     int16x8_t v1139_tmp = vqrdmulhq_n_s16(v1138, 1988);
   1270     int16x8_t v1139 = vaddq_s16(v1139_tmp, v1138);
   1271     int16x8_t v1140 = vaddq_s16(v1137, v1139);
   1272     int16x8_t v1141 = vsubq_s16(v925, v927);
   1273     int16x8_t v1142 = vsubq_s16(v929, v931);
   1274     int16x8_t v1143_tmp = vqrdmulhq_n_s16(v1142, 1988);
   1275     int16x8_t v1143 = vaddq_s16(v1143_tmp, v1142);
   1276     int16x8_t v1144 = vaddq_s16(v1141, v1143);
   1277     int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 19102);
   1278     int16x8_t v1146 = vaddq_s16(v1140, v1145);
   1279     int16x8_t v1147 = vqrdmulhq_n_s16(v1146, 17000);
   1280     int16x8_t v1148 = vaddq_s16(v1136, v1147);
   1281     int16x8_t v1149 = vsubq_s16(v939, v941);
   1282     int16x8_t v1150 = vsubq_s16(v943, v945);
   1283     int16x8_t v1151_tmp = vqrdmulhq_n_s16(v1150, 1988);
   1284     int16x8_t v1151 = vaddq_s16(v1151_tmp, v1150);
   1285     int16x8_t v1152 = vaddq_s16(v1149, v1151);
   1286     int16x8_t v1153 = vsubq_s16(v949, v951);
   1287     int16x8_t v1154 = vsubq_s16(v953, v955);
   1288     int16x8_t v1155_tmp = vqrdmulhq_n_s16(v1154, 1988);
   1289     int16x8_t v1155 = vaddq_s16(v1155_tmp, v1154);
   1290     int16x8_t v1156 = vaddq_s16(v1153, v1155);
   1291     int16x8_t v1157 = vqrdmulhq_n_s16(v1156, 19102);
   1292     int16x8_t v1158 = vaddq_s16(v1152, v1157);
   1293     int16x8_t v1159 = vsubq_s16(v961, v963);
   1294     int16x8_t v1160 = vsubq_s16(v965, v967);
   1295     int16x8_t v1161_tmp = vqrdmulhq_n_s16(v1160, 1988);
   1296     int16x8_t v1161 = vaddq_s16(v1161_tmp, v1160);
   1297     int16x8_t v1162 = vaddq_s16(v1159, v1161);
   1298     int16x8_t v1163 = vsubq_s16(v971, v973);
   1299     int16x8_t v1164 = vsubq_s16(v975, v977);
   1300     int16x8_t v1165_tmp = vqrdmulhq_n_s16(v1164, 1988);
   1301     int16x8_t v1165 = vaddq_s16(v1165_tmp, v1164);
   1302     int16x8_t v1166 = vaddq_s16(v1163, v1165);
   1303     int16x8_t v1167 = vqrdmulhq_n_s16(v1166, 19102);
   1304     int16x8_t v1168 = vaddq_s16(v1162, v1167);
   1305     int16x8_t v1169 = vqrdmulhq_n_s16(v1168, 17000);
   1306     int16x8_t v1170 = vaddq_s16(v1158, v1169);
   1307     int16x8_t v1171 = vqrdmulhq_n_s16(v1170, 16534);
   1308     int16x8_t v1172 = vaddq_s16(v1148, v1171);
   1309     int16x8_t v1173 = vsubq_s16(v705, v710);
   1310     int16x8_t v1174 = vsubq_s16(v715, v720);
   1311     int16x8_t v1175_tmp = vqrdmulhq_n_s16(v1174, 23673);
   1312     int16x8_t v1175 = vaddq_s16(v1175_tmp, v1174);
   1313     int16x8_t v1176 = vaddq_s16(v1173, v1175);
   1314     int16x8_t v1177 = vsubq_s16(v727, v732);
   1315     int16x8_t v1178 = vsubq_s16(v737, v742);
   1316     int16x8_t v1179_tmp = vqrdmulhq_n_s16(v1178, 23673);
   1317     int16x8_t v1179 = vaddq_s16(v1179_tmp, v1178);
   1318     int16x8_t v1180 = vaddq_s16(v1177, v1179);
   1319     int16x8_t v1181 = vqrdmulhq_n_s16(v1180, 20398);
   1320     int16x8_t v1182 = vaddq_s16(v1176, v1181);
   1321     int16x8_t v1183 = vsubq_s16(v751, v756);
   1322     int16x8_t v1184 = vsubq_s16(v761, v766);
   1323     int16x8_t v1185_tmp = vqrdmulhq_n_s16(v1184, 23673);
   1324     int16x8_t v1185 = vaddq_s16(v1185_tmp, v1184);
   1325     int16x8_t v1186 = vaddq_s16(v1183, v1185);
   1326     int16x8_t v1187 = vsubq_s16(v773, v778);
   1327     int16x8_t v1188 = vsubq_s16(v783, v788);
   1328     int16x8_t v1189_tmp = vqrdmulhq_n_s16(v1188, 23673);
   1329     int16x8_t v1189 = vaddq_s16(v1189_tmp, v1188);
   1330     int16x8_t v1190 = vaddq_s16(v1187, v1189);
   1331     int16x8_t v1191 = vqrdmulhq_n_s16(v1190, 20398);
   1332     int16x8_t v1192 = vaddq_s16(v1186, v1191);
   1333     int16x8_t v1193 = vqrdmulhq_n_s16(v1192, 17255);
   1334     int16x8_t v1194 = vaddq_s16(v1182, v1193);
   1335     int16x8_t v1195 = vsubq_s16(v799, v804);
   1336     int16x8_t v1196 = vsubq_s16(v809, v814);
   1337     int16x8_t v1197_tmp = vqrdmulhq_n_s16(v1196, 23673);
   1338     int16x8_t v1197 = vaddq_s16(v1197_tmp, v1196);
   1339     int16x8_t v1198 = vaddq_s16(v1195, v1197);
   1340     int16x8_t v1199 = vsubq_s16(v821, v826);
   1341     int16x8_t v1200 = vsubq_s16(v831, v836);
   1342     int16x8_t v1201_tmp = vqrdmulhq_n_s16(v1200, 23673);
   1343     int16x8_t v1201 = vaddq_s16(v1201_tmp, v1200);
   1344     int16x8_t v1202 = vaddq_s16(v1199, v1201);
   1345     int16x8_t v1203 = vqrdmulhq_n_s16(v1202, 20398);
   1346     int16x8_t v1204 = vaddq_s16(v1198, v1203);
   1347     int16x8_t v1205 = vsubq_s16(v845, v850);
   1348     int16x8_t v1206 = vsubq_s16(v855, v860);
   1349     int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 23673);
   1350     int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206);
   1351     int16x8_t v1208 = vaddq_s16(v1205, v1207);
   1352     int16x8_t v1209 = vsubq_s16(v867, v872);
   1353     int16x8_t v1210 = vsubq_s16(v877, v882);
   1354     int16x8_t v1211_tmp = vqrdmulhq_n_s16(v1210, 23673);
   1355     int16x8_t v1211 = vaddq_s16(v1211_tmp, v1210);
   1356     int16x8_t v1212 = vaddq_s16(v1209, v1211);
   1357     int16x8_t v1213 = vqrdmulhq_n_s16(v1212, 20398);
   1358     int16x8_t v1214 = vaddq_s16(v1208, v1213);
   1359     int16x8_t v1215 = vqrdmulhq_n_s16(v1214, 17255);
   1360     int16x8_t v1216 = vaddq_s16(v1204, v1215);
   1361     int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 16595);
   1362     int16x8_t v1218 = vaddq_s16(v1194, v1217);
   1363     int16x8_t v1219 = vsubq_s16(v9, v24);
   1364     int16x8_t v1220 = vsubq_s16(v42, v58);
   1365     int16x8_t v1221_tmp = vqrdmulhq_n_s16(v1220, 3314);
   1366     int16x8_t v1221 = vmlaq_n_s16(v1221_tmp, v1220, 5);
   1367     int16x8_t v1222 = vaddq_s16(v1219, v1221);
   1368     int16x8_t v1223 = vsubq_s16(v78, v101);
   1369     int16x8_t v1224 = vsubq_s16(v119, v136);
   1370     int16x8_t v1225_tmp = vqrdmulhq_n_s16(v1224, 3314);
   1371     int16x8_t v1225 = vmlaq_n_s16(v1225_tmp, v1224, 5);
   1372     int16x8_t v1226 = vaddq_s16(v1223, v1225);
   1373     int16x8_t v1227 = vqrdmulhq_n_s16(v1226, 22112);
   1374     int16x8_t v1228 = vaddq_s16(v1222, v1227);
   1375     int16x8_t v1229 = vsubq_s16(v158, v181);
   1376     int16x8_t v1230 = vsubq_s16(v213, v231);
   1377     int16x8_t v1231_tmp = vqrdmulhq_n_s16(v1230, 3314);
   1378     int16x8_t v1231 = vmlaq_n_s16(v1231_tmp, v1230, 5);
   1379     int16x8_t v1232 = vaddq_s16(v1229, v1231);
   1380     int16x8_t v1233 = vsubq_s16(v251, v274);
   1381     int16x8_t v1234 = vsubq_s16(v292, v310);
   1382     int16x8_t v1235_tmp = vqrdmulhq_n_s16(v1234, 3314);
   1383     int16x8_t v1235 = vmlaq_n_s16(v1235_tmp, v1234, 5);
   1384     int16x8_t v1236 = vaddq_s16(v1233, v1235);
   1385     int16x8_t v1237 = vqrdmulhq_n_s16(v1236, 22112);
   1386     int16x8_t v1238 = vaddq_s16(v1232, v1237);
   1387     int16x8_t v1239 = vqrdmulhq_n_s16(v1238, 17561);
   1388     int16x8_t v1240 = vaddq_s16(v1228, v1239);
   1389     int16x8_t v1241 = vsubq_s16(v334, v357);
   1390     int16x8_t v1242 = vsubq_s16(v389, v407);
   1391     int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 3314);
   1392     int16x8_t v1243 = vmlaq_n_s16(v1243_tmp, v1242, 5);
   1393     int16x8_t v1244 = vaddq_s16(v1241, v1243);
   1394     int16x8_t v1245 = vsubq_s16(v441, v480);
   1395     int16x8_t v1246 = vsubq_s16(v498, v517);
   1396     int16x8_t v1247_tmp = vqrdmulhq_n_s16(v1246, 3314);
   1397     int16x8_t v1247 = vmlaq_n_s16(v1247_tmp, v1246, 5);
   1398     int16x8_t v1248 = vaddq_s16(v1245, v1247);
   1399     int16x8_t v1249 = vqrdmulhq_n_s16(v1248, 22112);
   1400     int16x8_t v1250 = vaddq_s16(v1244, v1249);
   1401     int16x8_t v1251 = vsubq_s16(v539, v562);
   1402     int16x8_t v1252 = vsubq_s16(v594, v612);
   1403     int16x8_t v1253_tmp = vqrdmulhq_n_s16(v1252, 3314);
   1404     int16x8_t v1253 = vmlaq_n_s16(v1253_tmp, v1252, 5);
   1405     int16x8_t v1254 = vaddq_s16(v1251, v1253);
   1406     int16x8_t v1255 = vsubq_s16(v632, v655);
   1407     int16x8_t v1256 = vsubq_s16(v673, v692);
   1408     int16x8_t v1257_tmp = vqrdmulhq_n_s16(v1256, 3314);
   1409     int16x8_t v1257 = vmlaq_n_s16(v1257_tmp, v1256, 5);
   1410     int16x8_t v1258 = vaddq_s16(v1255, v1257);
   1411     int16x8_t v1259 = vqrdmulhq_n_s16(v1258, 22112);
   1412     int16x8_t v1260 = vaddq_s16(v1254, v1259);
   1413     int16x8_t v1261 = vqrdmulhq_n_s16(v1260, 17561);
   1414     int16x8_t v1262 = vaddq_s16(v1250, v1261);
   1415     int16x8_t v1263 = vqrdmulhq_n_s16(v1262, 16666);
   1416     int16x8_t v1264 = vaddq_s16(v1240, v1263);
   1417     int16x8_t v1265 = vsubq_s16(v1219, v1221);
   1418     int16x8_t v1266 = vsubq_s16(v1223, v1225);
   1419     int16x8_t v1267 = vqrdmulhq_n_s16(v1266, 24397);
   1420     int16x8_t v1268 = vaddq_s16(v1265, v1267);
   1421     int16x8_t v1269 = vsubq_s16(v1229, v1231);
   1422     int16x8_t v1270 = vsubq_s16(v1233, v1235);
   1423     int16x8_t v1271 = vqrdmulhq_n_s16(v1270, 24397);
   1424     int16x8_t v1272 = vaddq_s16(v1269, v1271);
   1425     int16x8_t v1273 = vqrdmulhq_n_s16(v1272, 17921);
   1426     int16x8_t v1274 = vaddq_s16(v1268, v1273);
   1427     int16x8_t v1275 = vsubq_s16(v1241, v1243);
   1428     int16x8_t v1276 = vsubq_s16(v1245, v1247);
   1429     int16x8_t v1277 = vqrdmulhq_n_s16(v1276, 24397);
   1430     int16x8_t v1278 = vaddq_s16(v1275, v1277);
   1431     int16x8_t v1279 = vsubq_s16(v1251, v1253);
   1432     int16x8_t v1280 = vsubq_s16(v1255, v1257);
   1433     int16x8_t v1281 = vqrdmulhq_n_s16(v1280, 24397);
   1434     int16x8_t v1282 = vaddq_s16(v1279, v1281);
   1435     int16x8_t v1283 = vqrdmulhq_n_s16(v1282, 17921);
   1436     int16x8_t v1284 = vaddq_s16(v1278, v1283);
   1437     int16x8_t v1285 = vqrdmulhq_n_s16(v1284, 16747);
   1438     int16x8_t v1286 = vaddq_s16(v1274, v1285);
   1439     int16x8_t v1287 = vsubq_s16(v1173, v1175);
   1440     int16x8_t v1288 = vsubq_s16(v1177, v1179);
   1441     int16x8_t v1289 = vqrdmulhq_n_s16(v1288, 27504);
   1442     int16x8_t v1290 = vaddq_s16(v1287, v1289);
   1443     int16x8_t v1291 = vsubq_s16(v1183, v1185);
   1444     int16x8_t v1292 = vsubq_s16(v1187, v1189);
   1445     int16x8_t v1293 = vqrdmulhq_n_s16(v1292, 27504);
   1446     int16x8_t v1294 = vaddq_s16(v1291, v1293);
   1447     int16x8_t v1295 = vqrdmulhq_n_s16(v1294, 18343);
   1448     int16x8_t v1296 = vaddq_s16(v1290, v1295);
   1449     int16x8_t v1297 = vsubq_s16(v1195, v1197);
   1450     int16x8_t v1298 = vsubq_s16(v1199, v1201);
   1451     int16x8_t v1299 = vqrdmulhq_n_s16(v1298, 27504);
   1452     int16x8_t v1300 = vaddq_s16(v1297, v1299);
   1453     int16x8_t v1301 = vsubq_s16(v1205, v1207);
   1454     int16x8_t v1302 = vsubq_s16(v1209, v1211);
   1455     int16x8_t v1303 = vqrdmulhq_n_s16(v1302, 27504);
   1456     int16x8_t v1304 = vaddq_s16(v1301, v1303);
   1457     int16x8_t v1305 = vqrdmulhq_n_s16(v1304, 18343);
   1458     int16x8_t v1306 = vaddq_s16(v1300, v1305);
   1459     int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 16840);
   1460     int16x8_t v1308 = vaddq_s16(v1296, v1307);
   1461     int16x8_t v1309 = vsubq_s16(v1127, v1129);
   1462     int16x8_t v1310 = vsubq_s16(v1131, v1133);
   1463     int16x8_t v1311 = vqrdmulhq_n_s16(v1310, 31869);
   1464     int16x8_t v1312 = vaddq_s16(v1309, v1311);
   1465     int16x8_t v1313 = vsubq_s16(v1137, v1139);
   1466     int16x8_t v1314 = vsubq_s16(v1141, v1143);
   1467     int16x8_t v1315 = vqrdmulhq_n_s16(v1314, 31869);
   1468     int16x8_t v1316 = vaddq_s16(v1313, v1315);
   1469     int16x8_t v1317 = vqrdmulhq_n_s16(v1316, 18830);
   1470     int16x8_t v1318 = vaddq_s16(v1312, v1317);
   1471     int16x8_t v1319 = vsubq_s16(v1149, v1151);
   1472     int16x8_t v1320 = vsubq_s16(v1153, v1155);
   1473     int16x8_t v1321 = vqrdmulhq_n_s16(v1320, 31869);
   1474     int16x8_t v1322 = vaddq_s16(v1319, v1321);
   1475     int16x8_t v1323 = vsubq_s16(v1159, v1161);
   1476     int16x8_t v1324 = vsubq_s16(v1163, v1165);
   1477     int16x8_t v1325 = vqrdmulhq_n_s16(v1324, 31869);
   1478     int16x8_t v1326 = vaddq_s16(v1323, v1325);
   1479     int16x8_t v1327 = vqrdmulhq_n_s16(v1326, 18830);
   1480     int16x8_t v1328 = vaddq_s16(v1322, v1327);
   1481     int16x8_t v1329 = vqrdmulhq_n_s16(v1328, 16944);
   1482     int16x8_t v1330 = vaddq_s16(v1318, v1329);
   1483     int16x8_t v1331 = vsubq_s16(v1081, v1083);
   1484     int16x8_t v1332 = vsubq_s16(v1085, v1087);
   1485     int16x8_t v1333_tmp = vqrdmulhq_n_s16(v1332, 5552);
   1486     int16x8_t v1333 = vaddq_s16(v1333_tmp, v1332);
   1487     int16x8_t v1334 = vaddq_s16(v1331, v1333);
   1488     int16x8_t v1335 = vsubq_s16(v1091, v1093);
   1489     int16x8_t v1336 = vsubq_s16(v1095, v1097);
   1490     int16x8_t v1337_tmp = vqrdmulhq_n_s16(v1336, 5552);
   1491     int16x8_t v1337 = vaddq_s16(v1337_tmp, v1336);
   1492     int16x8_t v1338 = vaddq_s16(v1335, v1337);
   1493     int16x8_t v1339 = vqrdmulhq_n_s16(v1338, 19393);
   1494     int16x8_t v1340 = vaddq_s16(v1334, v1339);
   1495     int16x8_t v1341 = vsubq_s16(v1103, v1105);
   1496     int16x8_t v1342 = vsubq_s16(v1107, v1109);
   1497     int16x8_t v1343_tmp = vqrdmulhq_n_s16(v1342, 5552);
   1498     int16x8_t v1343 = vaddq_s16(v1343_tmp, v1342);
   1499     int16x8_t v1344 = vaddq_s16(v1341, v1343);
   1500     int16x8_t v1345 = vsubq_s16(v1113, v1115);
   1501     int16x8_t v1346 = vsubq_s16(v1117, v1119);
   1502     int16x8_t v1347_tmp = vqrdmulhq_n_s16(v1346, 5552);
   1503     int16x8_t v1347 = vaddq_s16(v1347_tmp, v1346);
   1504     int16x8_t v1348 = vaddq_s16(v1345, v1347);
   1505     int16x8_t v1349 = vqrdmulhq_n_s16(v1348, 19393);
   1506     int16x8_t v1350 = vaddq_s16(v1344, v1349);
   1507     int16x8_t v1351 = vqrdmulhq_n_s16(v1350, 17059);
   1508     int16x8_t v1352 = vaddq_s16(v1340, v1351);
   1509     int16x8_t v1353 = vsubq_s16(v990, v995);
   1510     int16x8_t v1354 = vsubq_s16(v1000, v1005);
   1511     int16x8_t v1355_tmp = vqrdmulhq_n_s16(v1354, 15865);
   1512     int16x8_t v1355 = vaddq_s16(v1355_tmp, v1354);
   1513     int16x8_t v1356 = vaddq_s16(v1353, v1355);
   1514     int16x8_t v1357 = vsubq_s16(v1012, v1017);
   1515     int16x8_t v1358 = vsubq_s16(v1022, v1027);
   1516     int16x8_t v1359_tmp = vqrdmulhq_n_s16(v1358, 15865);
   1517     int16x8_t v1359 = vaddq_s16(v1359_tmp, v1358);
   1518     int16x8_t v1360 = vaddq_s16(v1357, v1359);
   1519     int16x8_t v1361 = vqrdmulhq_n_s16(v1360, 20040);
   1520     int16x8_t v1362 = vaddq_s16(v1356, v1361);
   1521     int16x8_t v1363 = vsubq_s16(v1036, v1041);
   1522     int16x8_t v1364 = vsubq_s16(v1046, v1051);
   1523     int16x8_t v1365_tmp = vqrdmulhq_n_s16(v1364, 15865);
   1524     int16x8_t v1365 = vaddq_s16(v1365_tmp, v1364);
   1525     int16x8_t v1366 = vaddq_s16(v1363, v1365);
   1526     int16x8_t v1367 = vsubq_s16(v1058, v1063);
   1527     int16x8_t v1368 = vsubq_s16(v1068, v1073);
   1528     int16x8_t v1369_tmp = vqrdmulhq_n_s16(v1368, 15865);
   1529     int16x8_t v1369 = vaddq_s16(v1369_tmp, v1368);
   1530     int16x8_t v1370 = vaddq_s16(v1367, v1369);
   1531     int16x8_t v1371 = vqrdmulhq_n_s16(v1370, 20040);
   1532     int16x8_t v1372 = vaddq_s16(v1366, v1371);
   1533     int16x8_t v1373 = vqrdmulhq_n_s16(v1372, 17187);
   1534     int16x8_t v1374 = vaddq_s16(v1362, v1373);
   1535     int16x8_t v1375 = vsubq_s16(v895, v900);
   1536     int16x8_t v1376 = vsubq_s16(v905, v910);
   1537     int16x8_t v1377_tmp = vqrdmulhq_n_s16(v1376, 1893);
   1538     int16x8_t v1377 = vmlaq_n_s16(v1377_tmp, v1376, 2);
   1539     int16x8_t v1378 = vaddq_s16(v1375, v1377);
   1540     int16x8_t v1379 = vsubq_s16(v918, v923);
   1541     int16x8_t v1380 = vsubq_s16(v928, v933);
   1542     int16x8_t v1381_tmp = vqrdmulhq_n_s16(v1380, 1893);
   1543     int16x8_t v1381 = vmlaq_n_s16(v1381_tmp, v1380, 2);
   1544     int16x8_t v1382 = vaddq_s16(v1379, v1381);
   1545     int16x8_t v1383 = vqrdmulhq_n_s16(v1382, 20783);
   1546     int16x8_t v1384 = vaddq_s16(v1378, v1383);
   1547     int16x8_t v1385 = vsubq_s16(v942, v947);
   1548     int16x8_t v1386 = vsubq_s16(v952, v957);
   1549     int16x8_t v1387_tmp = vqrdmulhq_n_s16(v1386, 1893);
   1550     int16x8_t v1387 = vmlaq_n_s16(v1387_tmp, v1386, 2);
   1551     int16x8_t v1388 = vaddq_s16(v1385, v1387);
   1552     int16x8_t v1389 = vsubq_s16(v964, v969);
   1553     int16x8_t v1390 = vsubq_s16(v974, v979);
   1554     int16x8_t v1391_tmp = vqrdmulhq_n_s16(v1390, 1893);
   1555     int16x8_t v1391 = vmlaq_n_s16(v1391_tmp, v1390, 2);
   1556     int16x8_t v1392 = vaddq_s16(v1389, v1391);
   1557     int16x8_t v1393 = vqrdmulhq_n_s16(v1392, 20783);
   1558     int16x8_t v1394 = vaddq_s16(v1388, v1393);
   1559     int16x8_t v1395 = vqrdmulhq_n_s16(v1394, 17326);
   1560     int16x8_t v1396 = vaddq_s16(v1384, v1395);
   1561     int16x8_t v1397 = vsubq_s16(v711, v722);
   1562     int16x8_t v1398 = vsubq_s16(v733, v744);
   1563     int16x8_t v1399_tmp = vqrdmulhq_n_s16(v1398, 13357);
   1564     int16x8_t v1399 = vmlaq_n_s16(v1399_tmp, v1398, 3);
   1565     int16x8_t v1400 = vaddq_s16(v1397, v1399);
   1566     int16x8_t v1401 = vsubq_s16(v757, v768);
   1567     int16x8_t v1402 = vsubq_s16(v779, v790);
   1568     int16x8_t v1403_tmp = vqrdmulhq_n_s16(v1402, 13357);
   1569     int16x8_t v1403 = vmlaq_n_s16(v1403_tmp, v1402, 3);
   1570     int16x8_t v1404 = vaddq_s16(v1401, v1403);
   1571     int16x8_t v1405 = vqrdmulhq_n_s16(v1404, 21637);
   1572     int16x8_t v1406 = vaddq_s16(v1400, v1405);
   1573     int16x8_t v1407 = vsubq_s16(v805, v816);
   1574     int16x8_t v1408 = vsubq_s16(v827, v838);
   1575     int16x8_t v1409_tmp = vqrdmulhq_n_s16(v1408, 13357);
   1576     int16x8_t v1409 = vmlaq_n_s16(v1409_tmp, v1408, 3);
   1577     int16x8_t v1410 = vaddq_s16(v1407, v1409);
   1578     int16x8_t v1411 = vsubq_s16(v851, v862);
   1579     int16x8_t v1412 = vsubq_s16(v873, v884);
   1580     int16x8_t v1413_tmp = vqrdmulhq_n_s16(v1412, 13357);
   1581     int16x8_t v1413 = vmlaq_n_s16(v1413_tmp, v1412, 3);
   1582     int16x8_t v1414 = vaddq_s16(v1411, v1413);
   1583     int16x8_t v1415 = vqrdmulhq_n_s16(v1414, 21637);
   1584     int16x8_t v1416 = vaddq_s16(v1410, v1415);
   1585     int16x8_t v1417 = vqrdmulhq_n_s16(v1416, 17479);
   1586     int16x8_t v1418 = vaddq_s16(v1406, v1417);
   1587     int16x8_t v1419 = vsubq_s16(v25, v60);
   1588     int16x8_t v1420 = vsubq_s16(v102, v138);
   1589     int16x8_t v1421_tmp = vqrdmulhq_n_s16(v1420, 6226);
   1590     int16x8_t v1421 = vmlaq_n_s16(v1421_tmp, v1420, 10);
   1591     int16x8_t v1422 = vaddq_s16(v1419, v1421);
   1592     int16x8_t v1423 = vsubq_s16(v182, v233);
   1593     int16x8_t v1424 = vsubq_s16(v275, v312);
   1594     int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 6226);
   1595     int16x8_t v1425 = vmlaq_n_s16(v1425_tmp, v1424, 10);
   1596     int16x8_t v1426 = vaddq_s16(v1423, v1425);
   1597     int16x8_t v1427 = vqrdmulhq_n_s16(v1426, 22622);
   1598     int16x8_t v1428 = vaddq_s16(v1422, v1427);
   1599     int16x8_t v1429 = vsubq_s16(v358, v409);
   1600     int16x8_t v1430 = vsubq_s16(v481, v519);
   1601     int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 6226);
   1602     int16x8_t v1431 = vmlaq_n_s16(v1431_tmp, v1430, 10);
   1603     int16x8_t v1432 = vaddq_s16(v1429, v1431);
   1604     int16x8_t v1433 = vsubq_s16(v563, v614);
   1605     int16x8_t v1434 = vsubq_s16(v656, v694);
   1606     int16x8_t v1435_tmp = vqrdmulhq_n_s16(v1434, 6226);
   1607     int16x8_t v1435 = vmlaq_n_s16(v1435_tmp, v1434, 10);
   1608     int16x8_t v1436 = vaddq_s16(v1433, v1435);
   1609     int16x8_t v1437 = vqrdmulhq_n_s16(v1436, 22622);
   1610     int16x8_t v1438 = vaddq_s16(v1432, v1437);
   1611     int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17646);
   1612     int16x8_t v1440 = vaddq_s16(v1428, v1439);
   1613     int16x8_t v1441 = vsubq_s16(v1419, v1421);
   1614     int16x8_t v1442 = vsubq_s16(v1423, v1425);
   1615     int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 23761);
   1616     int16x8_t v1444 = vaddq_s16(v1441, v1443);
   1617     int16x8_t v1445 = vsubq_s16(v1429, v1431);
   1618     int16x8_t v1446 = vsubq_s16(v1433, v1435);
   1619     int16x8_t v1447 = vqrdmulhq_n_s16(v1446, 23761);
   1620     int16x8_t v1448 = vaddq_s16(v1445, v1447);
   1621     int16x8_t v1449 = vqrdmulhq_n_s16(v1448, 17826);
   1622     int16x8_t v1450 = vaddq_s16(v1444, v1449);
   1623     int16x8_t v1451 = vsubq_s16(v1397, v1399);
   1624     int16x8_t v1452 = vsubq_s16(v1401, v1403);
   1625     int16x8_t v1453 = vqrdmulhq_n_s16(v1452, 25084);
   1626     int16x8_t v1454 = vaddq_s16(v1451, v1453);
   1627     int16x8_t v1455 = vsubq_s16(v1407, v1409);
   1628     int16x8_t v1456 = vsubq_s16(v1411, v1413);
   1629     int16x8_t v1457 = vqrdmulhq_n_s16(v1456, 25084);
   1630     int16x8_t v1458 = vaddq_s16(v1455, v1457);
   1631     int16x8_t v1459 = vqrdmulhq_n_s16(v1458, 18021);
   1632     int16x8_t v1460 = vaddq_s16(v1454, v1459);
   1633     int16x8_t v1461 = vsubq_s16(v1375, v1377);
   1634     int16x8_t v1462 = vsubq_s16(v1379, v1381);
   1635     int16x8_t v1463 = vqrdmulhq_n_s16(v1462, 26631);
   1636     int16x8_t v1464 = vaddq_s16(v1461, v1463);
   1637     int16x8_t v1465 = vsubq_s16(v1385, v1387);
   1638     int16x8_t v1466 = vsubq_s16(v1389, v1391);
   1639     int16x8_t v1467 = vqrdmulhq_n_s16(v1466, 26631);
   1640     int16x8_t v1468 = vaddq_s16(v1465, v1467);
   1641     int16x8_t v1469 = vqrdmulhq_n_s16(v1468, 18231);
   1642     int16x8_t v1470 = vaddq_s16(v1464, v1469);
   1643     int16x8_t v1471 = vsubq_s16(v1353, v1355);
   1644     int16x8_t v1472 = vsubq_s16(v1357, v1359);
   1645     int16x8_t v1473 = vqrdmulhq_n_s16(v1472, 28454);
   1646     int16x8_t v1474 = vaddq_s16(v1471, v1473);
   1647     int16x8_t v1475 = vsubq_s16(v1363, v1365);
   1648     int16x8_t v1476 = vsubq_s16(v1367, v1369);
   1649     int16x8_t v1477 = vqrdmulhq_n_s16(v1476, 28454);
   1650     int16x8_t v1478 = vaddq_s16(v1475, v1477);
   1651     int16x8_t v1479 = vqrdmulhq_n_s16(v1478, 18458);
   1652     int16x8_t v1480 = vaddq_s16(v1474, v1479);
   1653     int16x8_t v1481 = vsubq_s16(v1331, v1333);
   1654     int16x8_t v1482 = vsubq_s16(v1335, v1337);
   1655     int16x8_t v1483 = vqrdmulhq_n_s16(v1482, 30624);
   1656     int16x8_t v1484 = vaddq_s16(v1481, v1483);
   1657     int16x8_t v1485 = vsubq_s16(v1341, v1343);
   1658     int16x8_t v1486 = vsubq_s16(v1345, v1347);
   1659     int16x8_t v1487 = vqrdmulhq_n_s16(v1486, 30624);
   1660     int16x8_t v1488 = vaddq_s16(v1485, v1487);
   1661     int16x8_t v1489 = vqrdmulhq_n_s16(v1488, 18702);
   1662     int16x8_t v1490 = vaddq_s16(v1484, v1489);
   1663     int16x8_t v1491 = vsubq_s16(v1309, v1311);
   1664     int16x8_t v1492 = vsubq_s16(v1313, v1315);
   1665     int16x8_t v1493_tmp = vqrdmulhq_n_s16(v1492, 472);
   1666     int16x8_t v1493 = vaddq_s16(v1493_tmp, v1492);
   1667     int16x8_t v1494 = vaddq_s16(v1491, v1493);
   1668     int16x8_t v1495 = vsubq_s16(v1319, v1321);
   1669     int16x8_t v1496 = vsubq_s16(v1323, v1325);
   1670     int16x8_t v1497_tmp = vqrdmulhq_n_s16(v1496, 472);
   1671     int16x8_t v1497 = vaddq_s16(v1497_tmp, v1496);
   1672     int16x8_t v1498 = vaddq_s16(v1495, v1497);
   1673     int16x8_t v1499 = vqrdmulhq_n_s16(v1498, 18964);
   1674     int16x8_t v1500 = vaddq_s16(v1494, v1499);
   1675     int16x8_t v1501 = vsubq_s16(v1287, v1289);
   1676     int16x8_t v1502 = vsubq_s16(v1291, v1293);
   1677     int16x8_t v1503_tmp = vqrdmulhq_n_s16(v1502, 3672);
   1678     int16x8_t v1503 = vaddq_s16(v1503_tmp, v1502);
   1679     int16x8_t v1504 = vaddq_s16(v1501, v1503);
   1680     int16x8_t v1505 = vsubq_s16(v1297, v1299);
   1681     int16x8_t v1506 = vsubq_s16(v1301, v1303);
   1682     int16x8_t v1507_tmp = vqrdmulhq_n_s16(v1506, 3672);
   1683     int16x8_t v1507 = vaddq_s16(v1507_tmp, v1506);
   1684     int16x8_t v1508 = vaddq_s16(v1505, v1507);
   1685     int16x8_t v1509 = vqrdmulhq_n_s16(v1508, 19245);
   1686     int16x8_t v1510 = vaddq_s16(v1504, v1509);
   1687     int16x8_t v1511 = vsubq_s16(v1265, v1267);
   1688     int16x8_t v1512 = vsubq_s16(v1269, v1271);
   1689     int16x8_t v1513_tmp = vqrdmulhq_n_s16(v1512, 7662);
   1690     int16x8_t v1513 = vaddq_s16(v1513_tmp, v1512);
   1691     int16x8_t v1514 = vaddq_s16(v1511, v1513);
   1692     int16x8_t v1515 = vsubq_s16(v1275, v1277);
   1693     int16x8_t v1516 = vsubq_s16(v1279, v1281);
   1694     int16x8_t v1517_tmp = vqrdmulhq_n_s16(v1516, 7662);
   1695     int16x8_t v1517 = vaddq_s16(v1517_tmp, v1516);
   1696     int16x8_t v1518 = vaddq_s16(v1515, v1517);
   1697     int16x8_t v1519 = vqrdmulhq_n_s16(v1518, 19546);
   1698     int16x8_t v1520 = vaddq_s16(v1514, v1519);
   1699     int16x8_t v1521 = vsubq_s16(v1222, v1227);
   1700     int16x8_t v1522 = vsubq_s16(v1232, v1237);
   1701     int16x8_t v1523_tmp = vqrdmulhq_n_s16(v1522, 12756);
   1702     int16x8_t v1523 = vaddq_s16(v1523_tmp, v1522);
   1703     int16x8_t v1524 = vaddq_s16(v1521, v1523);
   1704     int16x8_t v1525 = vsubq_s16(v1244, v1249);
   1705     int16x8_t v1526 = vsubq_s16(v1254, v1259);
   1706     int16x8_t v1527_tmp = vqrdmulhq_n_s16(v1526, 12756);
   1707     int16x8_t v1527 = vaddq_s16(v1527_tmp, v1526);
   1708     int16x8_t v1528 = vaddq_s16(v1525, v1527);
   1709     int16x8_t v1529 = vqrdmulhq_n_s16(v1528, 19869);
   1710     int16x8_t v1530 = vaddq_s16(v1524, v1529);
   1711     int16x8_t v1531 = vsubq_s16(v1176, v1181);
   1712     int16x8_t v1532 = vsubq_s16(v1186, v1191);
   1713     int16x8_t v1533_tmp = vqrdmulhq_n_s16(v1532, 19463);
   1714     int16x8_t v1533 = vaddq_s16(v1533_tmp, v1532);
   1715     int16x8_t v1534 = vaddq_s16(v1531, v1533);
   1716     int16x8_t v1535 = vsubq_s16(v1198, v1203);
   1717     int16x8_t v1536 = vsubq_s16(v1208, v1213);
   1718     int16x8_t v1537_tmp = vqrdmulhq_n_s16(v1536, 19463);
   1719     int16x8_t v1537 = vaddq_s16(v1537_tmp, v1536);
   1720     int16x8_t v1538 = vaddq_s16(v1535, v1537);
   1721     int16x8_t v1539 = vqrdmulhq_n_s16(v1538, 20216);
   1722     int16x8_t v1540 = vaddq_s16(v1534, v1539);
   1723     int16x8_t v1541 = vsubq_s16(v1130, v1135);
   1724     int16x8_t v1542 = vsubq_s16(v1140, v1145);
   1725     int16x8_t v1543_tmp = vqrdmulhq_n_s16(v1542, 28661);
   1726     int16x8_t v1543 = vaddq_s16(v1543_tmp, v1542);
   1727     int16x8_t v1544 = vaddq_s16(v1541, v1543);
   1728     int16x8_t v1545 = vsubq_s16(v1152, v1157);
   1729     int16x8_t v1546 = vsubq_s16(v1162, v1167);
   1730     int16x8_t v1547_tmp = vqrdmulhq_n_s16(v1546, 28661);
   1731     int16x8_t v1547 = vaddq_s16(v1547_tmp, v1546);
   1732     int16x8_t v1548 = vaddq_s16(v1545, v1547);
   1733     int16x8_t v1549 = vqrdmulhq_n_s16(v1548, 20587);
   1734     int16x8_t v1550 = vaddq_s16(v1544, v1549);
   1735     int16x8_t v1551 = vsubq_s16(v1084, v1089);
   1736     int16x8_t v1552 = vsubq_s16(v1094, v1099);
   1737     int16x8_t v1553_tmp = vqrdmulhq_n_s16(v1552, 9242);
   1738     int16x8_t v1553 = vmlaq_n_s16(v1553_tmp, v1552, 2);
   1739     int16x8_t v1554 = vaddq_s16(v1551, v1553);
   1740     int16x8_t v1555 = vsubq_s16(v1106, v1111);
   1741     int16x8_t v1556 = vsubq_s16(v1116, v1121);
   1742     int16x8_t v1557_tmp = vqrdmulhq_n_s16(v1556, 9242);
   1743     int16x8_t v1557 = vmlaq_n_s16(v1557_tmp, v1556, 2);
   1744     int16x8_t v1558 = vaddq_s16(v1555, v1557);
   1745     int16x8_t v1559 = vqrdmulhq_n_s16(v1558, 20985);
   1746     int16x8_t v1560 = vaddq_s16(v1554, v1559);
   1747     int16x8_t v1561 = vsubq_s16(v996, v1007);
   1748     int16x8_t v1562 = vsubq_s16(v1018, v1029);
   1749     int16x8_t v1563_tmp = vqrdmulhq_n_s16(v1562, 30298);
   1750     int16x8_t v1563 = vmlaq_n_s16(v1563_tmp, v1562, 2);
   1751     int16x8_t v1564 = vaddq_s16(v1561, v1563);
   1752     int16x8_t v1565 = vsubq_s16(v1042, v1053);
   1753     int16x8_t v1566 = vsubq_s16(v1064, v1075);
   1754     int16x8_t v1567_tmp = vqrdmulhq_n_s16(v1566, 30298);
   1755     int16x8_t v1567 = vmlaq_n_s16(v1567_tmp, v1566, 2);
   1756     int16x8_t v1568 = vaddq_s16(v1565, v1567);
   1757     int16x8_t v1569 = vqrdmulhq_n_s16(v1568, 21412);
   1758     int16x8_t v1570 = vaddq_s16(v1564, v1569);
   1759     int16x8_t v1571 = vsubq_s16(v901, v912);
   1760     int16x8_t v1572 = vsubq_s16(v924, v935);
   1761     int16x8_t v1573_tmp = vqrdmulhq_n_s16(v1572, 2773);
   1762     int16x8_t v1573 = vmlaq_n_s16(v1573_tmp, v1572, 4);
   1763     int16x8_t v1574 = vaddq_s16(v1571, v1573);
   1764     int16x8_t v1575 = vsubq_s16(v948, v959);
   1765     int16x8_t v1576 = vsubq_s16(v970, v981);
   1766     int16x8_t v1577_tmp = vqrdmulhq_n_s16(v1576, 2773);
   1767     int16x8_t v1577 = vmlaq_n_s16(v1577_tmp, v1576, 4);
   1768     int16x8_t v1578 = vaddq_s16(v1575, v1577);
   1769     int16x8_t v1579 = vqrdmulhq_n_s16(v1578, 21871);
   1770     int16x8_t v1580 = vaddq_s16(v1574, v1579);
   1771     int16x8_t v1581 = vsubq_s16(v723, v746);
   1772     int16x8_t v1582 = vsubq_s16(v769, v792);
   1773     int16x8_t v1583_tmp = vqrdmulhq_n_s16(v1582, 26108);
   1774     int16x8_t v1583 = vmlaq_n_s16(v1583_tmp, v1582, 6);
   1775     int16x8_t v1584 = vaddq_s16(v1581, v1583);
   1776     int16x8_t v1585 = vsubq_s16(v817, v840);
   1777     int16x8_t v1586 = vsubq_s16(v863, v886);
   1778     int16x8_t v1587_tmp = vqrdmulhq_n_s16(v1586, 26108);
   1779     int16x8_t v1587 = vmlaq_n_s16(v1587_tmp, v1586, 6);
   1780     int16x8_t v1588 = vaddq_s16(v1585, v1587);
   1781     int16x8_t v1589 = vqrdmulhq_n_s16(v1588, 22363);
   1782     int16x8_t v1590 = vaddq_s16(v1584, v1589);
   1783     int16x8_t v1591 = vsubq_s16(v61, v140);
   1784     int16x8_t v1592 = vsubq_s16(v234, v314);
   1785     int16x8_t v1593_tmp = vqrdmulhq_n_s16(v1592, 12251);
   1786     int16x8_t v1593 = vmlaq_n_s16(v1593_tmp, v1592, 20);
   1787     int16x8_t v1594 = vaddq_s16(v1591, v1593);
   1788     int16x8_t v1595 = vsubq_s16(v410, v521);
   1789     int16x8_t v1596 = vsubq_s16(v615, v696);
   1790     int16x8_t v1597_tmp = vqrdmulhq_n_s16(v1596, 12251);
   1791     int16x8_t v1597 = vmlaq_n_s16(v1597_tmp, v1596, 20);
   1792     int16x8_t v1598 = vaddq_s16(v1595, v1597);
   1793     int16x8_t v1599 = vqrdmulhq_n_s16(v1598, 22891);
   1794     int16x8_t v1600 = vaddq_s16(v1594, v1599);
   1795     int16x8_t v1601 = vsubq_s16(v1591, v1593);
   1796     int16x8_t v1602 = vsubq_s16(v1595, v1597);
   1797     int16x8_t v1603 = vqrdmulhq_n_s16(v1602, 23460);
   1798     int16x8_t v1604 = vaddq_s16(v1601, v1603);
   1799     int16x8_t v1605 = vsubq_s16(v1581, v1583);
   1800     int16x8_t v1606 = vsubq_s16(v1585, v1587);
   1801     int16x8_t v1607 = vqrdmulhq_n_s16(v1606, 24073);
   1802     int16x8_t v1608 = vaddq_s16(v1605, v1607);
   1803     int16x8_t v1609 = vsubq_s16(v1571, v1573);
   1804     int16x8_t v1610 = vsubq_s16(v1575, v1577);
   1805     int16x8_t v1611 = vqrdmulhq_n_s16(v1610, 24734);
   1806     int16x8_t v1612 = vaddq_s16(v1609, v1611);
   1807     int16x8_t v1613 = vsubq_s16(v1561, v1563);
   1808     int16x8_t v1614 = vsubq_s16(v1565, v1567);
   1809     int16x8_t v1615 = vqrdmulhq_n_s16(v1614, 25448);
   1810     int16x8_t v1616 = vaddq_s16(v1613, v1615);
   1811     int16x8_t v1617 = vsubq_s16(v1551, v1553);
   1812     int16x8_t v1618 = vsubq_s16(v1555, v1557);
   1813     int16x8_t v1619 = vqrdmulhq_n_s16(v1618, 26220);
   1814     int16x8_t v1620 = vaddq_s16(v1617, v1619);
   1815     int16x8_t v1621 = vsubq_s16(v1541, v1543);
   1816     int16x8_t v1622 = vsubq_s16(v1545, v1547);
   1817     int16x8_t v1623 = vqrdmulhq_n_s16(v1622, 27058);
   1818     int16x8_t v1624 = vaddq_s16(v1621, v1623);
   1819     int16x8_t v1625 = vsubq_s16(v1531, v1533);
   1820     int16x8_t v1626 = vsubq_s16(v1535, v1537);
   1821     int16x8_t v1627 = vqrdmulhq_n_s16(v1626, 27969);
   1822     int16x8_t v1628 = vaddq_s16(v1625, v1627);
   1823     int16x8_t v1629 = vsubq_s16(v1521, v1523);
   1824     int16x8_t v1630 = vsubq_s16(v1525, v1527);
   1825     int16x8_t v1631 = vqrdmulhq_n_s16(v1630, 28961);
   1826     int16x8_t v1632 = vaddq_s16(v1629, v1631);
   1827     int16x8_t v1633 = vsubq_s16(v1511, v1513);
   1828     int16x8_t v1634 = vsubq_s16(v1515, v1517);
   1829     int16x8_t v1635 = vqrdmulhq_n_s16(v1634, 30044);
   1830     int16x8_t v1636 = vaddq_s16(v1633, v1635);
   1831     int16x8_t v1637 = vsubq_s16(v1501, v1503);
   1832     int16x8_t v1638 = vsubq_s16(v1505, v1507);
   1833     int16x8_t v1639 = vqrdmulhq_n_s16(v1638, 31232);
   1834     int16x8_t v1640 = vaddq_s16(v1637, v1639);
   1835     int16x8_t v1641 = vsubq_s16(v1491, v1493);
   1836     int16x8_t v1642 = vsubq_s16(v1495, v1497);
   1837     int16x8_t v1643 = vqrdmulhq_n_s16(v1642, 32538);
   1838     int16x8_t v1644 = vaddq_s16(v1641, v1643);
   1839     int16x8_t v1645 = vsubq_s16(v1481, v1483);
   1840     int16x8_t v1646 = vsubq_s16(v1485, v1487);
   1841     int16x8_t v1647_tmp = vqrdmulhq_n_s16(v1646, 1211);
   1842     int16x8_t v1647 = vaddq_s16(v1647_tmp, v1646);
   1843     int16x8_t v1648 = vaddq_s16(v1645, v1647);
   1844     int16x8_t v1649 = vsubq_s16(v1471, v1473);
   1845     int16x8_t v1650 = vsubq_s16(v1475, v1477);
   1846     int16x8_t v1651_tmp = vqrdmulhq_n_s16(v1650, 2808);
   1847     int16x8_t v1651 = vaddq_s16(v1651_tmp, v1650);
   1848     int16x8_t v1652 = vaddq_s16(v1649, v1651);
   1849     int16x8_t v1653 = vsubq_s16(v1461, v1463);
   1850     int16x8_t v1654 = vsubq_s16(v1465, v1467);
   1851     int16x8_t v1655_tmp = vqrdmulhq_n_s16(v1654, 4586);
   1852     int16x8_t v1655 = vaddq_s16(v1655_tmp, v1654);
   1853     int16x8_t v1656 = vaddq_s16(v1653, v1655);
   1854     int16x8_t v1657 = vsubq_s16(v1451, v1453);
   1855     int16x8_t v1658 = vsubq_s16(v1455, v1457);
   1856     int16x8_t v1659_tmp = vqrdmulhq_n_s16(v1658, 6576);
   1857     int16x8_t v1659 = vaddq_s16(v1659_tmp, v1658);
   1858     int16x8_t v1660 = vaddq_s16(v1657, v1659);
   1859     int16x8_t v1661 = vsubq_s16(v1441, v1443);
   1860     int16x8_t v1662 = vsubq_s16(v1445, v1447);
   1861     int16x8_t v1663_tmp = vqrdmulhq_n_s16(v1662, 8817);
   1862     int16x8_t v1663 = vaddq_s16(v1663_tmp, v1662);
   1863     int16x8_t v1664 = vaddq_s16(v1661, v1663);
   1864     int16x8_t v1665 = vsubq_s16(v1422, v1427);
   1865     int16x8_t v1666 = vsubq_s16(v1432, v1437);
   1866     int16x8_t v1667_tmp = vqrdmulhq_n_s16(v1666, 11356);
   1867     int16x8_t v1667 = vaddq_s16(v1667_tmp, v1666);
   1868     int16x8_t v1668 = vaddq_s16(v1665, v1667);
   1869     int16x8_t v1669 = vsubq_s16(v1400, v1405);
   1870     int16x8_t v1670 = vsubq_s16(v1410, v1415);
   1871     int16x8_t v1671_tmp = vqrdmulhq_n_s16(v1670, 14256);
   1872     int16x8_t v1671 = vaddq_s16(v1671_tmp, v1670);
   1873     int16x8_t v1672 = vaddq_s16(v1669, v1671);
   1874     int16x8_t v1673 = vsubq_s16(v1378, v1383);
   1875     int16x8_t v1674 = vsubq_s16(v1388, v1393);
   1876     int16x8_t v1675_tmp = vqrdmulhq_n_s16(v1674, 17596);
   1877     int16x8_t v1675 = vaddq_s16(v1675_tmp, v1674);
   1878     int16x8_t v1676 = vaddq_s16(v1673, v1675);
   1879     int16x8_t v1677 = vsubq_s16(v1356, v1361);
   1880     int16x8_t v1678 = vsubq_s16(v1366, v1371);
   1881     int16x8_t v1679_tmp = vqrdmulhq_n_s16(v1678, 21483);
   1882     int16x8_t v1679 = vaddq_s16(v1679_tmp, v1678);
   1883     int16x8_t v1680 = vaddq_s16(v1677, v1679);
   1884     int16x8_t v1681 = vsubq_s16(v1334, v1339);
   1885     int16x8_t v1682 = vsubq_s16(v1344, v1349);
   1886     int16x8_t v1683_tmp = vqrdmulhq_n_s16(v1682, 26057);
   1887     int16x8_t v1683 = vaddq_s16(v1683_tmp, v1682);
   1888     int16x8_t v1684 = vaddq_s16(v1681, v1683);
   1889     int16x8_t v1685 = vsubq_s16(v1312, v1317);
   1890     int16x8_t v1686 = vsubq_s16(v1322, v1327);
   1891     int16x8_t v1687_tmp = vqrdmulhq_n_s16(v1686, 31517);
   1892     int16x8_t v1687 = vaddq_s16(v1687_tmp, v1686);
   1893     int16x8_t v1688 = vaddq_s16(v1685, v1687);
   1894     int16x8_t v1689 = vsubq_s16(v1290, v1295);
   1895     int16x8_t v1690 = vsubq_s16(v1300, v1305);
   1896     int16x8_t v1691_tmp = vqrdmulhq_n_s16(v1690, 5373);
   1897     int16x8_t v1691 = vmlaq_n_s16(v1691_tmp, v1690, 2);
   1898     int16x8_t v1692 = vaddq_s16(v1689, v1691);
   1899     int16x8_t v1693 = vsubq_s16(v1268, v1273);
   1900     int16x8_t v1694 = vsubq_s16(v1278, v1283);
   1901     int16x8_t v1695_tmp = vqrdmulhq_n_s16(v1694, 13571);
   1902     int16x8_t v1695 = vmlaq_n_s16(v1695_tmp, v1694, 2);
   1903     int16x8_t v1696 = vaddq_s16(v1693, v1695);
   1904     int16x8_t v1697 = vsubq_s16(v1228, v1239);
   1905     int16x8_t v1698 = vsubq_s16(v1250, v1261);
   1906     int16x8_t v1699_tmp = vqrdmulhq_n_s16(v1698, 23975);
   1907     int16x8_t v1699 = vmlaq_n_s16(v1699_tmp, v1698, 2);
   1908     int16x8_t v1700 = vaddq_s16(v1697, v1699);
   1909     int16x8_t v1701 = vsubq_s16(v1182, v1193);
   1910     int16x8_t v1702 = vsubq_s16(v1204, v1215);
   1911     int16x8_t v1703_tmp = vqrdmulhq_n_s16(v1702, 4832);
   1912     int16x8_t v1703 = vmlaq_n_s16(v1703_tmp, v1702, 3);
   1913     int16x8_t v1704 = vaddq_s16(v1701, v1703);
   1914     int16x8_t v1705 = vsubq_s16(v1136, v1147);
   1915     int16x8_t v1706 = vsubq_s16(v1158, v1169);
   1916     int16x8_t v1707_tmp = vqrdmulhq_n_s16(v1706, 23437);
   1917     int16x8_t v1707 = vmlaq_n_s16(v1707_tmp, v1706, 3);
   1918     int16x8_t v1708 = vaddq_s16(v1705, v1707);
   1919     int16x8_t v1709 = vsubq_s16(v1090, v1101);
   1920     int16x8_t v1710 = vsubq_s16(v1112, v1123);
   1921     int16x8_t v1711_tmp = vqrdmulhq_n_s16(v1710, 17573);
   1922     int16x8_t v1711 = vmlaq_n_s16(v1711_tmp, v1710, 4);
   1923     int16x8_t v1712 = vaddq_s16(v1709, v1711);
   1924     int16x8_t v1713 = vsubq_s16(v1008, v1031);
   1925     int16x8_t v1714 = vsubq_s16(v1054, v1077);
   1926     int16x8_t v1715_tmp = vqrdmulhq_n_s16(v1714, 27122);
   1927     int16x8_t v1715 = vmlaq_n_s16(v1715_tmp, v1714, 5);
   1928     int16x8_t v1716 = vaddq_s16(v1713, v1715);
   1929     int16x8_t v1717 = vsubq_s16(v913, v937);
   1930     int16x8_t v1718 = vsubq_s16(v960, v983);
   1931     int16x8_t v1719_tmp = vqrdmulhq_n_s16(v1718, 5041);
   1932     int16x8_t v1719 = vmlaq_n_s16(v1719_tmp, v1718, 8);
   1933     int16x8_t v1720 = vaddq_s16(v1717, v1719);
   1934     int16x8_t v1721 = vsubq_s16(v747, v794);
   1935     int16x8_t v1722 = vsubq_s16(v841, v888);
   1936     int16x8_t v1723_tmp = vqrdmulhq_n_s16(v1722, 19146);
   1937     int16x8_t v1723 = vmlaq_n_s16(v1723_tmp, v1722, 13);
   1938     int16x8_t v1724 = vaddq_s16(v1721, v1723);
   1939     int16x8_t v1725 = vsubq_s16(v141, v316);
   1940     int16x8_t v1726 = vsubq_s16(v522, v698);
   1941     int16x8_t v1727_tmp = vqrdmulhq_n_s16(v1726, 24402);
   1942     int16x8_t v1727 = vmlaq_n_s16(v1727_tmp, v1726, 40);
   1943     int16x8_t v1728 = vaddq_s16(v1725, v1727);
   1944     int16x8_t v1729 = vsubq_s16(v1725, v1727);
   1945     int16x8_t v1730 = vsubq_s16(v1721, v1723);
   1946     int16x8_t v1731 = vsubq_s16(v1717, v1719);
   1947     int16x8_t v1732 = vsubq_s16(v1713, v1715);
   1948     int16x8_t v1733 = vsubq_s16(v1709, v1711);
   1949     int16x8_t v1734 = vsubq_s16(v1705, v1707);
   1950     int16x8_t v1735 = vsubq_s16(v1701, v1703);
   1951     int16x8_t v1736 = vsubq_s16(v1697, v1699);
   1952     int16x8_t v1737 = vsubq_s16(v1693, v1695);
   1953     int16x8_t v1738 = vsubq_s16(v1689, v1691);
   1954     int16x8_t v1739 = vsubq_s16(v1685, v1687);
   1955     int16x8_t v1740 = vsubq_s16(v1681, v1683);
   1956     int16x8_t v1741 = vsubq_s16(v1677, v1679);
   1957     int16x8_t v1742 = vsubq_s16(v1673, v1675);
   1958     int16x8_t v1743 = vsubq_s16(v1669, v1671);
   1959     int16x8_t v1744 = vsubq_s16(v1665, v1667);
   1960     int16x8_t v1745 = vsubq_s16(v1661, v1663);
   1961     int16x8_t v1746 = vsubq_s16(v1657, v1659);
   1962     int16x8_t v1747 = vsubq_s16(v1653, v1655);
   1963     int16x8_t v1748 = vsubq_s16(v1649, v1651);
   1964     int16x8_t v1749 = vsubq_s16(v1645, v1647);
   1965     int16x8_t v1750 = vsubq_s16(v1641, v1643);
   1966     int16x8_t v1751 = vsubq_s16(v1637, v1639);
   1967     int16x8_t v1752 = vsubq_s16(v1633, v1635);
   1968     int16x8_t v1753 = vsubq_s16(v1629, v1631);
   1969     int16x8_t v1754 = vsubq_s16(v1625, v1627);
   1970     int16x8_t v1755 = vsubq_s16(v1621, v1623);
   1971     int16x8_t v1756 = vsubq_s16(v1617, v1619);
   1972     int16x8_t v1757 = vsubq_s16(v1613, v1615);
   1973     int16x8_t v1758 = vsubq_s16(v1609, v1611);
   1974     int16x8_t v1759 = vsubq_s16(v1605, v1607);
   1975     int16x8_t v1760 = vsubq_s16(v1601, v1603);
   1976     int16x8_t v1761 = vsubq_s16(v1594, v1599);
   1977     int16x8_t v1762 = vsubq_s16(v1584, v1589);
   1978     int16x8_t v1763 = vsubq_s16(v1574, v1579);
   1979     int16x8_t v1764 = vsubq_s16(v1564, v1569);
   1980     int16x8_t v1765 = vsubq_s16(v1554, v1559);
   1981     int16x8_t v1766 = vsubq_s16(v1544, v1549);
   1982     int16x8_t v1767 = vsubq_s16(v1534, v1539);
   1983     int16x8_t v1768 = vsubq_s16(v1524, v1529);
   1984     int16x8_t v1769 = vsubq_s16(v1514, v1519);
   1985     int16x8_t v1770 = vsubq_s16(v1504, v1509);
   1986     int16x8_t v1771 = vsubq_s16(v1494, v1499);
   1987     int16x8_t v1772 = vsubq_s16(v1484, v1489);
   1988     int16x8_t v1773 = vsubq_s16(v1474, v1479);
   1989     int16x8_t v1774 = vsubq_s16(v1464, v1469);
   1990     int16x8_t v1775 = vsubq_s16(v1454, v1459);
   1991     int16x8_t v1776 = vsubq_s16(v1444, v1449);
   1992     int16x8_t v1777 = vsubq_s16(v1428, v1439);
   1993     int16x8_t v1778 = vsubq_s16(v1406, v1417);
   1994     int16x8_t v1779 = vsubq_s16(v1384, v1395);
   1995     int16x8_t v1780 = vsubq_s16(v1362, v1373);
   1996     int16x8_t v1781 = vsubq_s16(v1340, v1351);
   1997     int16x8_t v1782 = vsubq_s16(v1318, v1329);
   1998     int16x8_t v1783 = vsubq_s16(v1296, v1307);
   1999     int16x8_t v1784 = vsubq_s16(v1274, v1285);
   2000     int16x8_t v1785 = vsubq_s16(v1240, v1263);
   2001     int16x8_t v1786 = vsubq_s16(v1194, v1217);
   2002     int16x8_t v1787 = vsubq_s16(v1148, v1171);
   2003     int16x8_t v1788 = vsubq_s16(v1102, v1125);
   2004     int16x8_t v1789 = vsubq_s16(v1032, v1079);
   2005     int16x8_t v1790 = vsubq_s16(v938, v985);
   2006     int16x8_t v1791 = vsubq_s16(v795, v890);
   2007     int16x8_t v1792 = vsubq_s16(v317, v700);
   2008     vst1q_s16(out + out_stride * 0 + i, v701);
   2009     vst1q_s16(out + out_stride * 1 + i, v891);
   2010     vst1q_s16(out + out_stride * 2 + i, v986);
   2011     vst1q_s16(out + out_stride * 3 + i, v1080);
   2012     vst1q_s16(out + out_stride * 4 + i, v1126);
   2013     vst1q_s16(out + out_stride * 5 + i, v1172);
   2014     vst1q_s16(out + out_stride * 6 + i, v1218);
   2015     vst1q_s16(out + out_stride * 7 + i, v1264);
   2016     vst1q_s16(out + out_stride * 8 + i, v1286);
   2017     vst1q_s16(out + out_stride * 9 + i, v1308);
   2018     vst1q_s16(out + out_stride * 10 + i, v1330);
   2019     vst1q_s16(out + out_stride * 11 + i, v1352);
   2020     vst1q_s16(out + out_stride * 12 + i, v1374);
   2021     vst1q_s16(out + out_stride * 13 + i, v1396);
   2022     vst1q_s16(out + out_stride * 14 + i, v1418);
   2023     vst1q_s16(out + out_stride * 15 + i, v1440);
   2024     vst1q_s16(out + out_stride * 16 + i, v1450);
   2025     vst1q_s16(out + out_stride * 17 + i, v1460);
   2026     vst1q_s16(out + out_stride * 18 + i, v1470);
   2027     vst1q_s16(out + out_stride * 19 + i, v1480);
   2028     vst1q_s16(out + out_stride * 20 + i, v1490);
   2029     vst1q_s16(out + out_stride * 21 + i, v1500);
   2030     vst1q_s16(out + out_stride * 22 + i, v1510);
   2031     vst1q_s16(out + out_stride * 23 + i, v1520);
   2032     vst1q_s16(out + out_stride * 24 + i, v1530);
   2033     vst1q_s16(out + out_stride * 25 + i, v1540);
   2034     vst1q_s16(out + out_stride * 26 + i, v1550);
   2035     vst1q_s16(out + out_stride * 27 + i, v1560);
   2036     vst1q_s16(out + out_stride * 28 + i, v1570);
   2037     vst1q_s16(out + out_stride * 29 + i, v1580);
   2038     vst1q_s16(out + out_stride * 30 + i, v1590);
   2039     vst1q_s16(out + out_stride * 31 + i, v1600);
   2040     vst1q_s16(out + out_stride * 32 + i, v1604);
   2041     vst1q_s16(out + out_stride * 33 + i, v1608);
   2042     vst1q_s16(out + out_stride * 34 + i, v1612);
   2043     vst1q_s16(out + out_stride * 35 + i, v1616);
   2044     vst1q_s16(out + out_stride * 36 + i, v1620);
   2045     vst1q_s16(out + out_stride * 37 + i, v1624);
   2046     vst1q_s16(out + out_stride * 38 + i, v1628);
   2047     vst1q_s16(out + out_stride * 39 + i, v1632);
   2048     vst1q_s16(out + out_stride * 40 + i, v1636);
   2049     vst1q_s16(out + out_stride * 41 + i, v1640);
   2050     vst1q_s16(out + out_stride * 42 + i, v1644);
   2051     vst1q_s16(out + out_stride * 43 + i, v1648);
   2052     vst1q_s16(out + out_stride * 44 + i, v1652);
   2053     vst1q_s16(out + out_stride * 45 + i, v1656);
   2054     vst1q_s16(out + out_stride * 46 + i, v1660);
   2055     vst1q_s16(out + out_stride * 47 + i, v1664);
   2056     vst1q_s16(out + out_stride * 48 + i, v1668);
   2057     vst1q_s16(out + out_stride * 49 + i, v1672);
   2058     vst1q_s16(out + out_stride * 50 + i, v1676);
   2059     vst1q_s16(out + out_stride * 51 + i, v1680);
   2060     vst1q_s16(out + out_stride * 52 + i, v1684);
   2061     vst1q_s16(out + out_stride * 53 + i, v1688);
   2062     vst1q_s16(out + out_stride * 54 + i, v1692);
   2063     vst1q_s16(out + out_stride * 55 + i, v1696);
   2064     vst1q_s16(out + out_stride * 56 + i, v1700);
   2065     vst1q_s16(out + out_stride * 57 + i, v1704);
   2066     vst1q_s16(out + out_stride * 58 + i, v1708);
   2067     vst1q_s16(out + out_stride * 59 + i, v1712);
   2068     vst1q_s16(out + out_stride * 60 + i, v1716);
   2069     vst1q_s16(out + out_stride * 61 + i, v1720);
   2070     vst1q_s16(out + out_stride * 62 + i, v1724);
   2071     vst1q_s16(out + out_stride * 63 + i, v1728);
   2072     vst1q_s16(out + out_stride * 64 + i, v1729);
   2073     vst1q_s16(out + out_stride * 65 + i, v1730);
   2074     vst1q_s16(out + out_stride * 66 + i, v1731);
   2075     vst1q_s16(out + out_stride * 67 + i, v1732);
   2076     vst1q_s16(out + out_stride * 68 + i, v1733);
   2077     vst1q_s16(out + out_stride * 69 + i, v1734);
   2078     vst1q_s16(out + out_stride * 70 + i, v1735);
   2079     vst1q_s16(out + out_stride * 71 + i, v1736);
   2080     vst1q_s16(out + out_stride * 72 + i, v1737);
   2081     vst1q_s16(out + out_stride * 73 + i, v1738);
   2082     vst1q_s16(out + out_stride * 74 + i, v1739);
   2083     vst1q_s16(out + out_stride * 75 + i, v1740);
   2084     vst1q_s16(out + out_stride * 76 + i, v1741);
   2085     vst1q_s16(out + out_stride * 77 + i, v1742);
   2086     vst1q_s16(out + out_stride * 78 + i, v1743);
   2087     vst1q_s16(out + out_stride * 79 + i, v1744);
   2088     vst1q_s16(out + out_stride * 80 + i, v1745);
   2089     vst1q_s16(out + out_stride * 81 + i, v1746);
   2090     vst1q_s16(out + out_stride * 82 + i, v1747);
   2091     vst1q_s16(out + out_stride * 83 + i, v1748);
   2092     vst1q_s16(out + out_stride * 84 + i, v1749);
   2093     vst1q_s16(out + out_stride * 85 + i, v1750);
   2094     vst1q_s16(out + out_stride * 86 + i, v1751);
   2095     vst1q_s16(out + out_stride * 87 + i, v1752);
   2096     vst1q_s16(out + out_stride * 88 + i, v1753);
   2097     vst1q_s16(out + out_stride * 89 + i, v1754);
   2098     vst1q_s16(out + out_stride * 90 + i, v1755);
   2099     vst1q_s16(out + out_stride * 91 + i, v1756);
   2100     vst1q_s16(out + out_stride * 92 + i, v1757);
   2101     vst1q_s16(out + out_stride * 93 + i, v1758);
   2102     vst1q_s16(out + out_stride * 94 + i, v1759);
   2103     vst1q_s16(out + out_stride * 95 + i, v1760);
   2104     vst1q_s16(out + out_stride * 96 + i, v1761);
   2105     vst1q_s16(out + out_stride * 97 + i, v1762);
   2106     vst1q_s16(out + out_stride * 98 + i, v1763);
   2107     vst1q_s16(out + out_stride * 99 + i, v1764);
   2108     vst1q_s16(out + out_stride * 100 + i, v1765);
   2109     vst1q_s16(out + out_stride * 101 + i, v1766);
   2110     vst1q_s16(out + out_stride * 102 + i, v1767);
   2111     vst1q_s16(out + out_stride * 103 + i, v1768);
   2112     vst1q_s16(out + out_stride * 104 + i, v1769);
   2113     vst1q_s16(out + out_stride * 105 + i, v1770);
   2114     vst1q_s16(out + out_stride * 106 + i, v1771);
   2115     vst1q_s16(out + out_stride * 107 + i, v1772);
   2116     vst1q_s16(out + out_stride * 108 + i, v1773);
   2117     vst1q_s16(out + out_stride * 109 + i, v1774);
   2118     vst1q_s16(out + out_stride * 110 + i, v1775);
   2119     vst1q_s16(out + out_stride * 111 + i, v1776);
   2120     vst1q_s16(out + out_stride * 112 + i, v1777);
   2121     vst1q_s16(out + out_stride * 113 + i, v1778);
   2122     vst1q_s16(out + out_stride * 114 + i, v1779);
   2123     vst1q_s16(out + out_stride * 115 + i, v1780);
   2124     vst1q_s16(out + out_stride * 116 + i, v1781);
   2125     vst1q_s16(out + out_stride * 117 + i, v1782);
   2126     vst1q_s16(out + out_stride * 118 + i, v1783);
   2127     vst1q_s16(out + out_stride * 119 + i, v1784);
   2128     vst1q_s16(out + out_stride * 120 + i, v1785);
   2129     vst1q_s16(out + out_stride * 121 + i, v1786);
   2130     vst1q_s16(out + out_stride * 122 + i, v1787);
   2131     vst1q_s16(out + out_stride * 123 + i, v1788);
   2132     vst1q_s16(out + out_stride * 124 + i, v1789);
   2133     vst1q_s16(out + out_stride * 125 + i, v1790);
   2134     vst1q_s16(out + out_stride * 126 + i, v1791);
   2135     vst1q_s16(out + out_stride * 127 + i, v1792);
   2136   }
   2137 }