fast_dct128-inl.h (103005B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 /* This file is automatically generated. Do not modify it directly. */ 7 #if HWY_TARGET != HWY_NEON 8 #error "only include this file from fast_dct-inl.h" 9 #endif 10 11 constexpr size_t FastIDCTIntegerBits(FastDCTTag<128>) { return 2; } 12 13 void FastIDCT(FastDCTTag<128>, const int16_t* in, size_t in_stride, 14 int16_t* out, size_t out_stride, size_t count) { 15 JXL_ASSERT(count % 8 == 0); 16 for (size_t i = 0; i < count; i += 8) { 17 int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); 18 int16x8_t v1 = vld1q_s16(in + in_stride * 64 + i); 19 int16x8_t v2 = vaddq_s16(v0, v1); 20 int16x8_t v3 = vld1q_s16(in + in_stride * 32 + i); 21 int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); 22 int16x8_t v4 = vaddq_s16(v4_tmp, v3); 23 int16x8_t v5 = vld1q_s16(in + in_stride * 96 + i); 24 int16x8_t v6 = vaddq_s16(v5, v3); 25 int16x8_t v7 = vaddq_s16(v4, v6); 26 int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); 27 int16x8_t v9 = vaddq_s16(v2, v8); 28 int16x8_t v10 = vld1q_s16(in + in_stride * 16 + i); 29 int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); 30 int16x8_t v11 = vaddq_s16(v11_tmp, v10); 31 int16x8_t v12 = vld1q_s16(in + in_stride * 80 + i); 32 int16x8_t v13 = vld1q_s16(in + in_stride * 48 + i); 33 int16x8_t v14 = vaddq_s16(v12, v13); 34 int16x8_t v15 = vaddq_s16(v11, v14); 35 int16x8_t v16 = vaddq_s16(v13, v10); 36 int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573); 37 int16x8_t v17 = vaddq_s16(v17_tmp, v16); 38 int16x8_t v18 = vld1q_s16(in + in_stride * 112 + i); 39 int16x8_t v19 = vaddq_s16(v18, v12); 40 int16x8_t v20 = vaddq_s16(v19, v16); 41 int16x8_t v21 = vaddq_s16(v17, v20); 42 int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734); 43 int16x8_t v23 = vaddq_s16(v15, v22); 44 int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); 45 int16x8_t v25 = vaddq_s16(v9, v24); 46 int16x8_t v26 = vld1q_s16(in + in_stride * 8 + i); 47 int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573); 48 int16x8_t v27 = vaddq_s16(v27_tmp, v26); 49 int16x8_t v28 = vld1q_s16(in + in_stride * 72 + i); 50 int16x8_t v29 = vld1q_s16(in + in_stride * 56 + i); 51 int16x8_t v30 = vaddq_s16(v28, v29); 52 int16x8_t v31 = vaddq_s16(v27, v30); 53 int16x8_t v32 = vld1q_s16(in + in_stride * 40 + i); 54 int16x8_t v33 = vld1q_s16(in + in_stride * 24 + i); 55 int16x8_t v34 = vaddq_s16(v32, v33); 56 int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573); 57 int16x8_t v35 = vaddq_s16(v35_tmp, v34); 58 int16x8_t v36 = vld1q_s16(in + in_stride * 104 + i); 59 int16x8_t v37 = vld1q_s16(in + in_stride * 88 + i); 60 int16x8_t v38 = vaddq_s16(v36, v37); 61 int16x8_t v39 = vaddq_s16(v38, v34); 62 int16x8_t v40 = vaddq_s16(v35, v39); 63 int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734); 64 int16x8_t v42 = vaddq_s16(v31, v41); 65 int16x8_t v43 = vaddq_s16(v33, v26); 66 int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573); 67 int16x8_t v44 = vaddq_s16(v44_tmp, v43); 68 int16x8_t v45 = vaddq_s16(v37, v28); 69 int16x8_t v46 = vaddq_s16(v29, v32); 70 int16x8_t v47 = vaddq_s16(v45, v46); 71 int16x8_t v48 = vaddq_s16(v44, v47); 72 int16x8_t v49 = vaddq_s16(v46, v43); 73 int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573); 74 int16x8_t v50 = vaddq_s16(v50_tmp, v49); 75 int16x8_t v51 = vld1q_s16(in + in_stride * 120 + i); 76 int16x8_t v52 = vaddq_s16(v51, v36); 77 int16x8_t v53 = vaddq_s16(v52, v45); 78 int16x8_t v54 = vaddq_s16(v53, v49); 79 int16x8_t v55 = vaddq_s16(v50, v54); 80 int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734); 81 int16x8_t v57 = vaddq_s16(v48, v56); 82 int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705); 83 int16x8_t v59 = vaddq_s16(v42, v58); 84 int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463); 85 int16x8_t v61 = vaddq_s16(v25, v60); 86 int16x8_t v62 = vld1q_s16(in + in_stride * 4 + i); 87 int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573); 88 int16x8_t v63 = vaddq_s16(v63_tmp, v62); 89 int16x8_t v64 = vld1q_s16(in + in_stride * 68 + i); 90 int16x8_t v65 = vld1q_s16(in + in_stride * 60 + i); 91 int16x8_t v66 = vaddq_s16(v64, v65); 92 int16x8_t v67 = vaddq_s16(v63, v66); 93 int16x8_t v68 = vld1q_s16(in + in_stride * 36 + i); 94 int16x8_t v69 = vld1q_s16(in + in_stride * 28 + i); 95 int16x8_t v70 = vaddq_s16(v68, v69); 96 int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573); 97 int16x8_t v71 = vaddq_s16(v71_tmp, v70); 98 int16x8_t v72 = vld1q_s16(in + in_stride * 100 + i); 99 int16x8_t v73 = vld1q_s16(in + in_stride * 92 + i); 100 int16x8_t v74 = vaddq_s16(v72, v73); 101 int16x8_t v75 = vaddq_s16(v74, v70); 102 int16x8_t v76 = vaddq_s16(v71, v75); 103 int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734); 104 int16x8_t v78 = vaddq_s16(v67, v77); 105 int16x8_t v79 = vld1q_s16(in + in_stride * 20 + i); 106 int16x8_t v80 = vld1q_s16(in + in_stride * 12 + i); 107 int16x8_t v81 = vaddq_s16(v79, v80); 108 int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573); 109 int16x8_t v82 = vaddq_s16(v82_tmp, v81); 110 int16x8_t v83 = vld1q_s16(in + in_stride * 84 + i); 111 int16x8_t v84 = vld1q_s16(in + in_stride * 76 + i); 112 int16x8_t v85 = vaddq_s16(v83, v84); 113 int16x8_t v86 = vld1q_s16(in + in_stride * 52 + i); 114 int16x8_t v87 = vld1q_s16(in + in_stride * 44 + i); 115 int16x8_t v88 = vaddq_s16(v86, v87); 116 int16x8_t v89 = vaddq_s16(v85, v88); 117 int16x8_t v90 = vaddq_s16(v82, v89); 118 int16x8_t v91 = vaddq_s16(v88, v81); 119 int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573); 120 int16x8_t v92 = vaddq_s16(v92_tmp, v91); 121 int16x8_t v93 = vld1q_s16(in + in_stride * 116 + i); 122 int16x8_t v94 = vld1q_s16(in + in_stride * 108 + i); 123 int16x8_t v95 = vaddq_s16(v93, v94); 124 int16x8_t v96 = vaddq_s16(v95, v85); 125 int16x8_t v97 = vaddq_s16(v96, v91); 126 int16x8_t v98 = vaddq_s16(v92, v97); 127 int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734); 128 int16x8_t v100 = vaddq_s16(v90, v99); 129 int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705); 130 int16x8_t v102 = vaddq_s16(v78, v101); 131 int16x8_t v103 = vaddq_s16(v80, v62); 132 int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573); 133 int16x8_t v104 = vaddq_s16(v104_tmp, v103); 134 int16x8_t v105 = vaddq_s16(v84, v64); 135 int16x8_t v106 = vaddq_s16(v65, v86); 136 int16x8_t v107 = vaddq_s16(v105, v106); 137 int16x8_t v108 = vaddq_s16(v104, v107); 138 int16x8_t v109 = vaddq_s16(v87, v68); 139 int16x8_t v110 = vaddq_s16(v69, v79); 140 int16x8_t v111 = vaddq_s16(v109, v110); 141 int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573); 142 int16x8_t v112 = vaddq_s16(v112_tmp, v111); 143 int16x8_t v113 = vaddq_s16(v94, v72); 144 int16x8_t v114 = vaddq_s16(v73, v83); 145 int16x8_t v115 = vaddq_s16(v113, v114); 146 int16x8_t v116 = vaddq_s16(v115, v111); 147 int16x8_t v117 = vaddq_s16(v112, v116); 148 int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734); 149 int16x8_t v119 = vaddq_s16(v108, v118); 150 int16x8_t v120 = vaddq_s16(v110, v103); 151 int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573); 152 int16x8_t v121 = vaddq_s16(v121_tmp, v120); 153 int16x8_t v122 = vaddq_s16(v114, v105); 154 int16x8_t v123 = vaddq_s16(v106, v109); 155 int16x8_t v124 = vaddq_s16(v122, v123); 156 int16x8_t v125 = vaddq_s16(v121, v124); 157 int16x8_t v126 = vaddq_s16(v123, v120); 158 int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573); 159 int16x8_t v127 = vaddq_s16(v127_tmp, v126); 160 int16x8_t v128 = vld1q_s16(in + in_stride * 124 + i); 161 int16x8_t v129 = vaddq_s16(v128, v93); 162 int16x8_t v130 = vaddq_s16(v129, v113); 163 int16x8_t v131 = vaddq_s16(v130, v122); 164 int16x8_t v132 = vaddq_s16(v131, v126); 165 int16x8_t v133 = vaddq_s16(v127, v132); 166 int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734); 167 int16x8_t v135 = vaddq_s16(v125, v134); 168 int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705); 169 int16x8_t v137 = vaddq_s16(v119, v136); 170 int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463); 171 int16x8_t v139 = vaddq_s16(v102, v138); 172 int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404); 173 int16x8_t v141 = vaddq_s16(v61, v140); 174 int16x8_t v142 = vld1q_s16(in + in_stride * 2 + i); 175 int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573); 176 int16x8_t v143 = vaddq_s16(v143_tmp, v142); 177 int16x8_t v144 = vld1q_s16(in + in_stride * 66 + i); 178 int16x8_t v145 = vld1q_s16(in + in_stride * 62 + i); 179 int16x8_t v146 = vaddq_s16(v144, v145); 180 int16x8_t v147 = vaddq_s16(v143, v146); 181 int16x8_t v148 = vld1q_s16(in + in_stride * 34 + i); 182 int16x8_t v149 = vld1q_s16(in + in_stride * 30 + i); 183 int16x8_t v150 = vaddq_s16(v148, v149); 184 int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573); 185 int16x8_t v151 = vaddq_s16(v151_tmp, v150); 186 int16x8_t v152 = vld1q_s16(in + in_stride * 98 + i); 187 int16x8_t v153 = vld1q_s16(in + in_stride * 94 + i); 188 int16x8_t v154 = vaddq_s16(v152, v153); 189 int16x8_t v155 = vaddq_s16(v154, v150); 190 int16x8_t v156 = vaddq_s16(v151, v155); 191 int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734); 192 int16x8_t v158 = vaddq_s16(v147, v157); 193 int16x8_t v159 = vld1q_s16(in + in_stride * 18 + i); 194 int16x8_t v160 = vld1q_s16(in + in_stride * 14 + i); 195 int16x8_t v161 = vaddq_s16(v159, v160); 196 int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573); 197 int16x8_t v162 = vaddq_s16(v162_tmp, v161); 198 int16x8_t v163 = vld1q_s16(in + in_stride * 82 + i); 199 int16x8_t v164 = vld1q_s16(in + in_stride * 78 + i); 200 int16x8_t v165 = vaddq_s16(v163, v164); 201 int16x8_t v166 = vld1q_s16(in + in_stride * 50 + i); 202 int16x8_t v167 = vld1q_s16(in + in_stride * 46 + i); 203 int16x8_t v168 = vaddq_s16(v166, v167); 204 int16x8_t v169 = vaddq_s16(v165, v168); 205 int16x8_t v170 = vaddq_s16(v162, v169); 206 int16x8_t v171 = vaddq_s16(v168, v161); 207 int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573); 208 int16x8_t v172 = vaddq_s16(v172_tmp, v171); 209 int16x8_t v173 = vld1q_s16(in + in_stride * 114 + i); 210 int16x8_t v174 = vld1q_s16(in + in_stride * 110 + i); 211 int16x8_t v175 = vaddq_s16(v173, v174); 212 int16x8_t v176 = vaddq_s16(v175, v165); 213 int16x8_t v177 = vaddq_s16(v176, v171); 214 int16x8_t v178 = vaddq_s16(v172, v177); 215 int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734); 216 int16x8_t v180 = vaddq_s16(v170, v179); 217 int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705); 218 int16x8_t v182 = vaddq_s16(v158, v181); 219 int16x8_t v183 = vld1q_s16(in + in_stride * 10 + i); 220 int16x8_t v184 = vld1q_s16(in + in_stride * 6 + i); 221 int16x8_t v185 = vaddq_s16(v183, v184); 222 int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573); 223 int16x8_t v186 = vaddq_s16(v186_tmp, v185); 224 int16x8_t v187 = vld1q_s16(in + in_stride * 74 + i); 225 int16x8_t v188 = vld1q_s16(in + in_stride * 70 + i); 226 int16x8_t v189 = vaddq_s16(v187, v188); 227 int16x8_t v190 = vld1q_s16(in + in_stride * 58 + i); 228 int16x8_t v191 = vld1q_s16(in + in_stride * 54 + i); 229 int16x8_t v192 = vaddq_s16(v190, v191); 230 int16x8_t v193 = vaddq_s16(v189, v192); 231 int16x8_t v194 = vaddq_s16(v186, v193); 232 int16x8_t v195 = vld1q_s16(in + in_stride * 42 + i); 233 int16x8_t v196 = vld1q_s16(in + in_stride * 38 + i); 234 int16x8_t v197 = vaddq_s16(v195, v196); 235 int16x8_t v198 = vld1q_s16(in + in_stride * 26 + i); 236 int16x8_t v199 = vld1q_s16(in + in_stride * 22 + i); 237 int16x8_t v200 = vaddq_s16(v198, v199); 238 int16x8_t v201 = vaddq_s16(v197, v200); 239 int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573); 240 int16x8_t v202 = vaddq_s16(v202_tmp, v201); 241 int16x8_t v203 = vld1q_s16(in + in_stride * 106 + i); 242 int16x8_t v204 = vld1q_s16(in + in_stride * 102 + i); 243 int16x8_t v205 = vaddq_s16(v203, v204); 244 int16x8_t v206 = vld1q_s16(in + in_stride * 90 + i); 245 int16x8_t v207 = vld1q_s16(in + in_stride * 86 + i); 246 int16x8_t v208 = vaddq_s16(v206, v207); 247 int16x8_t v209 = vaddq_s16(v205, v208); 248 int16x8_t v210 = vaddq_s16(v209, v201); 249 int16x8_t v211 = vaddq_s16(v202, v210); 250 int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734); 251 int16x8_t v213 = vaddq_s16(v194, v212); 252 int16x8_t v214 = vaddq_s16(v200, v185); 253 int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573); 254 int16x8_t v215 = vaddq_s16(v215_tmp, v214); 255 int16x8_t v216 = vaddq_s16(v208, v189); 256 int16x8_t v217 = vaddq_s16(v192, v197); 257 int16x8_t v218 = vaddq_s16(v216, v217); 258 int16x8_t v219 = vaddq_s16(v215, v218); 259 int16x8_t v220 = vaddq_s16(v217, v214); 260 int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573); 261 int16x8_t v221 = vaddq_s16(v221_tmp, v220); 262 int16x8_t v222 = vld1q_s16(in + in_stride * 122 + i); 263 int16x8_t v223 = vld1q_s16(in + in_stride * 118 + i); 264 int16x8_t v224 = vaddq_s16(v222, v223); 265 int16x8_t v225 = vaddq_s16(v224, v205); 266 int16x8_t v226 = vaddq_s16(v225, v216); 267 int16x8_t v227 = vaddq_s16(v226, v220); 268 int16x8_t v228 = vaddq_s16(v221, v227); 269 int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734); 270 int16x8_t v230 = vaddq_s16(v219, v229); 271 int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705); 272 int16x8_t v232 = vaddq_s16(v213, v231); 273 int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463); 274 int16x8_t v234 = vaddq_s16(v182, v233); 275 int16x8_t v235 = vaddq_s16(v184, v142); 276 int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573); 277 int16x8_t v236 = vaddq_s16(v236_tmp, v235); 278 int16x8_t v237 = vaddq_s16(v188, v144); 279 int16x8_t v238 = vaddq_s16(v145, v190); 280 int16x8_t v239 = vaddq_s16(v237, v238); 281 int16x8_t v240 = vaddq_s16(v236, v239); 282 int16x8_t v241 = vaddq_s16(v196, v148); 283 int16x8_t v242 = vaddq_s16(v149, v198); 284 int16x8_t v243 = vaddq_s16(v241, v242); 285 int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573); 286 int16x8_t v244 = vaddq_s16(v244_tmp, v243); 287 int16x8_t v245 = vaddq_s16(v204, v152); 288 int16x8_t v246 = vaddq_s16(v153, v206); 289 int16x8_t v247 = vaddq_s16(v245, v246); 290 int16x8_t v248 = vaddq_s16(v247, v243); 291 int16x8_t v249 = vaddq_s16(v244, v248); 292 int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734); 293 int16x8_t v251 = vaddq_s16(v240, v250); 294 int16x8_t v252 = vaddq_s16(v199, v159); 295 int16x8_t v253 = vaddq_s16(v160, v183); 296 int16x8_t v254 = vaddq_s16(v252, v253); 297 int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573); 298 int16x8_t v255 = vaddq_s16(v255_tmp, v254); 299 int16x8_t v256 = vaddq_s16(v207, v163); 300 int16x8_t v257 = vaddq_s16(v164, v187); 301 int16x8_t v258 = vaddq_s16(v256, v257); 302 int16x8_t v259 = vaddq_s16(v191, v166); 303 int16x8_t v260 = vaddq_s16(v167, v195); 304 int16x8_t v261 = vaddq_s16(v259, v260); 305 int16x8_t v262 = vaddq_s16(v258, v261); 306 int16x8_t v263 = vaddq_s16(v255, v262); 307 int16x8_t v264 = vaddq_s16(v261, v254); 308 int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573); 309 int16x8_t v265 = vaddq_s16(v265_tmp, v264); 310 int16x8_t v266 = vaddq_s16(v223, v173); 311 int16x8_t v267 = vaddq_s16(v174, v203); 312 int16x8_t v268 = vaddq_s16(v266, v267); 313 int16x8_t v269 = vaddq_s16(v268, v258); 314 int16x8_t v270 = vaddq_s16(v269, v264); 315 int16x8_t v271 = vaddq_s16(v265, v270); 316 int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734); 317 int16x8_t v273 = vaddq_s16(v263, v272); 318 int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705); 319 int16x8_t v275 = vaddq_s16(v251, v274); 320 int16x8_t v276 = vaddq_s16(v253, v235); 321 int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573); 322 int16x8_t v277 = vaddq_s16(v277_tmp, v276); 323 int16x8_t v278 = vaddq_s16(v257, v237); 324 int16x8_t v279 = vaddq_s16(v238, v259); 325 int16x8_t v280 = vaddq_s16(v278, v279); 326 int16x8_t v281 = vaddq_s16(v277, v280); 327 int16x8_t v282 = vaddq_s16(v260, v241); 328 int16x8_t v283 = vaddq_s16(v242, v252); 329 int16x8_t v284 = vaddq_s16(v282, v283); 330 int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573); 331 int16x8_t v285 = vaddq_s16(v285_tmp, v284); 332 int16x8_t v286 = vaddq_s16(v267, v245); 333 int16x8_t v287 = vaddq_s16(v246, v256); 334 int16x8_t v288 = vaddq_s16(v286, v287); 335 int16x8_t v289 = vaddq_s16(v288, v284); 336 int16x8_t v290 = vaddq_s16(v285, v289); 337 int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734); 338 int16x8_t v292 = vaddq_s16(v281, v291); 339 int16x8_t v293 = vaddq_s16(v283, v276); 340 int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573); 341 int16x8_t v294 = vaddq_s16(v294_tmp, v293); 342 int16x8_t v295 = vaddq_s16(v287, v278); 343 int16x8_t v296 = vaddq_s16(v279, v282); 344 int16x8_t v297 = vaddq_s16(v295, v296); 345 int16x8_t v298 = vaddq_s16(v294, v297); 346 int16x8_t v299 = vaddq_s16(v296, v293); 347 int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573); 348 int16x8_t v300 = vaddq_s16(v300_tmp, v299); 349 int16x8_t v301 = vld1q_s16(in + in_stride * 126 + i); 350 int16x8_t v302 = vaddq_s16(v301, v222); 351 int16x8_t v303 = vaddq_s16(v302, v266); 352 int16x8_t v304 = vaddq_s16(v303, v286); 353 int16x8_t v305 = vaddq_s16(v304, v295); 354 int16x8_t v306 = vaddq_s16(v305, v299); 355 int16x8_t v307 = vaddq_s16(v300, v306); 356 int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734); 357 int16x8_t v309 = vaddq_s16(v298, v308); 358 int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705); 359 int16x8_t v311 = vaddq_s16(v292, v310); 360 int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463); 361 int16x8_t v313 = vaddq_s16(v275, v312); 362 int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404); 363 int16x8_t v315 = vaddq_s16(v234, v314); 364 int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389); 365 int16x8_t v317 = vaddq_s16(v141, v316); 366 int16x8_t v318 = vld1q_s16(in + in_stride * 1 + i); 367 int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573); 368 int16x8_t v319 = vaddq_s16(v319_tmp, v318); 369 int16x8_t v320 = vld1q_s16(in + in_stride * 65 + i); 370 int16x8_t v321 = vld1q_s16(in + in_stride * 63 + i); 371 int16x8_t v322 = vaddq_s16(v320, v321); 372 int16x8_t v323 = vaddq_s16(v319, v322); 373 int16x8_t v324 = vld1q_s16(in + in_stride * 33 + i); 374 int16x8_t v325 = vld1q_s16(in + in_stride * 31 + i); 375 int16x8_t v326 = vaddq_s16(v324, v325); 376 int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573); 377 int16x8_t v327 = vaddq_s16(v327_tmp, v326); 378 int16x8_t v328 = vld1q_s16(in + in_stride * 97 + i); 379 int16x8_t v329 = vld1q_s16(in + in_stride * 95 + i); 380 int16x8_t v330 = vaddq_s16(v328, v329); 381 int16x8_t v331 = vaddq_s16(v330, v326); 382 int16x8_t v332 = vaddq_s16(v327, v331); 383 int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734); 384 int16x8_t v334 = vaddq_s16(v323, v333); 385 int16x8_t v335 = vld1q_s16(in + in_stride * 17 + i); 386 int16x8_t v336 = vld1q_s16(in + in_stride * 15 + i); 387 int16x8_t v337 = vaddq_s16(v335, v336); 388 int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573); 389 int16x8_t v338 = vaddq_s16(v338_tmp, v337); 390 int16x8_t v339 = vld1q_s16(in + in_stride * 81 + i); 391 int16x8_t v340 = vld1q_s16(in + in_stride * 79 + i); 392 int16x8_t v341 = vaddq_s16(v339, v340); 393 int16x8_t v342 = vld1q_s16(in + in_stride * 49 + i); 394 int16x8_t v343 = vld1q_s16(in + in_stride * 47 + i); 395 int16x8_t v344 = vaddq_s16(v342, v343); 396 int16x8_t v345 = vaddq_s16(v341, v344); 397 int16x8_t v346 = vaddq_s16(v338, v345); 398 int16x8_t v347 = vaddq_s16(v344, v337); 399 int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573); 400 int16x8_t v348 = vaddq_s16(v348_tmp, v347); 401 int16x8_t v349 = vld1q_s16(in + in_stride * 113 + i); 402 int16x8_t v350 = vld1q_s16(in + in_stride * 111 + i); 403 int16x8_t v351 = vaddq_s16(v349, v350); 404 int16x8_t v352 = vaddq_s16(v351, v341); 405 int16x8_t v353 = vaddq_s16(v352, v347); 406 int16x8_t v354 = vaddq_s16(v348, v353); 407 int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734); 408 int16x8_t v356 = vaddq_s16(v346, v355); 409 int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705); 410 int16x8_t v358 = vaddq_s16(v334, v357); 411 int16x8_t v359 = vld1q_s16(in + in_stride * 9 + i); 412 int16x8_t v360 = vld1q_s16(in + in_stride * 7 + i); 413 int16x8_t v361 = vaddq_s16(v359, v360); 414 int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573); 415 int16x8_t v362 = vaddq_s16(v362_tmp, v361); 416 int16x8_t v363 = vld1q_s16(in + in_stride * 73 + i); 417 int16x8_t v364 = vld1q_s16(in + in_stride * 71 + i); 418 int16x8_t v365 = vaddq_s16(v363, v364); 419 int16x8_t v366 = vld1q_s16(in + in_stride * 57 + i); 420 int16x8_t v367 = vld1q_s16(in + in_stride * 55 + i); 421 int16x8_t v368 = vaddq_s16(v366, v367); 422 int16x8_t v369 = vaddq_s16(v365, v368); 423 int16x8_t v370 = vaddq_s16(v362, v369); 424 int16x8_t v371 = vld1q_s16(in + in_stride * 41 + i); 425 int16x8_t v372 = vld1q_s16(in + in_stride * 39 + i); 426 int16x8_t v373 = vaddq_s16(v371, v372); 427 int16x8_t v374 = vld1q_s16(in + in_stride * 25 + i); 428 int16x8_t v375 = vld1q_s16(in + in_stride * 23 + i); 429 int16x8_t v376 = vaddq_s16(v374, v375); 430 int16x8_t v377 = vaddq_s16(v373, v376); 431 int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573); 432 int16x8_t v378 = vaddq_s16(v378_tmp, v377); 433 int16x8_t v379 = vld1q_s16(in + in_stride * 105 + i); 434 int16x8_t v380 = vld1q_s16(in + in_stride * 103 + i); 435 int16x8_t v381 = vaddq_s16(v379, v380); 436 int16x8_t v382 = vld1q_s16(in + in_stride * 89 + i); 437 int16x8_t v383 = vld1q_s16(in + in_stride * 87 + i); 438 int16x8_t v384 = vaddq_s16(v382, v383); 439 int16x8_t v385 = vaddq_s16(v381, v384); 440 int16x8_t v386 = vaddq_s16(v385, v377); 441 int16x8_t v387 = vaddq_s16(v378, v386); 442 int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734); 443 int16x8_t v389 = vaddq_s16(v370, v388); 444 int16x8_t v390 = vaddq_s16(v376, v361); 445 int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573); 446 int16x8_t v391 = vaddq_s16(v391_tmp, v390); 447 int16x8_t v392 = vaddq_s16(v384, v365); 448 int16x8_t v393 = vaddq_s16(v368, v373); 449 int16x8_t v394 = vaddq_s16(v392, v393); 450 int16x8_t v395 = vaddq_s16(v391, v394); 451 int16x8_t v396 = vaddq_s16(v393, v390); 452 int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573); 453 int16x8_t v397 = vaddq_s16(v397_tmp, v396); 454 int16x8_t v398 = vld1q_s16(in + in_stride * 121 + i); 455 int16x8_t v399 = vld1q_s16(in + in_stride * 119 + i); 456 int16x8_t v400 = vaddq_s16(v398, v399); 457 int16x8_t v401 = vaddq_s16(v400, v381); 458 int16x8_t v402 = vaddq_s16(v401, v392); 459 int16x8_t v403 = vaddq_s16(v402, v396); 460 int16x8_t v404 = vaddq_s16(v397, v403); 461 int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734); 462 int16x8_t v406 = vaddq_s16(v395, v405); 463 int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705); 464 int16x8_t v408 = vaddq_s16(v389, v407); 465 int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463); 466 int16x8_t v410 = vaddq_s16(v358, v409); 467 int16x8_t v411 = vld1q_s16(in + in_stride * 5 + i); 468 int16x8_t v412 = vld1q_s16(in + in_stride * 3 + i); 469 int16x8_t v413 = vaddq_s16(v411, v412); 470 int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573); 471 int16x8_t v414 = vaddq_s16(v414_tmp, v413); 472 int16x8_t v415 = vld1q_s16(in + in_stride * 69 + i); 473 int16x8_t v416 = vld1q_s16(in + in_stride * 67 + i); 474 int16x8_t v417 = vaddq_s16(v415, v416); 475 int16x8_t v418 = vld1q_s16(in + in_stride * 61 + i); 476 int16x8_t v419 = vld1q_s16(in + in_stride * 59 + i); 477 int16x8_t v420 = vaddq_s16(v418, v419); 478 int16x8_t v421 = vaddq_s16(v417, v420); 479 int16x8_t v422 = vaddq_s16(v414, v421); 480 int16x8_t v423 = vld1q_s16(in + in_stride * 37 + i); 481 int16x8_t v424 = vld1q_s16(in + in_stride * 35 + i); 482 int16x8_t v425 = vaddq_s16(v423, v424); 483 int16x8_t v426 = vld1q_s16(in + in_stride * 29 + i); 484 int16x8_t v427 = vld1q_s16(in + in_stride * 27 + i); 485 int16x8_t v428 = vaddq_s16(v426, v427); 486 int16x8_t v429 = vaddq_s16(v425, v428); 487 int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573); 488 int16x8_t v430 = vaddq_s16(v430_tmp, v429); 489 int16x8_t v431 = vld1q_s16(in + in_stride * 101 + i); 490 int16x8_t v432 = vld1q_s16(in + in_stride * 99 + i); 491 int16x8_t v433 = vaddq_s16(v431, v432); 492 int16x8_t v434 = vld1q_s16(in + in_stride * 93 + i); 493 int16x8_t v435 = vld1q_s16(in + in_stride * 91 + i); 494 int16x8_t v436 = vaddq_s16(v434, v435); 495 int16x8_t v437 = vaddq_s16(v433, v436); 496 int16x8_t v438 = vaddq_s16(v437, v429); 497 int16x8_t v439 = vaddq_s16(v430, v438); 498 int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734); 499 int16x8_t v441 = vaddq_s16(v422, v440); 500 int16x8_t v442 = vld1q_s16(in + in_stride * 21 + i); 501 int16x8_t v443 = vld1q_s16(in + in_stride * 19 + i); 502 int16x8_t v444 = vaddq_s16(v442, v443); 503 int16x8_t v445 = vld1q_s16(in + in_stride * 13 + i); 504 int16x8_t v446 = vld1q_s16(in + in_stride * 11 + i); 505 int16x8_t v447 = vaddq_s16(v445, v446); 506 int16x8_t v448 = vaddq_s16(v444, v447); 507 int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573); 508 int16x8_t v449 = vaddq_s16(v449_tmp, v448); 509 int16x8_t v450 = vld1q_s16(in + in_stride * 85 + i); 510 int16x8_t v451 = vld1q_s16(in + in_stride * 83 + i); 511 int16x8_t v452 = vaddq_s16(v450, v451); 512 int16x8_t v453 = vld1q_s16(in + in_stride * 77 + i); 513 int16x8_t v454 = vld1q_s16(in + in_stride * 75 + i); 514 int16x8_t v455 = vaddq_s16(v453, v454); 515 int16x8_t v456 = vaddq_s16(v452, v455); 516 int16x8_t v457 = vld1q_s16(in + in_stride * 53 + i); 517 int16x8_t v458 = vld1q_s16(in + in_stride * 51 + i); 518 int16x8_t v459 = vaddq_s16(v457, v458); 519 int16x8_t v460 = vld1q_s16(in + in_stride * 45 + i); 520 int16x8_t v461 = vld1q_s16(in + in_stride * 43 + i); 521 int16x8_t v462 = vaddq_s16(v460, v461); 522 int16x8_t v463 = vaddq_s16(v459, v462); 523 int16x8_t v464 = vaddq_s16(v456, v463); 524 int16x8_t v465 = vaddq_s16(v449, v464); 525 int16x8_t v466 = vaddq_s16(v463, v448); 526 int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573); 527 int16x8_t v467 = vaddq_s16(v467_tmp, v466); 528 int16x8_t v468 = vld1q_s16(in + in_stride * 117 + i); 529 int16x8_t v469 = vld1q_s16(in + in_stride * 115 + i); 530 int16x8_t v470 = vaddq_s16(v468, v469); 531 int16x8_t v471 = vld1q_s16(in + in_stride * 109 + i); 532 int16x8_t v472 = vld1q_s16(in + in_stride * 107 + i); 533 int16x8_t v473 = vaddq_s16(v471, v472); 534 int16x8_t v474 = vaddq_s16(v470, v473); 535 int16x8_t v475 = vaddq_s16(v474, v456); 536 int16x8_t v476 = vaddq_s16(v475, v466); 537 int16x8_t v477 = vaddq_s16(v467, v476); 538 int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734); 539 int16x8_t v479 = vaddq_s16(v465, v478); 540 int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705); 541 int16x8_t v481 = vaddq_s16(v441, v480); 542 int16x8_t v482 = vaddq_s16(v447, v413); 543 int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573); 544 int16x8_t v483 = vaddq_s16(v483_tmp, v482); 545 int16x8_t v484 = vaddq_s16(v455, v417); 546 int16x8_t v485 = vaddq_s16(v420, v459); 547 int16x8_t v486 = vaddq_s16(v484, v485); 548 int16x8_t v487 = vaddq_s16(v483, v486); 549 int16x8_t v488 = vaddq_s16(v462, v425); 550 int16x8_t v489 = vaddq_s16(v428, v444); 551 int16x8_t v490 = vaddq_s16(v488, v489); 552 int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573); 553 int16x8_t v491 = vaddq_s16(v491_tmp, v490); 554 int16x8_t v492 = vaddq_s16(v473, v433); 555 int16x8_t v493 = vaddq_s16(v436, v452); 556 int16x8_t v494 = vaddq_s16(v492, v493); 557 int16x8_t v495 = vaddq_s16(v494, v490); 558 int16x8_t v496 = vaddq_s16(v491, v495); 559 int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734); 560 int16x8_t v498 = vaddq_s16(v487, v497); 561 int16x8_t v499 = vaddq_s16(v489, v482); 562 int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573); 563 int16x8_t v500 = vaddq_s16(v500_tmp, v499); 564 int16x8_t v501 = vaddq_s16(v493, v484); 565 int16x8_t v502 = vaddq_s16(v485, v488); 566 int16x8_t v503 = vaddq_s16(v501, v502); 567 int16x8_t v504 = vaddq_s16(v500, v503); 568 int16x8_t v505 = vaddq_s16(v502, v499); 569 int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573); 570 int16x8_t v506 = vaddq_s16(v506_tmp, v505); 571 int16x8_t v507 = vld1q_s16(in + in_stride * 125 + i); 572 int16x8_t v508 = vld1q_s16(in + in_stride * 123 + i); 573 int16x8_t v509 = vaddq_s16(v507, v508); 574 int16x8_t v510 = vaddq_s16(v509, v470); 575 int16x8_t v511 = vaddq_s16(v510, v492); 576 int16x8_t v512 = vaddq_s16(v511, v501); 577 int16x8_t v513 = vaddq_s16(v512, v505); 578 int16x8_t v514 = vaddq_s16(v506, v513); 579 int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734); 580 int16x8_t v516 = vaddq_s16(v504, v515); 581 int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705); 582 int16x8_t v518 = vaddq_s16(v498, v517); 583 int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463); 584 int16x8_t v520 = vaddq_s16(v481, v519); 585 int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404); 586 int16x8_t v522 = vaddq_s16(v410, v521); 587 int16x8_t v523 = vaddq_s16(v412, v318); 588 int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573); 589 int16x8_t v524 = vaddq_s16(v524_tmp, v523); 590 int16x8_t v525 = vaddq_s16(v416, v320); 591 int16x8_t v526 = vaddq_s16(v321, v418); 592 int16x8_t v527 = vaddq_s16(v525, v526); 593 int16x8_t v528 = vaddq_s16(v524, v527); 594 int16x8_t v529 = vaddq_s16(v424, v324); 595 int16x8_t v530 = vaddq_s16(v325, v426); 596 int16x8_t v531 = vaddq_s16(v529, v530); 597 int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573); 598 int16x8_t v532 = vaddq_s16(v532_tmp, v531); 599 int16x8_t v533 = vaddq_s16(v432, v328); 600 int16x8_t v534 = vaddq_s16(v329, v434); 601 int16x8_t v535 = vaddq_s16(v533, v534); 602 int16x8_t v536 = vaddq_s16(v535, v531); 603 int16x8_t v537 = vaddq_s16(v532, v536); 604 int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734); 605 int16x8_t v539 = vaddq_s16(v528, v538); 606 int16x8_t v540 = vaddq_s16(v443, v335); 607 int16x8_t v541 = vaddq_s16(v336, v445); 608 int16x8_t v542 = vaddq_s16(v540, v541); 609 int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573); 610 int16x8_t v543 = vaddq_s16(v543_tmp, v542); 611 int16x8_t v544 = vaddq_s16(v451, v339); 612 int16x8_t v545 = vaddq_s16(v340, v453); 613 int16x8_t v546 = vaddq_s16(v544, v545); 614 int16x8_t v547 = vaddq_s16(v458, v342); 615 int16x8_t v548 = vaddq_s16(v343, v460); 616 int16x8_t v549 = vaddq_s16(v547, v548); 617 int16x8_t v550 = vaddq_s16(v546, v549); 618 int16x8_t v551 = vaddq_s16(v543, v550); 619 int16x8_t v552 = vaddq_s16(v549, v542); 620 int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573); 621 int16x8_t v553 = vaddq_s16(v553_tmp, v552); 622 int16x8_t v554 = vaddq_s16(v469, v349); 623 int16x8_t v555 = vaddq_s16(v350, v471); 624 int16x8_t v556 = vaddq_s16(v554, v555); 625 int16x8_t v557 = vaddq_s16(v556, v546); 626 int16x8_t v558 = vaddq_s16(v557, v552); 627 int16x8_t v559 = vaddq_s16(v553, v558); 628 int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734); 629 int16x8_t v561 = vaddq_s16(v551, v560); 630 int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705); 631 int16x8_t v563 = vaddq_s16(v539, v562); 632 int16x8_t v564 = vaddq_s16(v446, v359); 633 int16x8_t v565 = vaddq_s16(v360, v411); 634 int16x8_t v566 = vaddq_s16(v564, v565); 635 int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573); 636 int16x8_t v567 = vaddq_s16(v567_tmp, v566); 637 int16x8_t v568 = vaddq_s16(v454, v363); 638 int16x8_t v569 = vaddq_s16(v364, v415); 639 int16x8_t v570 = vaddq_s16(v568, v569); 640 int16x8_t v571 = vaddq_s16(v419, v366); 641 int16x8_t v572 = vaddq_s16(v367, v457); 642 int16x8_t v573 = vaddq_s16(v571, v572); 643 int16x8_t v574 = vaddq_s16(v570, v573); 644 int16x8_t v575 = vaddq_s16(v567, v574); 645 int16x8_t v576 = vaddq_s16(v461, v371); 646 int16x8_t v577 = vaddq_s16(v372, v423); 647 int16x8_t v578 = vaddq_s16(v576, v577); 648 int16x8_t v579 = vaddq_s16(v427, v374); 649 int16x8_t v580 = vaddq_s16(v375, v442); 650 int16x8_t v581 = vaddq_s16(v579, v580); 651 int16x8_t v582 = vaddq_s16(v578, v581); 652 int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573); 653 int16x8_t v583 = vaddq_s16(v583_tmp, v582); 654 int16x8_t v584 = vaddq_s16(v472, v379); 655 int16x8_t v585 = vaddq_s16(v380, v431); 656 int16x8_t v586 = vaddq_s16(v584, v585); 657 int16x8_t v587 = vaddq_s16(v435, v382); 658 int16x8_t v588 = vaddq_s16(v383, v450); 659 int16x8_t v589 = vaddq_s16(v587, v588); 660 int16x8_t v590 = vaddq_s16(v586, v589); 661 int16x8_t v591 = vaddq_s16(v590, v582); 662 int16x8_t v592 = vaddq_s16(v583, v591); 663 int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734); 664 int16x8_t v594 = vaddq_s16(v575, v593); 665 int16x8_t v595 = vaddq_s16(v581, v566); 666 int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573); 667 int16x8_t v596 = vaddq_s16(v596_tmp, v595); 668 int16x8_t v597 = vaddq_s16(v589, v570); 669 int16x8_t v598 = vaddq_s16(v573, v578); 670 int16x8_t v599 = vaddq_s16(v597, v598); 671 int16x8_t v600 = vaddq_s16(v596, v599); 672 int16x8_t v601 = vaddq_s16(v598, v595); 673 int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573); 674 int16x8_t v602 = vaddq_s16(v602_tmp, v601); 675 int16x8_t v603 = vaddq_s16(v508, v398); 676 int16x8_t v604 = vaddq_s16(v399, v468); 677 int16x8_t v605 = vaddq_s16(v603, v604); 678 int16x8_t v606 = vaddq_s16(v605, v586); 679 int16x8_t v607 = vaddq_s16(v606, v597); 680 int16x8_t v608 = vaddq_s16(v607, v601); 681 int16x8_t v609 = vaddq_s16(v602, v608); 682 int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734); 683 int16x8_t v611 = vaddq_s16(v600, v610); 684 int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705); 685 int16x8_t v613 = vaddq_s16(v594, v612); 686 int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463); 687 int16x8_t v615 = vaddq_s16(v563, v614); 688 int16x8_t v616 = vaddq_s16(v565, v523); 689 int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573); 690 int16x8_t v617 = vaddq_s16(v617_tmp, v616); 691 int16x8_t v618 = vaddq_s16(v569, v525); 692 int16x8_t v619 = vaddq_s16(v526, v571); 693 int16x8_t v620 = vaddq_s16(v618, v619); 694 int16x8_t v621 = vaddq_s16(v617, v620); 695 int16x8_t v622 = vaddq_s16(v577, v529); 696 int16x8_t v623 = vaddq_s16(v530, v579); 697 int16x8_t v624 = vaddq_s16(v622, v623); 698 int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573); 699 int16x8_t v625 = vaddq_s16(v625_tmp, v624); 700 int16x8_t v626 = vaddq_s16(v585, v533); 701 int16x8_t v627 = vaddq_s16(v534, v587); 702 int16x8_t v628 = vaddq_s16(v626, v627); 703 int16x8_t v629 = vaddq_s16(v628, v624); 704 int16x8_t v630 = vaddq_s16(v625, v629); 705 int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734); 706 int16x8_t v632 = vaddq_s16(v621, v631); 707 int16x8_t v633 = vaddq_s16(v580, v540); 708 int16x8_t v634 = vaddq_s16(v541, v564); 709 int16x8_t v635 = vaddq_s16(v633, v634); 710 int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573); 711 int16x8_t v636 = vaddq_s16(v636_tmp, v635); 712 int16x8_t v637 = vaddq_s16(v588, v544); 713 int16x8_t v638 = vaddq_s16(v545, v568); 714 int16x8_t v639 = vaddq_s16(v637, v638); 715 int16x8_t v640 = vaddq_s16(v572, v547); 716 int16x8_t v641 = vaddq_s16(v548, v576); 717 int16x8_t v642 = vaddq_s16(v640, v641); 718 int16x8_t v643 = vaddq_s16(v639, v642); 719 int16x8_t v644 = vaddq_s16(v636, v643); 720 int16x8_t v645 = vaddq_s16(v642, v635); 721 int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573); 722 int16x8_t v646 = vaddq_s16(v646_tmp, v645); 723 int16x8_t v647 = vaddq_s16(v604, v554); 724 int16x8_t v648 = vaddq_s16(v555, v584); 725 int16x8_t v649 = vaddq_s16(v647, v648); 726 int16x8_t v650 = vaddq_s16(v649, v639); 727 int16x8_t v651 = vaddq_s16(v650, v645); 728 int16x8_t v652 = vaddq_s16(v646, v651); 729 int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734); 730 int16x8_t v654 = vaddq_s16(v644, v653); 731 int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705); 732 int16x8_t v656 = vaddq_s16(v632, v655); 733 int16x8_t v657 = vaddq_s16(v634, v616); 734 int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573); 735 int16x8_t v658 = vaddq_s16(v658_tmp, v657); 736 int16x8_t v659 = vaddq_s16(v638, v618); 737 int16x8_t v660 = vaddq_s16(v619, v640); 738 int16x8_t v661 = vaddq_s16(v659, v660); 739 int16x8_t v662 = vaddq_s16(v658, v661); 740 int16x8_t v663 = vaddq_s16(v641, v622); 741 int16x8_t v664 = vaddq_s16(v623, v633); 742 int16x8_t v665 = vaddq_s16(v663, v664); 743 int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573); 744 int16x8_t v666 = vaddq_s16(v666_tmp, v665); 745 int16x8_t v667 = vaddq_s16(v648, v626); 746 int16x8_t v668 = vaddq_s16(v627, v637); 747 int16x8_t v669 = vaddq_s16(v667, v668); 748 int16x8_t v670 = vaddq_s16(v669, v665); 749 int16x8_t v671 = vaddq_s16(v666, v670); 750 int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734); 751 int16x8_t v673 = vaddq_s16(v662, v672); 752 int16x8_t v674 = vaddq_s16(v664, v657); 753 int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573); 754 int16x8_t v675 = vaddq_s16(v675_tmp, v674); 755 int16x8_t v676 = vaddq_s16(v668, v659); 756 int16x8_t v677 = vaddq_s16(v660, v663); 757 int16x8_t v678 = vaddq_s16(v676, v677); 758 int16x8_t v679 = vaddq_s16(v675, v678); 759 int16x8_t v680 = vaddq_s16(v677, v674); 760 int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573); 761 int16x8_t v681 = vaddq_s16(v681_tmp, v680); 762 int16x8_t v682 = vld1q_s16(in + in_stride * 127 + i); 763 int16x8_t v683 = vaddq_s16(v682, v507); 764 int16x8_t v684 = vaddq_s16(v683, v603); 765 int16x8_t v685 = vaddq_s16(v684, v647); 766 int16x8_t v686 = vaddq_s16(v685, v667); 767 int16x8_t v687 = vaddq_s16(v686, v676); 768 int16x8_t v688 = vaddq_s16(v687, v680); 769 int16x8_t v689 = vaddq_s16(v681, v688); 770 int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734); 771 int16x8_t v691 = vaddq_s16(v679, v690); 772 int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705); 773 int16x8_t v693 = vaddq_s16(v673, v692); 774 int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463); 775 int16x8_t v695 = vaddq_s16(v656, v694); 776 int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404); 777 int16x8_t v697 = vaddq_s16(v615, v696); 778 int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389); 779 int16x8_t v699 = vaddq_s16(v522, v698); 780 int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385); 781 int16x8_t v701 = vaddq_s16(v317, v700); 782 int16x8_t v702 = vsubq_s16(v0, v1); 783 int16x8_t v703 = vsubq_s16(v4, v6); 784 int16x8_t v704_tmp = vqrdmulhq_n_s16(v703, 10045); 785 int16x8_t v704 = vaddq_s16(v704_tmp, v703); 786 int16x8_t v705 = vaddq_s16(v702, v704); 787 int16x8_t v706 = vsubq_s16(v11, v14); 788 int16x8_t v707 = vsubq_s16(v17, v20); 789 int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 10045); 790 int16x8_t v708 = vaddq_s16(v708_tmp, v707); 791 int16x8_t v709 = vaddq_s16(v706, v708); 792 int16x8_t v710 = vqrdmulhq_n_s16(v709, 19705); 793 int16x8_t v711 = vaddq_s16(v705, v710); 794 int16x8_t v712 = vsubq_s16(v27, v30); 795 int16x8_t v713 = vsubq_s16(v35, v39); 796 int16x8_t v714_tmp = vqrdmulhq_n_s16(v713, 10045); 797 int16x8_t v714 = vaddq_s16(v714_tmp, v713); 798 int16x8_t v715 = vaddq_s16(v712, v714); 799 int16x8_t v716 = vsubq_s16(v44, v47); 800 int16x8_t v717 = vsubq_s16(v50, v54); 801 int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 10045); 802 int16x8_t v718 = vaddq_s16(v718_tmp, v717); 803 int16x8_t v719 = vaddq_s16(v716, v718); 804 int16x8_t v720 = vqrdmulhq_n_s16(v719, 19705); 805 int16x8_t v721 = vaddq_s16(v715, v720); 806 int16x8_t v722 = vqrdmulhq_n_s16(v721, 17121); 807 int16x8_t v723 = vaddq_s16(v711, v722); 808 int16x8_t v724 = vsubq_s16(v63, v66); 809 int16x8_t v725 = vsubq_s16(v71, v75); 810 int16x8_t v726_tmp = vqrdmulhq_n_s16(v725, 10045); 811 int16x8_t v726 = vaddq_s16(v726_tmp, v725); 812 int16x8_t v727 = vaddq_s16(v724, v726); 813 int16x8_t v728 = vsubq_s16(v82, v89); 814 int16x8_t v729 = vsubq_s16(v92, v97); 815 int16x8_t v730_tmp = vqrdmulhq_n_s16(v729, 10045); 816 int16x8_t v730 = vaddq_s16(v730_tmp, v729); 817 int16x8_t v731 = vaddq_s16(v728, v730); 818 int16x8_t v732 = vqrdmulhq_n_s16(v731, 19705); 819 int16x8_t v733 = vaddq_s16(v727, v732); 820 int16x8_t v734 = vsubq_s16(v104, v107); 821 int16x8_t v735 = vsubq_s16(v112, v116); 822 int16x8_t v736_tmp = vqrdmulhq_n_s16(v735, 10045); 823 int16x8_t v736 = vaddq_s16(v736_tmp, v735); 824 int16x8_t v737 = vaddq_s16(v734, v736); 825 int16x8_t v738 = vsubq_s16(v121, v124); 826 int16x8_t v739 = vsubq_s16(v127, v132); 827 int16x8_t v740_tmp = vqrdmulhq_n_s16(v739, 10045); 828 int16x8_t v740 = vaddq_s16(v740_tmp, v739); 829 int16x8_t v741 = vaddq_s16(v738, v740); 830 int16x8_t v742 = vqrdmulhq_n_s16(v741, 19705); 831 int16x8_t v743 = vaddq_s16(v737, v742); 832 int16x8_t v744 = vqrdmulhq_n_s16(v743, 17121); 833 int16x8_t v745 = vaddq_s16(v733, v744); 834 int16x8_t v746 = vqrdmulhq_n_s16(v745, 16563); 835 int16x8_t v747 = vaddq_s16(v723, v746); 836 int16x8_t v748 = vsubq_s16(v143, v146); 837 int16x8_t v749 = vsubq_s16(v151, v155); 838 int16x8_t v750_tmp = vqrdmulhq_n_s16(v749, 10045); 839 int16x8_t v750 = vaddq_s16(v750_tmp, v749); 840 int16x8_t v751 = vaddq_s16(v748, v750); 841 int16x8_t v752 = vsubq_s16(v162, v169); 842 int16x8_t v753 = vqrdmulhq_n_s16(v752, 19705); 843 int16x8_t v754 = vsubq_s16(v172, v177); 844 int16x8_t v755 = vqrdmulhq_n_s16(v754, 25746); 845 int16x8_t v756 = vaddq_s16(v753, v755); 846 int16x8_t v757 = vaddq_s16(v751, v756); 847 int16x8_t v758 = vsubq_s16(v186, v193); 848 int16x8_t v759 = vsubq_s16(v202, v210); 849 int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 10045); 850 int16x8_t v760 = vaddq_s16(v760_tmp, v759); 851 int16x8_t v761 = vaddq_s16(v758, v760); 852 int16x8_t v762 = vsubq_s16(v215, v218); 853 int16x8_t v763 = vsubq_s16(v221, v227); 854 int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 10045); 855 int16x8_t v764 = vaddq_s16(v764_tmp, v763); 856 int16x8_t v765 = vaddq_s16(v762, v764); 857 int16x8_t v766 = vqrdmulhq_n_s16(v765, 19705); 858 int16x8_t v767 = vaddq_s16(v761, v766); 859 int16x8_t v768 = vqrdmulhq_n_s16(v767, 17121); 860 int16x8_t v769 = vaddq_s16(v757, v768); 861 int16x8_t v770 = vsubq_s16(v236, v239); 862 int16x8_t v771 = vsubq_s16(v244, v248); 863 int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 10045); 864 int16x8_t v772 = vaddq_s16(v772_tmp, v771); 865 int16x8_t v773 = vaddq_s16(v770, v772); 866 int16x8_t v774 = vsubq_s16(v255, v262); 867 int16x8_t v775 = vsubq_s16(v265, v270); 868 int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 10045); 869 int16x8_t v776 = vaddq_s16(v776_tmp, v775); 870 int16x8_t v777 = vaddq_s16(v774, v776); 871 int16x8_t v778 = vqrdmulhq_n_s16(v777, 19705); 872 int16x8_t v779 = vaddq_s16(v773, v778); 873 int16x8_t v780 = vsubq_s16(v277, v280); 874 int16x8_t v781 = vsubq_s16(v285, v289); 875 int16x8_t v782_tmp = vqrdmulhq_n_s16(v781, 10045); 876 int16x8_t v782 = vaddq_s16(v782_tmp, v781); 877 int16x8_t v783 = vaddq_s16(v780, v782); 878 int16x8_t v784 = vsubq_s16(v294, v297); 879 int16x8_t v785 = vsubq_s16(v300, v306); 880 int16x8_t v786_tmp = vqrdmulhq_n_s16(v785, 10045); 881 int16x8_t v786 = vaddq_s16(v786_tmp, v785); 882 int16x8_t v787 = vaddq_s16(v784, v786); 883 int16x8_t v788 = vqrdmulhq_n_s16(v787, 19705); 884 int16x8_t v789 = vaddq_s16(v783, v788); 885 int16x8_t v790 = vqrdmulhq_n_s16(v789, 17121); 886 int16x8_t v791 = vaddq_s16(v779, v790); 887 int16x8_t v792 = vqrdmulhq_n_s16(v791, 16563); 888 int16x8_t v793 = vaddq_s16(v769, v792); 889 int16x8_t v794 = vqrdmulhq_n_s16(v793, 16429); 890 int16x8_t v795 = vaddq_s16(v747, v794); 891 int16x8_t v796 = vsubq_s16(v319, v322); 892 int16x8_t v797 = vsubq_s16(v327, v331); 893 int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 10045); 894 int16x8_t v798 = vaddq_s16(v798_tmp, v797); 895 int16x8_t v799 = vaddq_s16(v796, v798); 896 int16x8_t v800 = vsubq_s16(v338, v345); 897 int16x8_t v801 = vsubq_s16(v348, v353); 898 int16x8_t v802_tmp = vqrdmulhq_n_s16(v801, 10045); 899 int16x8_t v802 = vaddq_s16(v802_tmp, v801); 900 int16x8_t v803 = vaddq_s16(v800, v802); 901 int16x8_t v804 = vqrdmulhq_n_s16(v803, 19705); 902 int16x8_t v805 = vaddq_s16(v799, v804); 903 int16x8_t v806 = vsubq_s16(v362, v369); 904 int16x8_t v807 = vsubq_s16(v378, v386); 905 int16x8_t v808_tmp = vqrdmulhq_n_s16(v807, 10045); 906 int16x8_t v808 = vaddq_s16(v808_tmp, v807); 907 int16x8_t v809 = vaddq_s16(v806, v808); 908 int16x8_t v810 = vsubq_s16(v391, v394); 909 int16x8_t v811 = vsubq_s16(v397, v403); 910 int16x8_t v812_tmp = vqrdmulhq_n_s16(v811, 10045); 911 int16x8_t v812 = vaddq_s16(v812_tmp, v811); 912 int16x8_t v813 = vaddq_s16(v810, v812); 913 int16x8_t v814 = vqrdmulhq_n_s16(v813, 19705); 914 int16x8_t v815 = vaddq_s16(v809, v814); 915 int16x8_t v816 = vqrdmulhq_n_s16(v815, 17121); 916 int16x8_t v817 = vaddq_s16(v805, v816); 917 int16x8_t v818 = vsubq_s16(v414, v421); 918 int16x8_t v819 = vsubq_s16(v430, v438); 919 int16x8_t v820_tmp = vqrdmulhq_n_s16(v819, 10045); 920 int16x8_t v820 = vaddq_s16(v820_tmp, v819); 921 int16x8_t v821 = vaddq_s16(v818, v820); 922 int16x8_t v822 = vsubq_s16(v449, v464); 923 int16x8_t v823 = vsubq_s16(v467, v476); 924 int16x8_t v824_tmp = vqrdmulhq_n_s16(v823, 10045); 925 int16x8_t v824 = vaddq_s16(v824_tmp, v823); 926 int16x8_t v825 = vaddq_s16(v822, v824); 927 int16x8_t v826 = vqrdmulhq_n_s16(v825, 19705); 928 int16x8_t v827 = vaddq_s16(v821, v826); 929 int16x8_t v828 = vsubq_s16(v483, v486); 930 int16x8_t v829 = vsubq_s16(v491, v495); 931 int16x8_t v830_tmp = vqrdmulhq_n_s16(v829, 10045); 932 int16x8_t v830 = vaddq_s16(v830_tmp, v829); 933 int16x8_t v831 = vaddq_s16(v828, v830); 934 int16x8_t v832 = vsubq_s16(v500, v503); 935 int16x8_t v833 = vsubq_s16(v506, v513); 936 int16x8_t v834_tmp = vqrdmulhq_n_s16(v833, 10045); 937 int16x8_t v834 = vaddq_s16(v834_tmp, v833); 938 int16x8_t v835 = vaddq_s16(v832, v834); 939 int16x8_t v836 = vqrdmulhq_n_s16(v835, 19705); 940 int16x8_t v837 = vaddq_s16(v831, v836); 941 int16x8_t v838 = vqrdmulhq_n_s16(v837, 17121); 942 int16x8_t v839 = vaddq_s16(v827, v838); 943 int16x8_t v840 = vqrdmulhq_n_s16(v839, 16563); 944 int16x8_t v841 = vaddq_s16(v817, v840); 945 int16x8_t v842 = vsubq_s16(v524, v527); 946 int16x8_t v843 = vsubq_s16(v532, v536); 947 int16x8_t v844_tmp = vqrdmulhq_n_s16(v843, 10045); 948 int16x8_t v844 = vaddq_s16(v844_tmp, v843); 949 int16x8_t v845 = vaddq_s16(v842, v844); 950 int16x8_t v846 = vsubq_s16(v543, v550); 951 int16x8_t v847 = vsubq_s16(v553, v558); 952 int16x8_t v848_tmp = vqrdmulhq_n_s16(v847, 10045); 953 int16x8_t v848 = vaddq_s16(v848_tmp, v847); 954 int16x8_t v849 = vaddq_s16(v846, v848); 955 int16x8_t v850 = vqrdmulhq_n_s16(v849, 19705); 956 int16x8_t v851 = vaddq_s16(v845, v850); 957 int16x8_t v852 = vsubq_s16(v567, v574); 958 int16x8_t v853 = vsubq_s16(v583, v591); 959 int16x8_t v854_tmp = vqrdmulhq_n_s16(v853, 10045); 960 int16x8_t v854 = vaddq_s16(v854_tmp, v853); 961 int16x8_t v855 = vaddq_s16(v852, v854); 962 int16x8_t v856 = vsubq_s16(v596, v599); 963 int16x8_t v857 = vsubq_s16(v602, v608); 964 int16x8_t v858_tmp = vqrdmulhq_n_s16(v857, 10045); 965 int16x8_t v858 = vaddq_s16(v858_tmp, v857); 966 int16x8_t v859 = vaddq_s16(v856, v858); 967 int16x8_t v860 = vqrdmulhq_n_s16(v859, 19705); 968 int16x8_t v861 = vaddq_s16(v855, v860); 969 int16x8_t v862 = vqrdmulhq_n_s16(v861, 17121); 970 int16x8_t v863 = vaddq_s16(v851, v862); 971 int16x8_t v864 = vsubq_s16(v617, v620); 972 int16x8_t v865 = vsubq_s16(v625, v629); 973 int16x8_t v866_tmp = vqrdmulhq_n_s16(v865, 10045); 974 int16x8_t v866 = vaddq_s16(v866_tmp, v865); 975 int16x8_t v867 = vaddq_s16(v864, v866); 976 int16x8_t v868 = vsubq_s16(v636, v643); 977 int16x8_t v869 = vsubq_s16(v646, v651); 978 int16x8_t v870_tmp = vqrdmulhq_n_s16(v869, 10045); 979 int16x8_t v870 = vaddq_s16(v870_tmp, v869); 980 int16x8_t v871 = vaddq_s16(v868, v870); 981 int16x8_t v872 = vqrdmulhq_n_s16(v871, 19705); 982 int16x8_t v873 = vaddq_s16(v867, v872); 983 int16x8_t v874 = vsubq_s16(v658, v661); 984 int16x8_t v875 = vsubq_s16(v666, v670); 985 int16x8_t v876_tmp = vqrdmulhq_n_s16(v875, 10045); 986 int16x8_t v876 = vaddq_s16(v876_tmp, v875); 987 int16x8_t v877 = vaddq_s16(v874, v876); 988 int16x8_t v878 = vsubq_s16(v675, v678); 989 int16x8_t v879 = vsubq_s16(v681, v688); 990 int16x8_t v880_tmp = vqrdmulhq_n_s16(v879, 10045); 991 int16x8_t v880 = vaddq_s16(v880_tmp, v879); 992 int16x8_t v881 = vaddq_s16(v878, v880); 993 int16x8_t v882 = vqrdmulhq_n_s16(v881, 19705); 994 int16x8_t v883 = vaddq_s16(v877, v882); 995 int16x8_t v884 = vqrdmulhq_n_s16(v883, 17121); 996 int16x8_t v885 = vaddq_s16(v873, v884); 997 int16x8_t v886 = vqrdmulhq_n_s16(v885, 16563); 998 int16x8_t v887 = vaddq_s16(v863, v886); 999 int16x8_t v888 = vqrdmulhq_n_s16(v887, 16429); 1000 int16x8_t v889 = vaddq_s16(v841, v888); 1001 int16x8_t v890 = vqrdmulhq_n_s16(v889, 16395); 1002 int16x8_t v891 = vaddq_s16(v795, v890); 1003 int16x8_t v892 = vsubq_s16(v702, v704); 1004 int16x8_t v893 = vsubq_s16(v706, v708); 1005 int16x8_t v894 = vqrdmulhq_n_s16(v893, 29490); 1006 int16x8_t v895 = vaddq_s16(v892, v894); 1007 int16x8_t v896 = vsubq_s16(v712, v714); 1008 int16x8_t v897 = vsubq_s16(v716, v718); 1009 int16x8_t v898 = vqrdmulhq_n_s16(v897, 29490); 1010 int16x8_t v899 = vaddq_s16(v896, v898); 1011 int16x8_t v900 = vqrdmulhq_n_s16(v899, 18578); 1012 int16x8_t v901 = vaddq_s16(v895, v900); 1013 int16x8_t v902 = vsubq_s16(v724, v726); 1014 int16x8_t v903 = vsubq_s16(v728, v730); 1015 int16x8_t v904 = vqrdmulhq_n_s16(v903, 29490); 1016 int16x8_t v905 = vaddq_s16(v902, v904); 1017 int16x8_t v906 = vsubq_s16(v734, v736); 1018 int16x8_t v907 = vsubq_s16(v738, v740); 1019 int16x8_t v908 = vqrdmulhq_n_s16(v907, 29490); 1020 int16x8_t v909 = vaddq_s16(v906, v908); 1021 int16x8_t v910 = vqrdmulhq_n_s16(v909, 18578); 1022 int16x8_t v911 = vaddq_s16(v905, v910); 1023 int16x8_t v912 = vqrdmulhq_n_s16(v911, 16890); 1024 int16x8_t v913 = vaddq_s16(v901, v912); 1025 int16x8_t v914 = vsubq_s16(v748, v750); 1026 int16x8_t v915_tmp = vqrdmulhq_n_s16(v754, 10045); 1027 int16x8_t v915 = vaddq_s16(v915_tmp, v754); 1028 int16x8_t v916 = vsubq_s16(v752, v915); 1029 int16x8_t v917 = vqrdmulhq_n_s16(v916, 29490); 1030 int16x8_t v918 = vaddq_s16(v914, v917); 1031 int16x8_t v919 = vsubq_s16(v758, v760); 1032 int16x8_t v920 = vsubq_s16(v762, v764); 1033 int16x8_t v921 = vqrdmulhq_n_s16(v920, 29490); 1034 int16x8_t v922 = vaddq_s16(v919, v921); 1035 int16x8_t v923 = vqrdmulhq_n_s16(v922, 18578); 1036 int16x8_t v924 = vaddq_s16(v918, v923); 1037 int16x8_t v925 = vsubq_s16(v770, v772); 1038 int16x8_t v926 = vsubq_s16(v774, v776); 1039 int16x8_t v927 = vqrdmulhq_n_s16(v926, 29490); 1040 int16x8_t v928 = vaddq_s16(v925, v927); 1041 int16x8_t v929 = vsubq_s16(v780, v782); 1042 int16x8_t v930 = vsubq_s16(v784, v786); 1043 int16x8_t v931 = vqrdmulhq_n_s16(v930, 29490); 1044 int16x8_t v932 = vaddq_s16(v929, v931); 1045 int16x8_t v933 = vqrdmulhq_n_s16(v932, 18578); 1046 int16x8_t v934 = vaddq_s16(v928, v933); 1047 int16x8_t v935 = vqrdmulhq_n_s16(v934, 16890); 1048 int16x8_t v936 = vaddq_s16(v924, v935); 1049 int16x8_t v937 = vqrdmulhq_n_s16(v936, 16508); 1050 int16x8_t v938 = vaddq_s16(v913, v937); 1051 int16x8_t v939 = vsubq_s16(v796, v798); 1052 int16x8_t v940 = vsubq_s16(v800, v802); 1053 int16x8_t v941 = vqrdmulhq_n_s16(v940, 29490); 1054 int16x8_t v942 = vaddq_s16(v939, v941); 1055 int16x8_t v943 = vsubq_s16(v806, v808); 1056 int16x8_t v944 = vsubq_s16(v810, v812); 1057 int16x8_t v945 = vqrdmulhq_n_s16(v944, 29490); 1058 int16x8_t v946 = vaddq_s16(v943, v945); 1059 int16x8_t v947 = vqrdmulhq_n_s16(v946, 18578); 1060 int16x8_t v948 = vaddq_s16(v942, v947); 1061 int16x8_t v949 = vsubq_s16(v818, v820); 1062 int16x8_t v950 = vsubq_s16(v822, v824); 1063 int16x8_t v951 = vqrdmulhq_n_s16(v950, 29490); 1064 int16x8_t v952 = vaddq_s16(v949, v951); 1065 int16x8_t v953 = vsubq_s16(v828, v830); 1066 int16x8_t v954 = vsubq_s16(v832, v834); 1067 int16x8_t v955 = vqrdmulhq_n_s16(v954, 29490); 1068 int16x8_t v956 = vaddq_s16(v953, v955); 1069 int16x8_t v957 = vqrdmulhq_n_s16(v956, 18578); 1070 int16x8_t v958 = vaddq_s16(v952, v957); 1071 int16x8_t v959 = vqrdmulhq_n_s16(v958, 16890); 1072 int16x8_t v960 = vaddq_s16(v948, v959); 1073 int16x8_t v961 = vsubq_s16(v842, v844); 1074 int16x8_t v962 = vsubq_s16(v846, v848); 1075 int16x8_t v963 = vqrdmulhq_n_s16(v962, 29490); 1076 int16x8_t v964 = vaddq_s16(v961, v963); 1077 int16x8_t v965 = vsubq_s16(v852, v854); 1078 int16x8_t v966 = vsubq_s16(v856, v858); 1079 int16x8_t v967 = vqrdmulhq_n_s16(v966, 29490); 1080 int16x8_t v968 = vaddq_s16(v965, v967); 1081 int16x8_t v969 = vqrdmulhq_n_s16(v968, 18578); 1082 int16x8_t v970 = vaddq_s16(v964, v969); 1083 int16x8_t v971 = vsubq_s16(v864, v866); 1084 int16x8_t v972 = vsubq_s16(v868, v870); 1085 int16x8_t v973 = vqrdmulhq_n_s16(v972, 29490); 1086 int16x8_t v974 = vaddq_s16(v971, v973); 1087 int16x8_t v975 = vsubq_s16(v874, v876); 1088 int16x8_t v976 = vsubq_s16(v878, v880); 1089 int16x8_t v977 = vqrdmulhq_n_s16(v976, 29490); 1090 int16x8_t v978 = vaddq_s16(v975, v977); 1091 int16x8_t v979 = vqrdmulhq_n_s16(v978, 18578); 1092 int16x8_t v980 = vaddq_s16(v974, v979); 1093 int16x8_t v981 = vqrdmulhq_n_s16(v980, 16890); 1094 int16x8_t v982 = vaddq_s16(v970, v981); 1095 int16x8_t v983 = vqrdmulhq_n_s16(v982, 16508); 1096 int16x8_t v984 = vaddq_s16(v960, v983); 1097 int16x8_t v985 = vqrdmulhq_n_s16(v984, 16415); 1098 int16x8_t v986 = vaddq_s16(v938, v985); 1099 int16x8_t v987 = vsubq_s16(v2, v8); 1100 int16x8_t v988 = vsubq_s16(v15, v22); 1101 int16x8_t v989_tmp = vqrdmulhq_n_s16(v988, 18446); 1102 int16x8_t v989 = vmlaq_n_s16(v989_tmp, v988, 2); 1103 int16x8_t v990 = vaddq_s16(v987, v989); 1104 int16x8_t v991 = vsubq_s16(v31, v41); 1105 int16x8_t v992 = vsubq_s16(v48, v56); 1106 int16x8_t v993_tmp = vqrdmulhq_n_s16(v992, 18446); 1107 int16x8_t v993 = vmlaq_n_s16(v993_tmp, v992, 2); 1108 int16x8_t v994 = vaddq_s16(v991, v993); 1109 int16x8_t v995 = vqrdmulhq_n_s16(v994, 21195); 1110 int16x8_t v996 = vaddq_s16(v990, v995); 1111 int16x8_t v997 = vsubq_s16(v67, v77); 1112 int16x8_t v998 = vsubq_s16(v90, v99); 1113 int16x8_t v999_tmp = vqrdmulhq_n_s16(v998, 18446); 1114 int16x8_t v999 = vmlaq_n_s16(v999_tmp, v998, 2); 1115 int16x8_t v1000 = vaddq_s16(v997, v999); 1116 int16x8_t v1001 = vsubq_s16(v108, v118); 1117 int16x8_t v1002 = vsubq_s16(v125, v134); 1118 int16x8_t v1003_tmp = vqrdmulhq_n_s16(v1002, 18446); 1119 int16x8_t v1003 = vmlaq_n_s16(v1003_tmp, v1002, 2); 1120 int16x8_t v1004 = vaddq_s16(v1001, v1003); 1121 int16x8_t v1005 = vqrdmulhq_n_s16(v1004, 21195); 1122 int16x8_t v1006 = vaddq_s16(v1000, v1005); 1123 int16x8_t v1007 = vqrdmulhq_n_s16(v1006, 17401); 1124 int16x8_t v1008 = vaddq_s16(v996, v1007); 1125 int16x8_t v1009 = vsubq_s16(v147, v157); 1126 int16x8_t v1010 = vsubq_s16(v170, v179); 1127 int16x8_t v1011_tmp = vqrdmulhq_n_s16(v1010, 18446); 1128 int16x8_t v1011 = vmlaq_n_s16(v1011_tmp, v1010, 2); 1129 int16x8_t v1012 = vaddq_s16(v1009, v1011); 1130 int16x8_t v1013 = vsubq_s16(v194, v212); 1131 int16x8_t v1014 = vsubq_s16(v219, v229); 1132 int16x8_t v1015_tmp = vqrdmulhq_n_s16(v1014, 18446); 1133 int16x8_t v1015 = vmlaq_n_s16(v1015_tmp, v1014, 2); 1134 int16x8_t v1016 = vaddq_s16(v1013, v1015); 1135 int16x8_t v1017 = vqrdmulhq_n_s16(v1016, 21195); 1136 int16x8_t v1018 = vaddq_s16(v1012, v1017); 1137 int16x8_t v1019 = vsubq_s16(v240, v250); 1138 int16x8_t v1020 = vsubq_s16(v263, v272); 1139 int16x8_t v1021_tmp = vqrdmulhq_n_s16(v1020, 18446); 1140 int16x8_t v1021 = vmlaq_n_s16(v1021_tmp, v1020, 2); 1141 int16x8_t v1022 = vaddq_s16(v1019, v1021); 1142 int16x8_t v1023 = vsubq_s16(v281, v291); 1143 int16x8_t v1024 = vsubq_s16(v298, v308); 1144 int16x8_t v1025_tmp = vqrdmulhq_n_s16(v1024, 18446); 1145 int16x8_t v1025 = vmlaq_n_s16(v1025_tmp, v1024, 2); 1146 int16x8_t v1026 = vaddq_s16(v1023, v1025); 1147 int16x8_t v1027 = vqrdmulhq_n_s16(v1026, 21195); 1148 int16x8_t v1028 = vaddq_s16(v1022, v1027); 1149 int16x8_t v1029 = vqrdmulhq_n_s16(v1028, 17401); 1150 int16x8_t v1030 = vaddq_s16(v1018, v1029); 1151 int16x8_t v1031 = vqrdmulhq_n_s16(v1030, 16629); 1152 int16x8_t v1032 = vaddq_s16(v1008, v1031); 1153 int16x8_t v1033 = vsubq_s16(v323, v333); 1154 int16x8_t v1034 = vsubq_s16(v346, v355); 1155 int16x8_t v1035_tmp = vqrdmulhq_n_s16(v1034, 18446); 1156 int16x8_t v1035 = vmlaq_n_s16(v1035_tmp, v1034, 2); 1157 int16x8_t v1036 = vaddq_s16(v1033, v1035); 1158 int16x8_t v1037 = vsubq_s16(v370, v388); 1159 int16x8_t v1038 = vsubq_s16(v395, v405); 1160 int16x8_t v1039_tmp = vqrdmulhq_n_s16(v1038, 18446); 1161 int16x8_t v1039 = vmlaq_n_s16(v1039_tmp, v1038, 2); 1162 int16x8_t v1040 = vaddq_s16(v1037, v1039); 1163 int16x8_t v1041 = vqrdmulhq_n_s16(v1040, 21195); 1164 int16x8_t v1042 = vaddq_s16(v1036, v1041); 1165 int16x8_t v1043 = vsubq_s16(v422, v440); 1166 int16x8_t v1044 = vsubq_s16(v465, v478); 1167 int16x8_t v1045_tmp = vqrdmulhq_n_s16(v1044, 18446); 1168 int16x8_t v1045 = vmlaq_n_s16(v1045_tmp, v1044, 2); 1169 int16x8_t v1046 = vaddq_s16(v1043, v1045); 1170 int16x8_t v1047 = vsubq_s16(v487, v497); 1171 int16x8_t v1048 = vsubq_s16(v504, v515); 1172 int16x8_t v1049_tmp = vqrdmulhq_n_s16(v1048, 18446); 1173 int16x8_t v1049 = vmlaq_n_s16(v1049_tmp, v1048, 2); 1174 int16x8_t v1050 = vaddq_s16(v1047, v1049); 1175 int16x8_t v1051 = vqrdmulhq_n_s16(v1050, 21195); 1176 int16x8_t v1052 = vaddq_s16(v1046, v1051); 1177 int16x8_t v1053 = vqrdmulhq_n_s16(v1052, 17401); 1178 int16x8_t v1054 = vaddq_s16(v1042, v1053); 1179 int16x8_t v1055 = vsubq_s16(v528, v538); 1180 int16x8_t v1056 = vsubq_s16(v551, v560); 1181 int16x8_t v1057_tmp = vqrdmulhq_n_s16(v1056, 18446); 1182 int16x8_t v1057 = vmlaq_n_s16(v1057_tmp, v1056, 2); 1183 int16x8_t v1058 = vaddq_s16(v1055, v1057); 1184 int16x8_t v1059 = vsubq_s16(v575, v593); 1185 int16x8_t v1060 = vsubq_s16(v600, v610); 1186 int16x8_t v1061_tmp = vqrdmulhq_n_s16(v1060, 18446); 1187 int16x8_t v1061 = vmlaq_n_s16(v1061_tmp, v1060, 2); 1188 int16x8_t v1062 = vaddq_s16(v1059, v1061); 1189 int16x8_t v1063 = vqrdmulhq_n_s16(v1062, 21195); 1190 int16x8_t v1064 = vaddq_s16(v1058, v1063); 1191 int16x8_t v1065 = vsubq_s16(v621, v631); 1192 int16x8_t v1066 = vsubq_s16(v644, v653); 1193 int16x8_t v1067_tmp = vqrdmulhq_n_s16(v1066, 18446); 1194 int16x8_t v1067 = vmlaq_n_s16(v1067_tmp, v1066, 2); 1195 int16x8_t v1068 = vaddq_s16(v1065, v1067); 1196 int16x8_t v1069 = vsubq_s16(v662, v672); 1197 int16x8_t v1070 = vsubq_s16(v679, v690); 1198 int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 18446); 1199 int16x8_t v1071 = vmlaq_n_s16(v1071_tmp, v1070, 2); 1200 int16x8_t v1072 = vaddq_s16(v1069, v1071); 1201 int16x8_t v1073 = vqrdmulhq_n_s16(v1072, 21195); 1202 int16x8_t v1074 = vaddq_s16(v1068, v1073); 1203 int16x8_t v1075 = vqrdmulhq_n_s16(v1074, 17401); 1204 int16x8_t v1076 = vaddq_s16(v1064, v1075); 1205 int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 16629); 1206 int16x8_t v1078 = vaddq_s16(v1054, v1077); 1207 int16x8_t v1079 = vqrdmulhq_n_s16(v1078, 16445); 1208 int16x8_t v1080 = vaddq_s16(v1032, v1079); 1209 int16x8_t v1081 = vsubq_s16(v987, v989); 1210 int16x8_t v1082 = vsubq_s16(v991, v993); 1211 int16x8_t v1083 = vqrdmulhq_n_s16(v1082, 25826); 1212 int16x8_t v1084 = vaddq_s16(v1081, v1083); 1213 int16x8_t v1085 = vsubq_s16(v997, v999); 1214 int16x8_t v1086 = vsubq_s16(v1001, v1003); 1215 int16x8_t v1087 = vqrdmulhq_n_s16(v1086, 25826); 1216 int16x8_t v1088 = vaddq_s16(v1085, v1087); 1217 int16x8_t v1089 = vqrdmulhq_n_s16(v1088, 18124); 1218 int16x8_t v1090 = vaddq_s16(v1084, v1089); 1219 int16x8_t v1091 = vsubq_s16(v1009, v1011); 1220 int16x8_t v1092 = vsubq_s16(v1013, v1015); 1221 int16x8_t v1093 = vqrdmulhq_n_s16(v1092, 25826); 1222 int16x8_t v1094 = vaddq_s16(v1091, v1093); 1223 int16x8_t v1095 = vsubq_s16(v1019, v1021); 1224 int16x8_t v1096 = vsubq_s16(v1023, v1025); 1225 int16x8_t v1097 = vqrdmulhq_n_s16(v1096, 25826); 1226 int16x8_t v1098 = vaddq_s16(v1095, v1097); 1227 int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 18124); 1228 int16x8_t v1100 = vaddq_s16(v1094, v1099); 1229 int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16792); 1230 int16x8_t v1102 = vaddq_s16(v1090, v1101); 1231 int16x8_t v1103 = vsubq_s16(v1033, v1035); 1232 int16x8_t v1104 = vsubq_s16(v1037, v1039); 1233 int16x8_t v1105 = vqrdmulhq_n_s16(v1104, 25826); 1234 int16x8_t v1106 = vaddq_s16(v1103, v1105); 1235 int16x8_t v1107 = vsubq_s16(v1043, v1045); 1236 int16x8_t v1108 = vsubq_s16(v1047, v1049); 1237 int16x8_t v1109 = vqrdmulhq_n_s16(v1108, 25826); 1238 int16x8_t v1110 = vaddq_s16(v1107, v1109); 1239 int16x8_t v1111 = vqrdmulhq_n_s16(v1110, 18124); 1240 int16x8_t v1112 = vaddq_s16(v1106, v1111); 1241 int16x8_t v1113 = vsubq_s16(v1055, v1057); 1242 int16x8_t v1114 = vsubq_s16(v1059, v1061); 1243 int16x8_t v1115 = vqrdmulhq_n_s16(v1114, 25826); 1244 int16x8_t v1116 = vaddq_s16(v1113, v1115); 1245 int16x8_t v1117 = vsubq_s16(v1065, v1067); 1246 int16x8_t v1118 = vsubq_s16(v1069, v1071); 1247 int16x8_t v1119 = vqrdmulhq_n_s16(v1118, 25826); 1248 int16x8_t v1120 = vaddq_s16(v1117, v1119); 1249 int16x8_t v1121 = vqrdmulhq_n_s16(v1120, 18124); 1250 int16x8_t v1122 = vaddq_s16(v1116, v1121); 1251 int16x8_t v1123 = vqrdmulhq_n_s16(v1122, 16792); 1252 int16x8_t v1124 = vaddq_s16(v1112, v1123); 1253 int16x8_t v1125 = vqrdmulhq_n_s16(v1124, 16484); 1254 int16x8_t v1126 = vaddq_s16(v1102, v1125); 1255 int16x8_t v1127 = vsubq_s16(v892, v894); 1256 int16x8_t v1128 = vsubq_s16(v896, v898); 1257 int16x8_t v1129_tmp = vqrdmulhq_n_s16(v1128, 1988); 1258 int16x8_t v1129 = vaddq_s16(v1129_tmp, v1128); 1259 int16x8_t v1130 = vaddq_s16(v1127, v1129); 1260 int16x8_t v1131 = vsubq_s16(v902, v904); 1261 int16x8_t v1132 = vsubq_s16(v906, v908); 1262 int16x8_t v1133_tmp = vqrdmulhq_n_s16(v1132, 1988); 1263 int16x8_t v1133 = vaddq_s16(v1133_tmp, v1132); 1264 int16x8_t v1134 = vaddq_s16(v1131, v1133); 1265 int16x8_t v1135 = vqrdmulhq_n_s16(v1134, 19102); 1266 int16x8_t v1136 = vaddq_s16(v1130, v1135); 1267 int16x8_t v1137 = vsubq_s16(v914, v917); 1268 int16x8_t v1138 = vsubq_s16(v919, v921); 1269 int16x8_t v1139_tmp = vqrdmulhq_n_s16(v1138, 1988); 1270 int16x8_t v1139 = vaddq_s16(v1139_tmp, v1138); 1271 int16x8_t v1140 = vaddq_s16(v1137, v1139); 1272 int16x8_t v1141 = vsubq_s16(v925, v927); 1273 int16x8_t v1142 = vsubq_s16(v929, v931); 1274 int16x8_t v1143_tmp = vqrdmulhq_n_s16(v1142, 1988); 1275 int16x8_t v1143 = vaddq_s16(v1143_tmp, v1142); 1276 int16x8_t v1144 = vaddq_s16(v1141, v1143); 1277 int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 19102); 1278 int16x8_t v1146 = vaddq_s16(v1140, v1145); 1279 int16x8_t v1147 = vqrdmulhq_n_s16(v1146, 17000); 1280 int16x8_t v1148 = vaddq_s16(v1136, v1147); 1281 int16x8_t v1149 = vsubq_s16(v939, v941); 1282 int16x8_t v1150 = vsubq_s16(v943, v945); 1283 int16x8_t v1151_tmp = vqrdmulhq_n_s16(v1150, 1988); 1284 int16x8_t v1151 = vaddq_s16(v1151_tmp, v1150); 1285 int16x8_t v1152 = vaddq_s16(v1149, v1151); 1286 int16x8_t v1153 = vsubq_s16(v949, v951); 1287 int16x8_t v1154 = vsubq_s16(v953, v955); 1288 int16x8_t v1155_tmp = vqrdmulhq_n_s16(v1154, 1988); 1289 int16x8_t v1155 = vaddq_s16(v1155_tmp, v1154); 1290 int16x8_t v1156 = vaddq_s16(v1153, v1155); 1291 int16x8_t v1157 = vqrdmulhq_n_s16(v1156, 19102); 1292 int16x8_t v1158 = vaddq_s16(v1152, v1157); 1293 int16x8_t v1159 = vsubq_s16(v961, v963); 1294 int16x8_t v1160 = vsubq_s16(v965, v967); 1295 int16x8_t v1161_tmp = vqrdmulhq_n_s16(v1160, 1988); 1296 int16x8_t v1161 = vaddq_s16(v1161_tmp, v1160); 1297 int16x8_t v1162 = vaddq_s16(v1159, v1161); 1298 int16x8_t v1163 = vsubq_s16(v971, v973); 1299 int16x8_t v1164 = vsubq_s16(v975, v977); 1300 int16x8_t v1165_tmp = vqrdmulhq_n_s16(v1164, 1988); 1301 int16x8_t v1165 = vaddq_s16(v1165_tmp, v1164); 1302 int16x8_t v1166 = vaddq_s16(v1163, v1165); 1303 int16x8_t v1167 = vqrdmulhq_n_s16(v1166, 19102); 1304 int16x8_t v1168 = vaddq_s16(v1162, v1167); 1305 int16x8_t v1169 = vqrdmulhq_n_s16(v1168, 17000); 1306 int16x8_t v1170 = vaddq_s16(v1158, v1169); 1307 int16x8_t v1171 = vqrdmulhq_n_s16(v1170, 16534); 1308 int16x8_t v1172 = vaddq_s16(v1148, v1171); 1309 int16x8_t v1173 = vsubq_s16(v705, v710); 1310 int16x8_t v1174 = vsubq_s16(v715, v720); 1311 int16x8_t v1175_tmp = vqrdmulhq_n_s16(v1174, 23673); 1312 int16x8_t v1175 = vaddq_s16(v1175_tmp, v1174); 1313 int16x8_t v1176 = vaddq_s16(v1173, v1175); 1314 int16x8_t v1177 = vsubq_s16(v727, v732); 1315 int16x8_t v1178 = vsubq_s16(v737, v742); 1316 int16x8_t v1179_tmp = vqrdmulhq_n_s16(v1178, 23673); 1317 int16x8_t v1179 = vaddq_s16(v1179_tmp, v1178); 1318 int16x8_t v1180 = vaddq_s16(v1177, v1179); 1319 int16x8_t v1181 = vqrdmulhq_n_s16(v1180, 20398); 1320 int16x8_t v1182 = vaddq_s16(v1176, v1181); 1321 int16x8_t v1183 = vsubq_s16(v751, v756); 1322 int16x8_t v1184 = vsubq_s16(v761, v766); 1323 int16x8_t v1185_tmp = vqrdmulhq_n_s16(v1184, 23673); 1324 int16x8_t v1185 = vaddq_s16(v1185_tmp, v1184); 1325 int16x8_t v1186 = vaddq_s16(v1183, v1185); 1326 int16x8_t v1187 = vsubq_s16(v773, v778); 1327 int16x8_t v1188 = vsubq_s16(v783, v788); 1328 int16x8_t v1189_tmp = vqrdmulhq_n_s16(v1188, 23673); 1329 int16x8_t v1189 = vaddq_s16(v1189_tmp, v1188); 1330 int16x8_t v1190 = vaddq_s16(v1187, v1189); 1331 int16x8_t v1191 = vqrdmulhq_n_s16(v1190, 20398); 1332 int16x8_t v1192 = vaddq_s16(v1186, v1191); 1333 int16x8_t v1193 = vqrdmulhq_n_s16(v1192, 17255); 1334 int16x8_t v1194 = vaddq_s16(v1182, v1193); 1335 int16x8_t v1195 = vsubq_s16(v799, v804); 1336 int16x8_t v1196 = vsubq_s16(v809, v814); 1337 int16x8_t v1197_tmp = vqrdmulhq_n_s16(v1196, 23673); 1338 int16x8_t v1197 = vaddq_s16(v1197_tmp, v1196); 1339 int16x8_t v1198 = vaddq_s16(v1195, v1197); 1340 int16x8_t v1199 = vsubq_s16(v821, v826); 1341 int16x8_t v1200 = vsubq_s16(v831, v836); 1342 int16x8_t v1201_tmp = vqrdmulhq_n_s16(v1200, 23673); 1343 int16x8_t v1201 = vaddq_s16(v1201_tmp, v1200); 1344 int16x8_t v1202 = vaddq_s16(v1199, v1201); 1345 int16x8_t v1203 = vqrdmulhq_n_s16(v1202, 20398); 1346 int16x8_t v1204 = vaddq_s16(v1198, v1203); 1347 int16x8_t v1205 = vsubq_s16(v845, v850); 1348 int16x8_t v1206 = vsubq_s16(v855, v860); 1349 int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 23673); 1350 int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206); 1351 int16x8_t v1208 = vaddq_s16(v1205, v1207); 1352 int16x8_t v1209 = vsubq_s16(v867, v872); 1353 int16x8_t v1210 = vsubq_s16(v877, v882); 1354 int16x8_t v1211_tmp = vqrdmulhq_n_s16(v1210, 23673); 1355 int16x8_t v1211 = vaddq_s16(v1211_tmp, v1210); 1356 int16x8_t v1212 = vaddq_s16(v1209, v1211); 1357 int16x8_t v1213 = vqrdmulhq_n_s16(v1212, 20398); 1358 int16x8_t v1214 = vaddq_s16(v1208, v1213); 1359 int16x8_t v1215 = vqrdmulhq_n_s16(v1214, 17255); 1360 int16x8_t v1216 = vaddq_s16(v1204, v1215); 1361 int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 16595); 1362 int16x8_t v1218 = vaddq_s16(v1194, v1217); 1363 int16x8_t v1219 = vsubq_s16(v9, v24); 1364 int16x8_t v1220 = vsubq_s16(v42, v58); 1365 int16x8_t v1221_tmp = vqrdmulhq_n_s16(v1220, 3314); 1366 int16x8_t v1221 = vmlaq_n_s16(v1221_tmp, v1220, 5); 1367 int16x8_t v1222 = vaddq_s16(v1219, v1221); 1368 int16x8_t v1223 = vsubq_s16(v78, v101); 1369 int16x8_t v1224 = vsubq_s16(v119, v136); 1370 int16x8_t v1225_tmp = vqrdmulhq_n_s16(v1224, 3314); 1371 int16x8_t v1225 = vmlaq_n_s16(v1225_tmp, v1224, 5); 1372 int16x8_t v1226 = vaddq_s16(v1223, v1225); 1373 int16x8_t v1227 = vqrdmulhq_n_s16(v1226, 22112); 1374 int16x8_t v1228 = vaddq_s16(v1222, v1227); 1375 int16x8_t v1229 = vsubq_s16(v158, v181); 1376 int16x8_t v1230 = vsubq_s16(v213, v231); 1377 int16x8_t v1231_tmp = vqrdmulhq_n_s16(v1230, 3314); 1378 int16x8_t v1231 = vmlaq_n_s16(v1231_tmp, v1230, 5); 1379 int16x8_t v1232 = vaddq_s16(v1229, v1231); 1380 int16x8_t v1233 = vsubq_s16(v251, v274); 1381 int16x8_t v1234 = vsubq_s16(v292, v310); 1382 int16x8_t v1235_tmp = vqrdmulhq_n_s16(v1234, 3314); 1383 int16x8_t v1235 = vmlaq_n_s16(v1235_tmp, v1234, 5); 1384 int16x8_t v1236 = vaddq_s16(v1233, v1235); 1385 int16x8_t v1237 = vqrdmulhq_n_s16(v1236, 22112); 1386 int16x8_t v1238 = vaddq_s16(v1232, v1237); 1387 int16x8_t v1239 = vqrdmulhq_n_s16(v1238, 17561); 1388 int16x8_t v1240 = vaddq_s16(v1228, v1239); 1389 int16x8_t v1241 = vsubq_s16(v334, v357); 1390 int16x8_t v1242 = vsubq_s16(v389, v407); 1391 int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 3314); 1392 int16x8_t v1243 = vmlaq_n_s16(v1243_tmp, v1242, 5); 1393 int16x8_t v1244 = vaddq_s16(v1241, v1243); 1394 int16x8_t v1245 = vsubq_s16(v441, v480); 1395 int16x8_t v1246 = vsubq_s16(v498, v517); 1396 int16x8_t v1247_tmp = vqrdmulhq_n_s16(v1246, 3314); 1397 int16x8_t v1247 = vmlaq_n_s16(v1247_tmp, v1246, 5); 1398 int16x8_t v1248 = vaddq_s16(v1245, v1247); 1399 int16x8_t v1249 = vqrdmulhq_n_s16(v1248, 22112); 1400 int16x8_t v1250 = vaddq_s16(v1244, v1249); 1401 int16x8_t v1251 = vsubq_s16(v539, v562); 1402 int16x8_t v1252 = vsubq_s16(v594, v612); 1403 int16x8_t v1253_tmp = vqrdmulhq_n_s16(v1252, 3314); 1404 int16x8_t v1253 = vmlaq_n_s16(v1253_tmp, v1252, 5); 1405 int16x8_t v1254 = vaddq_s16(v1251, v1253); 1406 int16x8_t v1255 = vsubq_s16(v632, v655); 1407 int16x8_t v1256 = vsubq_s16(v673, v692); 1408 int16x8_t v1257_tmp = vqrdmulhq_n_s16(v1256, 3314); 1409 int16x8_t v1257 = vmlaq_n_s16(v1257_tmp, v1256, 5); 1410 int16x8_t v1258 = vaddq_s16(v1255, v1257); 1411 int16x8_t v1259 = vqrdmulhq_n_s16(v1258, 22112); 1412 int16x8_t v1260 = vaddq_s16(v1254, v1259); 1413 int16x8_t v1261 = vqrdmulhq_n_s16(v1260, 17561); 1414 int16x8_t v1262 = vaddq_s16(v1250, v1261); 1415 int16x8_t v1263 = vqrdmulhq_n_s16(v1262, 16666); 1416 int16x8_t v1264 = vaddq_s16(v1240, v1263); 1417 int16x8_t v1265 = vsubq_s16(v1219, v1221); 1418 int16x8_t v1266 = vsubq_s16(v1223, v1225); 1419 int16x8_t v1267 = vqrdmulhq_n_s16(v1266, 24397); 1420 int16x8_t v1268 = vaddq_s16(v1265, v1267); 1421 int16x8_t v1269 = vsubq_s16(v1229, v1231); 1422 int16x8_t v1270 = vsubq_s16(v1233, v1235); 1423 int16x8_t v1271 = vqrdmulhq_n_s16(v1270, 24397); 1424 int16x8_t v1272 = vaddq_s16(v1269, v1271); 1425 int16x8_t v1273 = vqrdmulhq_n_s16(v1272, 17921); 1426 int16x8_t v1274 = vaddq_s16(v1268, v1273); 1427 int16x8_t v1275 = vsubq_s16(v1241, v1243); 1428 int16x8_t v1276 = vsubq_s16(v1245, v1247); 1429 int16x8_t v1277 = vqrdmulhq_n_s16(v1276, 24397); 1430 int16x8_t v1278 = vaddq_s16(v1275, v1277); 1431 int16x8_t v1279 = vsubq_s16(v1251, v1253); 1432 int16x8_t v1280 = vsubq_s16(v1255, v1257); 1433 int16x8_t v1281 = vqrdmulhq_n_s16(v1280, 24397); 1434 int16x8_t v1282 = vaddq_s16(v1279, v1281); 1435 int16x8_t v1283 = vqrdmulhq_n_s16(v1282, 17921); 1436 int16x8_t v1284 = vaddq_s16(v1278, v1283); 1437 int16x8_t v1285 = vqrdmulhq_n_s16(v1284, 16747); 1438 int16x8_t v1286 = vaddq_s16(v1274, v1285); 1439 int16x8_t v1287 = vsubq_s16(v1173, v1175); 1440 int16x8_t v1288 = vsubq_s16(v1177, v1179); 1441 int16x8_t v1289 = vqrdmulhq_n_s16(v1288, 27504); 1442 int16x8_t v1290 = vaddq_s16(v1287, v1289); 1443 int16x8_t v1291 = vsubq_s16(v1183, v1185); 1444 int16x8_t v1292 = vsubq_s16(v1187, v1189); 1445 int16x8_t v1293 = vqrdmulhq_n_s16(v1292, 27504); 1446 int16x8_t v1294 = vaddq_s16(v1291, v1293); 1447 int16x8_t v1295 = vqrdmulhq_n_s16(v1294, 18343); 1448 int16x8_t v1296 = vaddq_s16(v1290, v1295); 1449 int16x8_t v1297 = vsubq_s16(v1195, v1197); 1450 int16x8_t v1298 = vsubq_s16(v1199, v1201); 1451 int16x8_t v1299 = vqrdmulhq_n_s16(v1298, 27504); 1452 int16x8_t v1300 = vaddq_s16(v1297, v1299); 1453 int16x8_t v1301 = vsubq_s16(v1205, v1207); 1454 int16x8_t v1302 = vsubq_s16(v1209, v1211); 1455 int16x8_t v1303 = vqrdmulhq_n_s16(v1302, 27504); 1456 int16x8_t v1304 = vaddq_s16(v1301, v1303); 1457 int16x8_t v1305 = vqrdmulhq_n_s16(v1304, 18343); 1458 int16x8_t v1306 = vaddq_s16(v1300, v1305); 1459 int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 16840); 1460 int16x8_t v1308 = vaddq_s16(v1296, v1307); 1461 int16x8_t v1309 = vsubq_s16(v1127, v1129); 1462 int16x8_t v1310 = vsubq_s16(v1131, v1133); 1463 int16x8_t v1311 = vqrdmulhq_n_s16(v1310, 31869); 1464 int16x8_t v1312 = vaddq_s16(v1309, v1311); 1465 int16x8_t v1313 = vsubq_s16(v1137, v1139); 1466 int16x8_t v1314 = vsubq_s16(v1141, v1143); 1467 int16x8_t v1315 = vqrdmulhq_n_s16(v1314, 31869); 1468 int16x8_t v1316 = vaddq_s16(v1313, v1315); 1469 int16x8_t v1317 = vqrdmulhq_n_s16(v1316, 18830); 1470 int16x8_t v1318 = vaddq_s16(v1312, v1317); 1471 int16x8_t v1319 = vsubq_s16(v1149, v1151); 1472 int16x8_t v1320 = vsubq_s16(v1153, v1155); 1473 int16x8_t v1321 = vqrdmulhq_n_s16(v1320, 31869); 1474 int16x8_t v1322 = vaddq_s16(v1319, v1321); 1475 int16x8_t v1323 = vsubq_s16(v1159, v1161); 1476 int16x8_t v1324 = vsubq_s16(v1163, v1165); 1477 int16x8_t v1325 = vqrdmulhq_n_s16(v1324, 31869); 1478 int16x8_t v1326 = vaddq_s16(v1323, v1325); 1479 int16x8_t v1327 = vqrdmulhq_n_s16(v1326, 18830); 1480 int16x8_t v1328 = vaddq_s16(v1322, v1327); 1481 int16x8_t v1329 = vqrdmulhq_n_s16(v1328, 16944); 1482 int16x8_t v1330 = vaddq_s16(v1318, v1329); 1483 int16x8_t v1331 = vsubq_s16(v1081, v1083); 1484 int16x8_t v1332 = vsubq_s16(v1085, v1087); 1485 int16x8_t v1333_tmp = vqrdmulhq_n_s16(v1332, 5552); 1486 int16x8_t v1333 = vaddq_s16(v1333_tmp, v1332); 1487 int16x8_t v1334 = vaddq_s16(v1331, v1333); 1488 int16x8_t v1335 = vsubq_s16(v1091, v1093); 1489 int16x8_t v1336 = vsubq_s16(v1095, v1097); 1490 int16x8_t v1337_tmp = vqrdmulhq_n_s16(v1336, 5552); 1491 int16x8_t v1337 = vaddq_s16(v1337_tmp, v1336); 1492 int16x8_t v1338 = vaddq_s16(v1335, v1337); 1493 int16x8_t v1339 = vqrdmulhq_n_s16(v1338, 19393); 1494 int16x8_t v1340 = vaddq_s16(v1334, v1339); 1495 int16x8_t v1341 = vsubq_s16(v1103, v1105); 1496 int16x8_t v1342 = vsubq_s16(v1107, v1109); 1497 int16x8_t v1343_tmp = vqrdmulhq_n_s16(v1342, 5552); 1498 int16x8_t v1343 = vaddq_s16(v1343_tmp, v1342); 1499 int16x8_t v1344 = vaddq_s16(v1341, v1343); 1500 int16x8_t v1345 = vsubq_s16(v1113, v1115); 1501 int16x8_t v1346 = vsubq_s16(v1117, v1119); 1502 int16x8_t v1347_tmp = vqrdmulhq_n_s16(v1346, 5552); 1503 int16x8_t v1347 = vaddq_s16(v1347_tmp, v1346); 1504 int16x8_t v1348 = vaddq_s16(v1345, v1347); 1505 int16x8_t v1349 = vqrdmulhq_n_s16(v1348, 19393); 1506 int16x8_t v1350 = vaddq_s16(v1344, v1349); 1507 int16x8_t v1351 = vqrdmulhq_n_s16(v1350, 17059); 1508 int16x8_t v1352 = vaddq_s16(v1340, v1351); 1509 int16x8_t v1353 = vsubq_s16(v990, v995); 1510 int16x8_t v1354 = vsubq_s16(v1000, v1005); 1511 int16x8_t v1355_tmp = vqrdmulhq_n_s16(v1354, 15865); 1512 int16x8_t v1355 = vaddq_s16(v1355_tmp, v1354); 1513 int16x8_t v1356 = vaddq_s16(v1353, v1355); 1514 int16x8_t v1357 = vsubq_s16(v1012, v1017); 1515 int16x8_t v1358 = vsubq_s16(v1022, v1027); 1516 int16x8_t v1359_tmp = vqrdmulhq_n_s16(v1358, 15865); 1517 int16x8_t v1359 = vaddq_s16(v1359_tmp, v1358); 1518 int16x8_t v1360 = vaddq_s16(v1357, v1359); 1519 int16x8_t v1361 = vqrdmulhq_n_s16(v1360, 20040); 1520 int16x8_t v1362 = vaddq_s16(v1356, v1361); 1521 int16x8_t v1363 = vsubq_s16(v1036, v1041); 1522 int16x8_t v1364 = vsubq_s16(v1046, v1051); 1523 int16x8_t v1365_tmp = vqrdmulhq_n_s16(v1364, 15865); 1524 int16x8_t v1365 = vaddq_s16(v1365_tmp, v1364); 1525 int16x8_t v1366 = vaddq_s16(v1363, v1365); 1526 int16x8_t v1367 = vsubq_s16(v1058, v1063); 1527 int16x8_t v1368 = vsubq_s16(v1068, v1073); 1528 int16x8_t v1369_tmp = vqrdmulhq_n_s16(v1368, 15865); 1529 int16x8_t v1369 = vaddq_s16(v1369_tmp, v1368); 1530 int16x8_t v1370 = vaddq_s16(v1367, v1369); 1531 int16x8_t v1371 = vqrdmulhq_n_s16(v1370, 20040); 1532 int16x8_t v1372 = vaddq_s16(v1366, v1371); 1533 int16x8_t v1373 = vqrdmulhq_n_s16(v1372, 17187); 1534 int16x8_t v1374 = vaddq_s16(v1362, v1373); 1535 int16x8_t v1375 = vsubq_s16(v895, v900); 1536 int16x8_t v1376 = vsubq_s16(v905, v910); 1537 int16x8_t v1377_tmp = vqrdmulhq_n_s16(v1376, 1893); 1538 int16x8_t v1377 = vmlaq_n_s16(v1377_tmp, v1376, 2); 1539 int16x8_t v1378 = vaddq_s16(v1375, v1377); 1540 int16x8_t v1379 = vsubq_s16(v918, v923); 1541 int16x8_t v1380 = vsubq_s16(v928, v933); 1542 int16x8_t v1381_tmp = vqrdmulhq_n_s16(v1380, 1893); 1543 int16x8_t v1381 = vmlaq_n_s16(v1381_tmp, v1380, 2); 1544 int16x8_t v1382 = vaddq_s16(v1379, v1381); 1545 int16x8_t v1383 = vqrdmulhq_n_s16(v1382, 20783); 1546 int16x8_t v1384 = vaddq_s16(v1378, v1383); 1547 int16x8_t v1385 = vsubq_s16(v942, v947); 1548 int16x8_t v1386 = vsubq_s16(v952, v957); 1549 int16x8_t v1387_tmp = vqrdmulhq_n_s16(v1386, 1893); 1550 int16x8_t v1387 = vmlaq_n_s16(v1387_tmp, v1386, 2); 1551 int16x8_t v1388 = vaddq_s16(v1385, v1387); 1552 int16x8_t v1389 = vsubq_s16(v964, v969); 1553 int16x8_t v1390 = vsubq_s16(v974, v979); 1554 int16x8_t v1391_tmp = vqrdmulhq_n_s16(v1390, 1893); 1555 int16x8_t v1391 = vmlaq_n_s16(v1391_tmp, v1390, 2); 1556 int16x8_t v1392 = vaddq_s16(v1389, v1391); 1557 int16x8_t v1393 = vqrdmulhq_n_s16(v1392, 20783); 1558 int16x8_t v1394 = vaddq_s16(v1388, v1393); 1559 int16x8_t v1395 = vqrdmulhq_n_s16(v1394, 17326); 1560 int16x8_t v1396 = vaddq_s16(v1384, v1395); 1561 int16x8_t v1397 = vsubq_s16(v711, v722); 1562 int16x8_t v1398 = vsubq_s16(v733, v744); 1563 int16x8_t v1399_tmp = vqrdmulhq_n_s16(v1398, 13357); 1564 int16x8_t v1399 = vmlaq_n_s16(v1399_tmp, v1398, 3); 1565 int16x8_t v1400 = vaddq_s16(v1397, v1399); 1566 int16x8_t v1401 = vsubq_s16(v757, v768); 1567 int16x8_t v1402 = vsubq_s16(v779, v790); 1568 int16x8_t v1403_tmp = vqrdmulhq_n_s16(v1402, 13357); 1569 int16x8_t v1403 = vmlaq_n_s16(v1403_tmp, v1402, 3); 1570 int16x8_t v1404 = vaddq_s16(v1401, v1403); 1571 int16x8_t v1405 = vqrdmulhq_n_s16(v1404, 21637); 1572 int16x8_t v1406 = vaddq_s16(v1400, v1405); 1573 int16x8_t v1407 = vsubq_s16(v805, v816); 1574 int16x8_t v1408 = vsubq_s16(v827, v838); 1575 int16x8_t v1409_tmp = vqrdmulhq_n_s16(v1408, 13357); 1576 int16x8_t v1409 = vmlaq_n_s16(v1409_tmp, v1408, 3); 1577 int16x8_t v1410 = vaddq_s16(v1407, v1409); 1578 int16x8_t v1411 = vsubq_s16(v851, v862); 1579 int16x8_t v1412 = vsubq_s16(v873, v884); 1580 int16x8_t v1413_tmp = vqrdmulhq_n_s16(v1412, 13357); 1581 int16x8_t v1413 = vmlaq_n_s16(v1413_tmp, v1412, 3); 1582 int16x8_t v1414 = vaddq_s16(v1411, v1413); 1583 int16x8_t v1415 = vqrdmulhq_n_s16(v1414, 21637); 1584 int16x8_t v1416 = vaddq_s16(v1410, v1415); 1585 int16x8_t v1417 = vqrdmulhq_n_s16(v1416, 17479); 1586 int16x8_t v1418 = vaddq_s16(v1406, v1417); 1587 int16x8_t v1419 = vsubq_s16(v25, v60); 1588 int16x8_t v1420 = vsubq_s16(v102, v138); 1589 int16x8_t v1421_tmp = vqrdmulhq_n_s16(v1420, 6226); 1590 int16x8_t v1421 = vmlaq_n_s16(v1421_tmp, v1420, 10); 1591 int16x8_t v1422 = vaddq_s16(v1419, v1421); 1592 int16x8_t v1423 = vsubq_s16(v182, v233); 1593 int16x8_t v1424 = vsubq_s16(v275, v312); 1594 int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 6226); 1595 int16x8_t v1425 = vmlaq_n_s16(v1425_tmp, v1424, 10); 1596 int16x8_t v1426 = vaddq_s16(v1423, v1425); 1597 int16x8_t v1427 = vqrdmulhq_n_s16(v1426, 22622); 1598 int16x8_t v1428 = vaddq_s16(v1422, v1427); 1599 int16x8_t v1429 = vsubq_s16(v358, v409); 1600 int16x8_t v1430 = vsubq_s16(v481, v519); 1601 int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 6226); 1602 int16x8_t v1431 = vmlaq_n_s16(v1431_tmp, v1430, 10); 1603 int16x8_t v1432 = vaddq_s16(v1429, v1431); 1604 int16x8_t v1433 = vsubq_s16(v563, v614); 1605 int16x8_t v1434 = vsubq_s16(v656, v694); 1606 int16x8_t v1435_tmp = vqrdmulhq_n_s16(v1434, 6226); 1607 int16x8_t v1435 = vmlaq_n_s16(v1435_tmp, v1434, 10); 1608 int16x8_t v1436 = vaddq_s16(v1433, v1435); 1609 int16x8_t v1437 = vqrdmulhq_n_s16(v1436, 22622); 1610 int16x8_t v1438 = vaddq_s16(v1432, v1437); 1611 int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17646); 1612 int16x8_t v1440 = vaddq_s16(v1428, v1439); 1613 int16x8_t v1441 = vsubq_s16(v1419, v1421); 1614 int16x8_t v1442 = vsubq_s16(v1423, v1425); 1615 int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 23761); 1616 int16x8_t v1444 = vaddq_s16(v1441, v1443); 1617 int16x8_t v1445 = vsubq_s16(v1429, v1431); 1618 int16x8_t v1446 = vsubq_s16(v1433, v1435); 1619 int16x8_t v1447 = vqrdmulhq_n_s16(v1446, 23761); 1620 int16x8_t v1448 = vaddq_s16(v1445, v1447); 1621 int16x8_t v1449 = vqrdmulhq_n_s16(v1448, 17826); 1622 int16x8_t v1450 = vaddq_s16(v1444, v1449); 1623 int16x8_t v1451 = vsubq_s16(v1397, v1399); 1624 int16x8_t v1452 = vsubq_s16(v1401, v1403); 1625 int16x8_t v1453 = vqrdmulhq_n_s16(v1452, 25084); 1626 int16x8_t v1454 = vaddq_s16(v1451, v1453); 1627 int16x8_t v1455 = vsubq_s16(v1407, v1409); 1628 int16x8_t v1456 = vsubq_s16(v1411, v1413); 1629 int16x8_t v1457 = vqrdmulhq_n_s16(v1456, 25084); 1630 int16x8_t v1458 = vaddq_s16(v1455, v1457); 1631 int16x8_t v1459 = vqrdmulhq_n_s16(v1458, 18021); 1632 int16x8_t v1460 = vaddq_s16(v1454, v1459); 1633 int16x8_t v1461 = vsubq_s16(v1375, v1377); 1634 int16x8_t v1462 = vsubq_s16(v1379, v1381); 1635 int16x8_t v1463 = vqrdmulhq_n_s16(v1462, 26631); 1636 int16x8_t v1464 = vaddq_s16(v1461, v1463); 1637 int16x8_t v1465 = vsubq_s16(v1385, v1387); 1638 int16x8_t v1466 = vsubq_s16(v1389, v1391); 1639 int16x8_t v1467 = vqrdmulhq_n_s16(v1466, 26631); 1640 int16x8_t v1468 = vaddq_s16(v1465, v1467); 1641 int16x8_t v1469 = vqrdmulhq_n_s16(v1468, 18231); 1642 int16x8_t v1470 = vaddq_s16(v1464, v1469); 1643 int16x8_t v1471 = vsubq_s16(v1353, v1355); 1644 int16x8_t v1472 = vsubq_s16(v1357, v1359); 1645 int16x8_t v1473 = vqrdmulhq_n_s16(v1472, 28454); 1646 int16x8_t v1474 = vaddq_s16(v1471, v1473); 1647 int16x8_t v1475 = vsubq_s16(v1363, v1365); 1648 int16x8_t v1476 = vsubq_s16(v1367, v1369); 1649 int16x8_t v1477 = vqrdmulhq_n_s16(v1476, 28454); 1650 int16x8_t v1478 = vaddq_s16(v1475, v1477); 1651 int16x8_t v1479 = vqrdmulhq_n_s16(v1478, 18458); 1652 int16x8_t v1480 = vaddq_s16(v1474, v1479); 1653 int16x8_t v1481 = vsubq_s16(v1331, v1333); 1654 int16x8_t v1482 = vsubq_s16(v1335, v1337); 1655 int16x8_t v1483 = vqrdmulhq_n_s16(v1482, 30624); 1656 int16x8_t v1484 = vaddq_s16(v1481, v1483); 1657 int16x8_t v1485 = vsubq_s16(v1341, v1343); 1658 int16x8_t v1486 = vsubq_s16(v1345, v1347); 1659 int16x8_t v1487 = vqrdmulhq_n_s16(v1486, 30624); 1660 int16x8_t v1488 = vaddq_s16(v1485, v1487); 1661 int16x8_t v1489 = vqrdmulhq_n_s16(v1488, 18702); 1662 int16x8_t v1490 = vaddq_s16(v1484, v1489); 1663 int16x8_t v1491 = vsubq_s16(v1309, v1311); 1664 int16x8_t v1492 = vsubq_s16(v1313, v1315); 1665 int16x8_t v1493_tmp = vqrdmulhq_n_s16(v1492, 472); 1666 int16x8_t v1493 = vaddq_s16(v1493_tmp, v1492); 1667 int16x8_t v1494 = vaddq_s16(v1491, v1493); 1668 int16x8_t v1495 = vsubq_s16(v1319, v1321); 1669 int16x8_t v1496 = vsubq_s16(v1323, v1325); 1670 int16x8_t v1497_tmp = vqrdmulhq_n_s16(v1496, 472); 1671 int16x8_t v1497 = vaddq_s16(v1497_tmp, v1496); 1672 int16x8_t v1498 = vaddq_s16(v1495, v1497); 1673 int16x8_t v1499 = vqrdmulhq_n_s16(v1498, 18964); 1674 int16x8_t v1500 = vaddq_s16(v1494, v1499); 1675 int16x8_t v1501 = vsubq_s16(v1287, v1289); 1676 int16x8_t v1502 = vsubq_s16(v1291, v1293); 1677 int16x8_t v1503_tmp = vqrdmulhq_n_s16(v1502, 3672); 1678 int16x8_t v1503 = vaddq_s16(v1503_tmp, v1502); 1679 int16x8_t v1504 = vaddq_s16(v1501, v1503); 1680 int16x8_t v1505 = vsubq_s16(v1297, v1299); 1681 int16x8_t v1506 = vsubq_s16(v1301, v1303); 1682 int16x8_t v1507_tmp = vqrdmulhq_n_s16(v1506, 3672); 1683 int16x8_t v1507 = vaddq_s16(v1507_tmp, v1506); 1684 int16x8_t v1508 = vaddq_s16(v1505, v1507); 1685 int16x8_t v1509 = vqrdmulhq_n_s16(v1508, 19245); 1686 int16x8_t v1510 = vaddq_s16(v1504, v1509); 1687 int16x8_t v1511 = vsubq_s16(v1265, v1267); 1688 int16x8_t v1512 = vsubq_s16(v1269, v1271); 1689 int16x8_t v1513_tmp = vqrdmulhq_n_s16(v1512, 7662); 1690 int16x8_t v1513 = vaddq_s16(v1513_tmp, v1512); 1691 int16x8_t v1514 = vaddq_s16(v1511, v1513); 1692 int16x8_t v1515 = vsubq_s16(v1275, v1277); 1693 int16x8_t v1516 = vsubq_s16(v1279, v1281); 1694 int16x8_t v1517_tmp = vqrdmulhq_n_s16(v1516, 7662); 1695 int16x8_t v1517 = vaddq_s16(v1517_tmp, v1516); 1696 int16x8_t v1518 = vaddq_s16(v1515, v1517); 1697 int16x8_t v1519 = vqrdmulhq_n_s16(v1518, 19546); 1698 int16x8_t v1520 = vaddq_s16(v1514, v1519); 1699 int16x8_t v1521 = vsubq_s16(v1222, v1227); 1700 int16x8_t v1522 = vsubq_s16(v1232, v1237); 1701 int16x8_t v1523_tmp = vqrdmulhq_n_s16(v1522, 12756); 1702 int16x8_t v1523 = vaddq_s16(v1523_tmp, v1522); 1703 int16x8_t v1524 = vaddq_s16(v1521, v1523); 1704 int16x8_t v1525 = vsubq_s16(v1244, v1249); 1705 int16x8_t v1526 = vsubq_s16(v1254, v1259); 1706 int16x8_t v1527_tmp = vqrdmulhq_n_s16(v1526, 12756); 1707 int16x8_t v1527 = vaddq_s16(v1527_tmp, v1526); 1708 int16x8_t v1528 = vaddq_s16(v1525, v1527); 1709 int16x8_t v1529 = vqrdmulhq_n_s16(v1528, 19869); 1710 int16x8_t v1530 = vaddq_s16(v1524, v1529); 1711 int16x8_t v1531 = vsubq_s16(v1176, v1181); 1712 int16x8_t v1532 = vsubq_s16(v1186, v1191); 1713 int16x8_t v1533_tmp = vqrdmulhq_n_s16(v1532, 19463); 1714 int16x8_t v1533 = vaddq_s16(v1533_tmp, v1532); 1715 int16x8_t v1534 = vaddq_s16(v1531, v1533); 1716 int16x8_t v1535 = vsubq_s16(v1198, v1203); 1717 int16x8_t v1536 = vsubq_s16(v1208, v1213); 1718 int16x8_t v1537_tmp = vqrdmulhq_n_s16(v1536, 19463); 1719 int16x8_t v1537 = vaddq_s16(v1537_tmp, v1536); 1720 int16x8_t v1538 = vaddq_s16(v1535, v1537); 1721 int16x8_t v1539 = vqrdmulhq_n_s16(v1538, 20216); 1722 int16x8_t v1540 = vaddq_s16(v1534, v1539); 1723 int16x8_t v1541 = vsubq_s16(v1130, v1135); 1724 int16x8_t v1542 = vsubq_s16(v1140, v1145); 1725 int16x8_t v1543_tmp = vqrdmulhq_n_s16(v1542, 28661); 1726 int16x8_t v1543 = vaddq_s16(v1543_tmp, v1542); 1727 int16x8_t v1544 = vaddq_s16(v1541, v1543); 1728 int16x8_t v1545 = vsubq_s16(v1152, v1157); 1729 int16x8_t v1546 = vsubq_s16(v1162, v1167); 1730 int16x8_t v1547_tmp = vqrdmulhq_n_s16(v1546, 28661); 1731 int16x8_t v1547 = vaddq_s16(v1547_tmp, v1546); 1732 int16x8_t v1548 = vaddq_s16(v1545, v1547); 1733 int16x8_t v1549 = vqrdmulhq_n_s16(v1548, 20587); 1734 int16x8_t v1550 = vaddq_s16(v1544, v1549); 1735 int16x8_t v1551 = vsubq_s16(v1084, v1089); 1736 int16x8_t v1552 = vsubq_s16(v1094, v1099); 1737 int16x8_t v1553_tmp = vqrdmulhq_n_s16(v1552, 9242); 1738 int16x8_t v1553 = vmlaq_n_s16(v1553_tmp, v1552, 2); 1739 int16x8_t v1554 = vaddq_s16(v1551, v1553); 1740 int16x8_t v1555 = vsubq_s16(v1106, v1111); 1741 int16x8_t v1556 = vsubq_s16(v1116, v1121); 1742 int16x8_t v1557_tmp = vqrdmulhq_n_s16(v1556, 9242); 1743 int16x8_t v1557 = vmlaq_n_s16(v1557_tmp, v1556, 2); 1744 int16x8_t v1558 = vaddq_s16(v1555, v1557); 1745 int16x8_t v1559 = vqrdmulhq_n_s16(v1558, 20985); 1746 int16x8_t v1560 = vaddq_s16(v1554, v1559); 1747 int16x8_t v1561 = vsubq_s16(v996, v1007); 1748 int16x8_t v1562 = vsubq_s16(v1018, v1029); 1749 int16x8_t v1563_tmp = vqrdmulhq_n_s16(v1562, 30298); 1750 int16x8_t v1563 = vmlaq_n_s16(v1563_tmp, v1562, 2); 1751 int16x8_t v1564 = vaddq_s16(v1561, v1563); 1752 int16x8_t v1565 = vsubq_s16(v1042, v1053); 1753 int16x8_t v1566 = vsubq_s16(v1064, v1075); 1754 int16x8_t v1567_tmp = vqrdmulhq_n_s16(v1566, 30298); 1755 int16x8_t v1567 = vmlaq_n_s16(v1567_tmp, v1566, 2); 1756 int16x8_t v1568 = vaddq_s16(v1565, v1567); 1757 int16x8_t v1569 = vqrdmulhq_n_s16(v1568, 21412); 1758 int16x8_t v1570 = vaddq_s16(v1564, v1569); 1759 int16x8_t v1571 = vsubq_s16(v901, v912); 1760 int16x8_t v1572 = vsubq_s16(v924, v935); 1761 int16x8_t v1573_tmp = vqrdmulhq_n_s16(v1572, 2773); 1762 int16x8_t v1573 = vmlaq_n_s16(v1573_tmp, v1572, 4); 1763 int16x8_t v1574 = vaddq_s16(v1571, v1573); 1764 int16x8_t v1575 = vsubq_s16(v948, v959); 1765 int16x8_t v1576 = vsubq_s16(v970, v981); 1766 int16x8_t v1577_tmp = vqrdmulhq_n_s16(v1576, 2773); 1767 int16x8_t v1577 = vmlaq_n_s16(v1577_tmp, v1576, 4); 1768 int16x8_t v1578 = vaddq_s16(v1575, v1577); 1769 int16x8_t v1579 = vqrdmulhq_n_s16(v1578, 21871); 1770 int16x8_t v1580 = vaddq_s16(v1574, v1579); 1771 int16x8_t v1581 = vsubq_s16(v723, v746); 1772 int16x8_t v1582 = vsubq_s16(v769, v792); 1773 int16x8_t v1583_tmp = vqrdmulhq_n_s16(v1582, 26108); 1774 int16x8_t v1583 = vmlaq_n_s16(v1583_tmp, v1582, 6); 1775 int16x8_t v1584 = vaddq_s16(v1581, v1583); 1776 int16x8_t v1585 = vsubq_s16(v817, v840); 1777 int16x8_t v1586 = vsubq_s16(v863, v886); 1778 int16x8_t v1587_tmp = vqrdmulhq_n_s16(v1586, 26108); 1779 int16x8_t v1587 = vmlaq_n_s16(v1587_tmp, v1586, 6); 1780 int16x8_t v1588 = vaddq_s16(v1585, v1587); 1781 int16x8_t v1589 = vqrdmulhq_n_s16(v1588, 22363); 1782 int16x8_t v1590 = vaddq_s16(v1584, v1589); 1783 int16x8_t v1591 = vsubq_s16(v61, v140); 1784 int16x8_t v1592 = vsubq_s16(v234, v314); 1785 int16x8_t v1593_tmp = vqrdmulhq_n_s16(v1592, 12251); 1786 int16x8_t v1593 = vmlaq_n_s16(v1593_tmp, v1592, 20); 1787 int16x8_t v1594 = vaddq_s16(v1591, v1593); 1788 int16x8_t v1595 = vsubq_s16(v410, v521); 1789 int16x8_t v1596 = vsubq_s16(v615, v696); 1790 int16x8_t v1597_tmp = vqrdmulhq_n_s16(v1596, 12251); 1791 int16x8_t v1597 = vmlaq_n_s16(v1597_tmp, v1596, 20); 1792 int16x8_t v1598 = vaddq_s16(v1595, v1597); 1793 int16x8_t v1599 = vqrdmulhq_n_s16(v1598, 22891); 1794 int16x8_t v1600 = vaddq_s16(v1594, v1599); 1795 int16x8_t v1601 = vsubq_s16(v1591, v1593); 1796 int16x8_t v1602 = vsubq_s16(v1595, v1597); 1797 int16x8_t v1603 = vqrdmulhq_n_s16(v1602, 23460); 1798 int16x8_t v1604 = vaddq_s16(v1601, v1603); 1799 int16x8_t v1605 = vsubq_s16(v1581, v1583); 1800 int16x8_t v1606 = vsubq_s16(v1585, v1587); 1801 int16x8_t v1607 = vqrdmulhq_n_s16(v1606, 24073); 1802 int16x8_t v1608 = vaddq_s16(v1605, v1607); 1803 int16x8_t v1609 = vsubq_s16(v1571, v1573); 1804 int16x8_t v1610 = vsubq_s16(v1575, v1577); 1805 int16x8_t v1611 = vqrdmulhq_n_s16(v1610, 24734); 1806 int16x8_t v1612 = vaddq_s16(v1609, v1611); 1807 int16x8_t v1613 = vsubq_s16(v1561, v1563); 1808 int16x8_t v1614 = vsubq_s16(v1565, v1567); 1809 int16x8_t v1615 = vqrdmulhq_n_s16(v1614, 25448); 1810 int16x8_t v1616 = vaddq_s16(v1613, v1615); 1811 int16x8_t v1617 = vsubq_s16(v1551, v1553); 1812 int16x8_t v1618 = vsubq_s16(v1555, v1557); 1813 int16x8_t v1619 = vqrdmulhq_n_s16(v1618, 26220); 1814 int16x8_t v1620 = vaddq_s16(v1617, v1619); 1815 int16x8_t v1621 = vsubq_s16(v1541, v1543); 1816 int16x8_t v1622 = vsubq_s16(v1545, v1547); 1817 int16x8_t v1623 = vqrdmulhq_n_s16(v1622, 27058); 1818 int16x8_t v1624 = vaddq_s16(v1621, v1623); 1819 int16x8_t v1625 = vsubq_s16(v1531, v1533); 1820 int16x8_t v1626 = vsubq_s16(v1535, v1537); 1821 int16x8_t v1627 = vqrdmulhq_n_s16(v1626, 27969); 1822 int16x8_t v1628 = vaddq_s16(v1625, v1627); 1823 int16x8_t v1629 = vsubq_s16(v1521, v1523); 1824 int16x8_t v1630 = vsubq_s16(v1525, v1527); 1825 int16x8_t v1631 = vqrdmulhq_n_s16(v1630, 28961); 1826 int16x8_t v1632 = vaddq_s16(v1629, v1631); 1827 int16x8_t v1633 = vsubq_s16(v1511, v1513); 1828 int16x8_t v1634 = vsubq_s16(v1515, v1517); 1829 int16x8_t v1635 = vqrdmulhq_n_s16(v1634, 30044); 1830 int16x8_t v1636 = vaddq_s16(v1633, v1635); 1831 int16x8_t v1637 = vsubq_s16(v1501, v1503); 1832 int16x8_t v1638 = vsubq_s16(v1505, v1507); 1833 int16x8_t v1639 = vqrdmulhq_n_s16(v1638, 31232); 1834 int16x8_t v1640 = vaddq_s16(v1637, v1639); 1835 int16x8_t v1641 = vsubq_s16(v1491, v1493); 1836 int16x8_t v1642 = vsubq_s16(v1495, v1497); 1837 int16x8_t v1643 = vqrdmulhq_n_s16(v1642, 32538); 1838 int16x8_t v1644 = vaddq_s16(v1641, v1643); 1839 int16x8_t v1645 = vsubq_s16(v1481, v1483); 1840 int16x8_t v1646 = vsubq_s16(v1485, v1487); 1841 int16x8_t v1647_tmp = vqrdmulhq_n_s16(v1646, 1211); 1842 int16x8_t v1647 = vaddq_s16(v1647_tmp, v1646); 1843 int16x8_t v1648 = vaddq_s16(v1645, v1647); 1844 int16x8_t v1649 = vsubq_s16(v1471, v1473); 1845 int16x8_t v1650 = vsubq_s16(v1475, v1477); 1846 int16x8_t v1651_tmp = vqrdmulhq_n_s16(v1650, 2808); 1847 int16x8_t v1651 = vaddq_s16(v1651_tmp, v1650); 1848 int16x8_t v1652 = vaddq_s16(v1649, v1651); 1849 int16x8_t v1653 = vsubq_s16(v1461, v1463); 1850 int16x8_t v1654 = vsubq_s16(v1465, v1467); 1851 int16x8_t v1655_tmp = vqrdmulhq_n_s16(v1654, 4586); 1852 int16x8_t v1655 = vaddq_s16(v1655_tmp, v1654); 1853 int16x8_t v1656 = vaddq_s16(v1653, v1655); 1854 int16x8_t v1657 = vsubq_s16(v1451, v1453); 1855 int16x8_t v1658 = vsubq_s16(v1455, v1457); 1856 int16x8_t v1659_tmp = vqrdmulhq_n_s16(v1658, 6576); 1857 int16x8_t v1659 = vaddq_s16(v1659_tmp, v1658); 1858 int16x8_t v1660 = vaddq_s16(v1657, v1659); 1859 int16x8_t v1661 = vsubq_s16(v1441, v1443); 1860 int16x8_t v1662 = vsubq_s16(v1445, v1447); 1861 int16x8_t v1663_tmp = vqrdmulhq_n_s16(v1662, 8817); 1862 int16x8_t v1663 = vaddq_s16(v1663_tmp, v1662); 1863 int16x8_t v1664 = vaddq_s16(v1661, v1663); 1864 int16x8_t v1665 = vsubq_s16(v1422, v1427); 1865 int16x8_t v1666 = vsubq_s16(v1432, v1437); 1866 int16x8_t v1667_tmp = vqrdmulhq_n_s16(v1666, 11356); 1867 int16x8_t v1667 = vaddq_s16(v1667_tmp, v1666); 1868 int16x8_t v1668 = vaddq_s16(v1665, v1667); 1869 int16x8_t v1669 = vsubq_s16(v1400, v1405); 1870 int16x8_t v1670 = vsubq_s16(v1410, v1415); 1871 int16x8_t v1671_tmp = vqrdmulhq_n_s16(v1670, 14256); 1872 int16x8_t v1671 = vaddq_s16(v1671_tmp, v1670); 1873 int16x8_t v1672 = vaddq_s16(v1669, v1671); 1874 int16x8_t v1673 = vsubq_s16(v1378, v1383); 1875 int16x8_t v1674 = vsubq_s16(v1388, v1393); 1876 int16x8_t v1675_tmp = vqrdmulhq_n_s16(v1674, 17596); 1877 int16x8_t v1675 = vaddq_s16(v1675_tmp, v1674); 1878 int16x8_t v1676 = vaddq_s16(v1673, v1675); 1879 int16x8_t v1677 = vsubq_s16(v1356, v1361); 1880 int16x8_t v1678 = vsubq_s16(v1366, v1371); 1881 int16x8_t v1679_tmp = vqrdmulhq_n_s16(v1678, 21483); 1882 int16x8_t v1679 = vaddq_s16(v1679_tmp, v1678); 1883 int16x8_t v1680 = vaddq_s16(v1677, v1679); 1884 int16x8_t v1681 = vsubq_s16(v1334, v1339); 1885 int16x8_t v1682 = vsubq_s16(v1344, v1349); 1886 int16x8_t v1683_tmp = vqrdmulhq_n_s16(v1682, 26057); 1887 int16x8_t v1683 = vaddq_s16(v1683_tmp, v1682); 1888 int16x8_t v1684 = vaddq_s16(v1681, v1683); 1889 int16x8_t v1685 = vsubq_s16(v1312, v1317); 1890 int16x8_t v1686 = vsubq_s16(v1322, v1327); 1891 int16x8_t v1687_tmp = vqrdmulhq_n_s16(v1686, 31517); 1892 int16x8_t v1687 = vaddq_s16(v1687_tmp, v1686); 1893 int16x8_t v1688 = vaddq_s16(v1685, v1687); 1894 int16x8_t v1689 = vsubq_s16(v1290, v1295); 1895 int16x8_t v1690 = vsubq_s16(v1300, v1305); 1896 int16x8_t v1691_tmp = vqrdmulhq_n_s16(v1690, 5373); 1897 int16x8_t v1691 = vmlaq_n_s16(v1691_tmp, v1690, 2); 1898 int16x8_t v1692 = vaddq_s16(v1689, v1691); 1899 int16x8_t v1693 = vsubq_s16(v1268, v1273); 1900 int16x8_t v1694 = vsubq_s16(v1278, v1283); 1901 int16x8_t v1695_tmp = vqrdmulhq_n_s16(v1694, 13571); 1902 int16x8_t v1695 = vmlaq_n_s16(v1695_tmp, v1694, 2); 1903 int16x8_t v1696 = vaddq_s16(v1693, v1695); 1904 int16x8_t v1697 = vsubq_s16(v1228, v1239); 1905 int16x8_t v1698 = vsubq_s16(v1250, v1261); 1906 int16x8_t v1699_tmp = vqrdmulhq_n_s16(v1698, 23975); 1907 int16x8_t v1699 = vmlaq_n_s16(v1699_tmp, v1698, 2); 1908 int16x8_t v1700 = vaddq_s16(v1697, v1699); 1909 int16x8_t v1701 = vsubq_s16(v1182, v1193); 1910 int16x8_t v1702 = vsubq_s16(v1204, v1215); 1911 int16x8_t v1703_tmp = vqrdmulhq_n_s16(v1702, 4832); 1912 int16x8_t v1703 = vmlaq_n_s16(v1703_tmp, v1702, 3); 1913 int16x8_t v1704 = vaddq_s16(v1701, v1703); 1914 int16x8_t v1705 = vsubq_s16(v1136, v1147); 1915 int16x8_t v1706 = vsubq_s16(v1158, v1169); 1916 int16x8_t v1707_tmp = vqrdmulhq_n_s16(v1706, 23437); 1917 int16x8_t v1707 = vmlaq_n_s16(v1707_tmp, v1706, 3); 1918 int16x8_t v1708 = vaddq_s16(v1705, v1707); 1919 int16x8_t v1709 = vsubq_s16(v1090, v1101); 1920 int16x8_t v1710 = vsubq_s16(v1112, v1123); 1921 int16x8_t v1711_tmp = vqrdmulhq_n_s16(v1710, 17573); 1922 int16x8_t v1711 = vmlaq_n_s16(v1711_tmp, v1710, 4); 1923 int16x8_t v1712 = vaddq_s16(v1709, v1711); 1924 int16x8_t v1713 = vsubq_s16(v1008, v1031); 1925 int16x8_t v1714 = vsubq_s16(v1054, v1077); 1926 int16x8_t v1715_tmp = vqrdmulhq_n_s16(v1714, 27122); 1927 int16x8_t v1715 = vmlaq_n_s16(v1715_tmp, v1714, 5); 1928 int16x8_t v1716 = vaddq_s16(v1713, v1715); 1929 int16x8_t v1717 = vsubq_s16(v913, v937); 1930 int16x8_t v1718 = vsubq_s16(v960, v983); 1931 int16x8_t v1719_tmp = vqrdmulhq_n_s16(v1718, 5041); 1932 int16x8_t v1719 = vmlaq_n_s16(v1719_tmp, v1718, 8); 1933 int16x8_t v1720 = vaddq_s16(v1717, v1719); 1934 int16x8_t v1721 = vsubq_s16(v747, v794); 1935 int16x8_t v1722 = vsubq_s16(v841, v888); 1936 int16x8_t v1723_tmp = vqrdmulhq_n_s16(v1722, 19146); 1937 int16x8_t v1723 = vmlaq_n_s16(v1723_tmp, v1722, 13); 1938 int16x8_t v1724 = vaddq_s16(v1721, v1723); 1939 int16x8_t v1725 = vsubq_s16(v141, v316); 1940 int16x8_t v1726 = vsubq_s16(v522, v698); 1941 int16x8_t v1727_tmp = vqrdmulhq_n_s16(v1726, 24402); 1942 int16x8_t v1727 = vmlaq_n_s16(v1727_tmp, v1726, 40); 1943 int16x8_t v1728 = vaddq_s16(v1725, v1727); 1944 int16x8_t v1729 = vsubq_s16(v1725, v1727); 1945 int16x8_t v1730 = vsubq_s16(v1721, v1723); 1946 int16x8_t v1731 = vsubq_s16(v1717, v1719); 1947 int16x8_t v1732 = vsubq_s16(v1713, v1715); 1948 int16x8_t v1733 = vsubq_s16(v1709, v1711); 1949 int16x8_t v1734 = vsubq_s16(v1705, v1707); 1950 int16x8_t v1735 = vsubq_s16(v1701, v1703); 1951 int16x8_t v1736 = vsubq_s16(v1697, v1699); 1952 int16x8_t v1737 = vsubq_s16(v1693, v1695); 1953 int16x8_t v1738 = vsubq_s16(v1689, v1691); 1954 int16x8_t v1739 = vsubq_s16(v1685, v1687); 1955 int16x8_t v1740 = vsubq_s16(v1681, v1683); 1956 int16x8_t v1741 = vsubq_s16(v1677, v1679); 1957 int16x8_t v1742 = vsubq_s16(v1673, v1675); 1958 int16x8_t v1743 = vsubq_s16(v1669, v1671); 1959 int16x8_t v1744 = vsubq_s16(v1665, v1667); 1960 int16x8_t v1745 = vsubq_s16(v1661, v1663); 1961 int16x8_t v1746 = vsubq_s16(v1657, v1659); 1962 int16x8_t v1747 = vsubq_s16(v1653, v1655); 1963 int16x8_t v1748 = vsubq_s16(v1649, v1651); 1964 int16x8_t v1749 = vsubq_s16(v1645, v1647); 1965 int16x8_t v1750 = vsubq_s16(v1641, v1643); 1966 int16x8_t v1751 = vsubq_s16(v1637, v1639); 1967 int16x8_t v1752 = vsubq_s16(v1633, v1635); 1968 int16x8_t v1753 = vsubq_s16(v1629, v1631); 1969 int16x8_t v1754 = vsubq_s16(v1625, v1627); 1970 int16x8_t v1755 = vsubq_s16(v1621, v1623); 1971 int16x8_t v1756 = vsubq_s16(v1617, v1619); 1972 int16x8_t v1757 = vsubq_s16(v1613, v1615); 1973 int16x8_t v1758 = vsubq_s16(v1609, v1611); 1974 int16x8_t v1759 = vsubq_s16(v1605, v1607); 1975 int16x8_t v1760 = vsubq_s16(v1601, v1603); 1976 int16x8_t v1761 = vsubq_s16(v1594, v1599); 1977 int16x8_t v1762 = vsubq_s16(v1584, v1589); 1978 int16x8_t v1763 = vsubq_s16(v1574, v1579); 1979 int16x8_t v1764 = vsubq_s16(v1564, v1569); 1980 int16x8_t v1765 = vsubq_s16(v1554, v1559); 1981 int16x8_t v1766 = vsubq_s16(v1544, v1549); 1982 int16x8_t v1767 = vsubq_s16(v1534, v1539); 1983 int16x8_t v1768 = vsubq_s16(v1524, v1529); 1984 int16x8_t v1769 = vsubq_s16(v1514, v1519); 1985 int16x8_t v1770 = vsubq_s16(v1504, v1509); 1986 int16x8_t v1771 = vsubq_s16(v1494, v1499); 1987 int16x8_t v1772 = vsubq_s16(v1484, v1489); 1988 int16x8_t v1773 = vsubq_s16(v1474, v1479); 1989 int16x8_t v1774 = vsubq_s16(v1464, v1469); 1990 int16x8_t v1775 = vsubq_s16(v1454, v1459); 1991 int16x8_t v1776 = vsubq_s16(v1444, v1449); 1992 int16x8_t v1777 = vsubq_s16(v1428, v1439); 1993 int16x8_t v1778 = vsubq_s16(v1406, v1417); 1994 int16x8_t v1779 = vsubq_s16(v1384, v1395); 1995 int16x8_t v1780 = vsubq_s16(v1362, v1373); 1996 int16x8_t v1781 = vsubq_s16(v1340, v1351); 1997 int16x8_t v1782 = vsubq_s16(v1318, v1329); 1998 int16x8_t v1783 = vsubq_s16(v1296, v1307); 1999 int16x8_t v1784 = vsubq_s16(v1274, v1285); 2000 int16x8_t v1785 = vsubq_s16(v1240, v1263); 2001 int16x8_t v1786 = vsubq_s16(v1194, v1217); 2002 int16x8_t v1787 = vsubq_s16(v1148, v1171); 2003 int16x8_t v1788 = vsubq_s16(v1102, v1125); 2004 int16x8_t v1789 = vsubq_s16(v1032, v1079); 2005 int16x8_t v1790 = vsubq_s16(v938, v985); 2006 int16x8_t v1791 = vsubq_s16(v795, v890); 2007 int16x8_t v1792 = vsubq_s16(v317, v700); 2008 vst1q_s16(out + out_stride * 0 + i, v701); 2009 vst1q_s16(out + out_stride * 1 + i, v891); 2010 vst1q_s16(out + out_stride * 2 + i, v986); 2011 vst1q_s16(out + out_stride * 3 + i, v1080); 2012 vst1q_s16(out + out_stride * 4 + i, v1126); 2013 vst1q_s16(out + out_stride * 5 + i, v1172); 2014 vst1q_s16(out + out_stride * 6 + i, v1218); 2015 vst1q_s16(out + out_stride * 7 + i, v1264); 2016 vst1q_s16(out + out_stride * 8 + i, v1286); 2017 vst1q_s16(out + out_stride * 9 + i, v1308); 2018 vst1q_s16(out + out_stride * 10 + i, v1330); 2019 vst1q_s16(out + out_stride * 11 + i, v1352); 2020 vst1q_s16(out + out_stride * 12 + i, v1374); 2021 vst1q_s16(out + out_stride * 13 + i, v1396); 2022 vst1q_s16(out + out_stride * 14 + i, v1418); 2023 vst1q_s16(out + out_stride * 15 + i, v1440); 2024 vst1q_s16(out + out_stride * 16 + i, v1450); 2025 vst1q_s16(out + out_stride * 17 + i, v1460); 2026 vst1q_s16(out + out_stride * 18 + i, v1470); 2027 vst1q_s16(out + out_stride * 19 + i, v1480); 2028 vst1q_s16(out + out_stride * 20 + i, v1490); 2029 vst1q_s16(out + out_stride * 21 + i, v1500); 2030 vst1q_s16(out + out_stride * 22 + i, v1510); 2031 vst1q_s16(out + out_stride * 23 + i, v1520); 2032 vst1q_s16(out + out_stride * 24 + i, v1530); 2033 vst1q_s16(out + out_stride * 25 + i, v1540); 2034 vst1q_s16(out + out_stride * 26 + i, v1550); 2035 vst1q_s16(out + out_stride * 27 + i, v1560); 2036 vst1q_s16(out + out_stride * 28 + i, v1570); 2037 vst1q_s16(out + out_stride * 29 + i, v1580); 2038 vst1q_s16(out + out_stride * 30 + i, v1590); 2039 vst1q_s16(out + out_stride * 31 + i, v1600); 2040 vst1q_s16(out + out_stride * 32 + i, v1604); 2041 vst1q_s16(out + out_stride * 33 + i, v1608); 2042 vst1q_s16(out + out_stride * 34 + i, v1612); 2043 vst1q_s16(out + out_stride * 35 + i, v1616); 2044 vst1q_s16(out + out_stride * 36 + i, v1620); 2045 vst1q_s16(out + out_stride * 37 + i, v1624); 2046 vst1q_s16(out + out_stride * 38 + i, v1628); 2047 vst1q_s16(out + out_stride * 39 + i, v1632); 2048 vst1q_s16(out + out_stride * 40 + i, v1636); 2049 vst1q_s16(out + out_stride * 41 + i, v1640); 2050 vst1q_s16(out + out_stride * 42 + i, v1644); 2051 vst1q_s16(out + out_stride * 43 + i, v1648); 2052 vst1q_s16(out + out_stride * 44 + i, v1652); 2053 vst1q_s16(out + out_stride * 45 + i, v1656); 2054 vst1q_s16(out + out_stride * 46 + i, v1660); 2055 vst1q_s16(out + out_stride * 47 + i, v1664); 2056 vst1q_s16(out + out_stride * 48 + i, v1668); 2057 vst1q_s16(out + out_stride * 49 + i, v1672); 2058 vst1q_s16(out + out_stride * 50 + i, v1676); 2059 vst1q_s16(out + out_stride * 51 + i, v1680); 2060 vst1q_s16(out + out_stride * 52 + i, v1684); 2061 vst1q_s16(out + out_stride * 53 + i, v1688); 2062 vst1q_s16(out + out_stride * 54 + i, v1692); 2063 vst1q_s16(out + out_stride * 55 + i, v1696); 2064 vst1q_s16(out + out_stride * 56 + i, v1700); 2065 vst1q_s16(out + out_stride * 57 + i, v1704); 2066 vst1q_s16(out + out_stride * 58 + i, v1708); 2067 vst1q_s16(out + out_stride * 59 + i, v1712); 2068 vst1q_s16(out + out_stride * 60 + i, v1716); 2069 vst1q_s16(out + out_stride * 61 + i, v1720); 2070 vst1q_s16(out + out_stride * 62 + i, v1724); 2071 vst1q_s16(out + out_stride * 63 + i, v1728); 2072 vst1q_s16(out + out_stride * 64 + i, v1729); 2073 vst1q_s16(out + out_stride * 65 + i, v1730); 2074 vst1q_s16(out + out_stride * 66 + i, v1731); 2075 vst1q_s16(out + out_stride * 67 + i, v1732); 2076 vst1q_s16(out + out_stride * 68 + i, v1733); 2077 vst1q_s16(out + out_stride * 69 + i, v1734); 2078 vst1q_s16(out + out_stride * 70 + i, v1735); 2079 vst1q_s16(out + out_stride * 71 + i, v1736); 2080 vst1q_s16(out + out_stride * 72 + i, v1737); 2081 vst1q_s16(out + out_stride * 73 + i, v1738); 2082 vst1q_s16(out + out_stride * 74 + i, v1739); 2083 vst1q_s16(out + out_stride * 75 + i, v1740); 2084 vst1q_s16(out + out_stride * 76 + i, v1741); 2085 vst1q_s16(out + out_stride * 77 + i, v1742); 2086 vst1q_s16(out + out_stride * 78 + i, v1743); 2087 vst1q_s16(out + out_stride * 79 + i, v1744); 2088 vst1q_s16(out + out_stride * 80 + i, v1745); 2089 vst1q_s16(out + out_stride * 81 + i, v1746); 2090 vst1q_s16(out + out_stride * 82 + i, v1747); 2091 vst1q_s16(out + out_stride * 83 + i, v1748); 2092 vst1q_s16(out + out_stride * 84 + i, v1749); 2093 vst1q_s16(out + out_stride * 85 + i, v1750); 2094 vst1q_s16(out + out_stride * 86 + i, v1751); 2095 vst1q_s16(out + out_stride * 87 + i, v1752); 2096 vst1q_s16(out + out_stride * 88 + i, v1753); 2097 vst1q_s16(out + out_stride * 89 + i, v1754); 2098 vst1q_s16(out + out_stride * 90 + i, v1755); 2099 vst1q_s16(out + out_stride * 91 + i, v1756); 2100 vst1q_s16(out + out_stride * 92 + i, v1757); 2101 vst1q_s16(out + out_stride * 93 + i, v1758); 2102 vst1q_s16(out + out_stride * 94 + i, v1759); 2103 vst1q_s16(out + out_stride * 95 + i, v1760); 2104 vst1q_s16(out + out_stride * 96 + i, v1761); 2105 vst1q_s16(out + out_stride * 97 + i, v1762); 2106 vst1q_s16(out + out_stride * 98 + i, v1763); 2107 vst1q_s16(out + out_stride * 99 + i, v1764); 2108 vst1q_s16(out + out_stride * 100 + i, v1765); 2109 vst1q_s16(out + out_stride * 101 + i, v1766); 2110 vst1q_s16(out + out_stride * 102 + i, v1767); 2111 vst1q_s16(out + out_stride * 103 + i, v1768); 2112 vst1q_s16(out + out_stride * 104 + i, v1769); 2113 vst1q_s16(out + out_stride * 105 + i, v1770); 2114 vst1q_s16(out + out_stride * 106 + i, v1771); 2115 vst1q_s16(out + out_stride * 107 + i, v1772); 2116 vst1q_s16(out + out_stride * 108 + i, v1773); 2117 vst1q_s16(out + out_stride * 109 + i, v1774); 2118 vst1q_s16(out + out_stride * 110 + i, v1775); 2119 vst1q_s16(out + out_stride * 111 + i, v1776); 2120 vst1q_s16(out + out_stride * 112 + i, v1777); 2121 vst1q_s16(out + out_stride * 113 + i, v1778); 2122 vst1q_s16(out + out_stride * 114 + i, v1779); 2123 vst1q_s16(out + out_stride * 115 + i, v1780); 2124 vst1q_s16(out + out_stride * 116 + i, v1781); 2125 vst1q_s16(out + out_stride * 117 + i, v1782); 2126 vst1q_s16(out + out_stride * 118 + i, v1783); 2127 vst1q_s16(out + out_stride * 119 + i, v1784); 2128 vst1q_s16(out + out_stride * 120 + i, v1785); 2129 vst1q_s16(out + out_stride * 121 + i, v1786); 2130 vst1q_s16(out + out_stride * 122 + i, v1787); 2131 vst1q_s16(out + out_stride * 123 + i, v1788); 2132 vst1q_s16(out + out_stride * 124 + i, v1789); 2133 vst1q_s16(out + out_stride * 125 + i, v1790); 2134 vst1q_s16(out + out_stride * 126 + i, v1791); 2135 vst1q_s16(out + out_stride * 127 + i, v1792); 2136 } 2137 }