fast_dct256-inl.h (236296B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 /* This file is automatically generated. Do not modify it directly. */ 7 #if HWY_TARGET != HWY_NEON 8 #error "only include this file from fast_dct-inl.h" 9 #endif 10 11 constexpr size_t FastIDCTIntegerBits(FastDCTTag<256>) { return 3; } 12 13 void FastIDCT(FastDCTTag<256>, const int16_t* in, size_t in_stride, 14 int16_t* out, size_t out_stride, size_t count) { 15 JXL_ASSERT(count % 8 == 0); 16 for (size_t i = 0; i < count; i += 8) { 17 int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); 18 int16x8_t v1 = vld1q_s16(in + in_stride * 128 + i); 19 int16x8_t v2 = vaddq_s16(v0, v1); 20 int16x8_t v3 = vld1q_s16(in + in_stride * 64 + i); 21 int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); 22 int16x8_t v4 = vaddq_s16(v4_tmp, v3); 23 int16x8_t v5 = vld1q_s16(in + in_stride * 192 + i); 24 int16x8_t v6 = vaddq_s16(v5, v3); 25 int16x8_t v7 = vaddq_s16(v4, v6); 26 int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); 27 int16x8_t v9 = vaddq_s16(v2, v8); 28 int16x8_t v10 = vld1q_s16(in + in_stride * 32 + i); 29 int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); 30 int16x8_t v11 = vaddq_s16(v11_tmp, v10); 31 int16x8_t v12 = vld1q_s16(in + in_stride * 160 + i); 32 int16x8_t v13 = vld1q_s16(in + in_stride * 96 + i); 33 int16x8_t v14 = vaddq_s16(v12, v13); 34 int16x8_t v15 = vaddq_s16(v11, v14); 35 int16x8_t v16 = vaddq_s16(v13, v10); 36 int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573); 37 int16x8_t v17 = vaddq_s16(v17_tmp, v16); 38 int16x8_t v18 = vld1q_s16(in + in_stride * 224 + i); 39 int16x8_t v19 = vaddq_s16(v18, v12); 40 int16x8_t v20 = vaddq_s16(v19, v16); 41 int16x8_t v21 = vaddq_s16(v17, v20); 42 int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734); 43 int16x8_t v23 = vaddq_s16(v15, v22); 44 int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); 45 int16x8_t v25 = vaddq_s16(v9, v24); 46 int16x8_t v26 = vld1q_s16(in + in_stride * 16 + i); 47 int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573); 48 int16x8_t v27 = vaddq_s16(v27_tmp, v26); 49 int16x8_t v28 = vld1q_s16(in + in_stride * 144 + i); 50 int16x8_t v29 = vld1q_s16(in + in_stride * 112 + i); 51 int16x8_t v30 = vaddq_s16(v28, v29); 52 int16x8_t v31 = vaddq_s16(v27, v30); 53 int16x8_t v32 = vld1q_s16(in + in_stride * 80 + i); 54 int16x8_t v33 = vld1q_s16(in + in_stride * 48 + i); 55 int16x8_t v34 = vaddq_s16(v32, v33); 56 int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573); 57 int16x8_t v35 = vaddq_s16(v35_tmp, v34); 58 int16x8_t v36 = vld1q_s16(in + in_stride * 208 + i); 59 int16x8_t v37 = vld1q_s16(in + in_stride * 176 + i); 60 int16x8_t v38 = vaddq_s16(v36, v37); 61 int16x8_t v39 = vaddq_s16(v38, v34); 62 int16x8_t v40 = vaddq_s16(v35, v39); 63 int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734); 64 int16x8_t v42 = vaddq_s16(v31, v41); 65 int16x8_t v43 = vaddq_s16(v33, v26); 66 int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573); 67 int16x8_t v44 = vaddq_s16(v44_tmp, v43); 68 int16x8_t v45 = vaddq_s16(v37, v28); 69 int16x8_t v46 = vaddq_s16(v29, v32); 70 int16x8_t v47 = vaddq_s16(v45, v46); 71 int16x8_t v48 = vaddq_s16(v44, v47); 72 int16x8_t v49 = vaddq_s16(v46, v43); 73 int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573); 74 int16x8_t v50 = vaddq_s16(v50_tmp, v49); 75 int16x8_t v51 = vld1q_s16(in + in_stride * 240 + i); 76 int16x8_t v52 = vaddq_s16(v51, v36); 77 int16x8_t v53 = vaddq_s16(v52, v45); 78 int16x8_t v54 = vaddq_s16(v53, v49); 79 int16x8_t v55 = vaddq_s16(v50, v54); 80 int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734); 81 int16x8_t v57 = vaddq_s16(v48, v56); 82 int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705); 83 int16x8_t v59 = vaddq_s16(v42, v58); 84 int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463); 85 int16x8_t v61 = vaddq_s16(v25, v60); 86 int16x8_t v62 = vld1q_s16(in + in_stride * 8 + i); 87 int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573); 88 int16x8_t v63 = vaddq_s16(v63_tmp, v62); 89 int16x8_t v64 = vld1q_s16(in + in_stride * 136 + i); 90 int16x8_t v65 = vld1q_s16(in + in_stride * 120 + i); 91 int16x8_t v66 = vaddq_s16(v64, v65); 92 int16x8_t v67 = vaddq_s16(v63, v66); 93 int16x8_t v68 = vld1q_s16(in + in_stride * 72 + i); 94 int16x8_t v69 = vld1q_s16(in + in_stride * 56 + i); 95 int16x8_t v70 = vaddq_s16(v68, v69); 96 int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573); 97 int16x8_t v71 = vaddq_s16(v71_tmp, v70); 98 int16x8_t v72 = vld1q_s16(in + in_stride * 200 + i); 99 int16x8_t v73 = vld1q_s16(in + in_stride * 184 + i); 100 int16x8_t v74 = vaddq_s16(v72, v73); 101 int16x8_t v75 = vaddq_s16(v74, v70); 102 int16x8_t v76 = vaddq_s16(v71, v75); 103 int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734); 104 int16x8_t v78 = vaddq_s16(v67, v77); 105 int16x8_t v79 = vld1q_s16(in + in_stride * 40 + i); 106 int16x8_t v80 = vld1q_s16(in + in_stride * 24 + i); 107 int16x8_t v81 = vaddq_s16(v79, v80); 108 int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573); 109 int16x8_t v82 = vaddq_s16(v82_tmp, v81); 110 int16x8_t v83 = vld1q_s16(in + in_stride * 168 + i); 111 int16x8_t v84 = vld1q_s16(in + in_stride * 152 + i); 112 int16x8_t v85 = vaddq_s16(v83, v84); 113 int16x8_t v86 = vld1q_s16(in + in_stride * 104 + i); 114 int16x8_t v87 = vld1q_s16(in + in_stride * 88 + i); 115 int16x8_t v88 = vaddq_s16(v86, v87); 116 int16x8_t v89 = vaddq_s16(v85, v88); 117 int16x8_t v90 = vaddq_s16(v82, v89); 118 int16x8_t v91 = vaddq_s16(v88, v81); 119 int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573); 120 int16x8_t v92 = vaddq_s16(v92_tmp, v91); 121 int16x8_t v93 = vld1q_s16(in + in_stride * 232 + i); 122 int16x8_t v94 = vld1q_s16(in + in_stride * 216 + i); 123 int16x8_t v95 = vaddq_s16(v93, v94); 124 int16x8_t v96 = vaddq_s16(v95, v85); 125 int16x8_t v97 = vaddq_s16(v96, v91); 126 int16x8_t v98 = vaddq_s16(v92, v97); 127 int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734); 128 int16x8_t v100 = vaddq_s16(v90, v99); 129 int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705); 130 int16x8_t v102 = vaddq_s16(v78, v101); 131 int16x8_t v103 = vaddq_s16(v80, v62); 132 int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573); 133 int16x8_t v104 = vaddq_s16(v104_tmp, v103); 134 int16x8_t v105 = vaddq_s16(v84, v64); 135 int16x8_t v106 = vaddq_s16(v65, v86); 136 int16x8_t v107 = vaddq_s16(v105, v106); 137 int16x8_t v108 = vaddq_s16(v104, v107); 138 int16x8_t v109 = vaddq_s16(v87, v68); 139 int16x8_t v110 = vaddq_s16(v69, v79); 140 int16x8_t v111 = vaddq_s16(v109, v110); 141 int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573); 142 int16x8_t v112 = vaddq_s16(v112_tmp, v111); 143 int16x8_t v113 = vaddq_s16(v94, v72); 144 int16x8_t v114 = vaddq_s16(v73, v83); 145 int16x8_t v115 = vaddq_s16(v113, v114); 146 int16x8_t v116 = vaddq_s16(v115, v111); 147 int16x8_t v117 = vaddq_s16(v112, v116); 148 int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734); 149 int16x8_t v119 = vaddq_s16(v108, v118); 150 int16x8_t v120 = vaddq_s16(v110, v103); 151 int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573); 152 int16x8_t v121 = vaddq_s16(v121_tmp, v120); 153 int16x8_t v122 = vaddq_s16(v114, v105); 154 int16x8_t v123 = vaddq_s16(v106, v109); 155 int16x8_t v124 = vaddq_s16(v122, v123); 156 int16x8_t v125 = vaddq_s16(v121, v124); 157 int16x8_t v126 = vaddq_s16(v123, v120); 158 int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573); 159 int16x8_t v127 = vaddq_s16(v127_tmp, v126); 160 int16x8_t v128 = vld1q_s16(in + in_stride * 248 + i); 161 int16x8_t v129 = vaddq_s16(v128, v93); 162 int16x8_t v130 = vaddq_s16(v129, v113); 163 int16x8_t v131 = vaddq_s16(v130, v122); 164 int16x8_t v132 = vaddq_s16(v131, v126); 165 int16x8_t v133 = vaddq_s16(v127, v132); 166 int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734); 167 int16x8_t v135 = vaddq_s16(v125, v134); 168 int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705); 169 int16x8_t v137 = vaddq_s16(v119, v136); 170 int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463); 171 int16x8_t v139 = vaddq_s16(v102, v138); 172 int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404); 173 int16x8_t v141 = vaddq_s16(v61, v140); 174 int16x8_t v142 = vld1q_s16(in + in_stride * 4 + i); 175 int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573); 176 int16x8_t v143 = vaddq_s16(v143_tmp, v142); 177 int16x8_t v144 = vld1q_s16(in + in_stride * 132 + i); 178 int16x8_t v145 = vld1q_s16(in + in_stride * 124 + i); 179 int16x8_t v146 = vaddq_s16(v144, v145); 180 int16x8_t v147 = vaddq_s16(v143, v146); 181 int16x8_t v148 = vld1q_s16(in + in_stride * 68 + i); 182 int16x8_t v149 = vld1q_s16(in + in_stride * 60 + i); 183 int16x8_t v150 = vaddq_s16(v148, v149); 184 int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573); 185 int16x8_t v151 = vaddq_s16(v151_tmp, v150); 186 int16x8_t v152 = vld1q_s16(in + in_stride * 196 + i); 187 int16x8_t v153 = vld1q_s16(in + in_stride * 188 + i); 188 int16x8_t v154 = vaddq_s16(v152, v153); 189 int16x8_t v155 = vaddq_s16(v154, v150); 190 int16x8_t v156 = vaddq_s16(v151, v155); 191 int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734); 192 int16x8_t v158 = vaddq_s16(v147, v157); 193 int16x8_t v159 = vld1q_s16(in + in_stride * 36 + i); 194 int16x8_t v160 = vld1q_s16(in + in_stride * 28 + i); 195 int16x8_t v161 = vaddq_s16(v159, v160); 196 int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573); 197 int16x8_t v162 = vaddq_s16(v162_tmp, v161); 198 int16x8_t v163 = vld1q_s16(in + in_stride * 164 + i); 199 int16x8_t v164 = vld1q_s16(in + in_stride * 156 + i); 200 int16x8_t v165 = vaddq_s16(v163, v164); 201 int16x8_t v166 = vld1q_s16(in + in_stride * 100 + i); 202 int16x8_t v167 = vld1q_s16(in + in_stride * 92 + i); 203 int16x8_t v168 = vaddq_s16(v166, v167); 204 int16x8_t v169 = vaddq_s16(v165, v168); 205 int16x8_t v170 = vaddq_s16(v162, v169); 206 int16x8_t v171 = vaddq_s16(v168, v161); 207 int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573); 208 int16x8_t v172 = vaddq_s16(v172_tmp, v171); 209 int16x8_t v173 = vld1q_s16(in + in_stride * 228 + i); 210 int16x8_t v174 = vld1q_s16(in + in_stride * 220 + i); 211 int16x8_t v175 = vaddq_s16(v173, v174); 212 int16x8_t v176 = vaddq_s16(v175, v165); 213 int16x8_t v177 = vaddq_s16(v176, v171); 214 int16x8_t v178 = vaddq_s16(v172, v177); 215 int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734); 216 int16x8_t v180 = vaddq_s16(v170, v179); 217 int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705); 218 int16x8_t v182 = vaddq_s16(v158, v181); 219 int16x8_t v183 = vld1q_s16(in + in_stride * 20 + i); 220 int16x8_t v184 = vld1q_s16(in + in_stride * 12 + i); 221 int16x8_t v185 = vaddq_s16(v183, v184); 222 int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573); 223 int16x8_t v186 = vaddq_s16(v186_tmp, v185); 224 int16x8_t v187 = vld1q_s16(in + in_stride * 148 + i); 225 int16x8_t v188 = vld1q_s16(in + in_stride * 140 + i); 226 int16x8_t v189 = vaddq_s16(v187, v188); 227 int16x8_t v190 = vld1q_s16(in + in_stride * 116 + i); 228 int16x8_t v191 = vld1q_s16(in + in_stride * 108 + i); 229 int16x8_t v192 = vaddq_s16(v190, v191); 230 int16x8_t v193 = vaddq_s16(v189, v192); 231 int16x8_t v194 = vaddq_s16(v186, v193); 232 int16x8_t v195 = vld1q_s16(in + in_stride * 84 + i); 233 int16x8_t v196 = vld1q_s16(in + in_stride * 76 + i); 234 int16x8_t v197 = vaddq_s16(v195, v196); 235 int16x8_t v198 = vld1q_s16(in + in_stride * 52 + i); 236 int16x8_t v199 = vld1q_s16(in + in_stride * 44 + i); 237 int16x8_t v200 = vaddq_s16(v198, v199); 238 int16x8_t v201 = vaddq_s16(v197, v200); 239 int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573); 240 int16x8_t v202 = vaddq_s16(v202_tmp, v201); 241 int16x8_t v203 = vld1q_s16(in + in_stride * 212 + i); 242 int16x8_t v204 = vld1q_s16(in + in_stride * 204 + i); 243 int16x8_t v205 = vaddq_s16(v203, v204); 244 int16x8_t v206 = vld1q_s16(in + in_stride * 180 + i); 245 int16x8_t v207 = vld1q_s16(in + in_stride * 172 + i); 246 int16x8_t v208 = vaddq_s16(v206, v207); 247 int16x8_t v209 = vaddq_s16(v205, v208); 248 int16x8_t v210 = vaddq_s16(v209, v201); 249 int16x8_t v211 = vaddq_s16(v202, v210); 250 int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734); 251 int16x8_t v213 = vaddq_s16(v194, v212); 252 int16x8_t v214 = vaddq_s16(v200, v185); 253 int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573); 254 int16x8_t v215 = vaddq_s16(v215_tmp, v214); 255 int16x8_t v216 = vaddq_s16(v208, v189); 256 int16x8_t v217 = vaddq_s16(v192, v197); 257 int16x8_t v218 = vaddq_s16(v216, v217); 258 int16x8_t v219 = vaddq_s16(v215, v218); 259 int16x8_t v220 = vaddq_s16(v217, v214); 260 int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573); 261 int16x8_t v221 = vaddq_s16(v221_tmp, v220); 262 int16x8_t v222 = vld1q_s16(in + in_stride * 244 + i); 263 int16x8_t v223 = vld1q_s16(in + in_stride * 236 + i); 264 int16x8_t v224 = vaddq_s16(v222, v223); 265 int16x8_t v225 = vaddq_s16(v224, v205); 266 int16x8_t v226 = vaddq_s16(v225, v216); 267 int16x8_t v227 = vaddq_s16(v226, v220); 268 int16x8_t v228 = vaddq_s16(v221, v227); 269 int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734); 270 int16x8_t v230 = vaddq_s16(v219, v229); 271 int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705); 272 int16x8_t v232 = vaddq_s16(v213, v231); 273 int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463); 274 int16x8_t v234 = vaddq_s16(v182, v233); 275 int16x8_t v235 = vaddq_s16(v184, v142); 276 int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573); 277 int16x8_t v236 = vaddq_s16(v236_tmp, v235); 278 int16x8_t v237 = vaddq_s16(v188, v144); 279 int16x8_t v238 = vaddq_s16(v145, v190); 280 int16x8_t v239 = vaddq_s16(v237, v238); 281 int16x8_t v240 = vaddq_s16(v236, v239); 282 int16x8_t v241 = vaddq_s16(v196, v148); 283 int16x8_t v242 = vaddq_s16(v149, v198); 284 int16x8_t v243 = vaddq_s16(v241, v242); 285 int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573); 286 int16x8_t v244 = vaddq_s16(v244_tmp, v243); 287 int16x8_t v245 = vaddq_s16(v204, v152); 288 int16x8_t v246 = vaddq_s16(v153, v206); 289 int16x8_t v247 = vaddq_s16(v245, v246); 290 int16x8_t v248 = vaddq_s16(v247, v243); 291 int16x8_t v249 = vaddq_s16(v244, v248); 292 int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734); 293 int16x8_t v251 = vaddq_s16(v240, v250); 294 int16x8_t v252 = vaddq_s16(v199, v159); 295 int16x8_t v253 = vaddq_s16(v160, v183); 296 int16x8_t v254 = vaddq_s16(v252, v253); 297 int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573); 298 int16x8_t v255 = vaddq_s16(v255_tmp, v254); 299 int16x8_t v256 = vaddq_s16(v207, v163); 300 int16x8_t v257 = vaddq_s16(v164, v187); 301 int16x8_t v258 = vaddq_s16(v256, v257); 302 int16x8_t v259 = vaddq_s16(v191, v166); 303 int16x8_t v260 = vaddq_s16(v167, v195); 304 int16x8_t v261 = vaddq_s16(v259, v260); 305 int16x8_t v262 = vaddq_s16(v258, v261); 306 int16x8_t v263 = vaddq_s16(v255, v262); 307 int16x8_t v264 = vaddq_s16(v261, v254); 308 int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573); 309 int16x8_t v265 = vaddq_s16(v265_tmp, v264); 310 int16x8_t v266 = vaddq_s16(v223, v173); 311 int16x8_t v267 = vaddq_s16(v174, v203); 312 int16x8_t v268 = vaddq_s16(v266, v267); 313 int16x8_t v269 = vaddq_s16(v268, v258); 314 int16x8_t v270 = vaddq_s16(v269, v264); 315 int16x8_t v271 = vaddq_s16(v265, v270); 316 int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734); 317 int16x8_t v273 = vaddq_s16(v263, v272); 318 int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705); 319 int16x8_t v275 = vaddq_s16(v251, v274); 320 int16x8_t v276 = vaddq_s16(v253, v235); 321 int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573); 322 int16x8_t v277 = vaddq_s16(v277_tmp, v276); 323 int16x8_t v278 = vaddq_s16(v257, v237); 324 int16x8_t v279 = vaddq_s16(v238, v259); 325 int16x8_t v280 = vaddq_s16(v278, v279); 326 int16x8_t v281 = vaddq_s16(v277, v280); 327 int16x8_t v282 = vaddq_s16(v260, v241); 328 int16x8_t v283 = vaddq_s16(v242, v252); 329 int16x8_t v284 = vaddq_s16(v282, v283); 330 int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573); 331 int16x8_t v285 = vaddq_s16(v285_tmp, v284); 332 int16x8_t v286 = vaddq_s16(v267, v245); 333 int16x8_t v287 = vaddq_s16(v246, v256); 334 int16x8_t v288 = vaddq_s16(v286, v287); 335 int16x8_t v289 = vaddq_s16(v288, v284); 336 int16x8_t v290 = vaddq_s16(v285, v289); 337 int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734); 338 int16x8_t v292 = vaddq_s16(v281, v291); 339 int16x8_t v293 = vaddq_s16(v283, v276); 340 int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573); 341 int16x8_t v294 = vaddq_s16(v294_tmp, v293); 342 int16x8_t v295 = vaddq_s16(v287, v278); 343 int16x8_t v296 = vaddq_s16(v279, v282); 344 int16x8_t v297 = vaddq_s16(v295, v296); 345 int16x8_t v298 = vaddq_s16(v294, v297); 346 int16x8_t v299 = vaddq_s16(v296, v293); 347 int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573); 348 int16x8_t v300 = vaddq_s16(v300_tmp, v299); 349 int16x8_t v301 = vld1q_s16(in + in_stride * 252 + i); 350 int16x8_t v302 = vaddq_s16(v301, v222); 351 int16x8_t v303 = vaddq_s16(v302, v266); 352 int16x8_t v304 = vaddq_s16(v303, v286); 353 int16x8_t v305 = vaddq_s16(v304, v295); 354 int16x8_t v306 = vaddq_s16(v305, v299); 355 int16x8_t v307 = vaddq_s16(v300, v306); 356 int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734); 357 int16x8_t v309 = vaddq_s16(v298, v308); 358 int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705); 359 int16x8_t v311 = vaddq_s16(v292, v310); 360 int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463); 361 int16x8_t v313 = vaddq_s16(v275, v312); 362 int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404); 363 int16x8_t v315 = vaddq_s16(v234, v314); 364 int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389); 365 int16x8_t v317 = vaddq_s16(v141, v316); 366 int16x8_t v318 = vld1q_s16(in + in_stride * 2 + i); 367 int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573); 368 int16x8_t v319 = vaddq_s16(v319_tmp, v318); 369 int16x8_t v320 = vld1q_s16(in + in_stride * 130 + i); 370 int16x8_t v321 = vld1q_s16(in + in_stride * 126 + i); 371 int16x8_t v322 = vaddq_s16(v320, v321); 372 int16x8_t v323 = vaddq_s16(v319, v322); 373 int16x8_t v324 = vld1q_s16(in + in_stride * 66 + i); 374 int16x8_t v325 = vld1q_s16(in + in_stride * 62 + i); 375 int16x8_t v326 = vaddq_s16(v324, v325); 376 int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573); 377 int16x8_t v327 = vaddq_s16(v327_tmp, v326); 378 int16x8_t v328 = vld1q_s16(in + in_stride * 194 + i); 379 int16x8_t v329 = vld1q_s16(in + in_stride * 190 + i); 380 int16x8_t v330 = vaddq_s16(v328, v329); 381 int16x8_t v331 = vaddq_s16(v330, v326); 382 int16x8_t v332 = vaddq_s16(v327, v331); 383 int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734); 384 int16x8_t v334 = vaddq_s16(v323, v333); 385 int16x8_t v335 = vld1q_s16(in + in_stride * 34 + i); 386 int16x8_t v336 = vld1q_s16(in + in_stride * 30 + i); 387 int16x8_t v337 = vaddq_s16(v335, v336); 388 int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573); 389 int16x8_t v338 = vaddq_s16(v338_tmp, v337); 390 int16x8_t v339 = vld1q_s16(in + in_stride * 162 + i); 391 int16x8_t v340 = vld1q_s16(in + in_stride * 158 + i); 392 int16x8_t v341 = vaddq_s16(v339, v340); 393 int16x8_t v342 = vld1q_s16(in + in_stride * 98 + i); 394 int16x8_t v343 = vld1q_s16(in + in_stride * 94 + i); 395 int16x8_t v344 = vaddq_s16(v342, v343); 396 int16x8_t v345 = vaddq_s16(v341, v344); 397 int16x8_t v346 = vaddq_s16(v338, v345); 398 int16x8_t v347 = vaddq_s16(v344, v337); 399 int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573); 400 int16x8_t v348 = vaddq_s16(v348_tmp, v347); 401 int16x8_t v349 = vld1q_s16(in + in_stride * 226 + i); 402 int16x8_t v350 = vld1q_s16(in + in_stride * 222 + i); 403 int16x8_t v351 = vaddq_s16(v349, v350); 404 int16x8_t v352 = vaddq_s16(v351, v341); 405 int16x8_t v353 = vaddq_s16(v352, v347); 406 int16x8_t v354 = vaddq_s16(v348, v353); 407 int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734); 408 int16x8_t v356 = vaddq_s16(v346, v355); 409 int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705); 410 int16x8_t v358 = vaddq_s16(v334, v357); 411 int16x8_t v359 = vld1q_s16(in + in_stride * 18 + i); 412 int16x8_t v360 = vld1q_s16(in + in_stride * 14 + i); 413 int16x8_t v361 = vaddq_s16(v359, v360); 414 int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573); 415 int16x8_t v362 = vaddq_s16(v362_tmp, v361); 416 int16x8_t v363 = vld1q_s16(in + in_stride * 146 + i); 417 int16x8_t v364 = vld1q_s16(in + in_stride * 142 + i); 418 int16x8_t v365 = vaddq_s16(v363, v364); 419 int16x8_t v366 = vld1q_s16(in + in_stride * 114 + i); 420 int16x8_t v367 = vld1q_s16(in + in_stride * 110 + i); 421 int16x8_t v368 = vaddq_s16(v366, v367); 422 int16x8_t v369 = vaddq_s16(v365, v368); 423 int16x8_t v370 = vaddq_s16(v362, v369); 424 int16x8_t v371 = vld1q_s16(in + in_stride * 82 + i); 425 int16x8_t v372 = vld1q_s16(in + in_stride * 78 + i); 426 int16x8_t v373 = vaddq_s16(v371, v372); 427 int16x8_t v374 = vld1q_s16(in + in_stride * 50 + i); 428 int16x8_t v375 = vld1q_s16(in + in_stride * 46 + i); 429 int16x8_t v376 = vaddq_s16(v374, v375); 430 int16x8_t v377 = vaddq_s16(v373, v376); 431 int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573); 432 int16x8_t v378 = vaddq_s16(v378_tmp, v377); 433 int16x8_t v379 = vld1q_s16(in + in_stride * 210 + i); 434 int16x8_t v380 = vld1q_s16(in + in_stride * 206 + i); 435 int16x8_t v381 = vaddq_s16(v379, v380); 436 int16x8_t v382 = vld1q_s16(in + in_stride * 178 + i); 437 int16x8_t v383 = vld1q_s16(in + in_stride * 174 + i); 438 int16x8_t v384 = vaddq_s16(v382, v383); 439 int16x8_t v385 = vaddq_s16(v381, v384); 440 int16x8_t v386 = vaddq_s16(v385, v377); 441 int16x8_t v387 = vaddq_s16(v378, v386); 442 int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734); 443 int16x8_t v389 = vaddq_s16(v370, v388); 444 int16x8_t v390 = vaddq_s16(v376, v361); 445 int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573); 446 int16x8_t v391 = vaddq_s16(v391_tmp, v390); 447 int16x8_t v392 = vaddq_s16(v384, v365); 448 int16x8_t v393 = vaddq_s16(v368, v373); 449 int16x8_t v394 = vaddq_s16(v392, v393); 450 int16x8_t v395 = vaddq_s16(v391, v394); 451 int16x8_t v396 = vaddq_s16(v393, v390); 452 int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573); 453 int16x8_t v397 = vaddq_s16(v397_tmp, v396); 454 int16x8_t v398 = vld1q_s16(in + in_stride * 242 + i); 455 int16x8_t v399 = vld1q_s16(in + in_stride * 238 + i); 456 int16x8_t v400 = vaddq_s16(v398, v399); 457 int16x8_t v401 = vaddq_s16(v400, v381); 458 int16x8_t v402 = vaddq_s16(v401, v392); 459 int16x8_t v403 = vaddq_s16(v402, v396); 460 int16x8_t v404 = vaddq_s16(v397, v403); 461 int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734); 462 int16x8_t v406 = vaddq_s16(v395, v405); 463 int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705); 464 int16x8_t v408 = vaddq_s16(v389, v407); 465 int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463); 466 int16x8_t v410 = vaddq_s16(v358, v409); 467 int16x8_t v411 = vld1q_s16(in + in_stride * 10 + i); 468 int16x8_t v412 = vld1q_s16(in + in_stride * 6 + i); 469 int16x8_t v413 = vaddq_s16(v411, v412); 470 int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573); 471 int16x8_t v414 = vaddq_s16(v414_tmp, v413); 472 int16x8_t v415 = vld1q_s16(in + in_stride * 138 + i); 473 int16x8_t v416 = vld1q_s16(in + in_stride * 134 + i); 474 int16x8_t v417 = vaddq_s16(v415, v416); 475 int16x8_t v418 = vld1q_s16(in + in_stride * 122 + i); 476 int16x8_t v419 = vld1q_s16(in + in_stride * 118 + i); 477 int16x8_t v420 = vaddq_s16(v418, v419); 478 int16x8_t v421 = vaddq_s16(v417, v420); 479 int16x8_t v422 = vaddq_s16(v414, v421); 480 int16x8_t v423 = vld1q_s16(in + in_stride * 74 + i); 481 int16x8_t v424 = vld1q_s16(in + in_stride * 70 + i); 482 int16x8_t v425 = vaddq_s16(v423, v424); 483 int16x8_t v426 = vld1q_s16(in + in_stride * 58 + i); 484 int16x8_t v427 = vld1q_s16(in + in_stride * 54 + i); 485 int16x8_t v428 = vaddq_s16(v426, v427); 486 int16x8_t v429 = vaddq_s16(v425, v428); 487 int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573); 488 int16x8_t v430 = vaddq_s16(v430_tmp, v429); 489 int16x8_t v431 = vld1q_s16(in + in_stride * 202 + i); 490 int16x8_t v432 = vld1q_s16(in + in_stride * 198 + i); 491 int16x8_t v433 = vaddq_s16(v431, v432); 492 int16x8_t v434 = vld1q_s16(in + in_stride * 186 + i); 493 int16x8_t v435 = vld1q_s16(in + in_stride * 182 + i); 494 int16x8_t v436 = vaddq_s16(v434, v435); 495 int16x8_t v437 = vaddq_s16(v433, v436); 496 int16x8_t v438 = vaddq_s16(v437, v429); 497 int16x8_t v439 = vaddq_s16(v430, v438); 498 int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734); 499 int16x8_t v441 = vaddq_s16(v422, v440); 500 int16x8_t v442 = vld1q_s16(in + in_stride * 42 + i); 501 int16x8_t v443 = vld1q_s16(in + in_stride * 38 + i); 502 int16x8_t v444 = vaddq_s16(v442, v443); 503 int16x8_t v445 = vld1q_s16(in + in_stride * 26 + i); 504 int16x8_t v446 = vld1q_s16(in + in_stride * 22 + i); 505 int16x8_t v447 = vaddq_s16(v445, v446); 506 int16x8_t v448 = vaddq_s16(v444, v447); 507 int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573); 508 int16x8_t v449 = vaddq_s16(v449_tmp, v448); 509 int16x8_t v450 = vld1q_s16(in + in_stride * 170 + i); 510 int16x8_t v451 = vld1q_s16(in + in_stride * 166 + i); 511 int16x8_t v452 = vaddq_s16(v450, v451); 512 int16x8_t v453 = vld1q_s16(in + in_stride * 154 + i); 513 int16x8_t v454 = vld1q_s16(in + in_stride * 150 + i); 514 int16x8_t v455 = vaddq_s16(v453, v454); 515 int16x8_t v456 = vaddq_s16(v452, v455); 516 int16x8_t v457 = vld1q_s16(in + in_stride * 106 + i); 517 int16x8_t v458 = vld1q_s16(in + in_stride * 102 + i); 518 int16x8_t v459 = vaddq_s16(v457, v458); 519 int16x8_t v460 = vld1q_s16(in + in_stride * 90 + i); 520 int16x8_t v461 = vld1q_s16(in + in_stride * 86 + i); 521 int16x8_t v462 = vaddq_s16(v460, v461); 522 int16x8_t v463 = vaddq_s16(v459, v462); 523 int16x8_t v464 = vaddq_s16(v456, v463); 524 int16x8_t v465 = vaddq_s16(v449, v464); 525 int16x8_t v466 = vaddq_s16(v463, v448); 526 int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573); 527 int16x8_t v467 = vaddq_s16(v467_tmp, v466); 528 int16x8_t v468 = vld1q_s16(in + in_stride * 234 + i); 529 int16x8_t v469 = vld1q_s16(in + in_stride * 230 + i); 530 int16x8_t v470 = vaddq_s16(v468, v469); 531 int16x8_t v471 = vld1q_s16(in + in_stride * 218 + i); 532 int16x8_t v472 = vld1q_s16(in + in_stride * 214 + i); 533 int16x8_t v473 = vaddq_s16(v471, v472); 534 int16x8_t v474 = vaddq_s16(v470, v473); 535 int16x8_t v475 = vaddq_s16(v474, v456); 536 int16x8_t v476 = vaddq_s16(v475, v466); 537 int16x8_t v477 = vaddq_s16(v467, v476); 538 int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734); 539 int16x8_t v479 = vaddq_s16(v465, v478); 540 int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705); 541 int16x8_t v481 = vaddq_s16(v441, v480); 542 int16x8_t v482 = vaddq_s16(v447, v413); 543 int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573); 544 int16x8_t v483 = vaddq_s16(v483_tmp, v482); 545 int16x8_t v484 = vaddq_s16(v455, v417); 546 int16x8_t v485 = vaddq_s16(v420, v459); 547 int16x8_t v486 = vaddq_s16(v484, v485); 548 int16x8_t v487 = vaddq_s16(v483, v486); 549 int16x8_t v488 = vaddq_s16(v462, v425); 550 int16x8_t v489 = vaddq_s16(v428, v444); 551 int16x8_t v490 = vaddq_s16(v488, v489); 552 int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573); 553 int16x8_t v491 = vaddq_s16(v491_tmp, v490); 554 int16x8_t v492 = vaddq_s16(v473, v433); 555 int16x8_t v493 = vaddq_s16(v436, v452); 556 int16x8_t v494 = vaddq_s16(v492, v493); 557 int16x8_t v495 = vaddq_s16(v494, v490); 558 int16x8_t v496 = vaddq_s16(v491, v495); 559 int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734); 560 int16x8_t v498 = vaddq_s16(v487, v497); 561 int16x8_t v499 = vaddq_s16(v489, v482); 562 int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573); 563 int16x8_t v500 = vaddq_s16(v500_tmp, v499); 564 int16x8_t v501 = vaddq_s16(v493, v484); 565 int16x8_t v502 = vaddq_s16(v485, v488); 566 int16x8_t v503 = vaddq_s16(v501, v502); 567 int16x8_t v504 = vaddq_s16(v500, v503); 568 int16x8_t v505 = vaddq_s16(v502, v499); 569 int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573); 570 int16x8_t v506 = vaddq_s16(v506_tmp, v505); 571 int16x8_t v507 = vld1q_s16(in + in_stride * 250 + i); 572 int16x8_t v508 = vld1q_s16(in + in_stride * 246 + i); 573 int16x8_t v509 = vaddq_s16(v507, v508); 574 int16x8_t v510 = vaddq_s16(v509, v470); 575 int16x8_t v511 = vaddq_s16(v510, v492); 576 int16x8_t v512 = vaddq_s16(v511, v501); 577 int16x8_t v513 = vaddq_s16(v512, v505); 578 int16x8_t v514 = vaddq_s16(v506, v513); 579 int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734); 580 int16x8_t v516 = vaddq_s16(v504, v515); 581 int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705); 582 int16x8_t v518 = vaddq_s16(v498, v517); 583 int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463); 584 int16x8_t v520 = vaddq_s16(v481, v519); 585 int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404); 586 int16x8_t v522 = vaddq_s16(v410, v521); 587 int16x8_t v523 = vaddq_s16(v412, v318); 588 int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573); 589 int16x8_t v524 = vaddq_s16(v524_tmp, v523); 590 int16x8_t v525 = vaddq_s16(v416, v320); 591 int16x8_t v526 = vaddq_s16(v321, v418); 592 int16x8_t v527 = vaddq_s16(v525, v526); 593 int16x8_t v528 = vaddq_s16(v524, v527); 594 int16x8_t v529 = vaddq_s16(v424, v324); 595 int16x8_t v530 = vaddq_s16(v325, v426); 596 int16x8_t v531 = vaddq_s16(v529, v530); 597 int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573); 598 int16x8_t v532 = vaddq_s16(v532_tmp, v531); 599 int16x8_t v533 = vaddq_s16(v432, v328); 600 int16x8_t v534 = vaddq_s16(v329, v434); 601 int16x8_t v535 = vaddq_s16(v533, v534); 602 int16x8_t v536 = vaddq_s16(v535, v531); 603 int16x8_t v537 = vaddq_s16(v532, v536); 604 int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734); 605 int16x8_t v539 = vaddq_s16(v528, v538); 606 int16x8_t v540 = vaddq_s16(v443, v335); 607 int16x8_t v541 = vaddq_s16(v336, v445); 608 int16x8_t v542 = vaddq_s16(v540, v541); 609 int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573); 610 int16x8_t v543 = vaddq_s16(v543_tmp, v542); 611 int16x8_t v544 = vaddq_s16(v451, v339); 612 int16x8_t v545 = vaddq_s16(v340, v453); 613 int16x8_t v546 = vaddq_s16(v544, v545); 614 int16x8_t v547 = vaddq_s16(v458, v342); 615 int16x8_t v548 = vaddq_s16(v343, v460); 616 int16x8_t v549 = vaddq_s16(v547, v548); 617 int16x8_t v550 = vaddq_s16(v546, v549); 618 int16x8_t v551 = vaddq_s16(v543, v550); 619 int16x8_t v552 = vaddq_s16(v549, v542); 620 int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573); 621 int16x8_t v553 = vaddq_s16(v553_tmp, v552); 622 int16x8_t v554 = vaddq_s16(v469, v349); 623 int16x8_t v555 = vaddq_s16(v350, v471); 624 int16x8_t v556 = vaddq_s16(v554, v555); 625 int16x8_t v557 = vaddq_s16(v556, v546); 626 int16x8_t v558 = vaddq_s16(v557, v552); 627 int16x8_t v559 = vaddq_s16(v553, v558); 628 int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734); 629 int16x8_t v561 = vaddq_s16(v551, v560); 630 int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705); 631 int16x8_t v563 = vaddq_s16(v539, v562); 632 int16x8_t v564 = vaddq_s16(v446, v359); 633 int16x8_t v565 = vaddq_s16(v360, v411); 634 int16x8_t v566 = vaddq_s16(v564, v565); 635 int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573); 636 int16x8_t v567 = vaddq_s16(v567_tmp, v566); 637 int16x8_t v568 = vaddq_s16(v454, v363); 638 int16x8_t v569 = vaddq_s16(v364, v415); 639 int16x8_t v570 = vaddq_s16(v568, v569); 640 int16x8_t v571 = vaddq_s16(v419, v366); 641 int16x8_t v572 = vaddq_s16(v367, v457); 642 int16x8_t v573 = vaddq_s16(v571, v572); 643 int16x8_t v574 = vaddq_s16(v570, v573); 644 int16x8_t v575 = vaddq_s16(v567, v574); 645 int16x8_t v576 = vaddq_s16(v461, v371); 646 int16x8_t v577 = vaddq_s16(v372, v423); 647 int16x8_t v578 = vaddq_s16(v576, v577); 648 int16x8_t v579 = vaddq_s16(v427, v374); 649 int16x8_t v580 = vaddq_s16(v375, v442); 650 int16x8_t v581 = vaddq_s16(v579, v580); 651 int16x8_t v582 = vaddq_s16(v578, v581); 652 int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573); 653 int16x8_t v583 = vaddq_s16(v583_tmp, v582); 654 int16x8_t v584 = vaddq_s16(v472, v379); 655 int16x8_t v585 = vaddq_s16(v380, v431); 656 int16x8_t v586 = vaddq_s16(v584, v585); 657 int16x8_t v587 = vaddq_s16(v435, v382); 658 int16x8_t v588 = vaddq_s16(v383, v450); 659 int16x8_t v589 = vaddq_s16(v587, v588); 660 int16x8_t v590 = vaddq_s16(v586, v589); 661 int16x8_t v591 = vaddq_s16(v590, v582); 662 int16x8_t v592 = vaddq_s16(v583, v591); 663 int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734); 664 int16x8_t v594 = vaddq_s16(v575, v593); 665 int16x8_t v595 = vaddq_s16(v581, v566); 666 int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573); 667 int16x8_t v596 = vaddq_s16(v596_tmp, v595); 668 int16x8_t v597 = vaddq_s16(v589, v570); 669 int16x8_t v598 = vaddq_s16(v573, v578); 670 int16x8_t v599 = vaddq_s16(v597, v598); 671 int16x8_t v600 = vaddq_s16(v596, v599); 672 int16x8_t v601 = vaddq_s16(v598, v595); 673 int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573); 674 int16x8_t v602 = vaddq_s16(v602_tmp, v601); 675 int16x8_t v603 = vaddq_s16(v508, v398); 676 int16x8_t v604 = vaddq_s16(v399, v468); 677 int16x8_t v605 = vaddq_s16(v603, v604); 678 int16x8_t v606 = vaddq_s16(v605, v586); 679 int16x8_t v607 = vaddq_s16(v606, v597); 680 int16x8_t v608 = vaddq_s16(v607, v601); 681 int16x8_t v609 = vaddq_s16(v602, v608); 682 int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734); 683 int16x8_t v611 = vaddq_s16(v600, v610); 684 int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705); 685 int16x8_t v613 = vaddq_s16(v594, v612); 686 int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463); 687 int16x8_t v615 = vaddq_s16(v563, v614); 688 int16x8_t v616 = vaddq_s16(v565, v523); 689 int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573); 690 int16x8_t v617 = vaddq_s16(v617_tmp, v616); 691 int16x8_t v618 = vaddq_s16(v569, v525); 692 int16x8_t v619 = vaddq_s16(v526, v571); 693 int16x8_t v620 = vaddq_s16(v618, v619); 694 int16x8_t v621 = vaddq_s16(v617, v620); 695 int16x8_t v622 = vaddq_s16(v577, v529); 696 int16x8_t v623 = vaddq_s16(v530, v579); 697 int16x8_t v624 = vaddq_s16(v622, v623); 698 int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573); 699 int16x8_t v625 = vaddq_s16(v625_tmp, v624); 700 int16x8_t v626 = vaddq_s16(v585, v533); 701 int16x8_t v627 = vaddq_s16(v534, v587); 702 int16x8_t v628 = vaddq_s16(v626, v627); 703 int16x8_t v629 = vaddq_s16(v628, v624); 704 int16x8_t v630 = vaddq_s16(v625, v629); 705 int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734); 706 int16x8_t v632 = vaddq_s16(v621, v631); 707 int16x8_t v633 = vaddq_s16(v580, v540); 708 int16x8_t v634 = vaddq_s16(v541, v564); 709 int16x8_t v635 = vaddq_s16(v633, v634); 710 int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573); 711 int16x8_t v636 = vaddq_s16(v636_tmp, v635); 712 int16x8_t v637 = vaddq_s16(v588, v544); 713 int16x8_t v638 = vaddq_s16(v545, v568); 714 int16x8_t v639 = vaddq_s16(v637, v638); 715 int16x8_t v640 = vaddq_s16(v572, v547); 716 int16x8_t v641 = vaddq_s16(v548, v576); 717 int16x8_t v642 = vaddq_s16(v640, v641); 718 int16x8_t v643 = vaddq_s16(v639, v642); 719 int16x8_t v644 = vaddq_s16(v636, v643); 720 int16x8_t v645 = vaddq_s16(v642, v635); 721 int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573); 722 int16x8_t v646 = vaddq_s16(v646_tmp, v645); 723 int16x8_t v647 = vaddq_s16(v604, v554); 724 int16x8_t v648 = vaddq_s16(v555, v584); 725 int16x8_t v649 = vaddq_s16(v647, v648); 726 int16x8_t v650 = vaddq_s16(v649, v639); 727 int16x8_t v651 = vaddq_s16(v650, v645); 728 int16x8_t v652 = vaddq_s16(v646, v651); 729 int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734); 730 int16x8_t v654 = vaddq_s16(v644, v653); 731 int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705); 732 int16x8_t v656 = vaddq_s16(v632, v655); 733 int16x8_t v657 = vaddq_s16(v634, v616); 734 int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573); 735 int16x8_t v658 = vaddq_s16(v658_tmp, v657); 736 int16x8_t v659 = vaddq_s16(v638, v618); 737 int16x8_t v660 = vaddq_s16(v619, v640); 738 int16x8_t v661 = vaddq_s16(v659, v660); 739 int16x8_t v662 = vaddq_s16(v658, v661); 740 int16x8_t v663 = vaddq_s16(v641, v622); 741 int16x8_t v664 = vaddq_s16(v623, v633); 742 int16x8_t v665 = vaddq_s16(v663, v664); 743 int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573); 744 int16x8_t v666 = vaddq_s16(v666_tmp, v665); 745 int16x8_t v667 = vaddq_s16(v648, v626); 746 int16x8_t v668 = vaddq_s16(v627, v637); 747 int16x8_t v669 = vaddq_s16(v667, v668); 748 int16x8_t v670 = vaddq_s16(v669, v665); 749 int16x8_t v671 = vaddq_s16(v666, v670); 750 int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734); 751 int16x8_t v673 = vaddq_s16(v662, v672); 752 int16x8_t v674 = vaddq_s16(v664, v657); 753 int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573); 754 int16x8_t v675 = vaddq_s16(v675_tmp, v674); 755 int16x8_t v676 = vaddq_s16(v668, v659); 756 int16x8_t v677 = vaddq_s16(v660, v663); 757 int16x8_t v678 = vaddq_s16(v676, v677); 758 int16x8_t v679 = vaddq_s16(v675, v678); 759 int16x8_t v680 = vaddq_s16(v677, v674); 760 int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573); 761 int16x8_t v681 = vaddq_s16(v681_tmp, v680); 762 int16x8_t v682 = vld1q_s16(in + in_stride * 254 + i); 763 int16x8_t v683 = vaddq_s16(v682, v507); 764 int16x8_t v684 = vaddq_s16(v683, v603); 765 int16x8_t v685 = vaddq_s16(v684, v647); 766 int16x8_t v686 = vaddq_s16(v685, v667); 767 int16x8_t v687 = vaddq_s16(v686, v676); 768 int16x8_t v688 = vaddq_s16(v687, v680); 769 int16x8_t v689 = vaddq_s16(v681, v688); 770 int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734); 771 int16x8_t v691 = vaddq_s16(v679, v690); 772 int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705); 773 int16x8_t v693 = vaddq_s16(v673, v692); 774 int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463); 775 int16x8_t v695 = vaddq_s16(v656, v694); 776 int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404); 777 int16x8_t v697 = vaddq_s16(v615, v696); 778 int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389); 779 int16x8_t v699 = vaddq_s16(v522, v698); 780 int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385); 781 int16x8_t v701 = vaddq_s16(v317, v700); 782 int16x8_t v702 = vld1q_s16(in + in_stride * 1 + i); 783 int16x8_t v703_tmp = vqrdmulhq_n_s16(v702, 13573); 784 int16x8_t v703 = vaddq_s16(v703_tmp, v702); 785 int16x8_t v704 = vld1q_s16(in + in_stride * 129 + i); 786 int16x8_t v705 = vld1q_s16(in + in_stride * 127 + i); 787 int16x8_t v706 = vaddq_s16(v704, v705); 788 int16x8_t v707 = vaddq_s16(v703, v706); 789 int16x8_t v708 = vld1q_s16(in + in_stride * 65 + i); 790 int16x8_t v709 = vld1q_s16(in + in_stride * 63 + i); 791 int16x8_t v710 = vaddq_s16(v708, v709); 792 int16x8_t v711_tmp = vqrdmulhq_n_s16(v710, 13573); 793 int16x8_t v711 = vaddq_s16(v711_tmp, v710); 794 int16x8_t v712 = vld1q_s16(in + in_stride * 193 + i); 795 int16x8_t v713 = vld1q_s16(in + in_stride * 191 + i); 796 int16x8_t v714 = vaddq_s16(v712, v713); 797 int16x8_t v715 = vaddq_s16(v714, v710); 798 int16x8_t v716 = vaddq_s16(v711, v715); 799 int16x8_t v717 = vqrdmulhq_n_s16(v716, 17734); 800 int16x8_t v718 = vaddq_s16(v707, v717); 801 int16x8_t v719 = vld1q_s16(in + in_stride * 33 + i); 802 int16x8_t v720 = vld1q_s16(in + in_stride * 31 + i); 803 int16x8_t v721 = vaddq_s16(v719, v720); 804 int16x8_t v722_tmp = vqrdmulhq_n_s16(v721, 13573); 805 int16x8_t v722 = vaddq_s16(v722_tmp, v721); 806 int16x8_t v723 = vld1q_s16(in + in_stride * 161 + i); 807 int16x8_t v724 = vld1q_s16(in + in_stride * 159 + i); 808 int16x8_t v725 = vaddq_s16(v723, v724); 809 int16x8_t v726 = vld1q_s16(in + in_stride * 97 + i); 810 int16x8_t v727 = vld1q_s16(in + in_stride * 95 + i); 811 int16x8_t v728 = vaddq_s16(v726, v727); 812 int16x8_t v729 = vaddq_s16(v725, v728); 813 int16x8_t v730 = vaddq_s16(v722, v729); 814 int16x8_t v731 = vaddq_s16(v728, v721); 815 int16x8_t v732_tmp = vqrdmulhq_n_s16(v731, 13573); 816 int16x8_t v732 = vaddq_s16(v732_tmp, v731); 817 int16x8_t v733 = vld1q_s16(in + in_stride * 225 + i); 818 int16x8_t v734 = vld1q_s16(in + in_stride * 223 + i); 819 int16x8_t v735 = vaddq_s16(v733, v734); 820 int16x8_t v736 = vaddq_s16(v735, v725); 821 int16x8_t v737 = vaddq_s16(v736, v731); 822 int16x8_t v738 = vaddq_s16(v732, v737); 823 int16x8_t v739 = vqrdmulhq_n_s16(v738, 17734); 824 int16x8_t v740 = vaddq_s16(v730, v739); 825 int16x8_t v741 = vqrdmulhq_n_s16(v740, 16705); 826 int16x8_t v742 = vaddq_s16(v718, v741); 827 int16x8_t v743 = vld1q_s16(in + in_stride * 17 + i); 828 int16x8_t v744 = vld1q_s16(in + in_stride * 15 + i); 829 int16x8_t v745 = vaddq_s16(v743, v744); 830 int16x8_t v746_tmp = vqrdmulhq_n_s16(v745, 13573); 831 int16x8_t v746 = vaddq_s16(v746_tmp, v745); 832 int16x8_t v747 = vld1q_s16(in + in_stride * 145 + i); 833 int16x8_t v748 = vld1q_s16(in + in_stride * 143 + i); 834 int16x8_t v749 = vaddq_s16(v747, v748); 835 int16x8_t v750 = vld1q_s16(in + in_stride * 113 + i); 836 int16x8_t v751 = vld1q_s16(in + in_stride * 111 + i); 837 int16x8_t v752 = vaddq_s16(v750, v751); 838 int16x8_t v753 = vaddq_s16(v749, v752); 839 int16x8_t v754 = vaddq_s16(v746, v753); 840 int16x8_t v755 = vld1q_s16(in + in_stride * 81 + i); 841 int16x8_t v756 = vld1q_s16(in + in_stride * 79 + i); 842 int16x8_t v757 = vaddq_s16(v755, v756); 843 int16x8_t v758 = vld1q_s16(in + in_stride * 49 + i); 844 int16x8_t v759 = vld1q_s16(in + in_stride * 47 + i); 845 int16x8_t v760 = vaddq_s16(v758, v759); 846 int16x8_t v761 = vaddq_s16(v757, v760); 847 int16x8_t v762_tmp = vqrdmulhq_n_s16(v761, 13573); 848 int16x8_t v762 = vaddq_s16(v762_tmp, v761); 849 int16x8_t v763 = vld1q_s16(in + in_stride * 209 + i); 850 int16x8_t v764 = vld1q_s16(in + in_stride * 207 + i); 851 int16x8_t v765 = vaddq_s16(v763, v764); 852 int16x8_t v766 = vld1q_s16(in + in_stride * 177 + i); 853 int16x8_t v767 = vld1q_s16(in + in_stride * 175 + i); 854 int16x8_t v768 = vaddq_s16(v766, v767); 855 int16x8_t v769 = vaddq_s16(v765, v768); 856 int16x8_t v770 = vaddq_s16(v769, v761); 857 int16x8_t v771 = vaddq_s16(v762, v770); 858 int16x8_t v772 = vqrdmulhq_n_s16(v771, 17734); 859 int16x8_t v773 = vaddq_s16(v754, v772); 860 int16x8_t v774 = vaddq_s16(v760, v745); 861 int16x8_t v775_tmp = vqrdmulhq_n_s16(v774, 13573); 862 int16x8_t v775 = vaddq_s16(v775_tmp, v774); 863 int16x8_t v776 = vaddq_s16(v768, v749); 864 int16x8_t v777 = vaddq_s16(v752, v757); 865 int16x8_t v778 = vaddq_s16(v776, v777); 866 int16x8_t v779 = vaddq_s16(v775, v778); 867 int16x8_t v780 = vaddq_s16(v777, v774); 868 int16x8_t v781_tmp = vqrdmulhq_n_s16(v780, 13573); 869 int16x8_t v781 = vaddq_s16(v781_tmp, v780); 870 int16x8_t v782 = vld1q_s16(in + in_stride * 241 + i); 871 int16x8_t v783 = vld1q_s16(in + in_stride * 239 + i); 872 int16x8_t v784 = vaddq_s16(v782, v783); 873 int16x8_t v785 = vaddq_s16(v784, v765); 874 int16x8_t v786 = vaddq_s16(v785, v776); 875 int16x8_t v787 = vaddq_s16(v786, v780); 876 int16x8_t v788 = vaddq_s16(v781, v787); 877 int16x8_t v789 = vqrdmulhq_n_s16(v788, 17734); 878 int16x8_t v790 = vaddq_s16(v779, v789); 879 int16x8_t v791 = vqrdmulhq_n_s16(v790, 16705); 880 int16x8_t v792 = vaddq_s16(v773, v791); 881 int16x8_t v793 = vqrdmulhq_n_s16(v792, 16463); 882 int16x8_t v794 = vaddq_s16(v742, v793); 883 int16x8_t v795 = vld1q_s16(in + in_stride * 9 + i); 884 int16x8_t v796 = vld1q_s16(in + in_stride * 7 + i); 885 int16x8_t v797 = vaddq_s16(v795, v796); 886 int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 13573); 887 int16x8_t v798 = vaddq_s16(v798_tmp, v797); 888 int16x8_t v799 = vld1q_s16(in + in_stride * 137 + i); 889 int16x8_t v800 = vld1q_s16(in + in_stride * 135 + i); 890 int16x8_t v801 = vaddq_s16(v799, v800); 891 int16x8_t v802 = vld1q_s16(in + in_stride * 121 + i); 892 int16x8_t v803 = vld1q_s16(in + in_stride * 119 + i); 893 int16x8_t v804 = vaddq_s16(v802, v803); 894 int16x8_t v805 = vaddq_s16(v801, v804); 895 int16x8_t v806 = vaddq_s16(v798, v805); 896 int16x8_t v807 = vld1q_s16(in + in_stride * 73 + i); 897 int16x8_t v808 = vld1q_s16(in + in_stride * 71 + i); 898 int16x8_t v809 = vaddq_s16(v807, v808); 899 int16x8_t v810 = vld1q_s16(in + in_stride * 57 + i); 900 int16x8_t v811 = vld1q_s16(in + in_stride * 55 + i); 901 int16x8_t v812 = vaddq_s16(v810, v811); 902 int16x8_t v813 = vaddq_s16(v809, v812); 903 int16x8_t v814_tmp = vqrdmulhq_n_s16(v813, 13573); 904 int16x8_t v814 = vaddq_s16(v814_tmp, v813); 905 int16x8_t v815 = vld1q_s16(in + in_stride * 201 + i); 906 int16x8_t v816 = vld1q_s16(in + in_stride * 199 + i); 907 int16x8_t v817 = vaddq_s16(v815, v816); 908 int16x8_t v818 = vld1q_s16(in + in_stride * 185 + i); 909 int16x8_t v819 = vld1q_s16(in + in_stride * 183 + i); 910 int16x8_t v820 = vaddq_s16(v818, v819); 911 int16x8_t v821 = vaddq_s16(v817, v820); 912 int16x8_t v822 = vaddq_s16(v821, v813); 913 int16x8_t v823 = vaddq_s16(v814, v822); 914 int16x8_t v824 = vqrdmulhq_n_s16(v823, 17734); 915 int16x8_t v825 = vaddq_s16(v806, v824); 916 int16x8_t v826 = vld1q_s16(in + in_stride * 41 + i); 917 int16x8_t v827 = vld1q_s16(in + in_stride * 39 + i); 918 int16x8_t v828 = vaddq_s16(v826, v827); 919 int16x8_t v829 = vld1q_s16(in + in_stride * 25 + i); 920 int16x8_t v830 = vld1q_s16(in + in_stride * 23 + i); 921 int16x8_t v831 = vaddq_s16(v829, v830); 922 int16x8_t v832 = vaddq_s16(v828, v831); 923 int16x8_t v833_tmp = vqrdmulhq_n_s16(v832, 13573); 924 int16x8_t v833 = vaddq_s16(v833_tmp, v832); 925 int16x8_t v834 = vld1q_s16(in + in_stride * 169 + i); 926 int16x8_t v835 = vld1q_s16(in + in_stride * 167 + i); 927 int16x8_t v836 = vaddq_s16(v834, v835); 928 int16x8_t v837 = vld1q_s16(in + in_stride * 153 + i); 929 int16x8_t v838 = vld1q_s16(in + in_stride * 151 + i); 930 int16x8_t v839 = vaddq_s16(v837, v838); 931 int16x8_t v840 = vaddq_s16(v836, v839); 932 int16x8_t v841 = vld1q_s16(in + in_stride * 105 + i); 933 int16x8_t v842 = vld1q_s16(in + in_stride * 103 + i); 934 int16x8_t v843 = vaddq_s16(v841, v842); 935 int16x8_t v844 = vld1q_s16(in + in_stride * 89 + i); 936 int16x8_t v845 = vld1q_s16(in + in_stride * 87 + i); 937 int16x8_t v846 = vaddq_s16(v844, v845); 938 int16x8_t v847 = vaddq_s16(v843, v846); 939 int16x8_t v848 = vaddq_s16(v840, v847); 940 int16x8_t v849 = vaddq_s16(v833, v848); 941 int16x8_t v850 = vaddq_s16(v847, v832); 942 int16x8_t v851_tmp = vqrdmulhq_n_s16(v850, 13573); 943 int16x8_t v851 = vaddq_s16(v851_tmp, v850); 944 int16x8_t v852 = vld1q_s16(in + in_stride * 233 + i); 945 int16x8_t v853 = vld1q_s16(in + in_stride * 231 + i); 946 int16x8_t v854 = vaddq_s16(v852, v853); 947 int16x8_t v855 = vld1q_s16(in + in_stride * 217 + i); 948 int16x8_t v856 = vld1q_s16(in + in_stride * 215 + i); 949 int16x8_t v857 = vaddq_s16(v855, v856); 950 int16x8_t v858 = vaddq_s16(v854, v857); 951 int16x8_t v859 = vaddq_s16(v858, v840); 952 int16x8_t v860 = vaddq_s16(v859, v850); 953 int16x8_t v861 = vaddq_s16(v851, v860); 954 int16x8_t v862 = vqrdmulhq_n_s16(v861, 17734); 955 int16x8_t v863 = vaddq_s16(v849, v862); 956 int16x8_t v864 = vqrdmulhq_n_s16(v863, 16705); 957 int16x8_t v865 = vaddq_s16(v825, v864); 958 int16x8_t v866 = vaddq_s16(v831, v797); 959 int16x8_t v867_tmp = vqrdmulhq_n_s16(v866, 13573); 960 int16x8_t v867 = vaddq_s16(v867_tmp, v866); 961 int16x8_t v868 = vaddq_s16(v839, v801); 962 int16x8_t v869 = vaddq_s16(v804, v843); 963 int16x8_t v870 = vaddq_s16(v868, v869); 964 int16x8_t v871 = vaddq_s16(v867, v870); 965 int16x8_t v872 = vaddq_s16(v846, v809); 966 int16x8_t v873 = vaddq_s16(v812, v828); 967 int16x8_t v874 = vaddq_s16(v872, v873); 968 int16x8_t v875_tmp = vqrdmulhq_n_s16(v874, 13573); 969 int16x8_t v875 = vaddq_s16(v875_tmp, v874); 970 int16x8_t v876 = vaddq_s16(v857, v817); 971 int16x8_t v877 = vaddq_s16(v820, v836); 972 int16x8_t v878 = vaddq_s16(v876, v877); 973 int16x8_t v879 = vaddq_s16(v878, v874); 974 int16x8_t v880 = vaddq_s16(v875, v879); 975 int16x8_t v881 = vqrdmulhq_n_s16(v880, 17734); 976 int16x8_t v882 = vaddq_s16(v871, v881); 977 int16x8_t v883 = vaddq_s16(v873, v866); 978 int16x8_t v884_tmp = vqrdmulhq_n_s16(v883, 13573); 979 int16x8_t v884 = vaddq_s16(v884_tmp, v883); 980 int16x8_t v885 = vaddq_s16(v877, v868); 981 int16x8_t v886 = vaddq_s16(v869, v872); 982 int16x8_t v887 = vaddq_s16(v885, v886); 983 int16x8_t v888 = vaddq_s16(v884, v887); 984 int16x8_t v889 = vaddq_s16(v886, v883); 985 int16x8_t v890_tmp = vqrdmulhq_n_s16(v889, 13573); 986 int16x8_t v890 = vaddq_s16(v890_tmp, v889); 987 int16x8_t v891 = vld1q_s16(in + in_stride * 249 + i); 988 int16x8_t v892 = vld1q_s16(in + in_stride * 247 + i); 989 int16x8_t v893 = vaddq_s16(v891, v892); 990 int16x8_t v894 = vaddq_s16(v893, v854); 991 int16x8_t v895 = vaddq_s16(v894, v876); 992 int16x8_t v896 = vaddq_s16(v895, v885); 993 int16x8_t v897 = vaddq_s16(v896, v889); 994 int16x8_t v898 = vaddq_s16(v890, v897); 995 int16x8_t v899 = vqrdmulhq_n_s16(v898, 17734); 996 int16x8_t v900 = vaddq_s16(v888, v899); 997 int16x8_t v901 = vqrdmulhq_n_s16(v900, 16705); 998 int16x8_t v902 = vaddq_s16(v882, v901); 999 int16x8_t v903 = vqrdmulhq_n_s16(v902, 16463); 1000 int16x8_t v904 = vaddq_s16(v865, v903); 1001 int16x8_t v905 = vqrdmulhq_n_s16(v904, 16404); 1002 int16x8_t v906 = vaddq_s16(v794, v905); 1003 int16x8_t v907 = vld1q_s16(in + in_stride * 5 + i); 1004 int16x8_t v908 = vld1q_s16(in + in_stride * 3 + i); 1005 int16x8_t v909 = vaddq_s16(v907, v908); 1006 int16x8_t v910_tmp = vqrdmulhq_n_s16(v909, 13573); 1007 int16x8_t v910 = vaddq_s16(v910_tmp, v909); 1008 int16x8_t v911 = vld1q_s16(in + in_stride * 133 + i); 1009 int16x8_t v912 = vld1q_s16(in + in_stride * 131 + i); 1010 int16x8_t v913 = vaddq_s16(v911, v912); 1011 int16x8_t v914 = vld1q_s16(in + in_stride * 125 + i); 1012 int16x8_t v915 = vld1q_s16(in + in_stride * 123 + i); 1013 int16x8_t v916 = vaddq_s16(v914, v915); 1014 int16x8_t v917 = vaddq_s16(v913, v916); 1015 int16x8_t v918 = vaddq_s16(v910, v917); 1016 int16x8_t v919 = vld1q_s16(in + in_stride * 69 + i); 1017 int16x8_t v920 = vld1q_s16(in + in_stride * 67 + i); 1018 int16x8_t v921 = vaddq_s16(v919, v920); 1019 int16x8_t v922 = vld1q_s16(in + in_stride * 61 + i); 1020 int16x8_t v923 = vld1q_s16(in + in_stride * 59 + i); 1021 int16x8_t v924 = vaddq_s16(v922, v923); 1022 int16x8_t v925 = vaddq_s16(v921, v924); 1023 int16x8_t v926_tmp = vqrdmulhq_n_s16(v925, 13573); 1024 int16x8_t v926 = vaddq_s16(v926_tmp, v925); 1025 int16x8_t v927 = vld1q_s16(in + in_stride * 197 + i); 1026 int16x8_t v928 = vld1q_s16(in + in_stride * 195 + i); 1027 int16x8_t v929 = vaddq_s16(v927, v928); 1028 int16x8_t v930 = vld1q_s16(in + in_stride * 189 + i); 1029 int16x8_t v931 = vld1q_s16(in + in_stride * 187 + i); 1030 int16x8_t v932 = vaddq_s16(v930, v931); 1031 int16x8_t v933 = vaddq_s16(v929, v932); 1032 int16x8_t v934 = vaddq_s16(v933, v925); 1033 int16x8_t v935 = vaddq_s16(v926, v934); 1034 int16x8_t v936 = vqrdmulhq_n_s16(v935, 17734); 1035 int16x8_t v937 = vaddq_s16(v918, v936); 1036 int16x8_t v938 = vld1q_s16(in + in_stride * 37 + i); 1037 int16x8_t v939 = vld1q_s16(in + in_stride * 35 + i); 1038 int16x8_t v940 = vaddq_s16(v938, v939); 1039 int16x8_t v941 = vld1q_s16(in + in_stride * 29 + i); 1040 int16x8_t v942 = vld1q_s16(in + in_stride * 27 + i); 1041 int16x8_t v943 = vaddq_s16(v941, v942); 1042 int16x8_t v944 = vaddq_s16(v940, v943); 1043 int16x8_t v945_tmp = vqrdmulhq_n_s16(v944, 13573); 1044 int16x8_t v945 = vaddq_s16(v945_tmp, v944); 1045 int16x8_t v946 = vld1q_s16(in + in_stride * 165 + i); 1046 int16x8_t v947 = vld1q_s16(in + in_stride * 163 + i); 1047 int16x8_t v948 = vaddq_s16(v946, v947); 1048 int16x8_t v949 = vld1q_s16(in + in_stride * 157 + i); 1049 int16x8_t v950 = vld1q_s16(in + in_stride * 155 + i); 1050 int16x8_t v951 = vaddq_s16(v949, v950); 1051 int16x8_t v952 = vaddq_s16(v948, v951); 1052 int16x8_t v953 = vld1q_s16(in + in_stride * 101 + i); 1053 int16x8_t v954 = vld1q_s16(in + in_stride * 99 + i); 1054 int16x8_t v955 = vaddq_s16(v953, v954); 1055 int16x8_t v956 = vld1q_s16(in + in_stride * 93 + i); 1056 int16x8_t v957 = vld1q_s16(in + in_stride * 91 + i); 1057 int16x8_t v958 = vaddq_s16(v956, v957); 1058 int16x8_t v959 = vaddq_s16(v955, v958); 1059 int16x8_t v960 = vaddq_s16(v952, v959); 1060 int16x8_t v961 = vaddq_s16(v945, v960); 1061 int16x8_t v962 = vaddq_s16(v959, v944); 1062 int16x8_t v963_tmp = vqrdmulhq_n_s16(v962, 13573); 1063 int16x8_t v963 = vaddq_s16(v963_tmp, v962); 1064 int16x8_t v964 = vld1q_s16(in + in_stride * 229 + i); 1065 int16x8_t v965 = vld1q_s16(in + in_stride * 227 + i); 1066 int16x8_t v966 = vaddq_s16(v964, v965); 1067 int16x8_t v967 = vld1q_s16(in + in_stride * 221 + i); 1068 int16x8_t v968 = vld1q_s16(in + in_stride * 219 + i); 1069 int16x8_t v969 = vaddq_s16(v967, v968); 1070 int16x8_t v970 = vaddq_s16(v966, v969); 1071 int16x8_t v971 = vaddq_s16(v970, v952); 1072 int16x8_t v972 = vaddq_s16(v971, v962); 1073 int16x8_t v973 = vaddq_s16(v963, v972); 1074 int16x8_t v974 = vqrdmulhq_n_s16(v973, 17734); 1075 int16x8_t v975 = vaddq_s16(v961, v974); 1076 int16x8_t v976 = vqrdmulhq_n_s16(v975, 16705); 1077 int16x8_t v977 = vaddq_s16(v937, v976); 1078 int16x8_t v978 = vld1q_s16(in + in_stride * 21 + i); 1079 int16x8_t v979 = vld1q_s16(in + in_stride * 19 + i); 1080 int16x8_t v980 = vaddq_s16(v978, v979); 1081 int16x8_t v981 = vld1q_s16(in + in_stride * 13 + i); 1082 int16x8_t v982 = vld1q_s16(in + in_stride * 11 + i); 1083 int16x8_t v983 = vaddq_s16(v981, v982); 1084 int16x8_t v984 = vaddq_s16(v980, v983); 1085 int16x8_t v985_tmp = vqrdmulhq_n_s16(v984, 13573); 1086 int16x8_t v985 = vaddq_s16(v985_tmp, v984); 1087 int16x8_t v986 = vld1q_s16(in + in_stride * 149 + i); 1088 int16x8_t v987 = vld1q_s16(in + in_stride * 147 + i); 1089 int16x8_t v988 = vaddq_s16(v986, v987); 1090 int16x8_t v989 = vld1q_s16(in + in_stride * 141 + i); 1091 int16x8_t v990 = vld1q_s16(in + in_stride * 139 + i); 1092 int16x8_t v991 = vaddq_s16(v989, v990); 1093 int16x8_t v992 = vaddq_s16(v988, v991); 1094 int16x8_t v993 = vld1q_s16(in + in_stride * 117 + i); 1095 int16x8_t v994 = vld1q_s16(in + in_stride * 115 + i); 1096 int16x8_t v995 = vaddq_s16(v993, v994); 1097 int16x8_t v996 = vld1q_s16(in + in_stride * 109 + i); 1098 int16x8_t v997 = vld1q_s16(in + in_stride * 107 + i); 1099 int16x8_t v998 = vaddq_s16(v996, v997); 1100 int16x8_t v999 = vaddq_s16(v995, v998); 1101 int16x8_t v1000 = vaddq_s16(v992, v999); 1102 int16x8_t v1001 = vaddq_s16(v985, v1000); 1103 int16x8_t v1002 = vld1q_s16(in + in_stride * 85 + i); 1104 int16x8_t v1003 = vld1q_s16(in + in_stride * 83 + i); 1105 int16x8_t v1004 = vaddq_s16(v1002, v1003); 1106 int16x8_t v1005 = vld1q_s16(in + in_stride * 77 + i); 1107 int16x8_t v1006 = vld1q_s16(in + in_stride * 75 + i); 1108 int16x8_t v1007 = vaddq_s16(v1005, v1006); 1109 int16x8_t v1008 = vaddq_s16(v1004, v1007); 1110 int16x8_t v1009 = vld1q_s16(in + in_stride * 53 + i); 1111 int16x8_t v1010 = vld1q_s16(in + in_stride * 51 + i); 1112 int16x8_t v1011 = vaddq_s16(v1009, v1010); 1113 int16x8_t v1012 = vld1q_s16(in + in_stride * 45 + i); 1114 int16x8_t v1013 = vld1q_s16(in + in_stride * 43 + i); 1115 int16x8_t v1014 = vaddq_s16(v1012, v1013); 1116 int16x8_t v1015 = vaddq_s16(v1011, v1014); 1117 int16x8_t v1016 = vaddq_s16(v1008, v1015); 1118 int16x8_t v1017_tmp = vqrdmulhq_n_s16(v1016, 13573); 1119 int16x8_t v1017 = vaddq_s16(v1017_tmp, v1016); 1120 int16x8_t v1018 = vld1q_s16(in + in_stride * 213 + i); 1121 int16x8_t v1019 = vld1q_s16(in + in_stride * 211 + i); 1122 int16x8_t v1020 = vaddq_s16(v1018, v1019); 1123 int16x8_t v1021 = vld1q_s16(in + in_stride * 205 + i); 1124 int16x8_t v1022 = vld1q_s16(in + in_stride * 203 + i); 1125 int16x8_t v1023 = vaddq_s16(v1021, v1022); 1126 int16x8_t v1024 = vaddq_s16(v1020, v1023); 1127 int16x8_t v1025 = vld1q_s16(in + in_stride * 181 + i); 1128 int16x8_t v1026 = vld1q_s16(in + in_stride * 179 + i); 1129 int16x8_t v1027 = vaddq_s16(v1025, v1026); 1130 int16x8_t v1028 = vld1q_s16(in + in_stride * 173 + i); 1131 int16x8_t v1029 = vld1q_s16(in + in_stride * 171 + i); 1132 int16x8_t v1030 = vaddq_s16(v1028, v1029); 1133 int16x8_t v1031 = vaddq_s16(v1027, v1030); 1134 int16x8_t v1032 = vaddq_s16(v1024, v1031); 1135 int16x8_t v1033 = vaddq_s16(v1032, v1016); 1136 int16x8_t v1034 = vaddq_s16(v1017, v1033); 1137 int16x8_t v1035 = vqrdmulhq_n_s16(v1034, 17734); 1138 int16x8_t v1036 = vaddq_s16(v1001, v1035); 1139 int16x8_t v1037 = vaddq_s16(v1015, v984); 1140 int16x8_t v1038_tmp = vqrdmulhq_n_s16(v1037, 13573); 1141 int16x8_t v1038 = vaddq_s16(v1038_tmp, v1037); 1142 int16x8_t v1039 = vaddq_s16(v1031, v992); 1143 int16x8_t v1040 = vaddq_s16(v999, v1008); 1144 int16x8_t v1041 = vaddq_s16(v1039, v1040); 1145 int16x8_t v1042 = vaddq_s16(v1038, v1041); 1146 int16x8_t v1043 = vaddq_s16(v1040, v1037); 1147 int16x8_t v1044_tmp = vqrdmulhq_n_s16(v1043, 13573); 1148 int16x8_t v1044 = vaddq_s16(v1044_tmp, v1043); 1149 int16x8_t v1045 = vld1q_s16(in + in_stride * 245 + i); 1150 int16x8_t v1046 = vld1q_s16(in + in_stride * 243 + i); 1151 int16x8_t v1047 = vaddq_s16(v1045, v1046); 1152 int16x8_t v1048 = vld1q_s16(in + in_stride * 237 + i); 1153 int16x8_t v1049 = vld1q_s16(in + in_stride * 235 + i); 1154 int16x8_t v1050 = vaddq_s16(v1048, v1049); 1155 int16x8_t v1051 = vaddq_s16(v1047, v1050); 1156 int16x8_t v1052 = vaddq_s16(v1051, v1024); 1157 int16x8_t v1053 = vaddq_s16(v1052, v1039); 1158 int16x8_t v1054 = vaddq_s16(v1053, v1043); 1159 int16x8_t v1055 = vaddq_s16(v1044, v1054); 1160 int16x8_t v1056 = vqrdmulhq_n_s16(v1055, 17734); 1161 int16x8_t v1057 = vaddq_s16(v1042, v1056); 1162 int16x8_t v1058 = vqrdmulhq_n_s16(v1057, 16705); 1163 int16x8_t v1059 = vaddq_s16(v1036, v1058); 1164 int16x8_t v1060 = vqrdmulhq_n_s16(v1059, 16463); 1165 int16x8_t v1061 = vaddq_s16(v977, v1060); 1166 int16x8_t v1062 = vaddq_s16(v983, v909); 1167 int16x8_t v1063_tmp = vqrdmulhq_n_s16(v1062, 13573); 1168 int16x8_t v1063 = vaddq_s16(v1063_tmp, v1062); 1169 int16x8_t v1064 = vaddq_s16(v991, v913); 1170 int16x8_t v1065 = vaddq_s16(v916, v995); 1171 int16x8_t v1066 = vaddq_s16(v1064, v1065); 1172 int16x8_t v1067 = vaddq_s16(v1063, v1066); 1173 int16x8_t v1068 = vaddq_s16(v1007, v921); 1174 int16x8_t v1069 = vaddq_s16(v924, v1011); 1175 int16x8_t v1070 = vaddq_s16(v1068, v1069); 1176 int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 13573); 1177 int16x8_t v1071 = vaddq_s16(v1071_tmp, v1070); 1178 int16x8_t v1072 = vaddq_s16(v1023, v929); 1179 int16x8_t v1073 = vaddq_s16(v932, v1027); 1180 int16x8_t v1074 = vaddq_s16(v1072, v1073); 1181 int16x8_t v1075 = vaddq_s16(v1074, v1070); 1182 int16x8_t v1076 = vaddq_s16(v1071, v1075); 1183 int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 17734); 1184 int16x8_t v1078 = vaddq_s16(v1067, v1077); 1185 int16x8_t v1079 = vaddq_s16(v1014, v940); 1186 int16x8_t v1080 = vaddq_s16(v943, v980); 1187 int16x8_t v1081 = vaddq_s16(v1079, v1080); 1188 int16x8_t v1082_tmp = vqrdmulhq_n_s16(v1081, 13573); 1189 int16x8_t v1082 = vaddq_s16(v1082_tmp, v1081); 1190 int16x8_t v1083 = vaddq_s16(v1030, v948); 1191 int16x8_t v1084 = vaddq_s16(v951, v988); 1192 int16x8_t v1085 = vaddq_s16(v1083, v1084); 1193 int16x8_t v1086 = vaddq_s16(v998, v955); 1194 int16x8_t v1087 = vaddq_s16(v958, v1004); 1195 int16x8_t v1088 = vaddq_s16(v1086, v1087); 1196 int16x8_t v1089 = vaddq_s16(v1085, v1088); 1197 int16x8_t v1090 = vaddq_s16(v1082, v1089); 1198 int16x8_t v1091 = vaddq_s16(v1088, v1081); 1199 int16x8_t v1092_tmp = vqrdmulhq_n_s16(v1091, 13573); 1200 int16x8_t v1092 = vaddq_s16(v1092_tmp, v1091); 1201 int16x8_t v1093 = vaddq_s16(v1050, v966); 1202 int16x8_t v1094 = vaddq_s16(v969, v1020); 1203 int16x8_t v1095 = vaddq_s16(v1093, v1094); 1204 int16x8_t v1096 = vaddq_s16(v1095, v1085); 1205 int16x8_t v1097 = vaddq_s16(v1096, v1091); 1206 int16x8_t v1098 = vaddq_s16(v1092, v1097); 1207 int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 17734); 1208 int16x8_t v1100 = vaddq_s16(v1090, v1099); 1209 int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16705); 1210 int16x8_t v1102 = vaddq_s16(v1078, v1101); 1211 int16x8_t v1103 = vaddq_s16(v1080, v1062); 1212 int16x8_t v1104_tmp = vqrdmulhq_n_s16(v1103, 13573); 1213 int16x8_t v1104 = vaddq_s16(v1104_tmp, v1103); 1214 int16x8_t v1105 = vaddq_s16(v1084, v1064); 1215 int16x8_t v1106 = vaddq_s16(v1065, v1086); 1216 int16x8_t v1107 = vaddq_s16(v1105, v1106); 1217 int16x8_t v1108 = vaddq_s16(v1104, v1107); 1218 int16x8_t v1109 = vaddq_s16(v1087, v1068); 1219 int16x8_t v1110 = vaddq_s16(v1069, v1079); 1220 int16x8_t v1111 = vaddq_s16(v1109, v1110); 1221 int16x8_t v1112_tmp = vqrdmulhq_n_s16(v1111, 13573); 1222 int16x8_t v1112 = vaddq_s16(v1112_tmp, v1111); 1223 int16x8_t v1113 = vaddq_s16(v1094, v1072); 1224 int16x8_t v1114 = vaddq_s16(v1073, v1083); 1225 int16x8_t v1115 = vaddq_s16(v1113, v1114); 1226 int16x8_t v1116 = vaddq_s16(v1115, v1111); 1227 int16x8_t v1117 = vaddq_s16(v1112, v1116); 1228 int16x8_t v1118 = vqrdmulhq_n_s16(v1117, 17734); 1229 int16x8_t v1119 = vaddq_s16(v1108, v1118); 1230 int16x8_t v1120 = vaddq_s16(v1110, v1103); 1231 int16x8_t v1121_tmp = vqrdmulhq_n_s16(v1120, 13573); 1232 int16x8_t v1121 = vaddq_s16(v1121_tmp, v1120); 1233 int16x8_t v1122 = vaddq_s16(v1114, v1105); 1234 int16x8_t v1123 = vaddq_s16(v1106, v1109); 1235 int16x8_t v1124 = vaddq_s16(v1122, v1123); 1236 int16x8_t v1125 = vaddq_s16(v1121, v1124); 1237 int16x8_t v1126 = vaddq_s16(v1123, v1120); 1238 int16x8_t v1127_tmp = vqrdmulhq_n_s16(v1126, 13573); 1239 int16x8_t v1127 = vaddq_s16(v1127_tmp, v1126); 1240 int16x8_t v1128 = vld1q_s16(in + in_stride * 253 + i); 1241 int16x8_t v1129 = vld1q_s16(in + in_stride * 251 + i); 1242 int16x8_t v1130 = vaddq_s16(v1128, v1129); 1243 int16x8_t v1131 = vaddq_s16(v1130, v1047); 1244 int16x8_t v1132 = vaddq_s16(v1131, v1093); 1245 int16x8_t v1133 = vaddq_s16(v1132, v1113); 1246 int16x8_t v1134 = vaddq_s16(v1133, v1122); 1247 int16x8_t v1135 = vaddq_s16(v1134, v1126); 1248 int16x8_t v1136 = vaddq_s16(v1127, v1135); 1249 int16x8_t v1137 = vqrdmulhq_n_s16(v1136, 17734); 1250 int16x8_t v1138 = vaddq_s16(v1125, v1137); 1251 int16x8_t v1139 = vqrdmulhq_n_s16(v1138, 16705); 1252 int16x8_t v1140 = vaddq_s16(v1119, v1139); 1253 int16x8_t v1141 = vqrdmulhq_n_s16(v1140, 16463); 1254 int16x8_t v1142 = vaddq_s16(v1102, v1141); 1255 int16x8_t v1143 = vqrdmulhq_n_s16(v1142, 16404); 1256 int16x8_t v1144 = vaddq_s16(v1061, v1143); 1257 int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 16389); 1258 int16x8_t v1146 = vaddq_s16(v906, v1145); 1259 int16x8_t v1147 = vaddq_s16(v908, v702); 1260 int16x8_t v1148_tmp = vqrdmulhq_n_s16(v1147, 13573); 1261 int16x8_t v1148 = vaddq_s16(v1148_tmp, v1147); 1262 int16x8_t v1149 = vaddq_s16(v912, v704); 1263 int16x8_t v1150 = vaddq_s16(v705, v914); 1264 int16x8_t v1151 = vaddq_s16(v1149, v1150); 1265 int16x8_t v1152 = vaddq_s16(v1148, v1151); 1266 int16x8_t v1153 = vaddq_s16(v920, v708); 1267 int16x8_t v1154 = vaddq_s16(v709, v922); 1268 int16x8_t v1155 = vaddq_s16(v1153, v1154); 1269 int16x8_t v1156_tmp = vqrdmulhq_n_s16(v1155, 13573); 1270 int16x8_t v1156 = vaddq_s16(v1156_tmp, v1155); 1271 int16x8_t v1157 = vaddq_s16(v928, v712); 1272 int16x8_t v1158 = vaddq_s16(v713, v930); 1273 int16x8_t v1159 = vaddq_s16(v1157, v1158); 1274 int16x8_t v1160 = vaddq_s16(v1159, v1155); 1275 int16x8_t v1161 = vaddq_s16(v1156, v1160); 1276 int16x8_t v1162 = vqrdmulhq_n_s16(v1161, 17734); 1277 int16x8_t v1163 = vaddq_s16(v1152, v1162); 1278 int16x8_t v1164 = vaddq_s16(v939, v719); 1279 int16x8_t v1165 = vaddq_s16(v720, v941); 1280 int16x8_t v1166 = vaddq_s16(v1164, v1165); 1281 int16x8_t v1167_tmp = vqrdmulhq_n_s16(v1166, 13573); 1282 int16x8_t v1167 = vaddq_s16(v1167_tmp, v1166); 1283 int16x8_t v1168 = vaddq_s16(v947, v723); 1284 int16x8_t v1169 = vaddq_s16(v724, v949); 1285 int16x8_t v1170 = vaddq_s16(v1168, v1169); 1286 int16x8_t v1171 = vaddq_s16(v954, v726); 1287 int16x8_t v1172 = vaddq_s16(v727, v956); 1288 int16x8_t v1173 = vaddq_s16(v1171, v1172); 1289 int16x8_t v1174 = vaddq_s16(v1170, v1173); 1290 int16x8_t v1175 = vaddq_s16(v1167, v1174); 1291 int16x8_t v1176 = vaddq_s16(v1173, v1166); 1292 int16x8_t v1177_tmp = vqrdmulhq_n_s16(v1176, 13573); 1293 int16x8_t v1177 = vaddq_s16(v1177_tmp, v1176); 1294 int16x8_t v1178 = vaddq_s16(v965, v733); 1295 int16x8_t v1179 = vaddq_s16(v734, v967); 1296 int16x8_t v1180 = vaddq_s16(v1178, v1179); 1297 int16x8_t v1181 = vaddq_s16(v1180, v1170); 1298 int16x8_t v1182 = vaddq_s16(v1181, v1176); 1299 int16x8_t v1183 = vaddq_s16(v1177, v1182); 1300 int16x8_t v1184 = vqrdmulhq_n_s16(v1183, 17734); 1301 int16x8_t v1185 = vaddq_s16(v1175, v1184); 1302 int16x8_t v1186 = vqrdmulhq_n_s16(v1185, 16705); 1303 int16x8_t v1187 = vaddq_s16(v1163, v1186); 1304 int16x8_t v1188 = vaddq_s16(v979, v743); 1305 int16x8_t v1189 = vaddq_s16(v744, v981); 1306 int16x8_t v1190 = vaddq_s16(v1188, v1189); 1307 int16x8_t v1191_tmp = vqrdmulhq_n_s16(v1190, 13573); 1308 int16x8_t v1191 = vaddq_s16(v1191_tmp, v1190); 1309 int16x8_t v1192 = vaddq_s16(v987, v747); 1310 int16x8_t v1193 = vaddq_s16(v748, v989); 1311 int16x8_t v1194 = vaddq_s16(v1192, v1193); 1312 int16x8_t v1195 = vaddq_s16(v994, v750); 1313 int16x8_t v1196 = vaddq_s16(v751, v996); 1314 int16x8_t v1197 = vaddq_s16(v1195, v1196); 1315 int16x8_t v1198 = vaddq_s16(v1194, v1197); 1316 int16x8_t v1199 = vaddq_s16(v1191, v1198); 1317 int16x8_t v1200 = vaddq_s16(v1003, v755); 1318 int16x8_t v1201 = vaddq_s16(v756, v1005); 1319 int16x8_t v1202 = vaddq_s16(v1200, v1201); 1320 int16x8_t v1203 = vaddq_s16(v1010, v758); 1321 int16x8_t v1204 = vaddq_s16(v759, v1012); 1322 int16x8_t v1205 = vaddq_s16(v1203, v1204); 1323 int16x8_t v1206 = vaddq_s16(v1202, v1205); 1324 int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 13573); 1325 int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206); 1326 int16x8_t v1208 = vaddq_s16(v1019, v763); 1327 int16x8_t v1209 = vaddq_s16(v764, v1021); 1328 int16x8_t v1210 = vaddq_s16(v1208, v1209); 1329 int16x8_t v1211 = vaddq_s16(v1026, v766); 1330 int16x8_t v1212 = vaddq_s16(v767, v1028); 1331 int16x8_t v1213 = vaddq_s16(v1211, v1212); 1332 int16x8_t v1214 = vaddq_s16(v1210, v1213); 1333 int16x8_t v1215 = vaddq_s16(v1214, v1206); 1334 int16x8_t v1216 = vaddq_s16(v1207, v1215); 1335 int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 17734); 1336 int16x8_t v1218 = vaddq_s16(v1199, v1217); 1337 int16x8_t v1219 = vaddq_s16(v1205, v1190); 1338 int16x8_t v1220_tmp = vqrdmulhq_n_s16(v1219, 13573); 1339 int16x8_t v1220 = vaddq_s16(v1220_tmp, v1219); 1340 int16x8_t v1221 = vaddq_s16(v1213, v1194); 1341 int16x8_t v1222 = vaddq_s16(v1197, v1202); 1342 int16x8_t v1223 = vaddq_s16(v1221, v1222); 1343 int16x8_t v1224 = vaddq_s16(v1220, v1223); 1344 int16x8_t v1225 = vaddq_s16(v1222, v1219); 1345 int16x8_t v1226_tmp = vqrdmulhq_n_s16(v1225, 13573); 1346 int16x8_t v1226 = vaddq_s16(v1226_tmp, v1225); 1347 int16x8_t v1227 = vaddq_s16(v1046, v782); 1348 int16x8_t v1228 = vaddq_s16(v783, v1048); 1349 int16x8_t v1229 = vaddq_s16(v1227, v1228); 1350 int16x8_t v1230 = vaddq_s16(v1229, v1210); 1351 int16x8_t v1231 = vaddq_s16(v1230, v1221); 1352 int16x8_t v1232 = vaddq_s16(v1231, v1225); 1353 int16x8_t v1233 = vaddq_s16(v1226, v1232); 1354 int16x8_t v1234 = vqrdmulhq_n_s16(v1233, 17734); 1355 int16x8_t v1235 = vaddq_s16(v1224, v1234); 1356 int16x8_t v1236 = vqrdmulhq_n_s16(v1235, 16705); 1357 int16x8_t v1237 = vaddq_s16(v1218, v1236); 1358 int16x8_t v1238 = vqrdmulhq_n_s16(v1237, 16463); 1359 int16x8_t v1239 = vaddq_s16(v1187, v1238); 1360 int16x8_t v1240 = vaddq_s16(v982, v795); 1361 int16x8_t v1241 = vaddq_s16(v796, v907); 1362 int16x8_t v1242 = vaddq_s16(v1240, v1241); 1363 int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 13573); 1364 int16x8_t v1243 = vaddq_s16(v1243_tmp, v1242); 1365 int16x8_t v1244 = vaddq_s16(v990, v799); 1366 int16x8_t v1245 = vaddq_s16(v800, v911); 1367 int16x8_t v1246 = vaddq_s16(v1244, v1245); 1368 int16x8_t v1247 = vaddq_s16(v915, v802); 1369 int16x8_t v1248 = vaddq_s16(v803, v993); 1370 int16x8_t v1249 = vaddq_s16(v1247, v1248); 1371 int16x8_t v1250 = vaddq_s16(v1246, v1249); 1372 int16x8_t v1251 = vaddq_s16(v1243, v1250); 1373 int16x8_t v1252 = vaddq_s16(v1006, v807); 1374 int16x8_t v1253 = vaddq_s16(v808, v919); 1375 int16x8_t v1254 = vaddq_s16(v1252, v1253); 1376 int16x8_t v1255 = vaddq_s16(v923, v810); 1377 int16x8_t v1256 = vaddq_s16(v811, v1009); 1378 int16x8_t v1257 = vaddq_s16(v1255, v1256); 1379 int16x8_t v1258 = vaddq_s16(v1254, v1257); 1380 int16x8_t v1259_tmp = vqrdmulhq_n_s16(v1258, 13573); 1381 int16x8_t v1259 = vaddq_s16(v1259_tmp, v1258); 1382 int16x8_t v1260 = vaddq_s16(v1022, v815); 1383 int16x8_t v1261 = vaddq_s16(v816, v927); 1384 int16x8_t v1262 = vaddq_s16(v1260, v1261); 1385 int16x8_t v1263 = vaddq_s16(v931, v818); 1386 int16x8_t v1264 = vaddq_s16(v819, v1025); 1387 int16x8_t v1265 = vaddq_s16(v1263, v1264); 1388 int16x8_t v1266 = vaddq_s16(v1262, v1265); 1389 int16x8_t v1267 = vaddq_s16(v1266, v1258); 1390 int16x8_t v1268 = vaddq_s16(v1259, v1267); 1391 int16x8_t v1269 = vqrdmulhq_n_s16(v1268, 17734); 1392 int16x8_t v1270 = vaddq_s16(v1251, v1269); 1393 int16x8_t v1271 = vaddq_s16(v1013, v826); 1394 int16x8_t v1272 = vaddq_s16(v827, v938); 1395 int16x8_t v1273 = vaddq_s16(v1271, v1272); 1396 int16x8_t v1274 = vaddq_s16(v942, v829); 1397 int16x8_t v1275 = vaddq_s16(v830, v978); 1398 int16x8_t v1276 = vaddq_s16(v1274, v1275); 1399 int16x8_t v1277 = vaddq_s16(v1273, v1276); 1400 int16x8_t v1278_tmp = vqrdmulhq_n_s16(v1277, 13573); 1401 int16x8_t v1278 = vaddq_s16(v1278_tmp, v1277); 1402 int16x8_t v1279 = vaddq_s16(v1029, v834); 1403 int16x8_t v1280 = vaddq_s16(v835, v946); 1404 int16x8_t v1281 = vaddq_s16(v1279, v1280); 1405 int16x8_t v1282 = vaddq_s16(v950, v837); 1406 int16x8_t v1283 = vaddq_s16(v838, v986); 1407 int16x8_t v1284 = vaddq_s16(v1282, v1283); 1408 int16x8_t v1285 = vaddq_s16(v1281, v1284); 1409 int16x8_t v1286 = vaddq_s16(v997, v841); 1410 int16x8_t v1287 = vaddq_s16(v842, v953); 1411 int16x8_t v1288 = vaddq_s16(v1286, v1287); 1412 int16x8_t v1289 = vaddq_s16(v957, v844); 1413 int16x8_t v1290 = vaddq_s16(v845, v1002); 1414 int16x8_t v1291 = vaddq_s16(v1289, v1290); 1415 int16x8_t v1292 = vaddq_s16(v1288, v1291); 1416 int16x8_t v1293 = vaddq_s16(v1285, v1292); 1417 int16x8_t v1294 = vaddq_s16(v1278, v1293); 1418 int16x8_t v1295 = vaddq_s16(v1292, v1277); 1419 int16x8_t v1296_tmp = vqrdmulhq_n_s16(v1295, 13573); 1420 int16x8_t v1296 = vaddq_s16(v1296_tmp, v1295); 1421 int16x8_t v1297 = vaddq_s16(v1049, v852); 1422 int16x8_t v1298 = vaddq_s16(v853, v964); 1423 int16x8_t v1299 = vaddq_s16(v1297, v1298); 1424 int16x8_t v1300 = vaddq_s16(v968, v855); 1425 int16x8_t v1301 = vaddq_s16(v856, v1018); 1426 int16x8_t v1302 = vaddq_s16(v1300, v1301); 1427 int16x8_t v1303 = vaddq_s16(v1299, v1302); 1428 int16x8_t v1304 = vaddq_s16(v1303, v1285); 1429 int16x8_t v1305 = vaddq_s16(v1304, v1295); 1430 int16x8_t v1306 = vaddq_s16(v1296, v1305); 1431 int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 17734); 1432 int16x8_t v1308 = vaddq_s16(v1294, v1307); 1433 int16x8_t v1309 = vqrdmulhq_n_s16(v1308, 16705); 1434 int16x8_t v1310 = vaddq_s16(v1270, v1309); 1435 int16x8_t v1311 = vaddq_s16(v1276, v1242); 1436 int16x8_t v1312_tmp = vqrdmulhq_n_s16(v1311, 13573); 1437 int16x8_t v1312 = vaddq_s16(v1312_tmp, v1311); 1438 int16x8_t v1313 = vaddq_s16(v1284, v1246); 1439 int16x8_t v1314 = vaddq_s16(v1249, v1288); 1440 int16x8_t v1315 = vaddq_s16(v1313, v1314); 1441 int16x8_t v1316 = vaddq_s16(v1312, v1315); 1442 int16x8_t v1317 = vaddq_s16(v1291, v1254); 1443 int16x8_t v1318 = vaddq_s16(v1257, v1273); 1444 int16x8_t v1319 = vaddq_s16(v1317, v1318); 1445 int16x8_t v1320_tmp = vqrdmulhq_n_s16(v1319, 13573); 1446 int16x8_t v1320 = vaddq_s16(v1320_tmp, v1319); 1447 int16x8_t v1321 = vaddq_s16(v1302, v1262); 1448 int16x8_t v1322 = vaddq_s16(v1265, v1281); 1449 int16x8_t v1323 = vaddq_s16(v1321, v1322); 1450 int16x8_t v1324 = vaddq_s16(v1323, v1319); 1451 int16x8_t v1325 = vaddq_s16(v1320, v1324); 1452 int16x8_t v1326 = vqrdmulhq_n_s16(v1325, 17734); 1453 int16x8_t v1327 = vaddq_s16(v1316, v1326); 1454 int16x8_t v1328 = vaddq_s16(v1318, v1311); 1455 int16x8_t v1329_tmp = vqrdmulhq_n_s16(v1328, 13573); 1456 int16x8_t v1329 = vaddq_s16(v1329_tmp, v1328); 1457 int16x8_t v1330 = vaddq_s16(v1322, v1313); 1458 int16x8_t v1331 = vaddq_s16(v1314, v1317); 1459 int16x8_t v1332 = vaddq_s16(v1330, v1331); 1460 int16x8_t v1333 = vaddq_s16(v1329, v1332); 1461 int16x8_t v1334 = vaddq_s16(v1331, v1328); 1462 int16x8_t v1335_tmp = vqrdmulhq_n_s16(v1334, 13573); 1463 int16x8_t v1335 = vaddq_s16(v1335_tmp, v1334); 1464 int16x8_t v1336 = vaddq_s16(v1129, v891); 1465 int16x8_t v1337 = vaddq_s16(v892, v1045); 1466 int16x8_t v1338 = vaddq_s16(v1336, v1337); 1467 int16x8_t v1339 = vaddq_s16(v1338, v1299); 1468 int16x8_t v1340 = vaddq_s16(v1339, v1321); 1469 int16x8_t v1341 = vaddq_s16(v1340, v1330); 1470 int16x8_t v1342 = vaddq_s16(v1341, v1334); 1471 int16x8_t v1343 = vaddq_s16(v1335, v1342); 1472 int16x8_t v1344 = vqrdmulhq_n_s16(v1343, 17734); 1473 int16x8_t v1345 = vaddq_s16(v1333, v1344); 1474 int16x8_t v1346 = vqrdmulhq_n_s16(v1345, 16705); 1475 int16x8_t v1347 = vaddq_s16(v1327, v1346); 1476 int16x8_t v1348 = vqrdmulhq_n_s16(v1347, 16463); 1477 int16x8_t v1349 = vaddq_s16(v1310, v1348); 1478 int16x8_t v1350 = vqrdmulhq_n_s16(v1349, 16404); 1479 int16x8_t v1351 = vaddq_s16(v1239, v1350); 1480 int16x8_t v1352 = vaddq_s16(v1241, v1147); 1481 int16x8_t v1353_tmp = vqrdmulhq_n_s16(v1352, 13573); 1482 int16x8_t v1353 = vaddq_s16(v1353_tmp, v1352); 1483 int16x8_t v1354 = vaddq_s16(v1245, v1149); 1484 int16x8_t v1355 = vaddq_s16(v1150, v1247); 1485 int16x8_t v1356 = vaddq_s16(v1354, v1355); 1486 int16x8_t v1357 = vaddq_s16(v1353, v1356); 1487 int16x8_t v1358 = vaddq_s16(v1253, v1153); 1488 int16x8_t v1359 = vaddq_s16(v1154, v1255); 1489 int16x8_t v1360 = vaddq_s16(v1358, v1359); 1490 int16x8_t v1361_tmp = vqrdmulhq_n_s16(v1360, 13573); 1491 int16x8_t v1361 = vaddq_s16(v1361_tmp, v1360); 1492 int16x8_t v1362 = vaddq_s16(v1261, v1157); 1493 int16x8_t v1363 = vaddq_s16(v1158, v1263); 1494 int16x8_t v1364 = vaddq_s16(v1362, v1363); 1495 int16x8_t v1365 = vaddq_s16(v1364, v1360); 1496 int16x8_t v1366 = vaddq_s16(v1361, v1365); 1497 int16x8_t v1367 = vqrdmulhq_n_s16(v1366, 17734); 1498 int16x8_t v1368 = vaddq_s16(v1357, v1367); 1499 int16x8_t v1369 = vaddq_s16(v1272, v1164); 1500 int16x8_t v1370 = vaddq_s16(v1165, v1274); 1501 int16x8_t v1371 = vaddq_s16(v1369, v1370); 1502 int16x8_t v1372_tmp = vqrdmulhq_n_s16(v1371, 13573); 1503 int16x8_t v1372 = vaddq_s16(v1372_tmp, v1371); 1504 int16x8_t v1373 = vaddq_s16(v1280, v1168); 1505 int16x8_t v1374 = vaddq_s16(v1169, v1282); 1506 int16x8_t v1375 = vaddq_s16(v1373, v1374); 1507 int16x8_t v1376 = vaddq_s16(v1287, v1171); 1508 int16x8_t v1377 = vaddq_s16(v1172, v1289); 1509 int16x8_t v1378 = vaddq_s16(v1376, v1377); 1510 int16x8_t v1379 = vaddq_s16(v1375, v1378); 1511 int16x8_t v1380 = vaddq_s16(v1372, v1379); 1512 int16x8_t v1381 = vaddq_s16(v1378, v1371); 1513 int16x8_t v1382_tmp = vqrdmulhq_n_s16(v1381, 13573); 1514 int16x8_t v1382 = vaddq_s16(v1382_tmp, v1381); 1515 int16x8_t v1383 = vaddq_s16(v1298, v1178); 1516 int16x8_t v1384 = vaddq_s16(v1179, v1300); 1517 int16x8_t v1385 = vaddq_s16(v1383, v1384); 1518 int16x8_t v1386 = vaddq_s16(v1385, v1375); 1519 int16x8_t v1387 = vaddq_s16(v1386, v1381); 1520 int16x8_t v1388 = vaddq_s16(v1382, v1387); 1521 int16x8_t v1389 = vqrdmulhq_n_s16(v1388, 17734); 1522 int16x8_t v1390 = vaddq_s16(v1380, v1389); 1523 int16x8_t v1391 = vqrdmulhq_n_s16(v1390, 16705); 1524 int16x8_t v1392 = vaddq_s16(v1368, v1391); 1525 int16x8_t v1393 = vaddq_s16(v1275, v1188); 1526 int16x8_t v1394 = vaddq_s16(v1189, v1240); 1527 int16x8_t v1395 = vaddq_s16(v1393, v1394); 1528 int16x8_t v1396_tmp = vqrdmulhq_n_s16(v1395, 13573); 1529 int16x8_t v1396 = vaddq_s16(v1396_tmp, v1395); 1530 int16x8_t v1397 = vaddq_s16(v1283, v1192); 1531 int16x8_t v1398 = vaddq_s16(v1193, v1244); 1532 int16x8_t v1399 = vaddq_s16(v1397, v1398); 1533 int16x8_t v1400 = vaddq_s16(v1248, v1195); 1534 int16x8_t v1401 = vaddq_s16(v1196, v1286); 1535 int16x8_t v1402 = vaddq_s16(v1400, v1401); 1536 int16x8_t v1403 = vaddq_s16(v1399, v1402); 1537 int16x8_t v1404 = vaddq_s16(v1396, v1403); 1538 int16x8_t v1405 = vaddq_s16(v1290, v1200); 1539 int16x8_t v1406 = vaddq_s16(v1201, v1252); 1540 int16x8_t v1407 = vaddq_s16(v1405, v1406); 1541 int16x8_t v1408 = vaddq_s16(v1256, v1203); 1542 int16x8_t v1409 = vaddq_s16(v1204, v1271); 1543 int16x8_t v1410 = vaddq_s16(v1408, v1409); 1544 int16x8_t v1411 = vaddq_s16(v1407, v1410); 1545 int16x8_t v1412_tmp = vqrdmulhq_n_s16(v1411, 13573); 1546 int16x8_t v1412 = vaddq_s16(v1412_tmp, v1411); 1547 int16x8_t v1413 = vaddq_s16(v1301, v1208); 1548 int16x8_t v1414 = vaddq_s16(v1209, v1260); 1549 int16x8_t v1415 = vaddq_s16(v1413, v1414); 1550 int16x8_t v1416 = vaddq_s16(v1264, v1211); 1551 int16x8_t v1417 = vaddq_s16(v1212, v1279); 1552 int16x8_t v1418 = vaddq_s16(v1416, v1417); 1553 int16x8_t v1419 = vaddq_s16(v1415, v1418); 1554 int16x8_t v1420 = vaddq_s16(v1419, v1411); 1555 int16x8_t v1421 = vaddq_s16(v1412, v1420); 1556 int16x8_t v1422 = vqrdmulhq_n_s16(v1421, 17734); 1557 int16x8_t v1423 = vaddq_s16(v1404, v1422); 1558 int16x8_t v1424 = vaddq_s16(v1410, v1395); 1559 int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 13573); 1560 int16x8_t v1425 = vaddq_s16(v1425_tmp, v1424); 1561 int16x8_t v1426 = vaddq_s16(v1418, v1399); 1562 int16x8_t v1427 = vaddq_s16(v1402, v1407); 1563 int16x8_t v1428 = vaddq_s16(v1426, v1427); 1564 int16x8_t v1429 = vaddq_s16(v1425, v1428); 1565 int16x8_t v1430 = vaddq_s16(v1427, v1424); 1566 int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 13573); 1567 int16x8_t v1431 = vaddq_s16(v1431_tmp, v1430); 1568 int16x8_t v1432 = vaddq_s16(v1337, v1227); 1569 int16x8_t v1433 = vaddq_s16(v1228, v1297); 1570 int16x8_t v1434 = vaddq_s16(v1432, v1433); 1571 int16x8_t v1435 = vaddq_s16(v1434, v1415); 1572 int16x8_t v1436 = vaddq_s16(v1435, v1426); 1573 int16x8_t v1437 = vaddq_s16(v1436, v1430); 1574 int16x8_t v1438 = vaddq_s16(v1431, v1437); 1575 int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17734); 1576 int16x8_t v1440 = vaddq_s16(v1429, v1439); 1577 int16x8_t v1441 = vqrdmulhq_n_s16(v1440, 16705); 1578 int16x8_t v1442 = vaddq_s16(v1423, v1441); 1579 int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 16463); 1580 int16x8_t v1444 = vaddq_s16(v1392, v1443); 1581 int16x8_t v1445 = vaddq_s16(v1394, v1352); 1582 int16x8_t v1446_tmp = vqrdmulhq_n_s16(v1445, 13573); 1583 int16x8_t v1446 = vaddq_s16(v1446_tmp, v1445); 1584 int16x8_t v1447 = vaddq_s16(v1398, v1354); 1585 int16x8_t v1448 = vaddq_s16(v1355, v1400); 1586 int16x8_t v1449 = vaddq_s16(v1447, v1448); 1587 int16x8_t v1450 = vaddq_s16(v1446, v1449); 1588 int16x8_t v1451 = vaddq_s16(v1406, v1358); 1589 int16x8_t v1452 = vaddq_s16(v1359, v1408); 1590 int16x8_t v1453 = vaddq_s16(v1451, v1452); 1591 int16x8_t v1454_tmp = vqrdmulhq_n_s16(v1453, 13573); 1592 int16x8_t v1454 = vaddq_s16(v1454_tmp, v1453); 1593 int16x8_t v1455 = vaddq_s16(v1414, v1362); 1594 int16x8_t v1456 = vaddq_s16(v1363, v1416); 1595 int16x8_t v1457 = vaddq_s16(v1455, v1456); 1596 int16x8_t v1458 = vaddq_s16(v1457, v1453); 1597 int16x8_t v1459 = vaddq_s16(v1454, v1458); 1598 int16x8_t v1460 = vqrdmulhq_n_s16(v1459, 17734); 1599 int16x8_t v1461 = vaddq_s16(v1450, v1460); 1600 int16x8_t v1462 = vaddq_s16(v1409, v1369); 1601 int16x8_t v1463 = vaddq_s16(v1370, v1393); 1602 int16x8_t v1464 = vaddq_s16(v1462, v1463); 1603 int16x8_t v1465_tmp = vqrdmulhq_n_s16(v1464, 13573); 1604 int16x8_t v1465 = vaddq_s16(v1465_tmp, v1464); 1605 int16x8_t v1466 = vaddq_s16(v1417, v1373); 1606 int16x8_t v1467 = vaddq_s16(v1374, v1397); 1607 int16x8_t v1468 = vaddq_s16(v1466, v1467); 1608 int16x8_t v1469 = vaddq_s16(v1401, v1376); 1609 int16x8_t v1470 = vaddq_s16(v1377, v1405); 1610 int16x8_t v1471 = vaddq_s16(v1469, v1470); 1611 int16x8_t v1472 = vaddq_s16(v1468, v1471); 1612 int16x8_t v1473 = vaddq_s16(v1465, v1472); 1613 int16x8_t v1474 = vaddq_s16(v1471, v1464); 1614 int16x8_t v1475_tmp = vqrdmulhq_n_s16(v1474, 13573); 1615 int16x8_t v1475 = vaddq_s16(v1475_tmp, v1474); 1616 int16x8_t v1476 = vaddq_s16(v1433, v1383); 1617 int16x8_t v1477 = vaddq_s16(v1384, v1413); 1618 int16x8_t v1478 = vaddq_s16(v1476, v1477); 1619 int16x8_t v1479 = vaddq_s16(v1478, v1468); 1620 int16x8_t v1480 = vaddq_s16(v1479, v1474); 1621 int16x8_t v1481 = vaddq_s16(v1475, v1480); 1622 int16x8_t v1482 = vqrdmulhq_n_s16(v1481, 17734); 1623 int16x8_t v1483 = vaddq_s16(v1473, v1482); 1624 int16x8_t v1484 = vqrdmulhq_n_s16(v1483, 16705); 1625 int16x8_t v1485 = vaddq_s16(v1461, v1484); 1626 int16x8_t v1486 = vaddq_s16(v1463, v1445); 1627 int16x8_t v1487_tmp = vqrdmulhq_n_s16(v1486, 13573); 1628 int16x8_t v1487 = vaddq_s16(v1487_tmp, v1486); 1629 int16x8_t v1488 = vaddq_s16(v1467, v1447); 1630 int16x8_t v1489 = vaddq_s16(v1448, v1469); 1631 int16x8_t v1490 = vaddq_s16(v1488, v1489); 1632 int16x8_t v1491 = vaddq_s16(v1487, v1490); 1633 int16x8_t v1492 = vaddq_s16(v1470, v1451); 1634 int16x8_t v1493 = vaddq_s16(v1452, v1462); 1635 int16x8_t v1494 = vaddq_s16(v1492, v1493); 1636 int16x8_t v1495_tmp = vqrdmulhq_n_s16(v1494, 13573); 1637 int16x8_t v1495 = vaddq_s16(v1495_tmp, v1494); 1638 int16x8_t v1496 = vaddq_s16(v1477, v1455); 1639 int16x8_t v1497 = vaddq_s16(v1456, v1466); 1640 int16x8_t v1498 = vaddq_s16(v1496, v1497); 1641 int16x8_t v1499 = vaddq_s16(v1498, v1494); 1642 int16x8_t v1500 = vaddq_s16(v1495, v1499); 1643 int16x8_t v1501 = vqrdmulhq_n_s16(v1500, 17734); 1644 int16x8_t v1502 = vaddq_s16(v1491, v1501); 1645 int16x8_t v1503 = vaddq_s16(v1493, v1486); 1646 int16x8_t v1504_tmp = vqrdmulhq_n_s16(v1503, 13573); 1647 int16x8_t v1504 = vaddq_s16(v1504_tmp, v1503); 1648 int16x8_t v1505 = vaddq_s16(v1497, v1488); 1649 int16x8_t v1506 = vaddq_s16(v1489, v1492); 1650 int16x8_t v1507 = vaddq_s16(v1505, v1506); 1651 int16x8_t v1508 = vaddq_s16(v1504, v1507); 1652 int16x8_t v1509 = vaddq_s16(v1506, v1503); 1653 int16x8_t v1510_tmp = vqrdmulhq_n_s16(v1509, 13573); 1654 int16x8_t v1510 = vaddq_s16(v1510_tmp, v1509); 1655 int16x8_t v1511 = vld1q_s16(in + in_stride * 255 + i); 1656 int16x8_t v1512 = vaddq_s16(v1511, v1128); 1657 int16x8_t v1513 = vaddq_s16(v1512, v1336); 1658 int16x8_t v1514 = vaddq_s16(v1513, v1432); 1659 int16x8_t v1515 = vaddq_s16(v1514, v1476); 1660 int16x8_t v1516 = vaddq_s16(v1515, v1496); 1661 int16x8_t v1517 = vaddq_s16(v1516, v1505); 1662 int16x8_t v1518 = vaddq_s16(v1517, v1509); 1663 int16x8_t v1519 = vaddq_s16(v1510, v1518); 1664 int16x8_t v1520 = vqrdmulhq_n_s16(v1519, 17734); 1665 int16x8_t v1521 = vaddq_s16(v1508, v1520); 1666 int16x8_t v1522 = vqrdmulhq_n_s16(v1521, 16705); 1667 int16x8_t v1523 = vaddq_s16(v1502, v1522); 1668 int16x8_t v1524 = vqrdmulhq_n_s16(v1523, 16463); 1669 int16x8_t v1525 = vaddq_s16(v1485, v1524); 1670 int16x8_t v1526 = vqrdmulhq_n_s16(v1525, 16404); 1671 int16x8_t v1527 = vaddq_s16(v1444, v1526); 1672 int16x8_t v1528 = vqrdmulhq_n_s16(v1527, 16389); 1673 int16x8_t v1529 = vaddq_s16(v1351, v1528); 1674 int16x8_t v1530 = vqrdmulhq_n_s16(v1529, 16385); 1675 int16x8_t v1531 = vaddq_s16(v1146, v1530); 1676 int16x8_t v1532 = vqrdmulhq_n_s16(v1531, 16384); 1677 int16x8_t v1533 = vaddq_s16(v701, v1532); 1678 int16x8_t v1534 = vsubq_s16(v0, v1); 1679 int16x8_t v1535 = vsubq_s16(v4, v6); 1680 int16x8_t v1536_tmp = vqrdmulhq_n_s16(v1535, 10045); 1681 int16x8_t v1536 = vaddq_s16(v1536_tmp, v1535); 1682 int16x8_t v1537 = vaddq_s16(v1534, v1536); 1683 int16x8_t v1538 = vsubq_s16(v11, v14); 1684 int16x8_t v1539 = vsubq_s16(v17, v20); 1685 int16x8_t v1540_tmp = vqrdmulhq_n_s16(v1539, 10045); 1686 int16x8_t v1540 = vaddq_s16(v1540_tmp, v1539); 1687 int16x8_t v1541 = vaddq_s16(v1538, v1540); 1688 int16x8_t v1542 = vqrdmulhq_n_s16(v1541, 19705); 1689 int16x8_t v1543 = vaddq_s16(v1537, v1542); 1690 int16x8_t v1544 = vsubq_s16(v27, v30); 1691 int16x8_t v1545 = vsubq_s16(v35, v39); 1692 int16x8_t v1546_tmp = vqrdmulhq_n_s16(v1545, 10045); 1693 int16x8_t v1546 = vaddq_s16(v1546_tmp, v1545); 1694 int16x8_t v1547 = vaddq_s16(v1544, v1546); 1695 int16x8_t v1548 = vsubq_s16(v44, v47); 1696 int16x8_t v1549 = vsubq_s16(v50, v54); 1697 int16x8_t v1550_tmp = vqrdmulhq_n_s16(v1549, 10045); 1698 int16x8_t v1550 = vaddq_s16(v1550_tmp, v1549); 1699 int16x8_t v1551 = vaddq_s16(v1548, v1550); 1700 int16x8_t v1552 = vqrdmulhq_n_s16(v1551, 19705); 1701 int16x8_t v1553 = vaddq_s16(v1547, v1552); 1702 int16x8_t v1554 = vqrdmulhq_n_s16(v1553, 17121); 1703 int16x8_t v1555 = vaddq_s16(v1543, v1554); 1704 int16x8_t v1556 = vsubq_s16(v63, v66); 1705 int16x8_t v1557 = vsubq_s16(v71, v75); 1706 int16x8_t v1558_tmp = vqrdmulhq_n_s16(v1557, 10045); 1707 int16x8_t v1558 = vaddq_s16(v1558_tmp, v1557); 1708 int16x8_t v1559 = vaddq_s16(v1556, v1558); 1709 int16x8_t v1560 = vsubq_s16(v82, v89); 1710 int16x8_t v1561 = vsubq_s16(v92, v97); 1711 int16x8_t v1562_tmp = vqrdmulhq_n_s16(v1561, 10045); 1712 int16x8_t v1562 = vaddq_s16(v1562_tmp, v1561); 1713 int16x8_t v1563 = vaddq_s16(v1560, v1562); 1714 int16x8_t v1564 = vqrdmulhq_n_s16(v1563, 19705); 1715 int16x8_t v1565 = vaddq_s16(v1559, v1564); 1716 int16x8_t v1566 = vsubq_s16(v104, v107); 1717 int16x8_t v1567 = vsubq_s16(v112, v116); 1718 int16x8_t v1568_tmp = vqrdmulhq_n_s16(v1567, 10045); 1719 int16x8_t v1568 = vaddq_s16(v1568_tmp, v1567); 1720 int16x8_t v1569 = vaddq_s16(v1566, v1568); 1721 int16x8_t v1570 = vsubq_s16(v121, v124); 1722 int16x8_t v1571 = vsubq_s16(v127, v132); 1723 int16x8_t v1572_tmp = vqrdmulhq_n_s16(v1571, 10045); 1724 int16x8_t v1572 = vaddq_s16(v1572_tmp, v1571); 1725 int16x8_t v1573 = vaddq_s16(v1570, v1572); 1726 int16x8_t v1574 = vqrdmulhq_n_s16(v1573, 19705); 1727 int16x8_t v1575 = vaddq_s16(v1569, v1574); 1728 int16x8_t v1576 = vqrdmulhq_n_s16(v1575, 17121); 1729 int16x8_t v1577 = vaddq_s16(v1565, v1576); 1730 int16x8_t v1578 = vqrdmulhq_n_s16(v1577, 16563); 1731 int16x8_t v1579 = vaddq_s16(v1555, v1578); 1732 int16x8_t v1580 = vsubq_s16(v143, v146); 1733 int16x8_t v1581 = vsubq_s16(v151, v155); 1734 int16x8_t v1582_tmp = vqrdmulhq_n_s16(v1581, 10045); 1735 int16x8_t v1582 = vaddq_s16(v1582_tmp, v1581); 1736 int16x8_t v1583 = vaddq_s16(v1580, v1582); 1737 int16x8_t v1584 = vsubq_s16(v162, v169); 1738 int16x8_t v1585 = vsubq_s16(v172, v177); 1739 int16x8_t v1586_tmp = vqrdmulhq_n_s16(v1585, 10045); 1740 int16x8_t v1586 = vaddq_s16(v1586_tmp, v1585); 1741 int16x8_t v1587 = vaddq_s16(v1584, v1586); 1742 int16x8_t v1588 = vqrdmulhq_n_s16(v1587, 19705); 1743 int16x8_t v1589 = vaddq_s16(v1583, v1588); 1744 int16x8_t v1590 = vsubq_s16(v186, v193); 1745 int16x8_t v1591 = vsubq_s16(v202, v210); 1746 int16x8_t v1592_tmp = vqrdmulhq_n_s16(v1591, 10045); 1747 int16x8_t v1592 = vaddq_s16(v1592_tmp, v1591); 1748 int16x8_t v1593 = vaddq_s16(v1590, v1592); 1749 int16x8_t v1594 = vsubq_s16(v215, v218); 1750 int16x8_t v1595 = vsubq_s16(v221, v227); 1751 int16x8_t v1596_tmp = vqrdmulhq_n_s16(v1595, 10045); 1752 int16x8_t v1596 = vaddq_s16(v1596_tmp, v1595); 1753 int16x8_t v1597 = vaddq_s16(v1594, v1596); 1754 int16x8_t v1598 = vqrdmulhq_n_s16(v1597, 19705); 1755 int16x8_t v1599 = vaddq_s16(v1593, v1598); 1756 int16x8_t v1600 = vqrdmulhq_n_s16(v1599, 17121); 1757 int16x8_t v1601 = vaddq_s16(v1589, v1600); 1758 int16x8_t v1602 = vsubq_s16(v236, v239); 1759 int16x8_t v1603 = vsubq_s16(v244, v248); 1760 int16x8_t v1604_tmp = vqrdmulhq_n_s16(v1603, 10045); 1761 int16x8_t v1604 = vaddq_s16(v1604_tmp, v1603); 1762 int16x8_t v1605 = vaddq_s16(v1602, v1604); 1763 int16x8_t v1606 = vsubq_s16(v255, v262); 1764 int16x8_t v1607 = vsubq_s16(v265, v270); 1765 int16x8_t v1608_tmp = vqrdmulhq_n_s16(v1607, 10045); 1766 int16x8_t v1608 = vaddq_s16(v1608_tmp, v1607); 1767 int16x8_t v1609 = vaddq_s16(v1606, v1608); 1768 int16x8_t v1610 = vqrdmulhq_n_s16(v1609, 19705); 1769 int16x8_t v1611 = vaddq_s16(v1605, v1610); 1770 int16x8_t v1612 = vsubq_s16(v277, v280); 1771 int16x8_t v1613 = vsubq_s16(v285, v289); 1772 int16x8_t v1614_tmp = vqrdmulhq_n_s16(v1613, 10045); 1773 int16x8_t v1614 = vaddq_s16(v1614_tmp, v1613); 1774 int16x8_t v1615 = vaddq_s16(v1612, v1614); 1775 int16x8_t v1616 = vsubq_s16(v294, v297); 1776 int16x8_t v1617 = vsubq_s16(v300, v306); 1777 int16x8_t v1618_tmp = vqrdmulhq_n_s16(v1617, 10045); 1778 int16x8_t v1618 = vaddq_s16(v1618_tmp, v1617); 1779 int16x8_t v1619 = vaddq_s16(v1616, v1618); 1780 int16x8_t v1620 = vqrdmulhq_n_s16(v1619, 19705); 1781 int16x8_t v1621 = vaddq_s16(v1615, v1620); 1782 int16x8_t v1622 = vqrdmulhq_n_s16(v1621, 17121); 1783 int16x8_t v1623 = vaddq_s16(v1611, v1622); 1784 int16x8_t v1624 = vqrdmulhq_n_s16(v1623, 16563); 1785 int16x8_t v1625 = vaddq_s16(v1601, v1624); 1786 int16x8_t v1626 = vqrdmulhq_n_s16(v1625, 16429); 1787 int16x8_t v1627 = vaddq_s16(v1579, v1626); 1788 int16x8_t v1628 = vsubq_s16(v319, v322); 1789 int16x8_t v1629 = vsubq_s16(v327, v331); 1790 int16x8_t v1630_tmp = vqrdmulhq_n_s16(v1629, 10045); 1791 int16x8_t v1630 = vaddq_s16(v1630_tmp, v1629); 1792 int16x8_t v1631 = vaddq_s16(v1628, v1630); 1793 int16x8_t v1632 = vsubq_s16(v338, v345); 1794 int16x8_t v1633 = vsubq_s16(v348, v353); 1795 int16x8_t v1634_tmp = vqrdmulhq_n_s16(v1633, 10045); 1796 int16x8_t v1634 = vaddq_s16(v1634_tmp, v1633); 1797 int16x8_t v1635 = vaddq_s16(v1632, v1634); 1798 int16x8_t v1636 = vqrdmulhq_n_s16(v1635, 19705); 1799 int16x8_t v1637 = vaddq_s16(v1631, v1636); 1800 int16x8_t v1638 = vsubq_s16(v362, v369); 1801 int16x8_t v1639 = vsubq_s16(v378, v386); 1802 int16x8_t v1640_tmp = vqrdmulhq_n_s16(v1639, 10045); 1803 int16x8_t v1640 = vaddq_s16(v1640_tmp, v1639); 1804 int16x8_t v1641 = vaddq_s16(v1638, v1640); 1805 int16x8_t v1642 = vsubq_s16(v391, v394); 1806 int16x8_t v1643 = vsubq_s16(v397, v403); 1807 int16x8_t v1644_tmp = vqrdmulhq_n_s16(v1643, 10045); 1808 int16x8_t v1644 = vaddq_s16(v1644_tmp, v1643); 1809 int16x8_t v1645 = vaddq_s16(v1642, v1644); 1810 int16x8_t v1646 = vqrdmulhq_n_s16(v1645, 19705); 1811 int16x8_t v1647 = vaddq_s16(v1641, v1646); 1812 int16x8_t v1648 = vqrdmulhq_n_s16(v1647, 17121); 1813 int16x8_t v1649 = vaddq_s16(v1637, v1648); 1814 int16x8_t v1650 = vsubq_s16(v414, v421); 1815 int16x8_t v1651 = vsubq_s16(v430, v438); 1816 int16x8_t v1652_tmp = vqrdmulhq_n_s16(v1651, 10045); 1817 int16x8_t v1652 = vaddq_s16(v1652_tmp, v1651); 1818 int16x8_t v1653 = vaddq_s16(v1650, v1652); 1819 int16x8_t v1654 = vsubq_s16(v449, v464); 1820 int16x8_t v1655 = vsubq_s16(v467, v476); 1821 int16x8_t v1656_tmp = vqrdmulhq_n_s16(v1655, 10045); 1822 int16x8_t v1656 = vaddq_s16(v1656_tmp, v1655); 1823 int16x8_t v1657 = vaddq_s16(v1654, v1656); 1824 int16x8_t v1658 = vqrdmulhq_n_s16(v1657, 19705); 1825 int16x8_t v1659 = vaddq_s16(v1653, v1658); 1826 int16x8_t v1660 = vsubq_s16(v483, v486); 1827 int16x8_t v1661 = vsubq_s16(v491, v495); 1828 int16x8_t v1662_tmp = vqrdmulhq_n_s16(v1661, 10045); 1829 int16x8_t v1662 = vaddq_s16(v1662_tmp, v1661); 1830 int16x8_t v1663 = vaddq_s16(v1660, v1662); 1831 int16x8_t v1664 = vsubq_s16(v500, v503); 1832 int16x8_t v1665 = vsubq_s16(v506, v513); 1833 int16x8_t v1666_tmp = vqrdmulhq_n_s16(v1665, 10045); 1834 int16x8_t v1666 = vaddq_s16(v1666_tmp, v1665); 1835 int16x8_t v1667 = vaddq_s16(v1664, v1666); 1836 int16x8_t v1668 = vqrdmulhq_n_s16(v1667, 19705); 1837 int16x8_t v1669 = vaddq_s16(v1663, v1668); 1838 int16x8_t v1670 = vqrdmulhq_n_s16(v1669, 17121); 1839 int16x8_t v1671 = vaddq_s16(v1659, v1670); 1840 int16x8_t v1672 = vqrdmulhq_n_s16(v1671, 16563); 1841 int16x8_t v1673 = vaddq_s16(v1649, v1672); 1842 int16x8_t v1674 = vsubq_s16(v524, v527); 1843 int16x8_t v1675 = vsubq_s16(v532, v536); 1844 int16x8_t v1676_tmp = vqrdmulhq_n_s16(v1675, 10045); 1845 int16x8_t v1676 = vaddq_s16(v1676_tmp, v1675); 1846 int16x8_t v1677 = vaddq_s16(v1674, v1676); 1847 int16x8_t v1678 = vsubq_s16(v543, v550); 1848 int16x8_t v1679 = vsubq_s16(v553, v558); 1849 int16x8_t v1680_tmp = vqrdmulhq_n_s16(v1679, 10045); 1850 int16x8_t v1680 = vaddq_s16(v1680_tmp, v1679); 1851 int16x8_t v1681 = vaddq_s16(v1678, v1680); 1852 int16x8_t v1682 = vqrdmulhq_n_s16(v1681, 19705); 1853 int16x8_t v1683 = vaddq_s16(v1677, v1682); 1854 int16x8_t v1684 = vsubq_s16(v567, v574); 1855 int16x8_t v1685 = vsubq_s16(v583, v591); 1856 int16x8_t v1686_tmp = vqrdmulhq_n_s16(v1685, 10045); 1857 int16x8_t v1686 = vaddq_s16(v1686_tmp, v1685); 1858 int16x8_t v1687 = vaddq_s16(v1684, v1686); 1859 int16x8_t v1688 = vsubq_s16(v596, v599); 1860 int16x8_t v1689 = vsubq_s16(v602, v608); 1861 int16x8_t v1690_tmp = vqrdmulhq_n_s16(v1689, 10045); 1862 int16x8_t v1690 = vaddq_s16(v1690_tmp, v1689); 1863 int16x8_t v1691 = vaddq_s16(v1688, v1690); 1864 int16x8_t v1692 = vqrdmulhq_n_s16(v1691, 19705); 1865 int16x8_t v1693 = vaddq_s16(v1687, v1692); 1866 int16x8_t v1694 = vqrdmulhq_n_s16(v1693, 17121); 1867 int16x8_t v1695 = vaddq_s16(v1683, v1694); 1868 int16x8_t v1696 = vsubq_s16(v617, v620); 1869 int16x8_t v1697 = vsubq_s16(v625, v629); 1870 int16x8_t v1698_tmp = vqrdmulhq_n_s16(v1697, 10045); 1871 int16x8_t v1698 = vaddq_s16(v1698_tmp, v1697); 1872 int16x8_t v1699 = vaddq_s16(v1696, v1698); 1873 int16x8_t v1700 = vsubq_s16(v636, v643); 1874 int16x8_t v1701 = vsubq_s16(v646, v651); 1875 int16x8_t v1702_tmp = vqrdmulhq_n_s16(v1701, 10045); 1876 int16x8_t v1702 = vaddq_s16(v1702_tmp, v1701); 1877 int16x8_t v1703 = vaddq_s16(v1700, v1702); 1878 int16x8_t v1704 = vqrdmulhq_n_s16(v1703, 19705); 1879 int16x8_t v1705 = vaddq_s16(v1699, v1704); 1880 int16x8_t v1706 = vsubq_s16(v658, v661); 1881 int16x8_t v1707 = vsubq_s16(v666, v670); 1882 int16x8_t v1708_tmp = vqrdmulhq_n_s16(v1707, 10045); 1883 int16x8_t v1708 = vaddq_s16(v1708_tmp, v1707); 1884 int16x8_t v1709 = vaddq_s16(v1706, v1708); 1885 int16x8_t v1710 = vsubq_s16(v675, v678); 1886 int16x8_t v1711 = vsubq_s16(v681, v688); 1887 int16x8_t v1712_tmp = vqrdmulhq_n_s16(v1711, 10045); 1888 int16x8_t v1712 = vaddq_s16(v1712_tmp, v1711); 1889 int16x8_t v1713 = vaddq_s16(v1710, v1712); 1890 int16x8_t v1714 = vqrdmulhq_n_s16(v1713, 19705); 1891 int16x8_t v1715 = vaddq_s16(v1709, v1714); 1892 int16x8_t v1716 = vqrdmulhq_n_s16(v1715, 17121); 1893 int16x8_t v1717 = vaddq_s16(v1705, v1716); 1894 int16x8_t v1718 = vqrdmulhq_n_s16(v1717, 16563); 1895 int16x8_t v1719 = vaddq_s16(v1695, v1718); 1896 int16x8_t v1720 = vqrdmulhq_n_s16(v1719, 16429); 1897 int16x8_t v1721 = vaddq_s16(v1673, v1720); 1898 int16x8_t v1722 = vqrdmulhq_n_s16(v1721, 16395); 1899 int16x8_t v1723 = vaddq_s16(v1627, v1722); 1900 int16x8_t v1724 = vsubq_s16(v703, v706); 1901 int16x8_t v1725 = vsubq_s16(v711, v715); 1902 int16x8_t v1726_tmp = vqrdmulhq_n_s16(v1725, 10045); 1903 int16x8_t v1726 = vaddq_s16(v1726_tmp, v1725); 1904 int16x8_t v1727 = vaddq_s16(v1724, v1726); 1905 int16x8_t v1728 = vsubq_s16(v722, v729); 1906 int16x8_t v1729 = vsubq_s16(v732, v737); 1907 int16x8_t v1730_tmp = vqrdmulhq_n_s16(v1729, 10045); 1908 int16x8_t v1730 = vaddq_s16(v1730_tmp, v1729); 1909 int16x8_t v1731 = vaddq_s16(v1728, v1730); 1910 int16x8_t v1732 = vqrdmulhq_n_s16(v1731, 19705); 1911 int16x8_t v1733 = vaddq_s16(v1727, v1732); 1912 int16x8_t v1734 = vsubq_s16(v746, v753); 1913 int16x8_t v1735 = vsubq_s16(v762, v770); 1914 int16x8_t v1736_tmp = vqrdmulhq_n_s16(v1735, 10045); 1915 int16x8_t v1736 = vaddq_s16(v1736_tmp, v1735); 1916 int16x8_t v1737 = vaddq_s16(v1734, v1736); 1917 int16x8_t v1738 = vsubq_s16(v775, v778); 1918 int16x8_t v1739 = vsubq_s16(v781, v787); 1919 int16x8_t v1740_tmp = vqrdmulhq_n_s16(v1739, 10045); 1920 int16x8_t v1740 = vaddq_s16(v1740_tmp, v1739); 1921 int16x8_t v1741 = vaddq_s16(v1738, v1740); 1922 int16x8_t v1742 = vqrdmulhq_n_s16(v1741, 19705); 1923 int16x8_t v1743 = vaddq_s16(v1737, v1742); 1924 int16x8_t v1744 = vqrdmulhq_n_s16(v1743, 17121); 1925 int16x8_t v1745 = vaddq_s16(v1733, v1744); 1926 int16x8_t v1746 = vsubq_s16(v798, v805); 1927 int16x8_t v1747 = vsubq_s16(v814, v822); 1928 int16x8_t v1748_tmp = vqrdmulhq_n_s16(v1747, 10045); 1929 int16x8_t v1748 = vaddq_s16(v1748_tmp, v1747); 1930 int16x8_t v1749 = vaddq_s16(v1746, v1748); 1931 int16x8_t v1750 = vsubq_s16(v833, v848); 1932 int16x8_t v1751 = vsubq_s16(v851, v860); 1933 int16x8_t v1752_tmp = vqrdmulhq_n_s16(v1751, 10045); 1934 int16x8_t v1752 = vaddq_s16(v1752_tmp, v1751); 1935 int16x8_t v1753 = vaddq_s16(v1750, v1752); 1936 int16x8_t v1754 = vqrdmulhq_n_s16(v1753, 19705); 1937 int16x8_t v1755 = vaddq_s16(v1749, v1754); 1938 int16x8_t v1756 = vsubq_s16(v867, v870); 1939 int16x8_t v1757 = vsubq_s16(v875, v879); 1940 int16x8_t v1758_tmp = vqrdmulhq_n_s16(v1757, 10045); 1941 int16x8_t v1758 = vaddq_s16(v1758_tmp, v1757); 1942 int16x8_t v1759 = vaddq_s16(v1756, v1758); 1943 int16x8_t v1760 = vsubq_s16(v884, v887); 1944 int16x8_t v1761 = vsubq_s16(v890, v897); 1945 int16x8_t v1762_tmp = vqrdmulhq_n_s16(v1761, 10045); 1946 int16x8_t v1762 = vaddq_s16(v1762_tmp, v1761); 1947 int16x8_t v1763 = vaddq_s16(v1760, v1762); 1948 int16x8_t v1764 = vqrdmulhq_n_s16(v1763, 19705); 1949 int16x8_t v1765 = vaddq_s16(v1759, v1764); 1950 int16x8_t v1766 = vqrdmulhq_n_s16(v1765, 17121); 1951 int16x8_t v1767 = vaddq_s16(v1755, v1766); 1952 int16x8_t v1768 = vqrdmulhq_n_s16(v1767, 16563); 1953 int16x8_t v1769 = vaddq_s16(v1745, v1768); 1954 int16x8_t v1770 = vsubq_s16(v910, v917); 1955 int16x8_t v1771 = vsubq_s16(v926, v934); 1956 int16x8_t v1772_tmp = vqrdmulhq_n_s16(v1771, 10045); 1957 int16x8_t v1772 = vaddq_s16(v1772_tmp, v1771); 1958 int16x8_t v1773 = vaddq_s16(v1770, v1772); 1959 int16x8_t v1774 = vsubq_s16(v945, v960); 1960 int16x8_t v1775 = vsubq_s16(v963, v972); 1961 int16x8_t v1776_tmp = vqrdmulhq_n_s16(v1775, 10045); 1962 int16x8_t v1776 = vaddq_s16(v1776_tmp, v1775); 1963 int16x8_t v1777 = vaddq_s16(v1774, v1776); 1964 int16x8_t v1778 = vqrdmulhq_n_s16(v1777, 19705); 1965 int16x8_t v1779 = vaddq_s16(v1773, v1778); 1966 int16x8_t v1780 = vsubq_s16(v985, v1000); 1967 int16x8_t v1781 = vsubq_s16(v1017, v1033); 1968 int16x8_t v1782_tmp = vqrdmulhq_n_s16(v1781, 10045); 1969 int16x8_t v1782 = vaddq_s16(v1782_tmp, v1781); 1970 int16x8_t v1783 = vaddq_s16(v1780, v1782); 1971 int16x8_t v1784 = vsubq_s16(v1038, v1041); 1972 int16x8_t v1785 = vsubq_s16(v1044, v1054); 1973 int16x8_t v1786_tmp = vqrdmulhq_n_s16(v1785, 10045); 1974 int16x8_t v1786 = vaddq_s16(v1786_tmp, v1785); 1975 int16x8_t v1787 = vaddq_s16(v1784, v1786); 1976 int16x8_t v1788 = vqrdmulhq_n_s16(v1787, 19705); 1977 int16x8_t v1789 = vaddq_s16(v1783, v1788); 1978 int16x8_t v1790 = vqrdmulhq_n_s16(v1789, 17121); 1979 int16x8_t v1791 = vaddq_s16(v1779, v1790); 1980 int16x8_t v1792 = vsubq_s16(v1063, v1066); 1981 int16x8_t v1793 = vsubq_s16(v1071, v1075); 1982 int16x8_t v1794_tmp = vqrdmulhq_n_s16(v1793, 10045); 1983 int16x8_t v1794 = vaddq_s16(v1794_tmp, v1793); 1984 int16x8_t v1795 = vaddq_s16(v1792, v1794); 1985 int16x8_t v1796 = vsubq_s16(v1082, v1089); 1986 int16x8_t v1797 = vsubq_s16(v1092, v1097); 1987 int16x8_t v1798_tmp = vqrdmulhq_n_s16(v1797, 10045); 1988 int16x8_t v1798 = vaddq_s16(v1798_tmp, v1797); 1989 int16x8_t v1799 = vaddq_s16(v1796, v1798); 1990 int16x8_t v1800 = vqrdmulhq_n_s16(v1799, 19705); 1991 int16x8_t v1801 = vaddq_s16(v1795, v1800); 1992 int16x8_t v1802 = vsubq_s16(v1104, v1107); 1993 int16x8_t v1803 = vsubq_s16(v1112, v1116); 1994 int16x8_t v1804_tmp = vqrdmulhq_n_s16(v1803, 10045); 1995 int16x8_t v1804 = vaddq_s16(v1804_tmp, v1803); 1996 int16x8_t v1805 = vaddq_s16(v1802, v1804); 1997 int16x8_t v1806 = vsubq_s16(v1121, v1124); 1998 int16x8_t v1807 = vsubq_s16(v1127, v1135); 1999 int16x8_t v1808_tmp = vqrdmulhq_n_s16(v1807, 10045); 2000 int16x8_t v1808 = vaddq_s16(v1808_tmp, v1807); 2001 int16x8_t v1809 = vaddq_s16(v1806, v1808); 2002 int16x8_t v1810 = vqrdmulhq_n_s16(v1809, 19705); 2003 int16x8_t v1811 = vaddq_s16(v1805, v1810); 2004 int16x8_t v1812 = vqrdmulhq_n_s16(v1811, 17121); 2005 int16x8_t v1813 = vaddq_s16(v1801, v1812); 2006 int16x8_t v1814 = vqrdmulhq_n_s16(v1813, 16563); 2007 int16x8_t v1815 = vaddq_s16(v1791, v1814); 2008 int16x8_t v1816 = vqrdmulhq_n_s16(v1815, 16429); 2009 int16x8_t v1817 = vaddq_s16(v1769, v1816); 2010 int16x8_t v1818 = vsubq_s16(v1148, v1151); 2011 int16x8_t v1819 = vsubq_s16(v1156, v1160); 2012 int16x8_t v1820_tmp = vqrdmulhq_n_s16(v1819, 10045); 2013 int16x8_t v1820 = vaddq_s16(v1820_tmp, v1819); 2014 int16x8_t v1821 = vaddq_s16(v1818, v1820); 2015 int16x8_t v1822 = vsubq_s16(v1167, v1174); 2016 int16x8_t v1823 = vsubq_s16(v1177, v1182); 2017 int16x8_t v1824_tmp = vqrdmulhq_n_s16(v1823, 10045); 2018 int16x8_t v1824 = vaddq_s16(v1824_tmp, v1823); 2019 int16x8_t v1825 = vaddq_s16(v1822, v1824); 2020 int16x8_t v1826 = vqrdmulhq_n_s16(v1825, 19705); 2021 int16x8_t v1827 = vaddq_s16(v1821, v1826); 2022 int16x8_t v1828 = vsubq_s16(v1191, v1198); 2023 int16x8_t v1829 = vsubq_s16(v1207, v1215); 2024 int16x8_t v1830_tmp = vqrdmulhq_n_s16(v1829, 10045); 2025 int16x8_t v1830 = vaddq_s16(v1830_tmp, v1829); 2026 int16x8_t v1831 = vaddq_s16(v1828, v1830); 2027 int16x8_t v1832 = vsubq_s16(v1220, v1223); 2028 int16x8_t v1833 = vsubq_s16(v1226, v1232); 2029 int16x8_t v1834_tmp = vqrdmulhq_n_s16(v1833, 10045); 2030 int16x8_t v1834 = vaddq_s16(v1834_tmp, v1833); 2031 int16x8_t v1835 = vaddq_s16(v1832, v1834); 2032 int16x8_t v1836 = vqrdmulhq_n_s16(v1835, 19705); 2033 int16x8_t v1837 = vaddq_s16(v1831, v1836); 2034 int16x8_t v1838 = vqrdmulhq_n_s16(v1837, 17121); 2035 int16x8_t v1839 = vaddq_s16(v1827, v1838); 2036 int16x8_t v1840 = vsubq_s16(v1243, v1250); 2037 int16x8_t v1841 = vsubq_s16(v1259, v1267); 2038 int16x8_t v1842_tmp = vqrdmulhq_n_s16(v1841, 10045); 2039 int16x8_t v1842 = vaddq_s16(v1842_tmp, v1841); 2040 int16x8_t v1843 = vaddq_s16(v1840, v1842); 2041 int16x8_t v1844 = vsubq_s16(v1278, v1293); 2042 int16x8_t v1845 = vsubq_s16(v1296, v1305); 2043 int16x8_t v1846_tmp = vqrdmulhq_n_s16(v1845, 10045); 2044 int16x8_t v1846 = vaddq_s16(v1846_tmp, v1845); 2045 int16x8_t v1847 = vaddq_s16(v1844, v1846); 2046 int16x8_t v1848 = vqrdmulhq_n_s16(v1847, 19705); 2047 int16x8_t v1849 = vaddq_s16(v1843, v1848); 2048 int16x8_t v1850 = vsubq_s16(v1312, v1315); 2049 int16x8_t v1851 = vsubq_s16(v1320, v1324); 2050 int16x8_t v1852_tmp = vqrdmulhq_n_s16(v1851, 10045); 2051 int16x8_t v1852 = vaddq_s16(v1852_tmp, v1851); 2052 int16x8_t v1853 = vaddq_s16(v1850, v1852); 2053 int16x8_t v1854 = vsubq_s16(v1329, v1332); 2054 int16x8_t v1855 = vsubq_s16(v1335, v1342); 2055 int16x8_t v1856_tmp = vqrdmulhq_n_s16(v1855, 10045); 2056 int16x8_t v1856 = vaddq_s16(v1856_tmp, v1855); 2057 int16x8_t v1857 = vaddq_s16(v1854, v1856); 2058 int16x8_t v1858 = vqrdmulhq_n_s16(v1857, 19705); 2059 int16x8_t v1859 = vaddq_s16(v1853, v1858); 2060 int16x8_t v1860 = vqrdmulhq_n_s16(v1859, 17121); 2061 int16x8_t v1861 = vaddq_s16(v1849, v1860); 2062 int16x8_t v1862 = vqrdmulhq_n_s16(v1861, 16563); 2063 int16x8_t v1863 = vaddq_s16(v1839, v1862); 2064 int16x8_t v1864 = vsubq_s16(v1353, v1356); 2065 int16x8_t v1865 = vsubq_s16(v1361, v1365); 2066 int16x8_t v1866_tmp = vqrdmulhq_n_s16(v1865, 10045); 2067 int16x8_t v1866 = vaddq_s16(v1866_tmp, v1865); 2068 int16x8_t v1867 = vaddq_s16(v1864, v1866); 2069 int16x8_t v1868 = vsubq_s16(v1372, v1379); 2070 int16x8_t v1869 = vsubq_s16(v1382, v1387); 2071 int16x8_t v1870_tmp = vqrdmulhq_n_s16(v1869, 10045); 2072 int16x8_t v1870 = vaddq_s16(v1870_tmp, v1869); 2073 int16x8_t v1871 = vaddq_s16(v1868, v1870); 2074 int16x8_t v1872 = vqrdmulhq_n_s16(v1871, 19705); 2075 int16x8_t v1873 = vaddq_s16(v1867, v1872); 2076 int16x8_t v1874 = vsubq_s16(v1396, v1403); 2077 int16x8_t v1875 = vsubq_s16(v1412, v1420); 2078 int16x8_t v1876_tmp = vqrdmulhq_n_s16(v1875, 10045); 2079 int16x8_t v1876 = vaddq_s16(v1876_tmp, v1875); 2080 int16x8_t v1877 = vaddq_s16(v1874, v1876); 2081 int16x8_t v1878 = vsubq_s16(v1425, v1428); 2082 int16x8_t v1879 = vsubq_s16(v1431, v1437); 2083 int16x8_t v1880_tmp = vqrdmulhq_n_s16(v1879, 10045); 2084 int16x8_t v1880 = vaddq_s16(v1880_tmp, v1879); 2085 int16x8_t v1881 = vaddq_s16(v1878, v1880); 2086 int16x8_t v1882 = vqrdmulhq_n_s16(v1881, 19705); 2087 int16x8_t v1883 = vaddq_s16(v1877, v1882); 2088 int16x8_t v1884 = vqrdmulhq_n_s16(v1883, 17121); 2089 int16x8_t v1885 = vaddq_s16(v1873, v1884); 2090 int16x8_t v1886 = vsubq_s16(v1446, v1449); 2091 int16x8_t v1887 = vsubq_s16(v1454, v1458); 2092 int16x8_t v1888_tmp = vqrdmulhq_n_s16(v1887, 10045); 2093 int16x8_t v1888 = vaddq_s16(v1888_tmp, v1887); 2094 int16x8_t v1889 = vaddq_s16(v1886, v1888); 2095 int16x8_t v1890 = vsubq_s16(v1465, v1472); 2096 int16x8_t v1891 = vsubq_s16(v1475, v1480); 2097 int16x8_t v1892_tmp = vqrdmulhq_n_s16(v1891, 10045); 2098 int16x8_t v1892 = vaddq_s16(v1892_tmp, v1891); 2099 int16x8_t v1893 = vaddq_s16(v1890, v1892); 2100 int16x8_t v1894 = vqrdmulhq_n_s16(v1893, 19705); 2101 int16x8_t v1895 = vaddq_s16(v1889, v1894); 2102 int16x8_t v1896 = vsubq_s16(v1487, v1490); 2103 int16x8_t v1897 = vsubq_s16(v1495, v1499); 2104 int16x8_t v1898_tmp = vqrdmulhq_n_s16(v1897, 10045); 2105 int16x8_t v1898 = vaddq_s16(v1898_tmp, v1897); 2106 int16x8_t v1899 = vaddq_s16(v1896, v1898); 2107 int16x8_t v1900 = vsubq_s16(v1504, v1507); 2108 int16x8_t v1901 = vsubq_s16(v1510, v1518); 2109 int16x8_t v1902_tmp = vqrdmulhq_n_s16(v1901, 10045); 2110 int16x8_t v1902 = vaddq_s16(v1902_tmp, v1901); 2111 int16x8_t v1903 = vaddq_s16(v1900, v1902); 2112 int16x8_t v1904 = vqrdmulhq_n_s16(v1903, 19705); 2113 int16x8_t v1905 = vaddq_s16(v1899, v1904); 2114 int16x8_t v1906 = vqrdmulhq_n_s16(v1905, 17121); 2115 int16x8_t v1907 = vaddq_s16(v1895, v1906); 2116 int16x8_t v1908 = vqrdmulhq_n_s16(v1907, 16563); 2117 int16x8_t v1909 = vaddq_s16(v1885, v1908); 2118 int16x8_t v1910 = vqrdmulhq_n_s16(v1909, 16429); 2119 int16x8_t v1911 = vaddq_s16(v1863, v1910); 2120 int16x8_t v1912 = vqrdmulhq_n_s16(v1911, 16395); 2121 int16x8_t v1913 = vaddq_s16(v1817, v1912); 2122 int16x8_t v1914 = vqrdmulhq_n_s16(v1913, 16387); 2123 int16x8_t v1915 = vaddq_s16(v1723, v1914); 2124 int16x8_t v1916 = vsubq_s16(v1534, v1536); 2125 int16x8_t v1917 = vsubq_s16(v1538, v1540); 2126 int16x8_t v1918 = vqrdmulhq_n_s16(v1917, 29490); 2127 int16x8_t v1919 = vaddq_s16(v1916, v1918); 2128 int16x8_t v1920 = vsubq_s16(v1544, v1546); 2129 int16x8_t v1921 = vsubq_s16(v1548, v1550); 2130 int16x8_t v1922 = vqrdmulhq_n_s16(v1921, 29490); 2131 int16x8_t v1923 = vaddq_s16(v1920, v1922); 2132 int16x8_t v1924 = vqrdmulhq_n_s16(v1923, 18578); 2133 int16x8_t v1925 = vaddq_s16(v1919, v1924); 2134 int16x8_t v1926 = vsubq_s16(v1556, v1558); 2135 int16x8_t v1927 = vsubq_s16(v1560, v1562); 2136 int16x8_t v1928 = vqrdmulhq_n_s16(v1927, 29490); 2137 int16x8_t v1929 = vaddq_s16(v1926, v1928); 2138 int16x8_t v1930 = vsubq_s16(v1566, v1568); 2139 int16x8_t v1931 = vsubq_s16(v1570, v1572); 2140 int16x8_t v1932 = vqrdmulhq_n_s16(v1931, 29490); 2141 int16x8_t v1933 = vaddq_s16(v1930, v1932); 2142 int16x8_t v1934 = vqrdmulhq_n_s16(v1933, 18578); 2143 int16x8_t v1935 = vaddq_s16(v1929, v1934); 2144 int16x8_t v1936 = vqrdmulhq_n_s16(v1935, 16890); 2145 int16x8_t v1937 = vaddq_s16(v1925, v1936); 2146 int16x8_t v1938 = vsubq_s16(v1580, v1582); 2147 int16x8_t v1939 = vsubq_s16(v1584, v1586); 2148 int16x8_t v1940 = vqrdmulhq_n_s16(v1939, 29490); 2149 int16x8_t v1941 = vaddq_s16(v1938, v1940); 2150 int16x8_t v1942 = vsubq_s16(v1590, v1592); 2151 int16x8_t v1943 = vsubq_s16(v1594, v1596); 2152 int16x8_t v1944 = vqrdmulhq_n_s16(v1943, 29490); 2153 int16x8_t v1945 = vaddq_s16(v1942, v1944); 2154 int16x8_t v1946 = vqrdmulhq_n_s16(v1945, 18578); 2155 int16x8_t v1947 = vaddq_s16(v1941, v1946); 2156 int16x8_t v1948 = vsubq_s16(v1602, v1604); 2157 int16x8_t v1949 = vsubq_s16(v1606, v1608); 2158 int16x8_t v1950 = vqrdmulhq_n_s16(v1949, 29490); 2159 int16x8_t v1951 = vaddq_s16(v1948, v1950); 2160 int16x8_t v1952 = vsubq_s16(v1612, v1614); 2161 int16x8_t v1953 = vsubq_s16(v1616, v1618); 2162 int16x8_t v1954 = vqrdmulhq_n_s16(v1953, 29490); 2163 int16x8_t v1955 = vaddq_s16(v1952, v1954); 2164 int16x8_t v1956 = vqrdmulhq_n_s16(v1955, 18578); 2165 int16x8_t v1957 = vaddq_s16(v1951, v1956); 2166 int16x8_t v1958 = vqrdmulhq_n_s16(v1957, 16890); 2167 int16x8_t v1959 = vaddq_s16(v1947, v1958); 2168 int16x8_t v1960 = vqrdmulhq_n_s16(v1959, 16508); 2169 int16x8_t v1961 = vaddq_s16(v1937, v1960); 2170 int16x8_t v1962 = vsubq_s16(v1628, v1630); 2171 int16x8_t v1963 = vsubq_s16(v1632, v1634); 2172 int16x8_t v1964 = vqrdmulhq_n_s16(v1963, 29490); 2173 int16x8_t v1965 = vaddq_s16(v1962, v1964); 2174 int16x8_t v1966 = vsubq_s16(v1638, v1640); 2175 int16x8_t v1967 = vsubq_s16(v1642, v1644); 2176 int16x8_t v1968 = vqrdmulhq_n_s16(v1967, 29490); 2177 int16x8_t v1969 = vaddq_s16(v1966, v1968); 2178 int16x8_t v1970 = vqrdmulhq_n_s16(v1969, 18578); 2179 int16x8_t v1971 = vaddq_s16(v1965, v1970); 2180 int16x8_t v1972 = vsubq_s16(v1650, v1652); 2181 int16x8_t v1973 = vsubq_s16(v1654, v1656); 2182 int16x8_t v1974 = vqrdmulhq_n_s16(v1973, 29490); 2183 int16x8_t v1975 = vaddq_s16(v1972, v1974); 2184 int16x8_t v1976 = vsubq_s16(v1660, v1662); 2185 int16x8_t v1977 = vsubq_s16(v1664, v1666); 2186 int16x8_t v1978 = vqrdmulhq_n_s16(v1977, 29490); 2187 int16x8_t v1979 = vaddq_s16(v1976, v1978); 2188 int16x8_t v1980 = vqrdmulhq_n_s16(v1979, 18578); 2189 int16x8_t v1981 = vaddq_s16(v1975, v1980); 2190 int16x8_t v1982 = vqrdmulhq_n_s16(v1981, 16890); 2191 int16x8_t v1983 = vaddq_s16(v1971, v1982); 2192 int16x8_t v1984 = vsubq_s16(v1674, v1676); 2193 int16x8_t v1985 = vsubq_s16(v1678, v1680); 2194 int16x8_t v1986 = vqrdmulhq_n_s16(v1985, 29490); 2195 int16x8_t v1987 = vaddq_s16(v1984, v1986); 2196 int16x8_t v1988 = vsubq_s16(v1684, v1686); 2197 int16x8_t v1989 = vsubq_s16(v1688, v1690); 2198 int16x8_t v1990 = vqrdmulhq_n_s16(v1989, 29490); 2199 int16x8_t v1991 = vaddq_s16(v1988, v1990); 2200 int16x8_t v1992 = vqrdmulhq_n_s16(v1991, 18578); 2201 int16x8_t v1993 = vaddq_s16(v1987, v1992); 2202 int16x8_t v1994 = vsubq_s16(v1696, v1698); 2203 int16x8_t v1995 = vsubq_s16(v1700, v1702); 2204 int16x8_t v1996 = vqrdmulhq_n_s16(v1995, 29490); 2205 int16x8_t v1997 = vaddq_s16(v1994, v1996); 2206 int16x8_t v1998 = vsubq_s16(v1706, v1708); 2207 int16x8_t v1999 = vsubq_s16(v1710, v1712); 2208 int16x8_t v2000 = vqrdmulhq_n_s16(v1999, 29490); 2209 int16x8_t v2001 = vaddq_s16(v1998, v2000); 2210 int16x8_t v2002 = vqrdmulhq_n_s16(v2001, 18578); 2211 int16x8_t v2003 = vaddq_s16(v1997, v2002); 2212 int16x8_t v2004 = vqrdmulhq_n_s16(v2003, 16890); 2213 int16x8_t v2005 = vaddq_s16(v1993, v2004); 2214 int16x8_t v2006 = vqrdmulhq_n_s16(v2005, 16508); 2215 int16x8_t v2007 = vaddq_s16(v1983, v2006); 2216 int16x8_t v2008 = vqrdmulhq_n_s16(v2007, 16415); 2217 int16x8_t v2009 = vaddq_s16(v1961, v2008); 2218 int16x8_t v2010 = vsubq_s16(v1724, v1726); 2219 int16x8_t v2011 = vsubq_s16(v1728, v1730); 2220 int16x8_t v2012 = vqrdmulhq_n_s16(v2011, 29490); 2221 int16x8_t v2013 = vaddq_s16(v2010, v2012); 2222 int16x8_t v2014 = vsubq_s16(v1734, v1736); 2223 int16x8_t v2015 = vsubq_s16(v1738, v1740); 2224 int16x8_t v2016 = vqrdmulhq_n_s16(v2015, 29490); 2225 int16x8_t v2017 = vaddq_s16(v2014, v2016); 2226 int16x8_t v2018 = vqrdmulhq_n_s16(v2017, 18578); 2227 int16x8_t v2019 = vaddq_s16(v2013, v2018); 2228 int16x8_t v2020 = vsubq_s16(v1746, v1748); 2229 int16x8_t v2021 = vsubq_s16(v1750, v1752); 2230 int16x8_t v2022 = vqrdmulhq_n_s16(v2021, 29490); 2231 int16x8_t v2023 = vaddq_s16(v2020, v2022); 2232 int16x8_t v2024 = vsubq_s16(v1756, v1758); 2233 int16x8_t v2025 = vsubq_s16(v1760, v1762); 2234 int16x8_t v2026 = vqrdmulhq_n_s16(v2025, 29490); 2235 int16x8_t v2027 = vaddq_s16(v2024, v2026); 2236 int16x8_t v2028 = vqrdmulhq_n_s16(v2027, 18578); 2237 int16x8_t v2029 = vaddq_s16(v2023, v2028); 2238 int16x8_t v2030 = vqrdmulhq_n_s16(v2029, 16890); 2239 int16x8_t v2031 = vaddq_s16(v2019, v2030); 2240 int16x8_t v2032 = vsubq_s16(v1770, v1772); 2241 int16x8_t v2033 = vsubq_s16(v1774, v1776); 2242 int16x8_t v2034 = vqrdmulhq_n_s16(v2033, 29490); 2243 int16x8_t v2035 = vaddq_s16(v2032, v2034); 2244 int16x8_t v2036 = vsubq_s16(v1780, v1782); 2245 int16x8_t v2037 = vsubq_s16(v1784, v1786); 2246 int16x8_t v2038 = vqrdmulhq_n_s16(v2037, 29490); 2247 int16x8_t v2039 = vaddq_s16(v2036, v2038); 2248 int16x8_t v2040 = vqrdmulhq_n_s16(v2039, 18578); 2249 int16x8_t v2041 = vaddq_s16(v2035, v2040); 2250 int16x8_t v2042 = vsubq_s16(v1792, v1794); 2251 int16x8_t v2043 = vsubq_s16(v1796, v1798); 2252 int16x8_t v2044 = vqrdmulhq_n_s16(v2043, 29490); 2253 int16x8_t v2045 = vaddq_s16(v2042, v2044); 2254 int16x8_t v2046 = vsubq_s16(v1802, v1804); 2255 int16x8_t v2047 = vsubq_s16(v1806, v1808); 2256 int16x8_t v2048 = vqrdmulhq_n_s16(v2047, 29490); 2257 int16x8_t v2049 = vaddq_s16(v2046, v2048); 2258 int16x8_t v2050 = vqrdmulhq_n_s16(v2049, 18578); 2259 int16x8_t v2051 = vaddq_s16(v2045, v2050); 2260 int16x8_t v2052 = vqrdmulhq_n_s16(v2051, 16890); 2261 int16x8_t v2053 = vaddq_s16(v2041, v2052); 2262 int16x8_t v2054 = vqrdmulhq_n_s16(v2053, 16508); 2263 int16x8_t v2055 = vaddq_s16(v2031, v2054); 2264 int16x8_t v2056 = vsubq_s16(v1818, v1820); 2265 int16x8_t v2057 = vsubq_s16(v1822, v1824); 2266 int16x8_t v2058 = vqrdmulhq_n_s16(v2057, 29490); 2267 int16x8_t v2059 = vaddq_s16(v2056, v2058); 2268 int16x8_t v2060 = vsubq_s16(v1828, v1830); 2269 int16x8_t v2061 = vsubq_s16(v1832, v1834); 2270 int16x8_t v2062 = vqrdmulhq_n_s16(v2061, 29490); 2271 int16x8_t v2063 = vaddq_s16(v2060, v2062); 2272 int16x8_t v2064 = vqrdmulhq_n_s16(v2063, 18578); 2273 int16x8_t v2065 = vaddq_s16(v2059, v2064); 2274 int16x8_t v2066 = vsubq_s16(v1840, v1842); 2275 int16x8_t v2067 = vsubq_s16(v1844, v1846); 2276 int16x8_t v2068 = vqrdmulhq_n_s16(v2067, 29490); 2277 int16x8_t v2069 = vaddq_s16(v2066, v2068); 2278 int16x8_t v2070 = vsubq_s16(v1850, v1852); 2279 int16x8_t v2071 = vqrdmulhq_n_s16(v2070, 18578); 2280 int16x8_t v2072 = vsubq_s16(v1854, v1856); 2281 int16x8_t v2073 = vqrdmulhq_n_s16(v2072, 16719); 2282 int16x8_t v2074 = vaddq_s16(v2071, v2073); 2283 int16x8_t v2075 = vaddq_s16(v2069, v2074); 2284 int16x8_t v2076 = vqrdmulhq_n_s16(v2075, 16890); 2285 int16x8_t v2077 = vaddq_s16(v2065, v2076); 2286 int16x8_t v2078 = vsubq_s16(v1864, v1866); 2287 int16x8_t v2079 = vsubq_s16(v1868, v1870); 2288 int16x8_t v2080 = vqrdmulhq_n_s16(v2079, 29490); 2289 int16x8_t v2081 = vaddq_s16(v2078, v2080); 2290 int16x8_t v2082 = vsubq_s16(v1874, v1876); 2291 int16x8_t v2083 = vsubq_s16(v1878, v1880); 2292 int16x8_t v2084 = vqrdmulhq_n_s16(v2083, 29490); 2293 int16x8_t v2085 = vaddq_s16(v2082, v2084); 2294 int16x8_t v2086 = vqrdmulhq_n_s16(v2085, 18578); 2295 int16x8_t v2087 = vaddq_s16(v2081, v2086); 2296 int16x8_t v2088 = vsubq_s16(v1886, v1888); 2297 int16x8_t v2089 = vsubq_s16(v1890, v1892); 2298 int16x8_t v2090 = vqrdmulhq_n_s16(v2089, 29490); 2299 int16x8_t v2091 = vaddq_s16(v2088, v2090); 2300 int16x8_t v2092 = vsubq_s16(v1896, v1898); 2301 int16x8_t v2093 = vsubq_s16(v1900, v1902); 2302 int16x8_t v2094 = vqrdmulhq_n_s16(v2093, 29490); 2303 int16x8_t v2095 = vaddq_s16(v2092, v2094); 2304 int16x8_t v2096 = vqrdmulhq_n_s16(v2095, 18578); 2305 int16x8_t v2097 = vaddq_s16(v2091, v2096); 2306 int16x8_t v2098 = vqrdmulhq_n_s16(v2097, 16890); 2307 int16x8_t v2099 = vaddq_s16(v2087, v2098); 2308 int16x8_t v2100 = vqrdmulhq_n_s16(v2099, 16508); 2309 int16x8_t v2101 = vaddq_s16(v2077, v2100); 2310 int16x8_t v2102 = vqrdmulhq_n_s16(v2101, 16415); 2311 int16x8_t v2103 = vaddq_s16(v2055, v2102); 2312 int16x8_t v2104 = vqrdmulhq_n_s16(v2103, 16392); 2313 int16x8_t v2105 = vaddq_s16(v2009, v2104); 2314 int16x8_t v2106 = vsubq_s16(v2, v8); 2315 int16x8_t v2107 = vsubq_s16(v15, v22); 2316 int16x8_t v2108_tmp = vqrdmulhq_n_s16(v2107, 18446); 2317 int16x8_t v2108 = vmlaq_n_s16(v2108_tmp, v2107, 2); 2318 int16x8_t v2109 = vaddq_s16(v2106, v2108); 2319 int16x8_t v2110 = vsubq_s16(v31, v41); 2320 int16x8_t v2111 = vsubq_s16(v48, v56); 2321 int16x8_t v2112_tmp = vqrdmulhq_n_s16(v2111, 18446); 2322 int16x8_t v2112 = vmlaq_n_s16(v2112_tmp, v2111, 2); 2323 int16x8_t v2113 = vaddq_s16(v2110, v2112); 2324 int16x8_t v2114 = vqrdmulhq_n_s16(v2113, 21195); 2325 int16x8_t v2115 = vaddq_s16(v2109, v2114); 2326 int16x8_t v2116 = vsubq_s16(v67, v77); 2327 int16x8_t v2117 = vsubq_s16(v90, v99); 2328 int16x8_t v2118_tmp = vqrdmulhq_n_s16(v2117, 18446); 2329 int16x8_t v2118 = vmlaq_n_s16(v2118_tmp, v2117, 2); 2330 int16x8_t v2119 = vaddq_s16(v2116, v2118); 2331 int16x8_t v2120 = vsubq_s16(v108, v118); 2332 int16x8_t v2121 = vsubq_s16(v125, v134); 2333 int16x8_t v2122_tmp = vqrdmulhq_n_s16(v2121, 18446); 2334 int16x8_t v2122 = vmlaq_n_s16(v2122_tmp, v2121, 2); 2335 int16x8_t v2123 = vaddq_s16(v2120, v2122); 2336 int16x8_t v2124 = vqrdmulhq_n_s16(v2123, 21195); 2337 int16x8_t v2125 = vaddq_s16(v2119, v2124); 2338 int16x8_t v2126 = vqrdmulhq_n_s16(v2125, 17401); 2339 int16x8_t v2127 = vaddq_s16(v2115, v2126); 2340 int16x8_t v2128 = vsubq_s16(v147, v157); 2341 int16x8_t v2129 = vsubq_s16(v170, v179); 2342 int16x8_t v2130_tmp = vqrdmulhq_n_s16(v2129, 18446); 2343 int16x8_t v2130 = vmlaq_n_s16(v2130_tmp, v2129, 2); 2344 int16x8_t v2131 = vaddq_s16(v2128, v2130); 2345 int16x8_t v2132 = vsubq_s16(v194, v212); 2346 int16x8_t v2133 = vsubq_s16(v219, v229); 2347 int16x8_t v2134_tmp = vqrdmulhq_n_s16(v2133, 18446); 2348 int16x8_t v2134 = vmlaq_n_s16(v2134_tmp, v2133, 2); 2349 int16x8_t v2135 = vaddq_s16(v2132, v2134); 2350 int16x8_t v2136 = vqrdmulhq_n_s16(v2135, 21195); 2351 int16x8_t v2137 = vaddq_s16(v2131, v2136); 2352 int16x8_t v2138 = vsubq_s16(v240, v250); 2353 int16x8_t v2139 = vsubq_s16(v263, v272); 2354 int16x8_t v2140_tmp = vqrdmulhq_n_s16(v2139, 18446); 2355 int16x8_t v2140 = vmlaq_n_s16(v2140_tmp, v2139, 2); 2356 int16x8_t v2141 = vaddq_s16(v2138, v2140); 2357 int16x8_t v2142 = vsubq_s16(v281, v291); 2358 int16x8_t v2143 = vsubq_s16(v298, v308); 2359 int16x8_t v2144_tmp = vqrdmulhq_n_s16(v2143, 18446); 2360 int16x8_t v2144 = vmlaq_n_s16(v2144_tmp, v2143, 2); 2361 int16x8_t v2145 = vaddq_s16(v2142, v2144); 2362 int16x8_t v2146 = vqrdmulhq_n_s16(v2145, 21195); 2363 int16x8_t v2147 = vaddq_s16(v2141, v2146); 2364 int16x8_t v2148 = vqrdmulhq_n_s16(v2147, 17401); 2365 int16x8_t v2149 = vaddq_s16(v2137, v2148); 2366 int16x8_t v2150 = vqrdmulhq_n_s16(v2149, 16629); 2367 int16x8_t v2151 = vaddq_s16(v2127, v2150); 2368 int16x8_t v2152 = vsubq_s16(v323, v333); 2369 int16x8_t v2153 = vsubq_s16(v346, v355); 2370 int16x8_t v2154_tmp = vqrdmulhq_n_s16(v2153, 18446); 2371 int16x8_t v2154 = vmlaq_n_s16(v2154_tmp, v2153, 2); 2372 int16x8_t v2155 = vaddq_s16(v2152, v2154); 2373 int16x8_t v2156 = vsubq_s16(v370, v388); 2374 int16x8_t v2157 = vsubq_s16(v395, v405); 2375 int16x8_t v2158_tmp = vqrdmulhq_n_s16(v2157, 18446); 2376 int16x8_t v2158 = vmlaq_n_s16(v2158_tmp, v2157, 2); 2377 int16x8_t v2159 = vaddq_s16(v2156, v2158); 2378 int16x8_t v2160 = vqrdmulhq_n_s16(v2159, 21195); 2379 int16x8_t v2161 = vaddq_s16(v2155, v2160); 2380 int16x8_t v2162 = vsubq_s16(v422, v440); 2381 int16x8_t v2163 = vsubq_s16(v465, v478); 2382 int16x8_t v2164_tmp = vqrdmulhq_n_s16(v2163, 18446); 2383 int16x8_t v2164 = vmlaq_n_s16(v2164_tmp, v2163, 2); 2384 int16x8_t v2165 = vaddq_s16(v2162, v2164); 2385 int16x8_t v2166 = vsubq_s16(v487, v497); 2386 int16x8_t v2167 = vsubq_s16(v504, v515); 2387 int16x8_t v2168_tmp = vqrdmulhq_n_s16(v2167, 18446); 2388 int16x8_t v2168 = vmlaq_n_s16(v2168_tmp, v2167, 2); 2389 int16x8_t v2169 = vaddq_s16(v2166, v2168); 2390 int16x8_t v2170 = vqrdmulhq_n_s16(v2169, 21195); 2391 int16x8_t v2171 = vaddq_s16(v2165, v2170); 2392 int16x8_t v2172 = vqrdmulhq_n_s16(v2171, 17401); 2393 int16x8_t v2173 = vaddq_s16(v2161, v2172); 2394 int16x8_t v2174 = vsubq_s16(v528, v538); 2395 int16x8_t v2175 = vsubq_s16(v551, v560); 2396 int16x8_t v2176_tmp = vqrdmulhq_n_s16(v2175, 18446); 2397 int16x8_t v2176 = vmlaq_n_s16(v2176_tmp, v2175, 2); 2398 int16x8_t v2177 = vaddq_s16(v2174, v2176); 2399 int16x8_t v2178 = vsubq_s16(v575, v593); 2400 int16x8_t v2179 = vsubq_s16(v600, v610); 2401 int16x8_t v2180_tmp = vqrdmulhq_n_s16(v2179, 18446); 2402 int16x8_t v2180 = vmlaq_n_s16(v2180_tmp, v2179, 2); 2403 int16x8_t v2181 = vaddq_s16(v2178, v2180); 2404 int16x8_t v2182 = vqrdmulhq_n_s16(v2181, 21195); 2405 int16x8_t v2183 = vaddq_s16(v2177, v2182); 2406 int16x8_t v2184 = vsubq_s16(v621, v631); 2407 int16x8_t v2185 = vsubq_s16(v644, v653); 2408 int16x8_t v2186_tmp = vqrdmulhq_n_s16(v2185, 18446); 2409 int16x8_t v2186 = vmlaq_n_s16(v2186_tmp, v2185, 2); 2410 int16x8_t v2187 = vaddq_s16(v2184, v2186); 2411 int16x8_t v2188 = vsubq_s16(v662, v672); 2412 int16x8_t v2189 = vsubq_s16(v679, v690); 2413 int16x8_t v2190_tmp = vqrdmulhq_n_s16(v2189, 18446); 2414 int16x8_t v2190 = vmlaq_n_s16(v2190_tmp, v2189, 2); 2415 int16x8_t v2191 = vaddq_s16(v2188, v2190); 2416 int16x8_t v2192 = vqrdmulhq_n_s16(v2191, 21195); 2417 int16x8_t v2193 = vaddq_s16(v2187, v2192); 2418 int16x8_t v2194 = vqrdmulhq_n_s16(v2193, 17401); 2419 int16x8_t v2195 = vaddq_s16(v2183, v2194); 2420 int16x8_t v2196 = vqrdmulhq_n_s16(v2195, 16629); 2421 int16x8_t v2197 = vaddq_s16(v2173, v2196); 2422 int16x8_t v2198 = vqrdmulhq_n_s16(v2197, 16445); 2423 int16x8_t v2199 = vaddq_s16(v2151, v2198); 2424 int16x8_t v2200 = vsubq_s16(v707, v717); 2425 int16x8_t v2201 = vsubq_s16(v730, v739); 2426 int16x8_t v2202_tmp = vqrdmulhq_n_s16(v2201, 18446); 2427 int16x8_t v2202 = vmlaq_n_s16(v2202_tmp, v2201, 2); 2428 int16x8_t v2203 = vaddq_s16(v2200, v2202); 2429 int16x8_t v2204 = vsubq_s16(v754, v772); 2430 int16x8_t v2205 = vsubq_s16(v779, v789); 2431 int16x8_t v2206_tmp = vqrdmulhq_n_s16(v2205, 18446); 2432 int16x8_t v2206 = vmlaq_n_s16(v2206_tmp, v2205, 2); 2433 int16x8_t v2207 = vaddq_s16(v2204, v2206); 2434 int16x8_t v2208 = vqrdmulhq_n_s16(v2207, 21195); 2435 int16x8_t v2209 = vaddq_s16(v2203, v2208); 2436 int16x8_t v2210 = vsubq_s16(v806, v824); 2437 int16x8_t v2211 = vsubq_s16(v849, v862); 2438 int16x8_t v2212_tmp = vqrdmulhq_n_s16(v2211, 18446); 2439 int16x8_t v2212 = vmlaq_n_s16(v2212_tmp, v2211, 2); 2440 int16x8_t v2213 = vaddq_s16(v2210, v2212); 2441 int16x8_t v2214 = vsubq_s16(v871, v881); 2442 int16x8_t v2215 = vsubq_s16(v888, v899); 2443 int16x8_t v2216_tmp = vqrdmulhq_n_s16(v2215, 18446); 2444 int16x8_t v2216 = vmlaq_n_s16(v2216_tmp, v2215, 2); 2445 int16x8_t v2217 = vaddq_s16(v2214, v2216); 2446 int16x8_t v2218 = vqrdmulhq_n_s16(v2217, 21195); 2447 int16x8_t v2219 = vaddq_s16(v2213, v2218); 2448 int16x8_t v2220 = vqrdmulhq_n_s16(v2219, 17401); 2449 int16x8_t v2221 = vaddq_s16(v2209, v2220); 2450 int16x8_t v2222 = vsubq_s16(v918, v936); 2451 int16x8_t v2223 = vsubq_s16(v961, v974); 2452 int16x8_t v2224_tmp = vqrdmulhq_n_s16(v2223, 18446); 2453 int16x8_t v2224 = vmlaq_n_s16(v2224_tmp, v2223, 2); 2454 int16x8_t v2225 = vaddq_s16(v2222, v2224); 2455 int16x8_t v2226 = vsubq_s16(v1001, v1035); 2456 int16x8_t v2227 = vsubq_s16(v1042, v1056); 2457 int16x8_t v2228_tmp = vqrdmulhq_n_s16(v2227, 18446); 2458 int16x8_t v2228 = vmlaq_n_s16(v2228_tmp, v2227, 2); 2459 int16x8_t v2229 = vaddq_s16(v2226, v2228); 2460 int16x8_t v2230 = vqrdmulhq_n_s16(v2229, 21195); 2461 int16x8_t v2231 = vaddq_s16(v2225, v2230); 2462 int16x8_t v2232 = vsubq_s16(v1067, v1077); 2463 int16x8_t v2233 = vsubq_s16(v1090, v1099); 2464 int16x8_t v2234_tmp = vqrdmulhq_n_s16(v2233, 18446); 2465 int16x8_t v2234 = vmlaq_n_s16(v2234_tmp, v2233, 2); 2466 int16x8_t v2235 = vaddq_s16(v2232, v2234); 2467 int16x8_t v2236 = vsubq_s16(v1108, v1118); 2468 int16x8_t v2237 = vsubq_s16(v1125, v1137); 2469 int16x8_t v2238_tmp = vqrdmulhq_n_s16(v2237, 18446); 2470 int16x8_t v2238 = vmlaq_n_s16(v2238_tmp, v2237, 2); 2471 int16x8_t v2239 = vaddq_s16(v2236, v2238); 2472 int16x8_t v2240 = vqrdmulhq_n_s16(v2239, 21195); 2473 int16x8_t v2241 = vaddq_s16(v2235, v2240); 2474 int16x8_t v2242 = vqrdmulhq_n_s16(v2241, 17401); 2475 int16x8_t v2243 = vaddq_s16(v2231, v2242); 2476 int16x8_t v2244 = vqrdmulhq_n_s16(v2243, 16629); 2477 int16x8_t v2245 = vaddq_s16(v2221, v2244); 2478 int16x8_t v2246 = vsubq_s16(v1152, v1162); 2479 int16x8_t v2247 = vsubq_s16(v1175, v1184); 2480 int16x8_t v2248_tmp = vqrdmulhq_n_s16(v2247, 18446); 2481 int16x8_t v2248 = vmlaq_n_s16(v2248_tmp, v2247, 2); 2482 int16x8_t v2249 = vaddq_s16(v2246, v2248); 2483 int16x8_t v2250 = vsubq_s16(v1199, v1217); 2484 int16x8_t v2251 = vsubq_s16(v1224, v1234); 2485 int16x8_t v2252_tmp = vqrdmulhq_n_s16(v2251, 18446); 2486 int16x8_t v2252 = vmlaq_n_s16(v2252_tmp, v2251, 2); 2487 int16x8_t v2253 = vaddq_s16(v2250, v2252); 2488 int16x8_t v2254 = vqrdmulhq_n_s16(v2253, 21195); 2489 int16x8_t v2255 = vaddq_s16(v2249, v2254); 2490 int16x8_t v2256 = vsubq_s16(v1251, v1269); 2491 int16x8_t v2257 = vsubq_s16(v1294, v1307); 2492 int16x8_t v2258_tmp = vqrdmulhq_n_s16(v2257, 18446); 2493 int16x8_t v2258 = vmlaq_n_s16(v2258_tmp, v2257, 2); 2494 int16x8_t v2259 = vaddq_s16(v2256, v2258); 2495 int16x8_t v2260 = vsubq_s16(v1316, v1326); 2496 int16x8_t v2261 = vsubq_s16(v1333, v1344); 2497 int16x8_t v2262_tmp = vqrdmulhq_n_s16(v2261, 18446); 2498 int16x8_t v2262 = vmlaq_n_s16(v2262_tmp, v2261, 2); 2499 int16x8_t v2263 = vaddq_s16(v2260, v2262); 2500 int16x8_t v2264 = vqrdmulhq_n_s16(v2263, 21195); 2501 int16x8_t v2265 = vaddq_s16(v2259, v2264); 2502 int16x8_t v2266 = vqrdmulhq_n_s16(v2265, 17401); 2503 int16x8_t v2267 = vaddq_s16(v2255, v2266); 2504 int16x8_t v2268 = vsubq_s16(v1357, v1367); 2505 int16x8_t v2269 = vsubq_s16(v1380, v1389); 2506 int16x8_t v2270_tmp = vqrdmulhq_n_s16(v2269, 18446); 2507 int16x8_t v2270 = vmlaq_n_s16(v2270_tmp, v2269, 2); 2508 int16x8_t v2271 = vaddq_s16(v2268, v2270); 2509 int16x8_t v2272 = vsubq_s16(v1404, v1422); 2510 int16x8_t v2273 = vsubq_s16(v1429, v1439); 2511 int16x8_t v2274_tmp = vqrdmulhq_n_s16(v2273, 18446); 2512 int16x8_t v2274 = vmlaq_n_s16(v2274_tmp, v2273, 2); 2513 int16x8_t v2275 = vaddq_s16(v2272, v2274); 2514 int16x8_t v2276 = vqrdmulhq_n_s16(v2275, 21195); 2515 int16x8_t v2277 = vaddq_s16(v2271, v2276); 2516 int16x8_t v2278 = vsubq_s16(v1450, v1460); 2517 int16x8_t v2279 = vsubq_s16(v1473, v1482); 2518 int16x8_t v2280_tmp = vqrdmulhq_n_s16(v2279, 18446); 2519 int16x8_t v2280 = vmlaq_n_s16(v2280_tmp, v2279, 2); 2520 int16x8_t v2281 = vaddq_s16(v2278, v2280); 2521 int16x8_t v2282 = vsubq_s16(v1491, v1501); 2522 int16x8_t v2283 = vsubq_s16(v1508, v1520); 2523 int16x8_t v2284_tmp = vqrdmulhq_n_s16(v2283, 18446); 2524 int16x8_t v2284 = vmlaq_n_s16(v2284_tmp, v2283, 2); 2525 int16x8_t v2285 = vaddq_s16(v2282, v2284); 2526 int16x8_t v2286 = vqrdmulhq_n_s16(v2285, 21195); 2527 int16x8_t v2287 = vaddq_s16(v2281, v2286); 2528 int16x8_t v2288 = vqrdmulhq_n_s16(v2287, 17401); 2529 int16x8_t v2289 = vaddq_s16(v2277, v2288); 2530 int16x8_t v2290 = vqrdmulhq_n_s16(v2289, 16629); 2531 int16x8_t v2291 = vaddq_s16(v2267, v2290); 2532 int16x8_t v2292 = vqrdmulhq_n_s16(v2291, 16445); 2533 int16x8_t v2293 = vaddq_s16(v2245, v2292); 2534 int16x8_t v2294 = vqrdmulhq_n_s16(v2293, 16399); 2535 int16x8_t v2295 = vaddq_s16(v2199, v2294); 2536 int16x8_t v2296 = vsubq_s16(v2106, v2108); 2537 int16x8_t v2297 = vsubq_s16(v2110, v2112); 2538 int16x8_t v2298 = vqrdmulhq_n_s16(v2297, 25826); 2539 int16x8_t v2299 = vaddq_s16(v2296, v2298); 2540 int16x8_t v2300 = vsubq_s16(v2116, v2118); 2541 int16x8_t v2301 = vsubq_s16(v2120, v2122); 2542 int16x8_t v2302 = vqrdmulhq_n_s16(v2301, 25826); 2543 int16x8_t v2303 = vaddq_s16(v2300, v2302); 2544 int16x8_t v2304 = vqrdmulhq_n_s16(v2303, 18124); 2545 int16x8_t v2305 = vaddq_s16(v2299, v2304); 2546 int16x8_t v2306 = vsubq_s16(v2128, v2130); 2547 int16x8_t v2307 = vsubq_s16(v2132, v2134); 2548 int16x8_t v2308 = vqrdmulhq_n_s16(v2307, 25826); 2549 int16x8_t v2309 = vaddq_s16(v2306, v2308); 2550 int16x8_t v2310 = vsubq_s16(v2138, v2140); 2551 int16x8_t v2311 = vsubq_s16(v2142, v2144); 2552 int16x8_t v2312 = vqrdmulhq_n_s16(v2311, 25826); 2553 int16x8_t v2313 = vaddq_s16(v2310, v2312); 2554 int16x8_t v2314 = vqrdmulhq_n_s16(v2313, 18124); 2555 int16x8_t v2315 = vaddq_s16(v2309, v2314); 2556 int16x8_t v2316 = vqrdmulhq_n_s16(v2315, 16792); 2557 int16x8_t v2317 = vaddq_s16(v2305, v2316); 2558 int16x8_t v2318 = vsubq_s16(v2152, v2154); 2559 int16x8_t v2319 = vsubq_s16(v2156, v2158); 2560 int16x8_t v2320 = vqrdmulhq_n_s16(v2319, 25826); 2561 int16x8_t v2321 = vaddq_s16(v2318, v2320); 2562 int16x8_t v2322 = vsubq_s16(v2162, v2164); 2563 int16x8_t v2323 = vsubq_s16(v2166, v2168); 2564 int16x8_t v2324 = vqrdmulhq_n_s16(v2323, 25826); 2565 int16x8_t v2325 = vaddq_s16(v2322, v2324); 2566 int16x8_t v2326 = vqrdmulhq_n_s16(v2325, 18124); 2567 int16x8_t v2327 = vaddq_s16(v2321, v2326); 2568 int16x8_t v2328 = vsubq_s16(v2174, v2176); 2569 int16x8_t v2329 = vsubq_s16(v2178, v2180); 2570 int16x8_t v2330 = vqrdmulhq_n_s16(v2329, 25826); 2571 int16x8_t v2331 = vaddq_s16(v2328, v2330); 2572 int16x8_t v2332 = vsubq_s16(v2184, v2186); 2573 int16x8_t v2333 = vsubq_s16(v2188, v2190); 2574 int16x8_t v2334 = vqrdmulhq_n_s16(v2333, 25826); 2575 int16x8_t v2335 = vaddq_s16(v2332, v2334); 2576 int16x8_t v2336 = vqrdmulhq_n_s16(v2335, 18124); 2577 int16x8_t v2337 = vaddq_s16(v2331, v2336); 2578 int16x8_t v2338 = vqrdmulhq_n_s16(v2337, 16792); 2579 int16x8_t v2339 = vaddq_s16(v2327, v2338); 2580 int16x8_t v2340 = vqrdmulhq_n_s16(v2339, 16484); 2581 int16x8_t v2341 = vaddq_s16(v2317, v2340); 2582 int16x8_t v2342 = vsubq_s16(v2200, v2202); 2583 int16x8_t v2343 = vsubq_s16(v2204, v2206); 2584 int16x8_t v2344 = vqrdmulhq_n_s16(v2343, 25826); 2585 int16x8_t v2345 = vaddq_s16(v2342, v2344); 2586 int16x8_t v2346 = vsubq_s16(v2210, v2212); 2587 int16x8_t v2347 = vsubq_s16(v2214, v2216); 2588 int16x8_t v2348 = vqrdmulhq_n_s16(v2347, 25826); 2589 int16x8_t v2349 = vaddq_s16(v2346, v2348); 2590 int16x8_t v2350 = vqrdmulhq_n_s16(v2349, 18124); 2591 int16x8_t v2351 = vaddq_s16(v2345, v2350); 2592 int16x8_t v2352 = vsubq_s16(v2222, v2224); 2593 int16x8_t v2353 = vsubq_s16(v2226, v2228); 2594 int16x8_t v2354 = vqrdmulhq_n_s16(v2353, 25826); 2595 int16x8_t v2355 = vaddq_s16(v2352, v2354); 2596 int16x8_t v2356 = vsubq_s16(v2232, v2234); 2597 int16x8_t v2357 = vsubq_s16(v2236, v2238); 2598 int16x8_t v2358 = vqrdmulhq_n_s16(v2357, 25826); 2599 int16x8_t v2359 = vaddq_s16(v2356, v2358); 2600 int16x8_t v2360 = vqrdmulhq_n_s16(v2359, 18124); 2601 int16x8_t v2361 = vaddq_s16(v2355, v2360); 2602 int16x8_t v2362 = vqrdmulhq_n_s16(v2361, 16792); 2603 int16x8_t v2363 = vaddq_s16(v2351, v2362); 2604 int16x8_t v2364 = vsubq_s16(v2246, v2248); 2605 int16x8_t v2365 = vsubq_s16(v2250, v2252); 2606 int16x8_t v2366 = vqrdmulhq_n_s16(v2365, 25826); 2607 int16x8_t v2367 = vaddq_s16(v2364, v2366); 2608 int16x8_t v2368 = vsubq_s16(v2256, v2258); 2609 int16x8_t v2369 = vsubq_s16(v2260, v2262); 2610 int16x8_t v2370 = vqrdmulhq_n_s16(v2369, 25826); 2611 int16x8_t v2371 = vaddq_s16(v2368, v2370); 2612 int16x8_t v2372 = vqrdmulhq_n_s16(v2371, 18124); 2613 int16x8_t v2373 = vaddq_s16(v2367, v2372); 2614 int16x8_t v2374 = vsubq_s16(v2268, v2270); 2615 int16x8_t v2375 = vsubq_s16(v2272, v2274); 2616 int16x8_t v2376 = vqrdmulhq_n_s16(v2375, 25826); 2617 int16x8_t v2377 = vaddq_s16(v2374, v2376); 2618 int16x8_t v2378 = vsubq_s16(v2278, v2280); 2619 int16x8_t v2379 = vsubq_s16(v2282, v2284); 2620 int16x8_t v2380 = vqrdmulhq_n_s16(v2379, 25826); 2621 int16x8_t v2381 = vaddq_s16(v2378, v2380); 2622 int16x8_t v2382 = vqrdmulhq_n_s16(v2381, 18124); 2623 int16x8_t v2383 = vaddq_s16(v2377, v2382); 2624 int16x8_t v2384 = vqrdmulhq_n_s16(v2383, 16792); 2625 int16x8_t v2385 = vaddq_s16(v2373, v2384); 2626 int16x8_t v2386 = vqrdmulhq_n_s16(v2385, 16484); 2627 int16x8_t v2387 = vaddq_s16(v2363, v2386); 2628 int16x8_t v2388 = vqrdmulhq_n_s16(v2387, 16409); 2629 int16x8_t v2389 = vaddq_s16(v2341, v2388); 2630 int16x8_t v2390 = vsubq_s16(v1916, v1918); 2631 int16x8_t v2391 = vsubq_s16(v1920, v1922); 2632 int16x8_t v2392_tmp = vqrdmulhq_n_s16(v2391, 1988); 2633 int16x8_t v2392 = vaddq_s16(v2392_tmp, v2391); 2634 int16x8_t v2393 = vaddq_s16(v2390, v2392); 2635 int16x8_t v2394 = vsubq_s16(v1926, v1928); 2636 int16x8_t v2395 = vsubq_s16(v1930, v1932); 2637 int16x8_t v2396_tmp = vqrdmulhq_n_s16(v2395, 1988); 2638 int16x8_t v2396 = vaddq_s16(v2396_tmp, v2395); 2639 int16x8_t v2397 = vaddq_s16(v2394, v2396); 2640 int16x8_t v2398 = vqrdmulhq_n_s16(v2397, 19102); 2641 int16x8_t v2399 = vaddq_s16(v2393, v2398); 2642 int16x8_t v2400 = vsubq_s16(v1938, v1940); 2643 int16x8_t v2401 = vsubq_s16(v1942, v1944); 2644 int16x8_t v2402_tmp = vqrdmulhq_n_s16(v2401, 1988); 2645 int16x8_t v2402 = vaddq_s16(v2402_tmp, v2401); 2646 int16x8_t v2403 = vaddq_s16(v2400, v2402); 2647 int16x8_t v2404 = vsubq_s16(v1948, v1950); 2648 int16x8_t v2405 = vsubq_s16(v1952, v1954); 2649 int16x8_t v2406_tmp = vqrdmulhq_n_s16(v2405, 1988); 2650 int16x8_t v2406 = vaddq_s16(v2406_tmp, v2405); 2651 int16x8_t v2407 = vaddq_s16(v2404, v2406); 2652 int16x8_t v2408 = vqrdmulhq_n_s16(v2407, 19102); 2653 int16x8_t v2409 = vaddq_s16(v2403, v2408); 2654 int16x8_t v2410 = vqrdmulhq_n_s16(v2409, 17000); 2655 int16x8_t v2411 = vaddq_s16(v2399, v2410); 2656 int16x8_t v2412 = vsubq_s16(v1962, v1964); 2657 int16x8_t v2413 = vsubq_s16(v1966, v1968); 2658 int16x8_t v2414_tmp = vqrdmulhq_n_s16(v2413, 1988); 2659 int16x8_t v2414 = vaddq_s16(v2414_tmp, v2413); 2660 int16x8_t v2415 = vaddq_s16(v2412, v2414); 2661 int16x8_t v2416 = vsubq_s16(v1972, v1974); 2662 int16x8_t v2417 = vsubq_s16(v1976, v1978); 2663 int16x8_t v2418_tmp = vqrdmulhq_n_s16(v2417, 1988); 2664 int16x8_t v2418 = vaddq_s16(v2418_tmp, v2417); 2665 int16x8_t v2419 = vaddq_s16(v2416, v2418); 2666 int16x8_t v2420 = vqrdmulhq_n_s16(v2419, 19102); 2667 int16x8_t v2421 = vaddq_s16(v2415, v2420); 2668 int16x8_t v2422 = vsubq_s16(v1984, v1986); 2669 int16x8_t v2423 = vsubq_s16(v1988, v1990); 2670 int16x8_t v2424_tmp = vqrdmulhq_n_s16(v2423, 1988); 2671 int16x8_t v2424 = vaddq_s16(v2424_tmp, v2423); 2672 int16x8_t v2425 = vaddq_s16(v2422, v2424); 2673 int16x8_t v2426 = vsubq_s16(v1994, v1996); 2674 int16x8_t v2427 = vsubq_s16(v1998, v2000); 2675 int16x8_t v2428_tmp = vqrdmulhq_n_s16(v2427, 1988); 2676 int16x8_t v2428 = vaddq_s16(v2428_tmp, v2427); 2677 int16x8_t v2429 = vaddq_s16(v2426, v2428); 2678 int16x8_t v2430 = vqrdmulhq_n_s16(v2429, 19102); 2679 int16x8_t v2431 = vaddq_s16(v2425, v2430); 2680 int16x8_t v2432 = vqrdmulhq_n_s16(v2431, 17000); 2681 int16x8_t v2433 = vaddq_s16(v2421, v2432); 2682 int16x8_t v2434 = vqrdmulhq_n_s16(v2433, 16534); 2683 int16x8_t v2435 = vaddq_s16(v2411, v2434); 2684 int16x8_t v2436 = vsubq_s16(v2010, v2012); 2685 int16x8_t v2437 = vsubq_s16(v2014, v2016); 2686 int16x8_t v2438_tmp = vqrdmulhq_n_s16(v2437, 1988); 2687 int16x8_t v2438 = vaddq_s16(v2438_tmp, v2437); 2688 int16x8_t v2439 = vaddq_s16(v2436, v2438); 2689 int16x8_t v2440 = vsubq_s16(v2020, v2022); 2690 int16x8_t v2441 = vsubq_s16(v2024, v2026); 2691 int16x8_t v2442_tmp = vqrdmulhq_n_s16(v2441, 1988); 2692 int16x8_t v2442 = vaddq_s16(v2442_tmp, v2441); 2693 int16x8_t v2443 = vaddq_s16(v2440, v2442); 2694 int16x8_t v2444 = vqrdmulhq_n_s16(v2443, 19102); 2695 int16x8_t v2445 = vaddq_s16(v2439, v2444); 2696 int16x8_t v2446 = vsubq_s16(v2032, v2034); 2697 int16x8_t v2447 = vsubq_s16(v2036, v2038); 2698 int16x8_t v2448_tmp = vqrdmulhq_n_s16(v2447, 1988); 2699 int16x8_t v2448 = vaddq_s16(v2448_tmp, v2447); 2700 int16x8_t v2449 = vaddq_s16(v2446, v2448); 2701 int16x8_t v2450 = vsubq_s16(v2042, v2044); 2702 int16x8_t v2451 = vsubq_s16(v2046, v2048); 2703 int16x8_t v2452_tmp = vqrdmulhq_n_s16(v2451, 1988); 2704 int16x8_t v2452 = vaddq_s16(v2452_tmp, v2451); 2705 int16x8_t v2453 = vaddq_s16(v2450, v2452); 2706 int16x8_t v2454 = vqrdmulhq_n_s16(v2453, 19102); 2707 int16x8_t v2455 = vaddq_s16(v2449, v2454); 2708 int16x8_t v2456 = vqrdmulhq_n_s16(v2455, 17000); 2709 int16x8_t v2457 = vaddq_s16(v2445, v2456); 2710 int16x8_t v2458 = vsubq_s16(v2056, v2058); 2711 int16x8_t v2459 = vsubq_s16(v2060, v2062); 2712 int16x8_t v2460_tmp = vqrdmulhq_n_s16(v2459, 1988); 2713 int16x8_t v2460 = vaddq_s16(v2460_tmp, v2459); 2714 int16x8_t v2461 = vaddq_s16(v2458, v2460); 2715 int16x8_t v2462 = vsubq_s16(v2066, v2068); 2716 int16x8_t v2463 = vqrdmulhq_n_s16(v2072, 29490); 2717 int16x8_t v2464 = vsubq_s16(v2070, v2463); 2718 int16x8_t v2465_tmp = vqrdmulhq_n_s16(v2464, 1988); 2719 int16x8_t v2465 = vaddq_s16(v2465_tmp, v2464); 2720 int16x8_t v2466 = vaddq_s16(v2462, v2465); 2721 int16x8_t v2467 = vqrdmulhq_n_s16(v2466, 19102); 2722 int16x8_t v2468 = vaddq_s16(v2461, v2467); 2723 int16x8_t v2469 = vsubq_s16(v2078, v2080); 2724 int16x8_t v2470 = vsubq_s16(v2082, v2084); 2725 int16x8_t v2471_tmp = vqrdmulhq_n_s16(v2470, 1988); 2726 int16x8_t v2471 = vaddq_s16(v2471_tmp, v2470); 2727 int16x8_t v2472 = vaddq_s16(v2469, v2471); 2728 int16x8_t v2473 = vsubq_s16(v2088, v2090); 2729 int16x8_t v2474 = vsubq_s16(v2092, v2094); 2730 int16x8_t v2475_tmp = vqrdmulhq_n_s16(v2474, 1988); 2731 int16x8_t v2475 = vaddq_s16(v2475_tmp, v2474); 2732 int16x8_t v2476 = vaddq_s16(v2473, v2475); 2733 int16x8_t v2477 = vqrdmulhq_n_s16(v2476, 19102); 2734 int16x8_t v2478 = vaddq_s16(v2472, v2477); 2735 int16x8_t v2479 = vqrdmulhq_n_s16(v2478, 17000); 2736 int16x8_t v2480 = vaddq_s16(v2468, v2479); 2737 int16x8_t v2481 = vqrdmulhq_n_s16(v2480, 16534); 2738 int16x8_t v2482 = vaddq_s16(v2457, v2481); 2739 int16x8_t v2483 = vqrdmulhq_n_s16(v2482, 16421); 2740 int16x8_t v2484 = vaddq_s16(v2435, v2483); 2741 int16x8_t v2485 = vsubq_s16(v1537, v1542); 2742 int16x8_t v2486 = vsubq_s16(v1547, v1552); 2743 int16x8_t v2487_tmp = vqrdmulhq_n_s16(v2486, 23673); 2744 int16x8_t v2487 = vaddq_s16(v2487_tmp, v2486); 2745 int16x8_t v2488 = vaddq_s16(v2485, v2487); 2746 int16x8_t v2489 = vsubq_s16(v1559, v1564); 2747 int16x8_t v2490 = vsubq_s16(v1569, v1574); 2748 int16x8_t v2491_tmp = vqrdmulhq_n_s16(v2490, 23673); 2749 int16x8_t v2491 = vaddq_s16(v2491_tmp, v2490); 2750 int16x8_t v2492 = vaddq_s16(v2489, v2491); 2751 int16x8_t v2493 = vqrdmulhq_n_s16(v2492, 20398); 2752 int16x8_t v2494 = vaddq_s16(v2488, v2493); 2753 int16x8_t v2495 = vsubq_s16(v1583, v1588); 2754 int16x8_t v2496 = vsubq_s16(v1593, v1598); 2755 int16x8_t v2497_tmp = vqrdmulhq_n_s16(v2496, 23673); 2756 int16x8_t v2497 = vaddq_s16(v2497_tmp, v2496); 2757 int16x8_t v2498 = vaddq_s16(v2495, v2497); 2758 int16x8_t v2499 = vsubq_s16(v1605, v1610); 2759 int16x8_t v2500 = vsubq_s16(v1615, v1620); 2760 int16x8_t v2501_tmp = vqrdmulhq_n_s16(v2500, 23673); 2761 int16x8_t v2501 = vaddq_s16(v2501_tmp, v2500); 2762 int16x8_t v2502 = vaddq_s16(v2499, v2501); 2763 int16x8_t v2503 = vqrdmulhq_n_s16(v2502, 20398); 2764 int16x8_t v2504 = vaddq_s16(v2498, v2503); 2765 int16x8_t v2505 = vqrdmulhq_n_s16(v2504, 17255); 2766 int16x8_t v2506 = vaddq_s16(v2494, v2505); 2767 int16x8_t v2507 = vsubq_s16(v1631, v1636); 2768 int16x8_t v2508 = vsubq_s16(v1641, v1646); 2769 int16x8_t v2509_tmp = vqrdmulhq_n_s16(v2508, 23673); 2770 int16x8_t v2509 = vaddq_s16(v2509_tmp, v2508); 2771 int16x8_t v2510 = vaddq_s16(v2507, v2509); 2772 int16x8_t v2511 = vsubq_s16(v1653, v1658); 2773 int16x8_t v2512 = vsubq_s16(v1663, v1668); 2774 int16x8_t v2513_tmp = vqrdmulhq_n_s16(v2512, 23673); 2775 int16x8_t v2513 = vaddq_s16(v2513_tmp, v2512); 2776 int16x8_t v2514 = vaddq_s16(v2511, v2513); 2777 int16x8_t v2515 = vqrdmulhq_n_s16(v2514, 20398); 2778 int16x8_t v2516 = vaddq_s16(v2510, v2515); 2779 int16x8_t v2517 = vsubq_s16(v1677, v1682); 2780 int16x8_t v2518 = vsubq_s16(v1687, v1692); 2781 int16x8_t v2519_tmp = vqrdmulhq_n_s16(v2518, 23673); 2782 int16x8_t v2519 = vaddq_s16(v2519_tmp, v2518); 2783 int16x8_t v2520 = vaddq_s16(v2517, v2519); 2784 int16x8_t v2521 = vsubq_s16(v1699, v1704); 2785 int16x8_t v2522 = vsubq_s16(v1709, v1714); 2786 int16x8_t v2523_tmp = vqrdmulhq_n_s16(v2522, 23673); 2787 int16x8_t v2523 = vaddq_s16(v2523_tmp, v2522); 2788 int16x8_t v2524 = vaddq_s16(v2521, v2523); 2789 int16x8_t v2525 = vqrdmulhq_n_s16(v2524, 20398); 2790 int16x8_t v2526 = vaddq_s16(v2520, v2525); 2791 int16x8_t v2527 = vqrdmulhq_n_s16(v2526, 17255); 2792 int16x8_t v2528 = vaddq_s16(v2516, v2527); 2793 int16x8_t v2529 = vqrdmulhq_n_s16(v2528, 16595); 2794 int16x8_t v2530 = vaddq_s16(v2506, v2529); 2795 int16x8_t v2531 = vsubq_s16(v1727, v1732); 2796 int16x8_t v2532 = vsubq_s16(v1737, v1742); 2797 int16x8_t v2533_tmp = vqrdmulhq_n_s16(v2532, 23673); 2798 int16x8_t v2533 = vaddq_s16(v2533_tmp, v2532); 2799 int16x8_t v2534 = vaddq_s16(v2531, v2533); 2800 int16x8_t v2535 = vsubq_s16(v1749, v1754); 2801 int16x8_t v2536 = vsubq_s16(v1759, v1764); 2802 int16x8_t v2537_tmp = vqrdmulhq_n_s16(v2536, 23673); 2803 int16x8_t v2537 = vaddq_s16(v2537_tmp, v2536); 2804 int16x8_t v2538 = vaddq_s16(v2535, v2537); 2805 int16x8_t v2539 = vqrdmulhq_n_s16(v2538, 20398); 2806 int16x8_t v2540 = vaddq_s16(v2534, v2539); 2807 int16x8_t v2541 = vsubq_s16(v1773, v1778); 2808 int16x8_t v2542 = vsubq_s16(v1783, v1788); 2809 int16x8_t v2543_tmp = vqrdmulhq_n_s16(v2542, 23673); 2810 int16x8_t v2543 = vaddq_s16(v2543_tmp, v2542); 2811 int16x8_t v2544 = vaddq_s16(v2541, v2543); 2812 int16x8_t v2545 = vsubq_s16(v1795, v1800); 2813 int16x8_t v2546 = vsubq_s16(v1805, v1810); 2814 int16x8_t v2547_tmp = vqrdmulhq_n_s16(v2546, 23673); 2815 int16x8_t v2547 = vaddq_s16(v2547_tmp, v2546); 2816 int16x8_t v2548 = vaddq_s16(v2545, v2547); 2817 int16x8_t v2549 = vqrdmulhq_n_s16(v2548, 20398); 2818 int16x8_t v2550 = vaddq_s16(v2544, v2549); 2819 int16x8_t v2551 = vqrdmulhq_n_s16(v2550, 17255); 2820 int16x8_t v2552 = vaddq_s16(v2540, v2551); 2821 int16x8_t v2553 = vsubq_s16(v1821, v1826); 2822 int16x8_t v2554 = vsubq_s16(v1831, v1836); 2823 int16x8_t v2555_tmp = vqrdmulhq_n_s16(v2554, 23673); 2824 int16x8_t v2555 = vaddq_s16(v2555_tmp, v2554); 2825 int16x8_t v2556 = vaddq_s16(v2553, v2555); 2826 int16x8_t v2557 = vsubq_s16(v1843, v1848); 2827 int16x8_t v2558 = vsubq_s16(v1853, v1858); 2828 int16x8_t v2559_tmp = vqrdmulhq_n_s16(v2558, 23673); 2829 int16x8_t v2559 = vaddq_s16(v2559_tmp, v2558); 2830 int16x8_t v2560 = vaddq_s16(v2557, v2559); 2831 int16x8_t v2561 = vqrdmulhq_n_s16(v2560, 20398); 2832 int16x8_t v2562 = vaddq_s16(v2556, v2561); 2833 int16x8_t v2563 = vsubq_s16(v1867, v1872); 2834 int16x8_t v2564 = vsubq_s16(v1877, v1882); 2835 int16x8_t v2565_tmp = vqrdmulhq_n_s16(v2564, 23673); 2836 int16x8_t v2565 = vaddq_s16(v2565_tmp, v2564); 2837 int16x8_t v2566 = vaddq_s16(v2563, v2565); 2838 int16x8_t v2567 = vsubq_s16(v1889, v1894); 2839 int16x8_t v2568 = vsubq_s16(v1899, v1904); 2840 int16x8_t v2569_tmp = vqrdmulhq_n_s16(v2568, 23673); 2841 int16x8_t v2569 = vaddq_s16(v2569_tmp, v2568); 2842 int16x8_t v2570 = vaddq_s16(v2567, v2569); 2843 int16x8_t v2571 = vqrdmulhq_n_s16(v2570, 20398); 2844 int16x8_t v2572 = vaddq_s16(v2566, v2571); 2845 int16x8_t v2573 = vqrdmulhq_n_s16(v2572, 17255); 2846 int16x8_t v2574 = vaddq_s16(v2562, v2573); 2847 int16x8_t v2575 = vqrdmulhq_n_s16(v2574, 16595); 2848 int16x8_t v2576 = vaddq_s16(v2552, v2575); 2849 int16x8_t v2577 = vqrdmulhq_n_s16(v2576, 16436); 2850 int16x8_t v2578 = vaddq_s16(v2530, v2577); 2851 int16x8_t v2579 = vsubq_s16(v9, v24); 2852 int16x8_t v2580 = vsubq_s16(v42, v58); 2853 int16x8_t v2581_tmp = vqrdmulhq_n_s16(v2580, 3314); 2854 int16x8_t v2581 = vmlaq_n_s16(v2581_tmp, v2580, 5); 2855 int16x8_t v2582 = vaddq_s16(v2579, v2581); 2856 int16x8_t v2583 = vsubq_s16(v78, v101); 2857 int16x8_t v2584 = vsubq_s16(v119, v136); 2858 int16x8_t v2585_tmp = vqrdmulhq_n_s16(v2584, 3314); 2859 int16x8_t v2585 = vmlaq_n_s16(v2585_tmp, v2584, 5); 2860 int16x8_t v2586 = vaddq_s16(v2583, v2585); 2861 int16x8_t v2587 = vqrdmulhq_n_s16(v2586, 22112); 2862 int16x8_t v2588 = vaddq_s16(v2582, v2587); 2863 int16x8_t v2589 = vsubq_s16(v158, v181); 2864 int16x8_t v2590 = vsubq_s16(v213, v231); 2865 int16x8_t v2591_tmp = vqrdmulhq_n_s16(v2590, 3314); 2866 int16x8_t v2591 = vmlaq_n_s16(v2591_tmp, v2590, 5); 2867 int16x8_t v2592 = vaddq_s16(v2589, v2591); 2868 int16x8_t v2593 = vsubq_s16(v251, v274); 2869 int16x8_t v2594 = vsubq_s16(v292, v310); 2870 int16x8_t v2595_tmp = vqrdmulhq_n_s16(v2594, 3314); 2871 int16x8_t v2595 = vmlaq_n_s16(v2595_tmp, v2594, 5); 2872 int16x8_t v2596 = vaddq_s16(v2593, v2595); 2873 int16x8_t v2597 = vqrdmulhq_n_s16(v2596, 22112); 2874 int16x8_t v2598 = vaddq_s16(v2592, v2597); 2875 int16x8_t v2599 = vqrdmulhq_n_s16(v2598, 17561); 2876 int16x8_t v2600 = vaddq_s16(v2588, v2599); 2877 int16x8_t v2601 = vsubq_s16(v334, v357); 2878 int16x8_t v2602 = vsubq_s16(v389, v407); 2879 int16x8_t v2603_tmp = vqrdmulhq_n_s16(v2602, 3314); 2880 int16x8_t v2603 = vmlaq_n_s16(v2603_tmp, v2602, 5); 2881 int16x8_t v2604 = vaddq_s16(v2601, v2603); 2882 int16x8_t v2605 = vsubq_s16(v441, v480); 2883 int16x8_t v2606 = vsubq_s16(v498, v517); 2884 int16x8_t v2607_tmp = vqrdmulhq_n_s16(v2606, 3314); 2885 int16x8_t v2607 = vmlaq_n_s16(v2607_tmp, v2606, 5); 2886 int16x8_t v2608 = vaddq_s16(v2605, v2607); 2887 int16x8_t v2609 = vqrdmulhq_n_s16(v2608, 22112); 2888 int16x8_t v2610 = vaddq_s16(v2604, v2609); 2889 int16x8_t v2611 = vsubq_s16(v539, v562); 2890 int16x8_t v2612 = vsubq_s16(v594, v612); 2891 int16x8_t v2613_tmp = vqrdmulhq_n_s16(v2612, 3314); 2892 int16x8_t v2613 = vmlaq_n_s16(v2613_tmp, v2612, 5); 2893 int16x8_t v2614 = vaddq_s16(v2611, v2613); 2894 int16x8_t v2615 = vsubq_s16(v632, v655); 2895 int16x8_t v2616 = vsubq_s16(v673, v692); 2896 int16x8_t v2617_tmp = vqrdmulhq_n_s16(v2616, 3314); 2897 int16x8_t v2617 = vmlaq_n_s16(v2617_tmp, v2616, 5); 2898 int16x8_t v2618 = vaddq_s16(v2615, v2617); 2899 int16x8_t v2619 = vqrdmulhq_n_s16(v2618, 22112); 2900 int16x8_t v2620 = vaddq_s16(v2614, v2619); 2901 int16x8_t v2621 = vqrdmulhq_n_s16(v2620, 17561); 2902 int16x8_t v2622 = vaddq_s16(v2610, v2621); 2903 int16x8_t v2623 = vqrdmulhq_n_s16(v2622, 16666); 2904 int16x8_t v2624 = vaddq_s16(v2600, v2623); 2905 int16x8_t v2625 = vsubq_s16(v718, v741); 2906 int16x8_t v2626 = vsubq_s16(v773, v791); 2907 int16x8_t v2627_tmp = vqrdmulhq_n_s16(v2626, 3314); 2908 int16x8_t v2627 = vmlaq_n_s16(v2627_tmp, v2626, 5); 2909 int16x8_t v2628 = vaddq_s16(v2625, v2627); 2910 int16x8_t v2629 = vsubq_s16(v825, v864); 2911 int16x8_t v2630 = vsubq_s16(v882, v901); 2912 int16x8_t v2631_tmp = vqrdmulhq_n_s16(v2630, 3314); 2913 int16x8_t v2631 = vmlaq_n_s16(v2631_tmp, v2630, 5); 2914 int16x8_t v2632 = vaddq_s16(v2629, v2631); 2915 int16x8_t v2633 = vqrdmulhq_n_s16(v2632, 22112); 2916 int16x8_t v2634 = vaddq_s16(v2628, v2633); 2917 int16x8_t v2635 = vsubq_s16(v937, v976); 2918 int16x8_t v2636 = vsubq_s16(v1036, v1058); 2919 int16x8_t v2637_tmp = vqrdmulhq_n_s16(v2636, 3314); 2920 int16x8_t v2637 = vmlaq_n_s16(v2637_tmp, v2636, 5); 2921 int16x8_t v2638 = vaddq_s16(v2635, v2637); 2922 int16x8_t v2639 = vsubq_s16(v1078, v1101); 2923 int16x8_t v2640 = vsubq_s16(v1119, v1139); 2924 int16x8_t v2641_tmp = vqrdmulhq_n_s16(v2640, 3314); 2925 int16x8_t v2641 = vmlaq_n_s16(v2641_tmp, v2640, 5); 2926 int16x8_t v2642 = vaddq_s16(v2639, v2641); 2927 int16x8_t v2643 = vqrdmulhq_n_s16(v2642, 22112); 2928 int16x8_t v2644 = vaddq_s16(v2638, v2643); 2929 int16x8_t v2645 = vqrdmulhq_n_s16(v2644, 17561); 2930 int16x8_t v2646 = vaddq_s16(v2634, v2645); 2931 int16x8_t v2647 = vsubq_s16(v1163, v1186); 2932 int16x8_t v2648 = vsubq_s16(v1218, v1236); 2933 int16x8_t v2649_tmp = vqrdmulhq_n_s16(v2648, 3314); 2934 int16x8_t v2649 = vmlaq_n_s16(v2649_tmp, v2648, 5); 2935 int16x8_t v2650 = vaddq_s16(v2647, v2649); 2936 int16x8_t v2651 = vsubq_s16(v1270, v1309); 2937 int16x8_t v2652 = vsubq_s16(v1327, v1346); 2938 int16x8_t v2653_tmp = vqrdmulhq_n_s16(v2652, 3314); 2939 int16x8_t v2653 = vmlaq_n_s16(v2653_tmp, v2652, 5); 2940 int16x8_t v2654 = vaddq_s16(v2651, v2653); 2941 int16x8_t v2655 = vqrdmulhq_n_s16(v2654, 22112); 2942 int16x8_t v2656 = vaddq_s16(v2650, v2655); 2943 int16x8_t v2657 = vsubq_s16(v1368, v1391); 2944 int16x8_t v2658 = vsubq_s16(v1423, v1441); 2945 int16x8_t v2659_tmp = vqrdmulhq_n_s16(v2658, 3314); 2946 int16x8_t v2659 = vmlaq_n_s16(v2659_tmp, v2658, 5); 2947 int16x8_t v2660 = vaddq_s16(v2657, v2659); 2948 int16x8_t v2661 = vsubq_s16(v1461, v1484); 2949 int16x8_t v2662 = vsubq_s16(v1502, v1522); 2950 int16x8_t v2663_tmp = vqrdmulhq_n_s16(v2662, 3314); 2951 int16x8_t v2663 = vmlaq_n_s16(v2663_tmp, v2662, 5); 2952 int16x8_t v2664 = vaddq_s16(v2661, v2663); 2953 int16x8_t v2665 = vqrdmulhq_n_s16(v2664, 22112); 2954 int16x8_t v2666 = vaddq_s16(v2660, v2665); 2955 int16x8_t v2667 = vqrdmulhq_n_s16(v2666, 17561); 2956 int16x8_t v2668 = vaddq_s16(v2656, v2667); 2957 int16x8_t v2669 = vqrdmulhq_n_s16(v2668, 16666); 2958 int16x8_t v2670 = vaddq_s16(v2646, v2669); 2959 int16x8_t v2671 = vqrdmulhq_n_s16(v2670, 16454); 2960 int16x8_t v2672 = vaddq_s16(v2624, v2671); 2961 int16x8_t v2673 = vsubq_s16(v2579, v2581); 2962 int16x8_t v2674 = vsubq_s16(v2583, v2585); 2963 int16x8_t v2675 = vqrdmulhq_n_s16(v2674, 24397); 2964 int16x8_t v2676 = vaddq_s16(v2673, v2675); 2965 int16x8_t v2677 = vsubq_s16(v2589, v2591); 2966 int16x8_t v2678 = vsubq_s16(v2593, v2595); 2967 int16x8_t v2679 = vqrdmulhq_n_s16(v2678, 24397); 2968 int16x8_t v2680 = vaddq_s16(v2677, v2679); 2969 int16x8_t v2681 = vqrdmulhq_n_s16(v2680, 17921); 2970 int16x8_t v2682 = vaddq_s16(v2676, v2681); 2971 int16x8_t v2683 = vsubq_s16(v2601, v2603); 2972 int16x8_t v2684 = vsubq_s16(v2605, v2607); 2973 int16x8_t v2685 = vqrdmulhq_n_s16(v2684, 24397); 2974 int16x8_t v2686 = vaddq_s16(v2683, v2685); 2975 int16x8_t v2687 = vsubq_s16(v2611, v2613); 2976 int16x8_t v2688 = vsubq_s16(v2615, v2617); 2977 int16x8_t v2689 = vqrdmulhq_n_s16(v2688, 24397); 2978 int16x8_t v2690 = vaddq_s16(v2687, v2689); 2979 int16x8_t v2691 = vqrdmulhq_n_s16(v2690, 17921); 2980 int16x8_t v2692 = vaddq_s16(v2686, v2691); 2981 int16x8_t v2693 = vqrdmulhq_n_s16(v2692, 16747); 2982 int16x8_t v2694 = vaddq_s16(v2682, v2693); 2983 int16x8_t v2695 = vsubq_s16(v2625, v2627); 2984 int16x8_t v2696 = vsubq_s16(v2629, v2631); 2985 int16x8_t v2697 = vqrdmulhq_n_s16(v2696, 24397); 2986 int16x8_t v2698 = vaddq_s16(v2695, v2697); 2987 int16x8_t v2699 = vsubq_s16(v2635, v2637); 2988 int16x8_t v2700 = vsubq_s16(v2639, v2641); 2989 int16x8_t v2701 = vqrdmulhq_n_s16(v2700, 24397); 2990 int16x8_t v2702 = vaddq_s16(v2699, v2701); 2991 int16x8_t v2703 = vqrdmulhq_n_s16(v2702, 17921); 2992 int16x8_t v2704 = vaddq_s16(v2698, v2703); 2993 int16x8_t v2705 = vsubq_s16(v2647, v2649); 2994 int16x8_t v2706 = vsubq_s16(v2651, v2653); 2995 int16x8_t v2707 = vqrdmulhq_n_s16(v2706, 24397); 2996 int16x8_t v2708 = vaddq_s16(v2705, v2707); 2997 int16x8_t v2709 = vsubq_s16(v2657, v2659); 2998 int16x8_t v2710 = vsubq_s16(v2661, v2663); 2999 int16x8_t v2711 = vqrdmulhq_n_s16(v2710, 24397); 3000 int16x8_t v2712 = vaddq_s16(v2709, v2711); 3001 int16x8_t v2713 = vqrdmulhq_n_s16(v2712, 17921); 3002 int16x8_t v2714 = vaddq_s16(v2708, v2713); 3003 int16x8_t v2715 = vqrdmulhq_n_s16(v2714, 16747); 3004 int16x8_t v2716 = vaddq_s16(v2704, v2715); 3005 int16x8_t v2717 = vqrdmulhq_n_s16(v2716, 16474); 3006 int16x8_t v2718 = vaddq_s16(v2694, v2717); 3007 int16x8_t v2719 = vsubq_s16(v2485, v2487); 3008 int16x8_t v2720 = vsubq_s16(v2489, v2491); 3009 int16x8_t v2721 = vqrdmulhq_n_s16(v2720, 27504); 3010 int16x8_t v2722 = vaddq_s16(v2719, v2721); 3011 int16x8_t v2723 = vsubq_s16(v2495, v2497); 3012 int16x8_t v2724 = vsubq_s16(v2499, v2501); 3013 int16x8_t v2725 = vqrdmulhq_n_s16(v2724, 27504); 3014 int16x8_t v2726 = vaddq_s16(v2723, v2725); 3015 int16x8_t v2727 = vqrdmulhq_n_s16(v2726, 18343); 3016 int16x8_t v2728 = vaddq_s16(v2722, v2727); 3017 int16x8_t v2729 = vsubq_s16(v2507, v2509); 3018 int16x8_t v2730 = vsubq_s16(v2511, v2513); 3019 int16x8_t v2731 = vqrdmulhq_n_s16(v2730, 27504); 3020 int16x8_t v2732 = vaddq_s16(v2729, v2731); 3021 int16x8_t v2733 = vsubq_s16(v2517, v2519); 3022 int16x8_t v2734 = vsubq_s16(v2521, v2523); 3023 int16x8_t v2735 = vqrdmulhq_n_s16(v2734, 27504); 3024 int16x8_t v2736 = vaddq_s16(v2733, v2735); 3025 int16x8_t v2737 = vqrdmulhq_n_s16(v2736, 18343); 3026 int16x8_t v2738 = vaddq_s16(v2732, v2737); 3027 int16x8_t v2739 = vqrdmulhq_n_s16(v2738, 16840); 3028 int16x8_t v2740 = vaddq_s16(v2728, v2739); 3029 int16x8_t v2741 = vsubq_s16(v2531, v2533); 3030 int16x8_t v2742 = vsubq_s16(v2535, v2537); 3031 int16x8_t v2743 = vqrdmulhq_n_s16(v2742, 27504); 3032 int16x8_t v2744 = vaddq_s16(v2741, v2743); 3033 int16x8_t v2745 = vsubq_s16(v2541, v2543); 3034 int16x8_t v2746 = vsubq_s16(v2545, v2547); 3035 int16x8_t v2747 = vqrdmulhq_n_s16(v2746, 27504); 3036 int16x8_t v2748 = vaddq_s16(v2745, v2747); 3037 int16x8_t v2749 = vqrdmulhq_n_s16(v2748, 18343); 3038 int16x8_t v2750 = vaddq_s16(v2744, v2749); 3039 int16x8_t v2751 = vsubq_s16(v2553, v2555); 3040 int16x8_t v2752 = vsubq_s16(v2557, v2559); 3041 int16x8_t v2753 = vqrdmulhq_n_s16(v2752, 27504); 3042 int16x8_t v2754 = vaddq_s16(v2751, v2753); 3043 int16x8_t v2755 = vsubq_s16(v2563, v2565); 3044 int16x8_t v2756 = vsubq_s16(v2567, v2569); 3045 int16x8_t v2757 = vqrdmulhq_n_s16(v2756, 27504); 3046 int16x8_t v2758 = vaddq_s16(v2755, v2757); 3047 int16x8_t v2759 = vqrdmulhq_n_s16(v2758, 18343); 3048 int16x8_t v2760 = vaddq_s16(v2754, v2759); 3049 int16x8_t v2761 = vqrdmulhq_n_s16(v2760, 16840); 3050 int16x8_t v2762 = vaddq_s16(v2750, v2761); 3051 int16x8_t v2763 = vqrdmulhq_n_s16(v2762, 16496); 3052 int16x8_t v2764 = vaddq_s16(v2740, v2763); 3053 int16x8_t v2765 = vsubq_s16(v2390, v2392); 3054 int16x8_t v2766 = vsubq_s16(v2394, v2396); 3055 int16x8_t v2767 = vqrdmulhq_n_s16(v2766, 31869); 3056 int16x8_t v2768 = vaddq_s16(v2765, v2767); 3057 int16x8_t v2769 = vsubq_s16(v2400, v2402); 3058 int16x8_t v2770 = vsubq_s16(v2404, v2406); 3059 int16x8_t v2771 = vqrdmulhq_n_s16(v2770, 31869); 3060 int16x8_t v2772 = vaddq_s16(v2769, v2771); 3061 int16x8_t v2773 = vqrdmulhq_n_s16(v2772, 18830); 3062 int16x8_t v2774 = vaddq_s16(v2768, v2773); 3063 int16x8_t v2775 = vsubq_s16(v2412, v2414); 3064 int16x8_t v2776 = vsubq_s16(v2416, v2418); 3065 int16x8_t v2777 = vqrdmulhq_n_s16(v2776, 31869); 3066 int16x8_t v2778 = vaddq_s16(v2775, v2777); 3067 int16x8_t v2779 = vsubq_s16(v2422, v2424); 3068 int16x8_t v2780 = vsubq_s16(v2426, v2428); 3069 int16x8_t v2781 = vqrdmulhq_n_s16(v2780, 31869); 3070 int16x8_t v2782 = vaddq_s16(v2779, v2781); 3071 int16x8_t v2783 = vqrdmulhq_n_s16(v2782, 18830); 3072 int16x8_t v2784 = vaddq_s16(v2778, v2783); 3073 int16x8_t v2785 = vqrdmulhq_n_s16(v2784, 16944); 3074 int16x8_t v2786 = vaddq_s16(v2774, v2785); 3075 int16x8_t v2787 = vsubq_s16(v2436, v2438); 3076 int16x8_t v2788 = vsubq_s16(v2440, v2442); 3077 int16x8_t v2789 = vqrdmulhq_n_s16(v2788, 31869); 3078 int16x8_t v2790 = vaddq_s16(v2787, v2789); 3079 int16x8_t v2791 = vsubq_s16(v2446, v2448); 3080 int16x8_t v2792 = vsubq_s16(v2450, v2452); 3081 int16x8_t v2793 = vqrdmulhq_n_s16(v2792, 31869); 3082 int16x8_t v2794 = vaddq_s16(v2791, v2793); 3083 int16x8_t v2795 = vqrdmulhq_n_s16(v2794, 18830); 3084 int16x8_t v2796 = vaddq_s16(v2790, v2795); 3085 int16x8_t v2797 = vsubq_s16(v2458, v2460); 3086 int16x8_t v2798 = vsubq_s16(v2462, v2465); 3087 int16x8_t v2799 = vqrdmulhq_n_s16(v2798, 31869); 3088 int16x8_t v2800 = vaddq_s16(v2797, v2799); 3089 int16x8_t v2801 = vsubq_s16(v2469, v2471); 3090 int16x8_t v2802 = vsubq_s16(v2473, v2475); 3091 int16x8_t v2803 = vqrdmulhq_n_s16(v2802, 31869); 3092 int16x8_t v2804 = vaddq_s16(v2801, v2803); 3093 int16x8_t v2805 = vqrdmulhq_n_s16(v2804, 18830); 3094 int16x8_t v2806 = vaddq_s16(v2800, v2805); 3095 int16x8_t v2807 = vqrdmulhq_n_s16(v2806, 16944); 3096 int16x8_t v2808 = vaddq_s16(v2796, v2807); 3097 int16x8_t v2809 = vqrdmulhq_n_s16(v2808, 16521); 3098 int16x8_t v2810 = vaddq_s16(v2786, v2809); 3099 int16x8_t v2811 = vsubq_s16(v2296, v2298); 3100 int16x8_t v2812 = vsubq_s16(v2300, v2302); 3101 int16x8_t v2813_tmp = vqrdmulhq_n_s16(v2812, 5552); 3102 int16x8_t v2813 = vaddq_s16(v2813_tmp, v2812); 3103 int16x8_t v2814 = vaddq_s16(v2811, v2813); 3104 int16x8_t v2815 = vsubq_s16(v2306, v2308); 3105 int16x8_t v2816 = vsubq_s16(v2310, v2312); 3106 int16x8_t v2817_tmp = vqrdmulhq_n_s16(v2816, 5552); 3107 int16x8_t v2817 = vaddq_s16(v2817_tmp, v2816); 3108 int16x8_t v2818 = vaddq_s16(v2815, v2817); 3109 int16x8_t v2819 = vqrdmulhq_n_s16(v2818, 19393); 3110 int16x8_t v2820 = vaddq_s16(v2814, v2819); 3111 int16x8_t v2821 = vsubq_s16(v2318, v2320); 3112 int16x8_t v2822 = vsubq_s16(v2322, v2324); 3113 int16x8_t v2823_tmp = vqrdmulhq_n_s16(v2822, 5552); 3114 int16x8_t v2823 = vaddq_s16(v2823_tmp, v2822); 3115 int16x8_t v2824 = vaddq_s16(v2821, v2823); 3116 int16x8_t v2825 = vsubq_s16(v2328, v2330); 3117 int16x8_t v2826 = vsubq_s16(v2332, v2334); 3118 int16x8_t v2827_tmp = vqrdmulhq_n_s16(v2826, 5552); 3119 int16x8_t v2827 = vaddq_s16(v2827_tmp, v2826); 3120 int16x8_t v2828 = vaddq_s16(v2825, v2827); 3121 int16x8_t v2829 = vqrdmulhq_n_s16(v2828, 19393); 3122 int16x8_t v2830 = vaddq_s16(v2824, v2829); 3123 int16x8_t v2831 = vqrdmulhq_n_s16(v2830, 17059); 3124 int16x8_t v2832 = vaddq_s16(v2820, v2831); 3125 int16x8_t v2833 = vsubq_s16(v2342, v2344); 3126 int16x8_t v2834 = vsubq_s16(v2346, v2348); 3127 int16x8_t v2835_tmp = vqrdmulhq_n_s16(v2834, 5552); 3128 int16x8_t v2835 = vaddq_s16(v2835_tmp, v2834); 3129 int16x8_t v2836 = vaddq_s16(v2833, v2835); 3130 int16x8_t v2837 = vsubq_s16(v2352, v2354); 3131 int16x8_t v2838 = vsubq_s16(v2356, v2358); 3132 int16x8_t v2839_tmp = vqrdmulhq_n_s16(v2838, 5552); 3133 int16x8_t v2839 = vaddq_s16(v2839_tmp, v2838); 3134 int16x8_t v2840 = vaddq_s16(v2837, v2839); 3135 int16x8_t v2841 = vqrdmulhq_n_s16(v2840, 19393); 3136 int16x8_t v2842 = vaddq_s16(v2836, v2841); 3137 int16x8_t v2843 = vsubq_s16(v2364, v2366); 3138 int16x8_t v2844 = vsubq_s16(v2368, v2370); 3139 int16x8_t v2845_tmp = vqrdmulhq_n_s16(v2844, 5552); 3140 int16x8_t v2845 = vaddq_s16(v2845_tmp, v2844); 3141 int16x8_t v2846 = vaddq_s16(v2843, v2845); 3142 int16x8_t v2847 = vsubq_s16(v2374, v2376); 3143 int16x8_t v2848 = vsubq_s16(v2378, v2380); 3144 int16x8_t v2849_tmp = vqrdmulhq_n_s16(v2848, 5552); 3145 int16x8_t v2849 = vaddq_s16(v2849_tmp, v2848); 3146 int16x8_t v2850 = vaddq_s16(v2847, v2849); 3147 int16x8_t v2851 = vqrdmulhq_n_s16(v2850, 19393); 3148 int16x8_t v2852 = vaddq_s16(v2846, v2851); 3149 int16x8_t v2853 = vqrdmulhq_n_s16(v2852, 17059); 3150 int16x8_t v2854 = vaddq_s16(v2842, v2853); 3151 int16x8_t v2855 = vqrdmulhq_n_s16(v2854, 16549); 3152 int16x8_t v2856 = vaddq_s16(v2832, v2855); 3153 int16x8_t v2857 = vsubq_s16(v2109, v2114); 3154 int16x8_t v2858 = vsubq_s16(v2119, v2124); 3155 int16x8_t v2859_tmp = vqrdmulhq_n_s16(v2858, 15865); 3156 int16x8_t v2859 = vaddq_s16(v2859_tmp, v2858); 3157 int16x8_t v2860 = vaddq_s16(v2857, v2859); 3158 int16x8_t v2861 = vsubq_s16(v2131, v2136); 3159 int16x8_t v2862 = vsubq_s16(v2141, v2146); 3160 int16x8_t v2863_tmp = vqrdmulhq_n_s16(v2862, 15865); 3161 int16x8_t v2863 = vaddq_s16(v2863_tmp, v2862); 3162 int16x8_t v2864 = vaddq_s16(v2861, v2863); 3163 int16x8_t v2865 = vqrdmulhq_n_s16(v2864, 20040); 3164 int16x8_t v2866 = vaddq_s16(v2860, v2865); 3165 int16x8_t v2867 = vsubq_s16(v2155, v2160); 3166 int16x8_t v2868 = vsubq_s16(v2165, v2170); 3167 int16x8_t v2869_tmp = vqrdmulhq_n_s16(v2868, 15865); 3168 int16x8_t v2869 = vaddq_s16(v2869_tmp, v2868); 3169 int16x8_t v2870 = vaddq_s16(v2867, v2869); 3170 int16x8_t v2871 = vsubq_s16(v2177, v2182); 3171 int16x8_t v2872 = vsubq_s16(v2187, v2192); 3172 int16x8_t v2873_tmp = vqrdmulhq_n_s16(v2872, 15865); 3173 int16x8_t v2873 = vaddq_s16(v2873_tmp, v2872); 3174 int16x8_t v2874 = vaddq_s16(v2871, v2873); 3175 int16x8_t v2875 = vqrdmulhq_n_s16(v2874, 20040); 3176 int16x8_t v2876 = vaddq_s16(v2870, v2875); 3177 int16x8_t v2877 = vqrdmulhq_n_s16(v2876, 17187); 3178 int16x8_t v2878 = vaddq_s16(v2866, v2877); 3179 int16x8_t v2879 = vsubq_s16(v2203, v2208); 3180 int16x8_t v2880 = vsubq_s16(v2213, v2218); 3181 int16x8_t v2881_tmp = vqrdmulhq_n_s16(v2880, 15865); 3182 int16x8_t v2881 = vaddq_s16(v2881_tmp, v2880); 3183 int16x8_t v2882 = vaddq_s16(v2879, v2881); 3184 int16x8_t v2883 = vsubq_s16(v2225, v2230); 3185 int16x8_t v2884 = vsubq_s16(v2235, v2240); 3186 int16x8_t v2885_tmp = vqrdmulhq_n_s16(v2884, 15865); 3187 int16x8_t v2885 = vaddq_s16(v2885_tmp, v2884); 3188 int16x8_t v2886 = vaddq_s16(v2883, v2885); 3189 int16x8_t v2887 = vqrdmulhq_n_s16(v2886, 20040); 3190 int16x8_t v2888 = vaddq_s16(v2882, v2887); 3191 int16x8_t v2889 = vsubq_s16(v2249, v2254); 3192 int16x8_t v2890 = vsubq_s16(v2259, v2264); 3193 int16x8_t v2891_tmp = vqrdmulhq_n_s16(v2890, 15865); 3194 int16x8_t v2891 = vaddq_s16(v2891_tmp, v2890); 3195 int16x8_t v2892 = vaddq_s16(v2889, v2891); 3196 int16x8_t v2893 = vsubq_s16(v2271, v2276); 3197 int16x8_t v2894 = vsubq_s16(v2281, v2286); 3198 int16x8_t v2895_tmp = vqrdmulhq_n_s16(v2894, 15865); 3199 int16x8_t v2895 = vaddq_s16(v2895_tmp, v2894); 3200 int16x8_t v2896 = vaddq_s16(v2893, v2895); 3201 int16x8_t v2897 = vqrdmulhq_n_s16(v2896, 20040); 3202 int16x8_t v2898 = vaddq_s16(v2892, v2897); 3203 int16x8_t v2899 = vqrdmulhq_n_s16(v2898, 17187); 3204 int16x8_t v2900 = vaddq_s16(v2888, v2899); 3205 int16x8_t v2901 = vqrdmulhq_n_s16(v2900, 16579); 3206 int16x8_t v2902 = vaddq_s16(v2878, v2901); 3207 int16x8_t v2903 = vsubq_s16(v1919, v1924); 3208 int16x8_t v2904 = vsubq_s16(v1929, v1934); 3209 int16x8_t v2905_tmp = vqrdmulhq_n_s16(v2904, 1893); 3210 int16x8_t v2905 = vmlaq_n_s16(v2905_tmp, v2904, 2); 3211 int16x8_t v2906 = vaddq_s16(v2903, v2905); 3212 int16x8_t v2907 = vsubq_s16(v1941, v1946); 3213 int16x8_t v2908 = vsubq_s16(v1951, v1956); 3214 int16x8_t v2909_tmp = vqrdmulhq_n_s16(v2908, 1893); 3215 int16x8_t v2909 = vmlaq_n_s16(v2909_tmp, v2908, 2); 3216 int16x8_t v2910 = vaddq_s16(v2907, v2909); 3217 int16x8_t v2911 = vqrdmulhq_n_s16(v2910, 20783); 3218 int16x8_t v2912 = vaddq_s16(v2906, v2911); 3219 int16x8_t v2913 = vsubq_s16(v1965, v1970); 3220 int16x8_t v2914 = vsubq_s16(v1975, v1980); 3221 int16x8_t v2915_tmp = vqrdmulhq_n_s16(v2914, 1893); 3222 int16x8_t v2915 = vmlaq_n_s16(v2915_tmp, v2914, 2); 3223 int16x8_t v2916 = vaddq_s16(v2913, v2915); 3224 int16x8_t v2917 = vsubq_s16(v1987, v1992); 3225 int16x8_t v2918 = vsubq_s16(v1997, v2002); 3226 int16x8_t v2919_tmp = vqrdmulhq_n_s16(v2918, 1893); 3227 int16x8_t v2919 = vmlaq_n_s16(v2919_tmp, v2918, 2); 3228 int16x8_t v2920 = vaddq_s16(v2917, v2919); 3229 int16x8_t v2921 = vqrdmulhq_n_s16(v2920, 20783); 3230 int16x8_t v2922 = vaddq_s16(v2916, v2921); 3231 int16x8_t v2923 = vqrdmulhq_n_s16(v2922, 17326); 3232 int16x8_t v2924 = vaddq_s16(v2912, v2923); 3233 int16x8_t v2925 = vsubq_s16(v2013, v2018); 3234 int16x8_t v2926 = vsubq_s16(v2023, v2028); 3235 int16x8_t v2927_tmp = vqrdmulhq_n_s16(v2926, 1893); 3236 int16x8_t v2927 = vmlaq_n_s16(v2927_tmp, v2926, 2); 3237 int16x8_t v2928 = vaddq_s16(v2925, v2927); 3238 int16x8_t v2929 = vsubq_s16(v2035, v2040); 3239 int16x8_t v2930 = vsubq_s16(v2045, v2050); 3240 int16x8_t v2931_tmp = vqrdmulhq_n_s16(v2930, 1893); 3241 int16x8_t v2931 = vmlaq_n_s16(v2931_tmp, v2930, 2); 3242 int16x8_t v2932 = vaddq_s16(v2929, v2931); 3243 int16x8_t v2933 = vqrdmulhq_n_s16(v2932, 20783); 3244 int16x8_t v2934 = vaddq_s16(v2928, v2933); 3245 int16x8_t v2935 = vsubq_s16(v2059, v2064); 3246 int16x8_t v2936 = vsubq_s16(v2069, v2074); 3247 int16x8_t v2937_tmp = vqrdmulhq_n_s16(v2936, 1893); 3248 int16x8_t v2937 = vmlaq_n_s16(v2937_tmp, v2936, 2); 3249 int16x8_t v2938 = vaddq_s16(v2935, v2937); 3250 int16x8_t v2939 = vsubq_s16(v2081, v2086); 3251 int16x8_t v2940 = vsubq_s16(v2091, v2096); 3252 int16x8_t v2941_tmp = vqrdmulhq_n_s16(v2940, 1893); 3253 int16x8_t v2941 = vmlaq_n_s16(v2941_tmp, v2940, 2); 3254 int16x8_t v2942 = vaddq_s16(v2939, v2941); 3255 int16x8_t v2943 = vqrdmulhq_n_s16(v2942, 20783); 3256 int16x8_t v2944 = vaddq_s16(v2938, v2943); 3257 int16x8_t v2945 = vqrdmulhq_n_s16(v2944, 17326); 3258 int16x8_t v2946 = vaddq_s16(v2934, v2945); 3259 int16x8_t v2947 = vqrdmulhq_n_s16(v2946, 16611); 3260 int16x8_t v2948 = vaddq_s16(v2924, v2947); 3261 int16x8_t v2949 = vsubq_s16(v1543, v1554); 3262 int16x8_t v2950 = vsubq_s16(v1565, v1576); 3263 int16x8_t v2951_tmp = vqrdmulhq_n_s16(v2950, 13357); 3264 int16x8_t v2951 = vmlaq_n_s16(v2951_tmp, v2950, 3); 3265 int16x8_t v2952 = vaddq_s16(v2949, v2951); 3266 int16x8_t v2953 = vsubq_s16(v1589, v1600); 3267 int16x8_t v2954 = vsubq_s16(v1611, v1622); 3268 int16x8_t v2955_tmp = vqrdmulhq_n_s16(v2954, 13357); 3269 int16x8_t v2955 = vmlaq_n_s16(v2955_tmp, v2954, 3); 3270 int16x8_t v2956 = vaddq_s16(v2953, v2955); 3271 int16x8_t v2957 = vqrdmulhq_n_s16(v2956, 21637); 3272 int16x8_t v2958 = vaddq_s16(v2952, v2957); 3273 int16x8_t v2959 = vsubq_s16(v1637, v1648); 3274 int16x8_t v2960 = vsubq_s16(v1659, v1670); 3275 int16x8_t v2961_tmp = vqrdmulhq_n_s16(v2960, 13357); 3276 int16x8_t v2961 = vmlaq_n_s16(v2961_tmp, v2960, 3); 3277 int16x8_t v2962 = vaddq_s16(v2959, v2961); 3278 int16x8_t v2963 = vsubq_s16(v1683, v1694); 3279 int16x8_t v2964 = vsubq_s16(v1705, v1716); 3280 int16x8_t v2965_tmp = vqrdmulhq_n_s16(v2964, 13357); 3281 int16x8_t v2965 = vmlaq_n_s16(v2965_tmp, v2964, 3); 3282 int16x8_t v2966 = vaddq_s16(v2963, v2965); 3283 int16x8_t v2967 = vqrdmulhq_n_s16(v2966, 21637); 3284 int16x8_t v2968 = vaddq_s16(v2962, v2967); 3285 int16x8_t v2969 = vqrdmulhq_n_s16(v2968, 17479); 3286 int16x8_t v2970 = vaddq_s16(v2958, v2969); 3287 int16x8_t v2971 = vsubq_s16(v1733, v1744); 3288 int16x8_t v2972 = vsubq_s16(v1755, v1766); 3289 int16x8_t v2973_tmp = vqrdmulhq_n_s16(v2972, 13357); 3290 int16x8_t v2973 = vmlaq_n_s16(v2973_tmp, v2972, 3); 3291 int16x8_t v2974 = vaddq_s16(v2971, v2973); 3292 int16x8_t v2975 = vsubq_s16(v1779, v1790); 3293 int16x8_t v2976 = vsubq_s16(v1801, v1812); 3294 int16x8_t v2977_tmp = vqrdmulhq_n_s16(v2976, 13357); 3295 int16x8_t v2977 = vmlaq_n_s16(v2977_tmp, v2976, 3); 3296 int16x8_t v2978 = vaddq_s16(v2975, v2977); 3297 int16x8_t v2979 = vqrdmulhq_n_s16(v2978, 21637); 3298 int16x8_t v2980 = vaddq_s16(v2974, v2979); 3299 int16x8_t v2981 = vsubq_s16(v1827, v1838); 3300 int16x8_t v2982 = vsubq_s16(v1849, v1860); 3301 int16x8_t v2983_tmp = vqrdmulhq_n_s16(v2982, 13357); 3302 int16x8_t v2983 = vmlaq_n_s16(v2983_tmp, v2982, 3); 3303 int16x8_t v2984 = vaddq_s16(v2981, v2983); 3304 int16x8_t v2985 = vsubq_s16(v1873, v1884); 3305 int16x8_t v2986 = vsubq_s16(v1895, v1906); 3306 int16x8_t v2987_tmp = vqrdmulhq_n_s16(v2986, 13357); 3307 int16x8_t v2987 = vmlaq_n_s16(v2987_tmp, v2986, 3); 3308 int16x8_t v2988 = vaddq_s16(v2985, v2987); 3309 int16x8_t v2989 = vqrdmulhq_n_s16(v2988, 21637); 3310 int16x8_t v2990 = vaddq_s16(v2984, v2989); 3311 int16x8_t v2991 = vqrdmulhq_n_s16(v2990, 17479); 3312 int16x8_t v2992 = vaddq_s16(v2980, v2991); 3313 int16x8_t v2993 = vqrdmulhq_n_s16(v2992, 16647); 3314 int16x8_t v2994 = vaddq_s16(v2970, v2993); 3315 int16x8_t v2995 = vsubq_s16(v25, v60); 3316 int16x8_t v2996 = vsubq_s16(v102, v138); 3317 int16x8_t v2997_tmp = vqrdmulhq_n_s16(v2996, 6226); 3318 int16x8_t v2997 = vmlaq_n_s16(v2997_tmp, v2996, 10); 3319 int16x8_t v2998 = vaddq_s16(v2995, v2997); 3320 int16x8_t v2999 = vsubq_s16(v182, v233); 3321 int16x8_t v3000 = vsubq_s16(v275, v312); 3322 int16x8_t v3001_tmp = vqrdmulhq_n_s16(v3000, 6226); 3323 int16x8_t v3001 = vmlaq_n_s16(v3001_tmp, v3000, 10); 3324 int16x8_t v3002 = vaddq_s16(v2999, v3001); 3325 int16x8_t v3003 = vqrdmulhq_n_s16(v3002, 22622); 3326 int16x8_t v3004 = vaddq_s16(v2998, v3003); 3327 int16x8_t v3005 = vsubq_s16(v358, v409); 3328 int16x8_t v3006 = vsubq_s16(v481, v519); 3329 int16x8_t v3007_tmp = vqrdmulhq_n_s16(v3006, 6226); 3330 int16x8_t v3007 = vmlaq_n_s16(v3007_tmp, v3006, 10); 3331 int16x8_t v3008 = vaddq_s16(v3005, v3007); 3332 int16x8_t v3009 = vsubq_s16(v563, v614); 3333 int16x8_t v3010 = vsubq_s16(v656, v694); 3334 int16x8_t v3011_tmp = vqrdmulhq_n_s16(v3010, 6226); 3335 int16x8_t v3011 = vmlaq_n_s16(v3011_tmp, v3010, 10); 3336 int16x8_t v3012 = vaddq_s16(v3009, v3011); 3337 int16x8_t v3013 = vqrdmulhq_n_s16(v3012, 22622); 3338 int16x8_t v3014 = vaddq_s16(v3008, v3013); 3339 int16x8_t v3015 = vqrdmulhq_n_s16(v3014, 17646); 3340 int16x8_t v3016 = vaddq_s16(v3004, v3015); 3341 int16x8_t v3017 = vsubq_s16(v742, v793); 3342 int16x8_t v3018 = vsubq_s16(v865, v903); 3343 int16x8_t v3019_tmp = vqrdmulhq_n_s16(v3018, 6226); 3344 int16x8_t v3019 = vmlaq_n_s16(v3019_tmp, v3018, 10); 3345 int16x8_t v3020 = vaddq_s16(v3017, v3019); 3346 int16x8_t v3021 = vsubq_s16(v977, v1060); 3347 int16x8_t v3022 = vsubq_s16(v1102, v1141); 3348 int16x8_t v3023_tmp = vqrdmulhq_n_s16(v3022, 6226); 3349 int16x8_t v3023 = vmlaq_n_s16(v3023_tmp, v3022, 10); 3350 int16x8_t v3024 = vaddq_s16(v3021, v3023); 3351 int16x8_t v3025 = vqrdmulhq_n_s16(v3024, 22622); 3352 int16x8_t v3026 = vaddq_s16(v3020, v3025); 3353 int16x8_t v3027 = vsubq_s16(v1187, v1238); 3354 int16x8_t v3028 = vsubq_s16(v1310, v1348); 3355 int16x8_t v3029_tmp = vqrdmulhq_n_s16(v3028, 6226); 3356 int16x8_t v3029 = vmlaq_n_s16(v3029_tmp, v3028, 10); 3357 int16x8_t v3030 = vaddq_s16(v3027, v3029); 3358 int16x8_t v3031 = vsubq_s16(v1392, v1443); 3359 int16x8_t v3032 = vsubq_s16(v1485, v1524); 3360 int16x8_t v3033_tmp = vqrdmulhq_n_s16(v3032, 6226); 3361 int16x8_t v3033 = vmlaq_n_s16(v3033_tmp, v3032, 10); 3362 int16x8_t v3034 = vaddq_s16(v3031, v3033); 3363 int16x8_t v3035 = vqrdmulhq_n_s16(v3034, 22622); 3364 int16x8_t v3036 = vaddq_s16(v3030, v3035); 3365 int16x8_t v3037 = vqrdmulhq_n_s16(v3036, 17646); 3366 int16x8_t v3038 = vaddq_s16(v3026, v3037); 3367 int16x8_t v3039 = vqrdmulhq_n_s16(v3038, 16685); 3368 int16x8_t v3040 = vaddq_s16(v3016, v3039); 3369 int16x8_t v3041 = vsubq_s16(v2995, v2997); 3370 int16x8_t v3042 = vsubq_s16(v2999, v3001); 3371 int16x8_t v3043 = vqrdmulhq_n_s16(v3042, 23761); 3372 int16x8_t v3044 = vaddq_s16(v3041, v3043); 3373 int16x8_t v3045 = vsubq_s16(v3005, v3007); 3374 int16x8_t v3046 = vsubq_s16(v3009, v3011); 3375 int16x8_t v3047 = vqrdmulhq_n_s16(v3046, 23761); 3376 int16x8_t v3048 = vaddq_s16(v3045, v3047); 3377 int16x8_t v3049 = vqrdmulhq_n_s16(v3048, 17826); 3378 int16x8_t v3050 = vaddq_s16(v3044, v3049); 3379 int16x8_t v3051 = vsubq_s16(v3017, v3019); 3380 int16x8_t v3052 = vsubq_s16(v3021, v3023); 3381 int16x8_t v3053 = vqrdmulhq_n_s16(v3052, 23761); 3382 int16x8_t v3054 = vaddq_s16(v3051, v3053); 3383 int16x8_t v3055 = vsubq_s16(v3027, v3029); 3384 int16x8_t v3056 = vsubq_s16(v3031, v3033); 3385 int16x8_t v3057 = vqrdmulhq_n_s16(v3056, 23761); 3386 int16x8_t v3058 = vaddq_s16(v3055, v3057); 3387 int16x8_t v3059 = vqrdmulhq_n_s16(v3058, 17826); 3388 int16x8_t v3060 = vaddq_s16(v3054, v3059); 3389 int16x8_t v3061 = vqrdmulhq_n_s16(v3060, 16726); 3390 int16x8_t v3062 = vaddq_s16(v3050, v3061); 3391 int16x8_t v3063 = vsubq_s16(v2949, v2951); 3392 int16x8_t v3064 = vsubq_s16(v2953, v2955); 3393 int16x8_t v3065 = vqrdmulhq_n_s16(v3064, 25084); 3394 int16x8_t v3066 = vaddq_s16(v3063, v3065); 3395 int16x8_t v3067 = vsubq_s16(v2959, v2961); 3396 int16x8_t v3068 = vsubq_s16(v2963, v2965); 3397 int16x8_t v3069 = vqrdmulhq_n_s16(v3068, 25084); 3398 int16x8_t v3070 = vaddq_s16(v3067, v3069); 3399 int16x8_t v3071 = vqrdmulhq_n_s16(v3070, 18021); 3400 int16x8_t v3072 = vaddq_s16(v3066, v3071); 3401 int16x8_t v3073 = vsubq_s16(v2971, v2973); 3402 int16x8_t v3074 = vsubq_s16(v2975, v2977); 3403 int16x8_t v3075 = vqrdmulhq_n_s16(v3074, 25084); 3404 int16x8_t v3076 = vaddq_s16(v3073, v3075); 3405 int16x8_t v3077 = vsubq_s16(v2981, v2983); 3406 int16x8_t v3078 = vsubq_s16(v2985, v2987); 3407 int16x8_t v3079 = vqrdmulhq_n_s16(v3078, 25084); 3408 int16x8_t v3080 = vaddq_s16(v3077, v3079); 3409 int16x8_t v3081 = vqrdmulhq_n_s16(v3080, 18021); 3410 int16x8_t v3082 = vaddq_s16(v3076, v3081); 3411 int16x8_t v3083 = vqrdmulhq_n_s16(v3082, 16769); 3412 int16x8_t v3084 = vaddq_s16(v3072, v3083); 3413 int16x8_t v3085 = vsubq_s16(v2903, v2905); 3414 int16x8_t v3086 = vsubq_s16(v2907, v2909); 3415 int16x8_t v3087 = vqrdmulhq_n_s16(v3086, 26631); 3416 int16x8_t v3088 = vaddq_s16(v3085, v3087); 3417 int16x8_t v3089 = vsubq_s16(v2913, v2915); 3418 int16x8_t v3090 = vsubq_s16(v2917, v2919); 3419 int16x8_t v3091 = vqrdmulhq_n_s16(v3090, 26631); 3420 int16x8_t v3092 = vaddq_s16(v3089, v3091); 3421 int16x8_t v3093 = vqrdmulhq_n_s16(v3092, 18231); 3422 int16x8_t v3094 = vaddq_s16(v3088, v3093); 3423 int16x8_t v3095 = vsubq_s16(v2925, v2927); 3424 int16x8_t v3096 = vsubq_s16(v2929, v2931); 3425 int16x8_t v3097 = vqrdmulhq_n_s16(v3096, 26631); 3426 int16x8_t v3098 = vaddq_s16(v3095, v3097); 3427 int16x8_t v3099 = vsubq_s16(v2935, v2937); 3428 int16x8_t v3100 = vsubq_s16(v2939, v2941); 3429 int16x8_t v3101 = vqrdmulhq_n_s16(v3100, 26631); 3430 int16x8_t v3102 = vaddq_s16(v3099, v3101); 3431 int16x8_t v3103 = vqrdmulhq_n_s16(v3102, 18231); 3432 int16x8_t v3104 = vaddq_s16(v3098, v3103); 3433 int16x8_t v3105 = vqrdmulhq_n_s16(v3104, 16815); 3434 int16x8_t v3106 = vaddq_s16(v3094, v3105); 3435 int16x8_t v3107 = vsubq_s16(v2857, v2859); 3436 int16x8_t v3108 = vsubq_s16(v2861, v2863); 3437 int16x8_t v3109 = vqrdmulhq_n_s16(v3108, 28454); 3438 int16x8_t v3110 = vaddq_s16(v3107, v3109); 3439 int16x8_t v3111 = vsubq_s16(v2867, v2869); 3440 int16x8_t v3112 = vsubq_s16(v2871, v2873); 3441 int16x8_t v3113 = vqrdmulhq_n_s16(v3112, 28454); 3442 int16x8_t v3114 = vaddq_s16(v3111, v3113); 3443 int16x8_t v3115 = vqrdmulhq_n_s16(v3114, 18458); 3444 int16x8_t v3116 = vaddq_s16(v3110, v3115); 3445 int16x8_t v3117 = vsubq_s16(v2879, v2881); 3446 int16x8_t v3118 = vsubq_s16(v2883, v2885); 3447 int16x8_t v3119 = vqrdmulhq_n_s16(v3118, 28454); 3448 int16x8_t v3120 = vaddq_s16(v3117, v3119); 3449 int16x8_t v3121 = vsubq_s16(v2889, v2891); 3450 int16x8_t v3122 = vsubq_s16(v2893, v2895); 3451 int16x8_t v3123 = vqrdmulhq_n_s16(v3122, 28454); 3452 int16x8_t v3124 = vaddq_s16(v3121, v3123); 3453 int16x8_t v3125 = vqrdmulhq_n_s16(v3124, 18458); 3454 int16x8_t v3126 = vaddq_s16(v3120, v3125); 3455 int16x8_t v3127 = vqrdmulhq_n_s16(v3126, 16865); 3456 int16x8_t v3128 = vaddq_s16(v3116, v3127); 3457 int16x8_t v3129 = vsubq_s16(v2811, v2813); 3458 int16x8_t v3130 = vsubq_s16(v2815, v2817); 3459 int16x8_t v3131 = vqrdmulhq_n_s16(v3130, 30624); 3460 int16x8_t v3132 = vaddq_s16(v3129, v3131); 3461 int16x8_t v3133 = vsubq_s16(v2821, v2823); 3462 int16x8_t v3134 = vsubq_s16(v2825, v2827); 3463 int16x8_t v3135 = vqrdmulhq_n_s16(v3134, 30624); 3464 int16x8_t v3136 = vaddq_s16(v3133, v3135); 3465 int16x8_t v3137 = vqrdmulhq_n_s16(v3136, 18702); 3466 int16x8_t v3138 = vaddq_s16(v3132, v3137); 3467 int16x8_t v3139 = vsubq_s16(v2833, v2835); 3468 int16x8_t v3140 = vsubq_s16(v2837, v2839); 3469 int16x8_t v3141 = vqrdmulhq_n_s16(v3140, 30624); 3470 int16x8_t v3142 = vaddq_s16(v3139, v3141); 3471 int16x8_t v3143 = vsubq_s16(v2843, v2845); 3472 int16x8_t v3144 = vsubq_s16(v2847, v2849); 3473 int16x8_t v3145 = vqrdmulhq_n_s16(v3144, 30624); 3474 int16x8_t v3146 = vaddq_s16(v3143, v3145); 3475 int16x8_t v3147 = vqrdmulhq_n_s16(v3146, 18702); 3476 int16x8_t v3148 = vaddq_s16(v3142, v3147); 3477 int16x8_t v3149 = vqrdmulhq_n_s16(v3148, 16916); 3478 int16x8_t v3150 = vaddq_s16(v3138, v3149); 3479 int16x8_t v3151 = vsubq_s16(v2765, v2767); 3480 int16x8_t v3152 = vsubq_s16(v2769, v2771); 3481 int16x8_t v3153_tmp = vqrdmulhq_n_s16(v3152, 472); 3482 int16x8_t v3153 = vaddq_s16(v3153_tmp, v3152); 3483 int16x8_t v3154 = vaddq_s16(v3151, v3153); 3484 int16x8_t v3155 = vsubq_s16(v2775, v2777); 3485 int16x8_t v3156 = vsubq_s16(v2779, v2781); 3486 int16x8_t v3157_tmp = vqrdmulhq_n_s16(v3156, 472); 3487 int16x8_t v3157 = vaddq_s16(v3157_tmp, v3156); 3488 int16x8_t v3158 = vaddq_s16(v3155, v3157); 3489 int16x8_t v3159 = vqrdmulhq_n_s16(v3158, 18964); 3490 int16x8_t v3160 = vaddq_s16(v3154, v3159); 3491 int16x8_t v3161 = vsubq_s16(v2787, v2789); 3492 int16x8_t v3162 = vsubq_s16(v2791, v2793); 3493 int16x8_t v3163_tmp = vqrdmulhq_n_s16(v3162, 472); 3494 int16x8_t v3163 = vaddq_s16(v3163_tmp, v3162); 3495 int16x8_t v3164 = vaddq_s16(v3161, v3163); 3496 int16x8_t v3165 = vsubq_s16(v2797, v2799); 3497 int16x8_t v3166 = vsubq_s16(v2801, v2803); 3498 int16x8_t v3167_tmp = vqrdmulhq_n_s16(v3166, 472); 3499 int16x8_t v3167 = vaddq_s16(v3167_tmp, v3166); 3500 int16x8_t v3168 = vaddq_s16(v3165, v3167); 3501 int16x8_t v3169 = vqrdmulhq_n_s16(v3168, 18964); 3502 int16x8_t v3170 = vaddq_s16(v3164, v3169); 3503 int16x8_t v3171 = vqrdmulhq_n_s16(v3170, 16971); 3504 int16x8_t v3172 = vaddq_s16(v3160, v3171); 3505 int16x8_t v3173 = vsubq_s16(v2719, v2721); 3506 int16x8_t v3174 = vsubq_s16(v2723, v2725); 3507 int16x8_t v3175_tmp = vqrdmulhq_n_s16(v3174, 3672); 3508 int16x8_t v3175 = vaddq_s16(v3175_tmp, v3174); 3509 int16x8_t v3176 = vaddq_s16(v3173, v3175); 3510 int16x8_t v3177 = vsubq_s16(v2729, v2731); 3511 int16x8_t v3178 = vsubq_s16(v2733, v2735); 3512 int16x8_t v3179_tmp = vqrdmulhq_n_s16(v3178, 3672); 3513 int16x8_t v3179 = vaddq_s16(v3179_tmp, v3178); 3514 int16x8_t v3180 = vaddq_s16(v3177, v3179); 3515 int16x8_t v3181 = vqrdmulhq_n_s16(v3180, 19245); 3516 int16x8_t v3182 = vaddq_s16(v3176, v3181); 3517 int16x8_t v3183 = vsubq_s16(v2741, v2743); 3518 int16x8_t v3184 = vsubq_s16(v2745, v2747); 3519 int16x8_t v3185_tmp = vqrdmulhq_n_s16(v3184, 3672); 3520 int16x8_t v3185 = vaddq_s16(v3185_tmp, v3184); 3521 int16x8_t v3186 = vaddq_s16(v3183, v3185); 3522 int16x8_t v3187 = vsubq_s16(v2751, v2753); 3523 int16x8_t v3188 = vsubq_s16(v2755, v2757); 3524 int16x8_t v3189_tmp = vqrdmulhq_n_s16(v3188, 3672); 3525 int16x8_t v3189 = vaddq_s16(v3189_tmp, v3188); 3526 int16x8_t v3190 = vaddq_s16(v3187, v3189); 3527 int16x8_t v3191 = vqrdmulhq_n_s16(v3190, 19245); 3528 int16x8_t v3192 = vaddq_s16(v3186, v3191); 3529 int16x8_t v3193 = vqrdmulhq_n_s16(v3192, 17029); 3530 int16x8_t v3194 = vaddq_s16(v3182, v3193); 3531 int16x8_t v3195 = vsubq_s16(v2673, v2675); 3532 int16x8_t v3196 = vsubq_s16(v2677, v2679); 3533 int16x8_t v3197_tmp = vqrdmulhq_n_s16(v3196, 7662); 3534 int16x8_t v3197 = vaddq_s16(v3197_tmp, v3196); 3535 int16x8_t v3198 = vaddq_s16(v3195, v3197); 3536 int16x8_t v3199 = vsubq_s16(v2683, v2685); 3537 int16x8_t v3200 = vsubq_s16(v2687, v2689); 3538 int16x8_t v3201_tmp = vqrdmulhq_n_s16(v3200, 7662); 3539 int16x8_t v3201 = vaddq_s16(v3201_tmp, v3200); 3540 int16x8_t v3202 = vaddq_s16(v3199, v3201); 3541 int16x8_t v3203 = vqrdmulhq_n_s16(v3202, 19546); 3542 int16x8_t v3204 = vaddq_s16(v3198, v3203); 3543 int16x8_t v3205 = vsubq_s16(v2695, v2697); 3544 int16x8_t v3206 = vsubq_s16(v2699, v2701); 3545 int16x8_t v3207_tmp = vqrdmulhq_n_s16(v3206, 7662); 3546 int16x8_t v3207 = vaddq_s16(v3207_tmp, v3206); 3547 int16x8_t v3208 = vaddq_s16(v3205, v3207); 3548 int16x8_t v3209 = vsubq_s16(v2705, v2707); 3549 int16x8_t v3210 = vsubq_s16(v2709, v2711); 3550 int16x8_t v3211_tmp = vqrdmulhq_n_s16(v3210, 7662); 3551 int16x8_t v3211 = vaddq_s16(v3211_tmp, v3210); 3552 int16x8_t v3212 = vaddq_s16(v3209, v3211); 3553 int16x8_t v3213 = vqrdmulhq_n_s16(v3212, 19546); 3554 int16x8_t v3214 = vaddq_s16(v3208, v3213); 3555 int16x8_t v3215 = vqrdmulhq_n_s16(v3214, 17090); 3556 int16x8_t v3216 = vaddq_s16(v3204, v3215); 3557 int16x8_t v3217 = vsubq_s16(v2582, v2587); 3558 int16x8_t v3218 = vsubq_s16(v2592, v2597); 3559 int16x8_t v3219_tmp = vqrdmulhq_n_s16(v3218, 12756); 3560 int16x8_t v3219 = vaddq_s16(v3219_tmp, v3218); 3561 int16x8_t v3220 = vaddq_s16(v3217, v3219); 3562 int16x8_t v3221 = vsubq_s16(v2604, v2609); 3563 int16x8_t v3222 = vsubq_s16(v2614, v2619); 3564 int16x8_t v3223_tmp = vqrdmulhq_n_s16(v3222, 12756); 3565 int16x8_t v3223 = vaddq_s16(v3223_tmp, v3222); 3566 int16x8_t v3224 = vaddq_s16(v3221, v3223); 3567 int16x8_t v3225 = vqrdmulhq_n_s16(v3224, 19869); 3568 int16x8_t v3226 = vaddq_s16(v3220, v3225); 3569 int16x8_t v3227 = vsubq_s16(v2628, v2633); 3570 int16x8_t v3228 = vsubq_s16(v2638, v2643); 3571 int16x8_t v3229_tmp = vqrdmulhq_n_s16(v3228, 12756); 3572 int16x8_t v3229 = vaddq_s16(v3229_tmp, v3228); 3573 int16x8_t v3230 = vaddq_s16(v3227, v3229); 3574 int16x8_t v3231 = vsubq_s16(v2650, v2655); 3575 int16x8_t v3232 = vsubq_s16(v2660, v2665); 3576 int16x8_t v3233_tmp = vqrdmulhq_n_s16(v3232, 12756); 3577 int16x8_t v3233 = vaddq_s16(v3233_tmp, v3232); 3578 int16x8_t v3234 = vaddq_s16(v3231, v3233); 3579 int16x8_t v3235 = vqrdmulhq_n_s16(v3234, 19869); 3580 int16x8_t v3236 = vaddq_s16(v3230, v3235); 3581 int16x8_t v3237 = vqrdmulhq_n_s16(v3236, 17153); 3582 int16x8_t v3238 = vaddq_s16(v3226, v3237); 3583 int16x8_t v3239 = vsubq_s16(v2488, v2493); 3584 int16x8_t v3240 = vsubq_s16(v2498, v2503); 3585 int16x8_t v3241_tmp = vqrdmulhq_n_s16(v3240, 19463); 3586 int16x8_t v3241 = vaddq_s16(v3241_tmp, v3240); 3587 int16x8_t v3242 = vaddq_s16(v3239, v3241); 3588 int16x8_t v3243 = vsubq_s16(v2510, v2515); 3589 int16x8_t v3244 = vsubq_s16(v2520, v2525); 3590 int16x8_t v3245_tmp = vqrdmulhq_n_s16(v3244, 19463); 3591 int16x8_t v3245 = vaddq_s16(v3245_tmp, v3244); 3592 int16x8_t v3246 = vaddq_s16(v3243, v3245); 3593 int16x8_t v3247 = vqrdmulhq_n_s16(v3246, 20216); 3594 int16x8_t v3248 = vaddq_s16(v3242, v3247); 3595 int16x8_t v3249 = vsubq_s16(v2534, v2539); 3596 int16x8_t v3250 = vsubq_s16(v2544, v2549); 3597 int16x8_t v3251_tmp = vqrdmulhq_n_s16(v3250, 19463); 3598 int16x8_t v3251 = vaddq_s16(v3251_tmp, v3250); 3599 int16x8_t v3252 = vaddq_s16(v3249, v3251); 3600 int16x8_t v3253 = vsubq_s16(v2556, v2561); 3601 int16x8_t v3254 = vsubq_s16(v2566, v2571); 3602 int16x8_t v3255_tmp = vqrdmulhq_n_s16(v3254, 19463); 3603 int16x8_t v3255 = vaddq_s16(v3255_tmp, v3254); 3604 int16x8_t v3256 = vaddq_s16(v3253, v3255); 3605 int16x8_t v3257 = vqrdmulhq_n_s16(v3256, 20216); 3606 int16x8_t v3258 = vaddq_s16(v3252, v3257); 3607 int16x8_t v3259 = vqrdmulhq_n_s16(v3258, 17220); 3608 int16x8_t v3260 = vaddq_s16(v3248, v3259); 3609 int16x8_t v3261 = vsubq_s16(v2393, v2398); 3610 int16x8_t v3262 = vsubq_s16(v2403, v2408); 3611 int16x8_t v3263_tmp = vqrdmulhq_n_s16(v3262, 28661); 3612 int16x8_t v3263 = vaddq_s16(v3263_tmp, v3262); 3613 int16x8_t v3264 = vaddq_s16(v3261, v3263); 3614 int16x8_t v3265 = vsubq_s16(v2415, v2420); 3615 int16x8_t v3266 = vsubq_s16(v2425, v2430); 3616 int16x8_t v3267_tmp = vqrdmulhq_n_s16(v3266, 28661); 3617 int16x8_t v3267 = vaddq_s16(v3267_tmp, v3266); 3618 int16x8_t v3268 = vaddq_s16(v3265, v3267); 3619 int16x8_t v3269 = vqrdmulhq_n_s16(v3268, 20587); 3620 int16x8_t v3270 = vaddq_s16(v3264, v3269); 3621 int16x8_t v3271 = vsubq_s16(v2439, v2444); 3622 int16x8_t v3272 = vsubq_s16(v2449, v2454); 3623 int16x8_t v3273_tmp = vqrdmulhq_n_s16(v3272, 28661); 3624 int16x8_t v3273 = vaddq_s16(v3273_tmp, v3272); 3625 int16x8_t v3274 = vaddq_s16(v3271, v3273); 3626 int16x8_t v3275 = vsubq_s16(v2461, v2467); 3627 int16x8_t v3276 = vsubq_s16(v2472, v2477); 3628 int16x8_t v3277_tmp = vqrdmulhq_n_s16(v3276, 28661); 3629 int16x8_t v3277 = vaddq_s16(v3277_tmp, v3276); 3630 int16x8_t v3278 = vaddq_s16(v3275, v3277); 3631 int16x8_t v3279 = vqrdmulhq_n_s16(v3278, 20587); 3632 int16x8_t v3280 = vaddq_s16(v3274, v3279); 3633 int16x8_t v3281 = vqrdmulhq_n_s16(v3280, 17290); 3634 int16x8_t v3282 = vaddq_s16(v3270, v3281); 3635 int16x8_t v3283 = vsubq_s16(v2299, v2304); 3636 int16x8_t v3284 = vsubq_s16(v2309, v2314); 3637 int16x8_t v3285_tmp = vqrdmulhq_n_s16(v3284, 9242); 3638 int16x8_t v3285 = vmlaq_n_s16(v3285_tmp, v3284, 2); 3639 int16x8_t v3286 = vaddq_s16(v3283, v3285); 3640 int16x8_t v3287 = vsubq_s16(v2321, v2326); 3641 int16x8_t v3288 = vsubq_s16(v2331, v2336); 3642 int16x8_t v3289_tmp = vqrdmulhq_n_s16(v3288, 9242); 3643 int16x8_t v3289 = vmlaq_n_s16(v3289_tmp, v3288, 2); 3644 int16x8_t v3290 = vaddq_s16(v3287, v3289); 3645 int16x8_t v3291 = vqrdmulhq_n_s16(v3290, 20985); 3646 int16x8_t v3292 = vaddq_s16(v3286, v3291); 3647 int16x8_t v3293 = vsubq_s16(v2345, v2350); 3648 int16x8_t v3294 = vsubq_s16(v2355, v2360); 3649 int16x8_t v3295_tmp = vqrdmulhq_n_s16(v3294, 9242); 3650 int16x8_t v3295 = vmlaq_n_s16(v3295_tmp, v3294, 2); 3651 int16x8_t v3296 = vaddq_s16(v3293, v3295); 3652 int16x8_t v3297 = vsubq_s16(v2367, v2372); 3653 int16x8_t v3298 = vsubq_s16(v2377, v2382); 3654 int16x8_t v3299_tmp = vqrdmulhq_n_s16(v3298, 9242); 3655 int16x8_t v3299 = vmlaq_n_s16(v3299_tmp, v3298, 2); 3656 int16x8_t v3300 = vaddq_s16(v3297, v3299); 3657 int16x8_t v3301 = vqrdmulhq_n_s16(v3300, 20985); 3658 int16x8_t v3302 = vaddq_s16(v3296, v3301); 3659 int16x8_t v3303 = vqrdmulhq_n_s16(v3302, 17363); 3660 int16x8_t v3304 = vaddq_s16(v3292, v3303); 3661 int16x8_t v3305 = vsubq_s16(v2115, v2126); 3662 int16x8_t v3306 = vsubq_s16(v2137, v2148); 3663 int16x8_t v3307_tmp = vqrdmulhq_n_s16(v3306, 30298); 3664 int16x8_t v3307 = vmlaq_n_s16(v3307_tmp, v3306, 2); 3665 int16x8_t v3308 = vaddq_s16(v3305, v3307); 3666 int16x8_t v3309 = vsubq_s16(v2161, v2172); 3667 int16x8_t v3310 = vsubq_s16(v2183, v2194); 3668 int16x8_t v3311_tmp = vqrdmulhq_n_s16(v3310, 30298); 3669 int16x8_t v3311 = vmlaq_n_s16(v3311_tmp, v3310, 2); 3670 int16x8_t v3312 = vaddq_s16(v3309, v3311); 3671 int16x8_t v3313 = vqrdmulhq_n_s16(v3312, 21412); 3672 int16x8_t v3314 = vaddq_s16(v3308, v3313); 3673 int16x8_t v3315 = vsubq_s16(v2209, v2220); 3674 int16x8_t v3316 = vsubq_s16(v2231, v2242); 3675 int16x8_t v3317_tmp = vqrdmulhq_n_s16(v3316, 30298); 3676 int16x8_t v3317 = vmlaq_n_s16(v3317_tmp, v3316, 2); 3677 int16x8_t v3318 = vaddq_s16(v3315, v3317); 3678 int16x8_t v3319 = vsubq_s16(v2255, v2266); 3679 int16x8_t v3320 = vsubq_s16(v2277, v2288); 3680 int16x8_t v3321_tmp = vqrdmulhq_n_s16(v3320, 30298); 3681 int16x8_t v3321 = vmlaq_n_s16(v3321_tmp, v3320, 2); 3682 int16x8_t v3322 = vaddq_s16(v3319, v3321); 3683 int16x8_t v3323 = vqrdmulhq_n_s16(v3322, 21412); 3684 int16x8_t v3324 = vaddq_s16(v3318, v3323); 3685 int16x8_t v3325 = vqrdmulhq_n_s16(v3324, 17440); 3686 int16x8_t v3326 = vaddq_s16(v3314, v3325); 3687 int16x8_t v3327 = vsubq_s16(v1925, v1936); 3688 int16x8_t v3328 = vsubq_s16(v1947, v1958); 3689 int16x8_t v3329_tmp = vqrdmulhq_n_s16(v3328, 2773); 3690 int16x8_t v3329 = vmlaq_n_s16(v3329_tmp, v3328, 4); 3691 int16x8_t v3330 = vaddq_s16(v3327, v3329); 3692 int16x8_t v3331 = vsubq_s16(v1971, v1982); 3693 int16x8_t v3332 = vsubq_s16(v1993, v2004); 3694 int16x8_t v3333_tmp = vqrdmulhq_n_s16(v3332, 2773); 3695 int16x8_t v3333 = vmlaq_n_s16(v3333_tmp, v3332, 4); 3696 int16x8_t v3334 = vaddq_s16(v3331, v3333); 3697 int16x8_t v3335 = vqrdmulhq_n_s16(v3334, 21871); 3698 int16x8_t v3336 = vaddq_s16(v3330, v3335); 3699 int16x8_t v3337 = vsubq_s16(v2019, v2030); 3700 int16x8_t v3338 = vsubq_s16(v2041, v2052); 3701 int16x8_t v3339_tmp = vqrdmulhq_n_s16(v3338, 2773); 3702 int16x8_t v3339 = vmlaq_n_s16(v3339_tmp, v3338, 4); 3703 int16x8_t v3340 = vaddq_s16(v3337, v3339); 3704 int16x8_t v3341 = vsubq_s16(v2065, v2076); 3705 int16x8_t v3342 = vsubq_s16(v2087, v2098); 3706 int16x8_t v3343_tmp = vqrdmulhq_n_s16(v3342, 2773); 3707 int16x8_t v3343 = vmlaq_n_s16(v3343_tmp, v3342, 4); 3708 int16x8_t v3344 = vaddq_s16(v3341, v3343); 3709 int16x8_t v3345 = vqrdmulhq_n_s16(v3344, 21871); 3710 int16x8_t v3346 = vaddq_s16(v3340, v3345); 3711 int16x8_t v3347 = vqrdmulhq_n_s16(v3346, 17520); 3712 int16x8_t v3348 = vaddq_s16(v3336, v3347); 3713 int16x8_t v3349 = vsubq_s16(v1555, v1578); 3714 int16x8_t v3350 = vsubq_s16(v1601, v1624); 3715 int16x8_t v3351_tmp = vqrdmulhq_n_s16(v3350, 26108); 3716 int16x8_t v3351 = vmlaq_n_s16(v3351_tmp, v3350, 6); 3717 int16x8_t v3352 = vaddq_s16(v3349, v3351); 3718 int16x8_t v3353 = vsubq_s16(v1649, v1672); 3719 int16x8_t v3354 = vsubq_s16(v1695, v1718); 3720 int16x8_t v3355_tmp = vqrdmulhq_n_s16(v3354, 26108); 3721 int16x8_t v3355 = vmlaq_n_s16(v3355_tmp, v3354, 6); 3722 int16x8_t v3356 = vaddq_s16(v3353, v3355); 3723 int16x8_t v3357 = vqrdmulhq_n_s16(v3356, 22363); 3724 int16x8_t v3358 = vaddq_s16(v3352, v3357); 3725 int16x8_t v3359 = vsubq_s16(v1745, v1768); 3726 int16x8_t v3360 = vsubq_s16(v1791, v1814); 3727 int16x8_t v3361_tmp = vqrdmulhq_n_s16(v3360, 26108); 3728 int16x8_t v3361 = vmlaq_n_s16(v3361_tmp, v3360, 6); 3729 int16x8_t v3362 = vaddq_s16(v3359, v3361); 3730 int16x8_t v3363 = vsubq_s16(v1839, v1862); 3731 int16x8_t v3364 = vsubq_s16(v1885, v1908); 3732 int16x8_t v3365_tmp = vqrdmulhq_n_s16(v3364, 26108); 3733 int16x8_t v3365 = vmlaq_n_s16(v3365_tmp, v3364, 6); 3734 int16x8_t v3366 = vaddq_s16(v3363, v3365); 3735 int16x8_t v3367 = vqrdmulhq_n_s16(v3366, 22363); 3736 int16x8_t v3368 = vaddq_s16(v3362, v3367); 3737 int16x8_t v3369 = vqrdmulhq_n_s16(v3368, 17603); 3738 int16x8_t v3370 = vaddq_s16(v3358, v3369); 3739 int16x8_t v3371 = vsubq_s16(v61, v140); 3740 int16x8_t v3372 = vsubq_s16(v234, v314); 3741 int16x8_t v3373_tmp = vqrdmulhq_n_s16(v3372, 12251); 3742 int16x8_t v3373 = vmlaq_n_s16(v3373_tmp, v3372, 20); 3743 int16x8_t v3374 = vaddq_s16(v3371, v3373); 3744 int16x8_t v3375 = vsubq_s16(v410, v521); 3745 int16x8_t v3376 = vsubq_s16(v615, v696); 3746 int16x8_t v3377_tmp = vqrdmulhq_n_s16(v3376, 12251); 3747 int16x8_t v3377 = vmlaq_n_s16(v3377_tmp, v3376, 20); 3748 int16x8_t v3378 = vaddq_s16(v3375, v3377); 3749 int16x8_t v3379 = vqrdmulhq_n_s16(v3378, 22891); 3750 int16x8_t v3380 = vaddq_s16(v3374, v3379); 3751 int16x8_t v3381 = vsubq_s16(v794, v905); 3752 int16x8_t v3382 = vsubq_s16(v1061, v1143); 3753 int16x8_t v3383_tmp = vqrdmulhq_n_s16(v3382, 12251); 3754 int16x8_t v3383 = vmlaq_n_s16(v3383_tmp, v3382, 20); 3755 int16x8_t v3384 = vaddq_s16(v3381, v3383); 3756 int16x8_t v3385 = vsubq_s16(v1239, v1350); 3757 int16x8_t v3386 = vsubq_s16(v1444, v1526); 3758 int16x8_t v3387_tmp = vqrdmulhq_n_s16(v3386, 12251); 3759 int16x8_t v3387 = vmlaq_n_s16(v3387_tmp, v3386, 20); 3760 int16x8_t v3388 = vaddq_s16(v3385, v3387); 3761 int16x8_t v3389 = vqrdmulhq_n_s16(v3388, 22891); 3762 int16x8_t v3390 = vaddq_s16(v3384, v3389); 3763 int16x8_t v3391 = vqrdmulhq_n_s16(v3390, 17689); 3764 int16x8_t v3392 = vaddq_s16(v3380, v3391); 3765 int16x8_t v3393 = vsubq_s16(v3371, v3373); 3766 int16x8_t v3394 = vsubq_s16(v3375, v3377); 3767 int16x8_t v3395 = vqrdmulhq_n_s16(v3394, 23460); 3768 int16x8_t v3396 = vaddq_s16(v3393, v3395); 3769 int16x8_t v3397 = vsubq_s16(v3381, v3383); 3770 int16x8_t v3398 = vsubq_s16(v3385, v3387); 3771 int16x8_t v3399 = vqrdmulhq_n_s16(v3398, 23460); 3772 int16x8_t v3400 = vaddq_s16(v3397, v3399); 3773 int16x8_t v3401 = vqrdmulhq_n_s16(v3400, 17779); 3774 int16x8_t v3402 = vaddq_s16(v3396, v3401); 3775 int16x8_t v3403 = vsubq_s16(v3349, v3351); 3776 int16x8_t v3404 = vsubq_s16(v3353, v3355); 3777 int16x8_t v3405 = vqrdmulhq_n_s16(v3404, 24073); 3778 int16x8_t v3406 = vaddq_s16(v3403, v3405); 3779 int16x8_t v3407 = vsubq_s16(v3359, v3361); 3780 int16x8_t v3408 = vsubq_s16(v3363, v3365); 3781 int16x8_t v3409 = vqrdmulhq_n_s16(v3408, 24073); 3782 int16x8_t v3410 = vaddq_s16(v3407, v3409); 3783 int16x8_t v3411 = vqrdmulhq_n_s16(v3410, 17873); 3784 int16x8_t v3412 = vaddq_s16(v3406, v3411); 3785 int16x8_t v3413 = vsubq_s16(v3327, v3329); 3786 int16x8_t v3414 = vsubq_s16(v3331, v3333); 3787 int16x8_t v3415 = vqrdmulhq_n_s16(v3414, 24734); 3788 int16x8_t v3416 = vaddq_s16(v3413, v3415); 3789 int16x8_t v3417 = vsubq_s16(v3337, v3339); 3790 int16x8_t v3418 = vsubq_s16(v3341, v3343); 3791 int16x8_t v3419 = vqrdmulhq_n_s16(v3418, 24734); 3792 int16x8_t v3420 = vaddq_s16(v3417, v3419); 3793 int16x8_t v3421 = vqrdmulhq_n_s16(v3420, 17971); 3794 int16x8_t v3422 = vaddq_s16(v3416, v3421); 3795 int16x8_t v3423 = vsubq_s16(v3305, v3307); 3796 int16x8_t v3424 = vsubq_s16(v3309, v3311); 3797 int16x8_t v3425 = vqrdmulhq_n_s16(v3424, 25448); 3798 int16x8_t v3426 = vaddq_s16(v3423, v3425); 3799 int16x8_t v3427 = vsubq_s16(v3315, v3317); 3800 int16x8_t v3428 = vsubq_s16(v3319, v3321); 3801 int16x8_t v3429 = vqrdmulhq_n_s16(v3428, 25448); 3802 int16x8_t v3430 = vaddq_s16(v3427, v3429); 3803 int16x8_t v3431 = vqrdmulhq_n_s16(v3430, 18072); 3804 int16x8_t v3432 = vaddq_s16(v3426, v3431); 3805 int16x8_t v3433 = vsubq_s16(v3283, v3285); 3806 int16x8_t v3434 = vsubq_s16(v3287, v3289); 3807 int16x8_t v3435 = vqrdmulhq_n_s16(v3434, 26220); 3808 int16x8_t v3436 = vaddq_s16(v3433, v3435); 3809 int16x8_t v3437 = vsubq_s16(v3293, v3295); 3810 int16x8_t v3438 = vsubq_s16(v3297, v3299); 3811 int16x8_t v3439 = vqrdmulhq_n_s16(v3438, 26220); 3812 int16x8_t v3440 = vaddq_s16(v3437, v3439); 3813 int16x8_t v3441 = vqrdmulhq_n_s16(v3440, 18177); 3814 int16x8_t v3442 = vaddq_s16(v3436, v3441); 3815 int16x8_t v3443 = vsubq_s16(v3261, v3263); 3816 int16x8_t v3444 = vsubq_s16(v3265, v3267); 3817 int16x8_t v3445 = vqrdmulhq_n_s16(v3444, 27058); 3818 int16x8_t v3446 = vaddq_s16(v3443, v3445); 3819 int16x8_t v3447 = vsubq_s16(v3271, v3273); 3820 int16x8_t v3448 = vsubq_s16(v3275, v3277); 3821 int16x8_t v3449 = vqrdmulhq_n_s16(v3448, 27058); 3822 int16x8_t v3450 = vaddq_s16(v3447, v3449); 3823 int16x8_t v3451 = vqrdmulhq_n_s16(v3450, 18286); 3824 int16x8_t v3452 = vaddq_s16(v3446, v3451); 3825 int16x8_t v3453 = vsubq_s16(v3239, v3241); 3826 int16x8_t v3454 = vsubq_s16(v3243, v3245); 3827 int16x8_t v3455 = vqrdmulhq_n_s16(v3454, 27969); 3828 int16x8_t v3456 = vaddq_s16(v3453, v3455); 3829 int16x8_t v3457 = vsubq_s16(v3249, v3251); 3830 int16x8_t v3458 = vsubq_s16(v3253, v3255); 3831 int16x8_t v3459 = vqrdmulhq_n_s16(v3458, 27969); 3832 int16x8_t v3460 = vaddq_s16(v3457, v3459); 3833 int16x8_t v3461 = vqrdmulhq_n_s16(v3460, 18400); 3834 int16x8_t v3462 = vaddq_s16(v3456, v3461); 3835 int16x8_t v3463 = vsubq_s16(v3217, v3219); 3836 int16x8_t v3464 = vsubq_s16(v3221, v3223); 3837 int16x8_t v3465 = vqrdmulhq_n_s16(v3464, 28961); 3838 int16x8_t v3466 = vaddq_s16(v3463, v3465); 3839 int16x8_t v3467 = vsubq_s16(v3227, v3229); 3840 int16x8_t v3468 = vsubq_s16(v3231, v3233); 3841 int16x8_t v3469 = vqrdmulhq_n_s16(v3468, 28961); 3842 int16x8_t v3470 = vaddq_s16(v3467, v3469); 3843 int16x8_t v3471 = vqrdmulhq_n_s16(v3470, 18517); 3844 int16x8_t v3472 = vaddq_s16(v3466, v3471); 3845 int16x8_t v3473 = vsubq_s16(v3195, v3197); 3846 int16x8_t v3474 = vsubq_s16(v3199, v3201); 3847 int16x8_t v3475 = vqrdmulhq_n_s16(v3474, 30044); 3848 int16x8_t v3476 = vaddq_s16(v3473, v3475); 3849 int16x8_t v3477 = vsubq_s16(v3205, v3207); 3850 int16x8_t v3478 = vsubq_s16(v3209, v3211); 3851 int16x8_t v3479 = vqrdmulhq_n_s16(v3478, 30044); 3852 int16x8_t v3480 = vaddq_s16(v3477, v3479); 3853 int16x8_t v3481 = vqrdmulhq_n_s16(v3480, 18639); 3854 int16x8_t v3482 = vaddq_s16(v3476, v3481); 3855 int16x8_t v3483 = vsubq_s16(v3173, v3175); 3856 int16x8_t v3484 = vsubq_s16(v3177, v3179); 3857 int16x8_t v3485 = vqrdmulhq_n_s16(v3484, 31232); 3858 int16x8_t v3486 = vaddq_s16(v3483, v3485); 3859 int16x8_t v3487 = vsubq_s16(v3183, v3185); 3860 int16x8_t v3488 = vsubq_s16(v3187, v3189); 3861 int16x8_t v3489 = vqrdmulhq_n_s16(v3488, 31232); 3862 int16x8_t v3490 = vaddq_s16(v3487, v3489); 3863 int16x8_t v3491 = vqrdmulhq_n_s16(v3490, 18765); 3864 int16x8_t v3492 = vaddq_s16(v3486, v3491); 3865 int16x8_t v3493 = vsubq_s16(v3151, v3153); 3866 int16x8_t v3494 = vsubq_s16(v3155, v3157); 3867 int16x8_t v3495 = vqrdmulhq_n_s16(v3494, 32538); 3868 int16x8_t v3496 = vaddq_s16(v3493, v3495); 3869 int16x8_t v3497 = vsubq_s16(v3161, v3163); 3870 int16x8_t v3498 = vsubq_s16(v3165, v3167); 3871 int16x8_t v3499 = vqrdmulhq_n_s16(v3498, 32538); 3872 int16x8_t v3500 = vaddq_s16(v3497, v3499); 3873 int16x8_t v3501 = vqrdmulhq_n_s16(v3500, 18896); 3874 int16x8_t v3502 = vaddq_s16(v3496, v3501); 3875 int16x8_t v3503 = vsubq_s16(v3129, v3131); 3876 int16x8_t v3504 = vsubq_s16(v3133, v3135); 3877 int16x8_t v3505_tmp = vqrdmulhq_n_s16(v3504, 1211); 3878 int16x8_t v3505 = vaddq_s16(v3505_tmp, v3504); 3879 int16x8_t v3506 = vaddq_s16(v3503, v3505); 3880 int16x8_t v3507 = vsubq_s16(v3139, v3141); 3881 int16x8_t v3508 = vsubq_s16(v3143, v3145); 3882 int16x8_t v3509_tmp = vqrdmulhq_n_s16(v3508, 1211); 3883 int16x8_t v3509 = vaddq_s16(v3509_tmp, v3508); 3884 int16x8_t v3510 = vaddq_s16(v3507, v3509); 3885 int16x8_t v3511 = vqrdmulhq_n_s16(v3510, 19032); 3886 int16x8_t v3512 = vaddq_s16(v3506, v3511); 3887 int16x8_t v3513 = vsubq_s16(v3107, v3109); 3888 int16x8_t v3514 = vsubq_s16(v3111, v3113); 3889 int16x8_t v3515_tmp = vqrdmulhq_n_s16(v3514, 2808); 3890 int16x8_t v3515 = vaddq_s16(v3515_tmp, v3514); 3891 int16x8_t v3516 = vaddq_s16(v3513, v3515); 3892 int16x8_t v3517 = vsubq_s16(v3117, v3119); 3893 int16x8_t v3518 = vsubq_s16(v3121, v3123); 3894 int16x8_t v3519_tmp = vqrdmulhq_n_s16(v3518, 2808); 3895 int16x8_t v3519 = vaddq_s16(v3519_tmp, v3518); 3896 int16x8_t v3520 = vaddq_s16(v3517, v3519); 3897 int16x8_t v3521 = vqrdmulhq_n_s16(v3520, 19172); 3898 int16x8_t v3522 = vaddq_s16(v3516, v3521); 3899 int16x8_t v3523 = vsubq_s16(v3085, v3087); 3900 int16x8_t v3524 = vsubq_s16(v3089, v3091); 3901 int16x8_t v3525_tmp = vqrdmulhq_n_s16(v3524, 4586); 3902 int16x8_t v3525 = vaddq_s16(v3525_tmp, v3524); 3903 int16x8_t v3526 = vaddq_s16(v3523, v3525); 3904 int16x8_t v3527 = vsubq_s16(v3095, v3097); 3905 int16x8_t v3528 = vsubq_s16(v3099, v3101); 3906 int16x8_t v3529_tmp = vqrdmulhq_n_s16(v3528, 4586); 3907 int16x8_t v3529 = vaddq_s16(v3529_tmp, v3528); 3908 int16x8_t v3530 = vaddq_s16(v3527, v3529); 3909 int16x8_t v3531 = vqrdmulhq_n_s16(v3530, 19318); 3910 int16x8_t v3532 = vaddq_s16(v3526, v3531); 3911 int16x8_t v3533 = vsubq_s16(v3063, v3065); 3912 int16x8_t v3534 = vsubq_s16(v3067, v3069); 3913 int16x8_t v3535_tmp = vqrdmulhq_n_s16(v3534, 6576); 3914 int16x8_t v3535 = vaddq_s16(v3535_tmp, v3534); 3915 int16x8_t v3536 = vaddq_s16(v3533, v3535); 3916 int16x8_t v3537 = vsubq_s16(v3073, v3075); 3917 int16x8_t v3538 = vsubq_s16(v3077, v3079); 3918 int16x8_t v3539_tmp = vqrdmulhq_n_s16(v3538, 6576); 3919 int16x8_t v3539 = vaddq_s16(v3539_tmp, v3538); 3920 int16x8_t v3540 = vaddq_s16(v3537, v3539); 3921 int16x8_t v3541 = vqrdmulhq_n_s16(v3540, 19469); 3922 int16x8_t v3542 = vaddq_s16(v3536, v3541); 3923 int16x8_t v3543 = vsubq_s16(v3041, v3043); 3924 int16x8_t v3544 = vsubq_s16(v3045, v3047); 3925 int16x8_t v3545_tmp = vqrdmulhq_n_s16(v3544, 8817); 3926 int16x8_t v3545 = vaddq_s16(v3545_tmp, v3544); 3927 int16x8_t v3546 = vaddq_s16(v3543, v3545); 3928 int16x8_t v3547 = vsubq_s16(v3051, v3053); 3929 int16x8_t v3548 = vsubq_s16(v3055, v3057); 3930 int16x8_t v3549_tmp = vqrdmulhq_n_s16(v3548, 8817); 3931 int16x8_t v3549 = vaddq_s16(v3549_tmp, v3548); 3932 int16x8_t v3550 = vaddq_s16(v3547, v3549); 3933 int16x8_t v3551 = vqrdmulhq_n_s16(v3550, 19625); 3934 int16x8_t v3552 = vaddq_s16(v3546, v3551); 3935 int16x8_t v3553 = vsubq_s16(v2998, v3003); 3936 int16x8_t v3554 = vsubq_s16(v3008, v3013); 3937 int16x8_t v3555_tmp = vqrdmulhq_n_s16(v3554, 11356); 3938 int16x8_t v3555 = vaddq_s16(v3555_tmp, v3554); 3939 int16x8_t v3556 = vaddq_s16(v3553, v3555); 3940 int16x8_t v3557 = vsubq_s16(v3020, v3025); 3941 int16x8_t v3558 = vsubq_s16(v3030, v3035); 3942 int16x8_t v3559_tmp = vqrdmulhq_n_s16(v3558, 11356); 3943 int16x8_t v3559 = vaddq_s16(v3559_tmp, v3558); 3944 int16x8_t v3560 = vaddq_s16(v3557, v3559); 3945 int16x8_t v3561 = vqrdmulhq_n_s16(v3560, 19786); 3946 int16x8_t v3562 = vaddq_s16(v3556, v3561); 3947 int16x8_t v3563 = vsubq_s16(v2952, v2957); 3948 int16x8_t v3564 = vsubq_s16(v2962, v2967); 3949 int16x8_t v3565_tmp = vqrdmulhq_n_s16(v3564, 14256); 3950 int16x8_t v3565 = vaddq_s16(v3565_tmp, v3564); 3951 int16x8_t v3566 = vaddq_s16(v3563, v3565); 3952 int16x8_t v3567 = vsubq_s16(v2974, v2979); 3953 int16x8_t v3568 = vsubq_s16(v2984, v2989); 3954 int16x8_t v3569_tmp = vqrdmulhq_n_s16(v3568, 14256); 3955 int16x8_t v3569 = vaddq_s16(v3569_tmp, v3568); 3956 int16x8_t v3570 = vaddq_s16(v3567, v3569); 3957 int16x8_t v3571 = vqrdmulhq_n_s16(v3570, 19954); 3958 int16x8_t v3572 = vaddq_s16(v3566, v3571); 3959 int16x8_t v3573 = vsubq_s16(v2906, v2911); 3960 int16x8_t v3574 = vsubq_s16(v2916, v2921); 3961 int16x8_t v3575_tmp = vqrdmulhq_n_s16(v3574, 17596); 3962 int16x8_t v3575 = vaddq_s16(v3575_tmp, v3574); 3963 int16x8_t v3576 = vaddq_s16(v3573, v3575); 3964 int16x8_t v3577 = vsubq_s16(v2928, v2933); 3965 int16x8_t v3578 = vsubq_s16(v2938, v2943); 3966 int16x8_t v3579_tmp = vqrdmulhq_n_s16(v3578, 17596); 3967 int16x8_t v3579 = vaddq_s16(v3579_tmp, v3578); 3968 int16x8_t v3580 = vaddq_s16(v3577, v3579); 3969 int16x8_t v3581 = vqrdmulhq_n_s16(v3580, 20127); 3970 int16x8_t v3582 = vaddq_s16(v3576, v3581); 3971 int16x8_t v3583 = vsubq_s16(v2860, v2865); 3972 int16x8_t v3584 = vsubq_s16(v2870, v2875); 3973 int16x8_t v3585_tmp = vqrdmulhq_n_s16(v3584, 21483); 3974 int16x8_t v3585 = vaddq_s16(v3585_tmp, v3584); 3975 int16x8_t v3586 = vaddq_s16(v3583, v3585); 3976 int16x8_t v3587 = vsubq_s16(v2882, v2887); 3977 int16x8_t v3588 = vsubq_s16(v2892, v2897); 3978 int16x8_t v3589_tmp = vqrdmulhq_n_s16(v3588, 21483); 3979 int16x8_t v3589 = vaddq_s16(v3589_tmp, v3588); 3980 int16x8_t v3590 = vaddq_s16(v3587, v3589); 3981 int16x8_t v3591 = vqrdmulhq_n_s16(v3590, 20306); 3982 int16x8_t v3592 = vaddq_s16(v3586, v3591); 3983 int16x8_t v3593 = vsubq_s16(v2814, v2819); 3984 int16x8_t v3594 = vsubq_s16(v2824, v2829); 3985 int16x8_t v3595_tmp = vqrdmulhq_n_s16(v3594, 26057); 3986 int16x8_t v3595 = vaddq_s16(v3595_tmp, v3594); 3987 int16x8_t v3596 = vaddq_s16(v3593, v3595); 3988 int16x8_t v3597 = vsubq_s16(v2836, v2841); 3989 int16x8_t v3598 = vsubq_s16(v2846, v2851); 3990 int16x8_t v3599_tmp = vqrdmulhq_n_s16(v3598, 26057); 3991 int16x8_t v3599 = vaddq_s16(v3599_tmp, v3598); 3992 int16x8_t v3600 = vaddq_s16(v3597, v3599); 3993 int16x8_t v3601 = vqrdmulhq_n_s16(v3600, 20492); 3994 int16x8_t v3602 = vaddq_s16(v3596, v3601); 3995 int16x8_t v3603 = vsubq_s16(v2768, v2773); 3996 int16x8_t v3604 = vsubq_s16(v2778, v2783); 3997 int16x8_t v3605_tmp = vqrdmulhq_n_s16(v3604, 31517); 3998 int16x8_t v3605 = vaddq_s16(v3605_tmp, v3604); 3999 int16x8_t v3606 = vaddq_s16(v3603, v3605); 4000 int16x8_t v3607 = vsubq_s16(v2790, v2795); 4001 int16x8_t v3608 = vsubq_s16(v2800, v2805); 4002 int16x8_t v3609_tmp = vqrdmulhq_n_s16(v3608, 31517); 4003 int16x8_t v3609 = vaddq_s16(v3609_tmp, v3608); 4004 int16x8_t v3610 = vaddq_s16(v3607, v3609); 4005 int16x8_t v3611 = vqrdmulhq_n_s16(v3610, 20684); 4006 int16x8_t v3612 = vaddq_s16(v3606, v3611); 4007 int16x8_t v3613 = vsubq_s16(v2722, v2727); 4008 int16x8_t v3614 = vsubq_s16(v2732, v2737); 4009 int16x8_t v3615_tmp = vqrdmulhq_n_s16(v3614, 5373); 4010 int16x8_t v3615 = vmlaq_n_s16(v3615_tmp, v3614, 2); 4011 int16x8_t v3616 = vaddq_s16(v3613, v3615); 4012 int16x8_t v3617 = vsubq_s16(v2744, v2749); 4013 int16x8_t v3618 = vsubq_s16(v2754, v2759); 4014 int16x8_t v3619_tmp = vqrdmulhq_n_s16(v3618, 5373); 4015 int16x8_t v3619 = vmlaq_n_s16(v3619_tmp, v3618, 2); 4016 int16x8_t v3620 = vaddq_s16(v3617, v3619); 4017 int16x8_t v3621 = vqrdmulhq_n_s16(v3620, 20883); 4018 int16x8_t v3622 = vaddq_s16(v3616, v3621); 4019 int16x8_t v3623 = vsubq_s16(v2676, v2681); 4020 int16x8_t v3624 = vsubq_s16(v2686, v2691); 4021 int16x8_t v3625_tmp = vqrdmulhq_n_s16(v3624, 13571); 4022 int16x8_t v3625 = vmlaq_n_s16(v3625_tmp, v3624, 2); 4023 int16x8_t v3626 = vaddq_s16(v3623, v3625); 4024 int16x8_t v3627 = vsubq_s16(v2698, v2703); 4025 int16x8_t v3628 = vsubq_s16(v2708, v2713); 4026 int16x8_t v3629_tmp = vqrdmulhq_n_s16(v3628, 13571); 4027 int16x8_t v3629 = vmlaq_n_s16(v3629_tmp, v3628, 2); 4028 int16x8_t v3630 = vaddq_s16(v3627, v3629); 4029 int16x8_t v3631 = vqrdmulhq_n_s16(v3630, 21089); 4030 int16x8_t v3632 = vaddq_s16(v3626, v3631); 4031 int16x8_t v3633 = vsubq_s16(v2588, v2599); 4032 int16x8_t v3634 = vsubq_s16(v2610, v2621); 4033 int16x8_t v3635_tmp = vqrdmulhq_n_s16(v3634, 23975); 4034 int16x8_t v3635 = vmlaq_n_s16(v3635_tmp, v3634, 2); 4035 int16x8_t v3636 = vaddq_s16(v3633, v3635); 4036 int16x8_t v3637 = vsubq_s16(v2634, v2645); 4037 int16x8_t v3638 = vsubq_s16(v2656, v2667); 4038 int16x8_t v3639_tmp = vqrdmulhq_n_s16(v3638, 23975); 4039 int16x8_t v3639 = vmlaq_n_s16(v3639_tmp, v3638, 2); 4040 int16x8_t v3640 = vaddq_s16(v3637, v3639); 4041 int16x8_t v3641 = vqrdmulhq_n_s16(v3640, 21303); 4042 int16x8_t v3642 = vaddq_s16(v3636, v3641); 4043 int16x8_t v3643 = vsubq_s16(v2494, v2505); 4044 int16x8_t v3644 = vsubq_s16(v2516, v2527); 4045 int16x8_t v3645_tmp = vqrdmulhq_n_s16(v3644, 4832); 4046 int16x8_t v3645 = vmlaq_n_s16(v3645_tmp, v3644, 3); 4047 int16x8_t v3646 = vaddq_s16(v3643, v3645); 4048 int16x8_t v3647 = vsubq_s16(v2540, v2551); 4049 int16x8_t v3648 = vsubq_s16(v2562, v2573); 4050 int16x8_t v3649_tmp = vqrdmulhq_n_s16(v3648, 4832); 4051 int16x8_t v3649 = vmlaq_n_s16(v3649_tmp, v3648, 3); 4052 int16x8_t v3650 = vaddq_s16(v3647, v3649); 4053 int16x8_t v3651 = vqrdmulhq_n_s16(v3650, 21524); 4054 int16x8_t v3652 = vaddq_s16(v3646, v3651); 4055 int16x8_t v3653 = vsubq_s16(v2399, v2410); 4056 int16x8_t v3654 = vsubq_s16(v2421, v2432); 4057 int16x8_t v3655_tmp = vqrdmulhq_n_s16(v3654, 23437); 4058 int16x8_t v3655 = vmlaq_n_s16(v3655_tmp, v3654, 3); 4059 int16x8_t v3656 = vaddq_s16(v3653, v3655); 4060 int16x8_t v3657 = vsubq_s16(v2445, v2456); 4061 int16x8_t v3658 = vsubq_s16(v2468, v2479); 4062 int16x8_t v3659_tmp = vqrdmulhq_n_s16(v3658, 23437); 4063 int16x8_t v3659 = vmlaq_n_s16(v3659_tmp, v3658, 3); 4064 int16x8_t v3660 = vaddq_s16(v3657, v3659); 4065 int16x8_t v3661 = vqrdmulhq_n_s16(v3660, 21753); 4066 int16x8_t v3662 = vaddq_s16(v3656, v3661); 4067 int16x8_t v3663 = vsubq_s16(v2305, v2316); 4068 int16x8_t v3664 = vsubq_s16(v2327, v2338); 4069 int16x8_t v3665_tmp = vqrdmulhq_n_s16(v3664, 17573); 4070 int16x8_t v3665 = vmlaq_n_s16(v3665_tmp, v3664, 4); 4071 int16x8_t v3666 = vaddq_s16(v3663, v3665); 4072 int16x8_t v3667 = vsubq_s16(v2351, v2362); 4073 int16x8_t v3668 = vsubq_s16(v2373, v2384); 4074 int16x8_t v3669_tmp = vqrdmulhq_n_s16(v3668, 17573); 4075 int16x8_t v3669 = vmlaq_n_s16(v3669_tmp, v3668, 4); 4076 int16x8_t v3670 = vaddq_s16(v3667, v3669); 4077 int16x8_t v3671 = vqrdmulhq_n_s16(v3670, 21990); 4078 int16x8_t v3672 = vaddq_s16(v3666, v3671); 4079 int16x8_t v3673 = vsubq_s16(v2127, v2150); 4080 int16x8_t v3674 = vsubq_s16(v2173, v2196); 4081 int16x8_t v3675_tmp = vqrdmulhq_n_s16(v3674, 27122); 4082 int16x8_t v3675 = vmlaq_n_s16(v3675_tmp, v3674, 5); 4083 int16x8_t v3676 = vaddq_s16(v3673, v3675); 4084 int16x8_t v3677 = vsubq_s16(v2221, v2244); 4085 int16x8_t v3678 = vsubq_s16(v2267, v2290); 4086 int16x8_t v3679_tmp = vqrdmulhq_n_s16(v3678, 27122); 4087 int16x8_t v3679 = vmlaq_n_s16(v3679_tmp, v3678, 5); 4088 int16x8_t v3680 = vaddq_s16(v3677, v3679); 4089 int16x8_t v3681 = vqrdmulhq_n_s16(v3680, 22236); 4090 int16x8_t v3682 = vaddq_s16(v3676, v3681); 4091 int16x8_t v3683 = vsubq_s16(v1937, v1960); 4092 int16x8_t v3684 = vsubq_s16(v1983, v2006); 4093 int16x8_t v3685_tmp = vqrdmulhq_n_s16(v3684, 5041); 4094 int16x8_t v3685 = vmlaq_n_s16(v3685_tmp, v3684, 8); 4095 int16x8_t v3686 = vaddq_s16(v3683, v3685); 4096 int16x8_t v3687 = vsubq_s16(v2031, v2054); 4097 int16x8_t v3688 = vsubq_s16(v2077, v2100); 4098 int16x8_t v3689_tmp = vqrdmulhq_n_s16(v3688, 5041); 4099 int16x8_t v3689 = vmlaq_n_s16(v3689_tmp, v3688, 8); 4100 int16x8_t v3690 = vaddq_s16(v3687, v3689); 4101 int16x8_t v3691 = vqrdmulhq_n_s16(v3690, 22491); 4102 int16x8_t v3692 = vaddq_s16(v3686, v3691); 4103 int16x8_t v3693 = vsubq_s16(v1579, v1626); 4104 int16x8_t v3694 = vsubq_s16(v1673, v1720); 4105 int16x8_t v3695_tmp = vqrdmulhq_n_s16(v3694, 19146); 4106 int16x8_t v3695 = vmlaq_n_s16(v3695_tmp, v3694, 13); 4107 int16x8_t v3696 = vaddq_s16(v3693, v3695); 4108 int16x8_t v3697 = vsubq_s16(v1769, v1816); 4109 int16x8_t v3698 = vsubq_s16(v1863, v1910); 4110 int16x8_t v3699_tmp = vqrdmulhq_n_s16(v3698, 19146); 4111 int16x8_t v3699 = vmlaq_n_s16(v3699_tmp, v3698, 13); 4112 int16x8_t v3700 = vaddq_s16(v3697, v3699); 4113 int16x8_t v3701 = vqrdmulhq_n_s16(v3700, 22755); 4114 int16x8_t v3702 = vaddq_s16(v3696, v3701); 4115 int16x8_t v3703 = vsubq_s16(v141, v316); 4116 int16x8_t v3704 = vsubq_s16(v522, v698); 4117 int16x8_t v3705_tmp = vqrdmulhq_n_s16(v3704, 24402); 4118 int16x8_t v3705 = vmlaq_n_s16(v3705_tmp, v3704, 40); 4119 int16x8_t v3706 = vaddq_s16(v3703, v3705); 4120 int16x8_t v3707 = vsubq_s16(v906, v1145); 4121 int16x8_t v3708 = vsubq_s16(v1351, v1528); 4122 int16x8_t v3709_tmp = vqrdmulhq_n_s16(v3708, 24402); 4123 int16x8_t v3709 = vmlaq_n_s16(v3709_tmp, v3708, 40); 4124 int16x8_t v3710 = vaddq_s16(v3707, v3709); 4125 int16x8_t v3711 = vqrdmulhq_n_s16(v3710, 23030); 4126 int16x8_t v3712 = vaddq_s16(v3706, v3711); 4127 int16x8_t v3713 = vsubq_s16(v3703, v3705); 4128 int16x8_t v3714 = vsubq_s16(v3707, v3709); 4129 int16x8_t v3715 = vqrdmulhq_n_s16(v3714, 23314); 4130 int16x8_t v3716 = vaddq_s16(v3713, v3715); 4131 int16x8_t v3717 = vsubq_s16(v3693, v3695); 4132 int16x8_t v3718 = vsubq_s16(v3697, v3699); 4133 int16x8_t v3719 = vqrdmulhq_n_s16(v3718, 23609); 4134 int16x8_t v3720 = vaddq_s16(v3717, v3719); 4135 int16x8_t v3721 = vsubq_s16(v3683, v3685); 4136 int16x8_t v3722 = vsubq_s16(v3687, v3689); 4137 int16x8_t v3723 = vqrdmulhq_n_s16(v3722, 23915); 4138 int16x8_t v3724 = vaddq_s16(v3721, v3723); 4139 int16x8_t v3725 = vsubq_s16(v3673, v3675); 4140 int16x8_t v3726 = vsubq_s16(v3677, v3679); 4141 int16x8_t v3727 = vqrdmulhq_n_s16(v3726, 24233); 4142 int16x8_t v3728 = vaddq_s16(v3725, v3727); 4143 int16x8_t v3729 = vsubq_s16(v3663, v3665); 4144 int16x8_t v3730 = vsubq_s16(v3667, v3669); 4145 int16x8_t v3731 = vqrdmulhq_n_s16(v3730, 24564); 4146 int16x8_t v3732 = vaddq_s16(v3729, v3731); 4147 int16x8_t v3733 = vsubq_s16(v3653, v3655); 4148 int16x8_t v3734 = vsubq_s16(v3657, v3659); 4149 int16x8_t v3735 = vqrdmulhq_n_s16(v3734, 24907); 4150 int16x8_t v3736 = vaddq_s16(v3733, v3735); 4151 int16x8_t v3737 = vsubq_s16(v3643, v3645); 4152 int16x8_t v3738 = vsubq_s16(v3647, v3649); 4153 int16x8_t v3739 = vqrdmulhq_n_s16(v3738, 25264); 4154 int16x8_t v3740 = vaddq_s16(v3737, v3739); 4155 int16x8_t v3741 = vsubq_s16(v3633, v3635); 4156 int16x8_t v3742 = vsubq_s16(v3637, v3639); 4157 int16x8_t v3743 = vqrdmulhq_n_s16(v3742, 25635); 4158 int16x8_t v3744 = vaddq_s16(v3741, v3743); 4159 int16x8_t v3745 = vsubq_s16(v3623, v3625); 4160 int16x8_t v3746 = vsubq_s16(v3627, v3629); 4161 int16x8_t v3747 = vqrdmulhq_n_s16(v3746, 26021); 4162 int16x8_t v3748 = vaddq_s16(v3745, v3747); 4163 int16x8_t v3749 = vsubq_s16(v3613, v3615); 4164 int16x8_t v3750 = vsubq_s16(v3617, v3619); 4165 int16x8_t v3751 = vqrdmulhq_n_s16(v3750, 26423); 4166 int16x8_t v3752 = vaddq_s16(v3749, v3751); 4167 int16x8_t v3753 = vsubq_s16(v3603, v3605); 4168 int16x8_t v3754 = vsubq_s16(v3607, v3609); 4169 int16x8_t v3755 = vqrdmulhq_n_s16(v3754, 26842); 4170 int16x8_t v3756 = vaddq_s16(v3753, v3755); 4171 int16x8_t v3757 = vsubq_s16(v3593, v3595); 4172 int16x8_t v3758 = vsubq_s16(v3597, v3599); 4173 int16x8_t v3759 = vqrdmulhq_n_s16(v3758, 27279); 4174 int16x8_t v3760 = vaddq_s16(v3757, v3759); 4175 int16x8_t v3761 = vsubq_s16(v3583, v3585); 4176 int16x8_t v3762 = vsubq_s16(v3587, v3589); 4177 int16x8_t v3763 = vqrdmulhq_n_s16(v3762, 27734); 4178 int16x8_t v3764 = vaddq_s16(v3761, v3763); 4179 int16x8_t v3765 = vsubq_s16(v3573, v3575); 4180 int16x8_t v3766 = vsubq_s16(v3577, v3579); 4181 int16x8_t v3767 = vqrdmulhq_n_s16(v3766, 28209); 4182 int16x8_t v3768 = vaddq_s16(v3765, v3767); 4183 int16x8_t v3769 = vsubq_s16(v3563, v3565); 4184 int16x8_t v3770 = vsubq_s16(v3567, v3569); 4185 int16x8_t v3771 = vqrdmulhq_n_s16(v3770, 28705); 4186 int16x8_t v3772 = vaddq_s16(v3769, v3771); 4187 int16x8_t v3773 = vsubq_s16(v3553, v3555); 4188 int16x8_t v3774 = vsubq_s16(v3557, v3559); 4189 int16x8_t v3775 = vqrdmulhq_n_s16(v3774, 29223); 4190 int16x8_t v3776 = vaddq_s16(v3773, v3775); 4191 int16x8_t v3777 = vsubq_s16(v3543, v3545); 4192 int16x8_t v3778 = vsubq_s16(v3547, v3549); 4193 int16x8_t v3779 = vqrdmulhq_n_s16(v3778, 29764); 4194 int16x8_t v3780 = vaddq_s16(v3777, v3779); 4195 int16x8_t v3781 = vsubq_s16(v3533, v3535); 4196 int16x8_t v3782 = vsubq_s16(v3537, v3539); 4197 int16x8_t v3783 = vqrdmulhq_n_s16(v3782, 30331); 4198 int16x8_t v3784 = vaddq_s16(v3781, v3783); 4199 int16x8_t v3785 = vsubq_s16(v3523, v3525); 4200 int16x8_t v3786 = vsubq_s16(v3527, v3529); 4201 int16x8_t v3787 = vqrdmulhq_n_s16(v3786, 30925); 4202 int16x8_t v3788 = vaddq_s16(v3785, v3787); 4203 int16x8_t v3789 = vsubq_s16(v3513, v3515); 4204 int16x8_t v3790 = vsubq_s16(v3517, v3519); 4205 int16x8_t v3791 = vqrdmulhq_n_s16(v3790, 31547); 4206 int16x8_t v3792 = vaddq_s16(v3789, v3791); 4207 int16x8_t v3793 = vsubq_s16(v3503, v3505); 4208 int16x8_t v3794 = vsubq_s16(v3507, v3509); 4209 int16x8_t v3795 = vqrdmulhq_n_s16(v3794, 32199); 4210 int16x8_t v3796 = vaddq_s16(v3793, v3795); 4211 int16x8_t v3797 = vsubq_s16(v3493, v3495); 4212 int16x8_t v3798 = vsubq_s16(v3497, v3499); 4213 int16x8_t v3799_tmp = vqrdmulhq_n_s16(v3798, 117); 4214 int16x8_t v3799 = vaddq_s16(v3799_tmp, v3798); 4215 int16x8_t v3800 = vaddq_s16(v3797, v3799); 4216 int16x8_t v3801 = vsubq_s16(v3483, v3485); 4217 int16x8_t v3802 = vsubq_s16(v3487, v3489); 4218 int16x8_t v3803_tmp = vqrdmulhq_n_s16(v3802, 837); 4219 int16x8_t v3803 = vaddq_s16(v3803_tmp, v3802); 4220 int16x8_t v3804 = vaddq_s16(v3801, v3803); 4221 int16x8_t v3805 = vsubq_s16(v3473, v3475); 4222 int16x8_t v3806 = vsubq_s16(v3477, v3479); 4223 int16x8_t v3807_tmp = vqrdmulhq_n_s16(v3806, 1594); 4224 int16x8_t v3807 = vaddq_s16(v3807_tmp, v3806); 4225 int16x8_t v3808 = vaddq_s16(v3805, v3807); 4226 int16x8_t v3809 = vsubq_s16(v3463, v3465); 4227 int16x8_t v3810 = vsubq_s16(v3467, v3469); 4228 int16x8_t v3811_tmp = vqrdmulhq_n_s16(v3810, 2393); 4229 int16x8_t v3811 = vaddq_s16(v3811_tmp, v3810); 4230 int16x8_t v3812 = vaddq_s16(v3809, v3811); 4231 int16x8_t v3813 = vsubq_s16(v3453, v3455); 4232 int16x8_t v3814 = vsubq_s16(v3457, v3459); 4233 int16x8_t v3815_tmp = vqrdmulhq_n_s16(v3814, 3234); 4234 int16x8_t v3815 = vaddq_s16(v3815_tmp, v3814); 4235 int16x8_t v3816 = vaddq_s16(v3813, v3815); 4236 int16x8_t v3817 = vsubq_s16(v3443, v3445); 4237 int16x8_t v3818 = vsubq_s16(v3447, v3449); 4238 int16x8_t v3819_tmp = vqrdmulhq_n_s16(v3818, 4123); 4239 int16x8_t v3819 = vaddq_s16(v3819_tmp, v3818); 4240 int16x8_t v3820 = vaddq_s16(v3817, v3819); 4241 int16x8_t v3821 = vsubq_s16(v3433, v3435); 4242 int16x8_t v3822 = vsubq_s16(v3437, v3439); 4243 int16x8_t v3823_tmp = vqrdmulhq_n_s16(v3822, 5062); 4244 int16x8_t v3823 = vaddq_s16(v3823_tmp, v3822); 4245 int16x8_t v3824 = vaddq_s16(v3821, v3823); 4246 int16x8_t v3825 = vsubq_s16(v3423, v3425); 4247 int16x8_t v3826 = vsubq_s16(v3427, v3429); 4248 int16x8_t v3827_tmp = vqrdmulhq_n_s16(v3826, 6057); 4249 int16x8_t v3827 = vaddq_s16(v3827_tmp, v3826); 4250 int16x8_t v3828 = vaddq_s16(v3825, v3827); 4251 int16x8_t v3829 = vsubq_s16(v3413, v3415); 4252 int16x8_t v3830 = vsubq_s16(v3417, v3419); 4253 int16x8_t v3831_tmp = vqrdmulhq_n_s16(v3830, 7111); 4254 int16x8_t v3831 = vaddq_s16(v3831_tmp, v3830); 4255 int16x8_t v3832 = vaddq_s16(v3829, v3831); 4256 int16x8_t v3833 = vsubq_s16(v3403, v3405); 4257 int16x8_t v3834 = vsubq_s16(v3407, v3409); 4258 int16x8_t v3835_tmp = vqrdmulhq_n_s16(v3834, 8231); 4259 int16x8_t v3835 = vaddq_s16(v3835_tmp, v3834); 4260 int16x8_t v3836 = vaddq_s16(v3833, v3835); 4261 int16x8_t v3837 = vsubq_s16(v3393, v3395); 4262 int16x8_t v3838 = vsubq_s16(v3397, v3399); 4263 int16x8_t v3839_tmp = vqrdmulhq_n_s16(v3838, 9421); 4264 int16x8_t v3839 = vaddq_s16(v3839_tmp, v3838); 4265 int16x8_t v3840 = vaddq_s16(v3837, v3839); 4266 int16x8_t v3841 = vsubq_s16(v3374, v3379); 4267 int16x8_t v3842 = vsubq_s16(v3384, v3389); 4268 int16x8_t v3843_tmp = vqrdmulhq_n_s16(v3842, 10690); 4269 int16x8_t v3843 = vaddq_s16(v3843_tmp, v3842); 4270 int16x8_t v3844 = vaddq_s16(v3841, v3843); 4271 int16x8_t v3845 = vsubq_s16(v3352, v3357); 4272 int16x8_t v3846 = vsubq_s16(v3362, v3367); 4273 int16x8_t v3847_tmp = vqrdmulhq_n_s16(v3846, 12044); 4274 int16x8_t v3847 = vaddq_s16(v3847_tmp, v3846); 4275 int16x8_t v3848 = vaddq_s16(v3845, v3847); 4276 int16x8_t v3849 = vsubq_s16(v3330, v3335); 4277 int16x8_t v3850 = vsubq_s16(v3340, v3345); 4278 int16x8_t v3851_tmp = vqrdmulhq_n_s16(v3850, 13493); 4279 int16x8_t v3851 = vaddq_s16(v3851_tmp, v3850); 4280 int16x8_t v3852 = vaddq_s16(v3849, v3851); 4281 int16x8_t v3853 = vsubq_s16(v3308, v3313); 4282 int16x8_t v3854 = vsubq_s16(v3318, v3323); 4283 int16x8_t v3855_tmp = vqrdmulhq_n_s16(v3854, 15046); 4284 int16x8_t v3855 = vaddq_s16(v3855_tmp, v3854); 4285 int16x8_t v3856 = vaddq_s16(v3853, v3855); 4286 int16x8_t v3857 = vsubq_s16(v3286, v3291); 4287 int16x8_t v3858 = vsubq_s16(v3296, v3301); 4288 int16x8_t v3859_tmp = vqrdmulhq_n_s16(v3858, 16715); 4289 int16x8_t v3859 = vaddq_s16(v3859_tmp, v3858); 4290 int16x8_t v3860 = vaddq_s16(v3857, v3859); 4291 int16x8_t v3861 = vsubq_s16(v3264, v3269); 4292 int16x8_t v3862 = vsubq_s16(v3274, v3279); 4293 int16x8_t v3863_tmp = vqrdmulhq_n_s16(v3862, 18512); 4294 int16x8_t v3863 = vaddq_s16(v3863_tmp, v3862); 4295 int16x8_t v3864 = vaddq_s16(v3861, v3863); 4296 int16x8_t v3865 = vsubq_s16(v3242, v3247); 4297 int16x8_t v3866 = vsubq_s16(v3252, v3257); 4298 int16x8_t v3867_tmp = vqrdmulhq_n_s16(v3866, 20453); 4299 int16x8_t v3867 = vaddq_s16(v3867_tmp, v3866); 4300 int16x8_t v3868 = vaddq_s16(v3865, v3867); 4301 int16x8_t v3869 = vsubq_s16(v3220, v3225); 4302 int16x8_t v3870 = vsubq_s16(v3230, v3235); 4303 int16x8_t v3871_tmp = vqrdmulhq_n_s16(v3870, 22555); 4304 int16x8_t v3871 = vaddq_s16(v3871_tmp, v3870); 4305 int16x8_t v3872 = vaddq_s16(v3869, v3871); 4306 int16x8_t v3873 = vsubq_s16(v3198, v3203); 4307 int16x8_t v3874 = vsubq_s16(v3208, v3213); 4308 int16x8_t v3875_tmp = vqrdmulhq_n_s16(v3874, 24839); 4309 int16x8_t v3875 = vaddq_s16(v3875_tmp, v3874); 4310 int16x8_t v3876 = vaddq_s16(v3873, v3875); 4311 int16x8_t v3877 = vsubq_s16(v3176, v3181); 4312 int16x8_t v3878 = vsubq_s16(v3186, v3191); 4313 int16x8_t v3879_tmp = vqrdmulhq_n_s16(v3878, 27330); 4314 int16x8_t v3879 = vaddq_s16(v3879_tmp, v3878); 4315 int16x8_t v3880 = vaddq_s16(v3877, v3879); 4316 int16x8_t v3881 = vsubq_s16(v3154, v3159); 4317 int16x8_t v3882 = vsubq_s16(v3164, v3169); 4318 int16x8_t v3883_tmp = vqrdmulhq_n_s16(v3882, 30056); 4319 int16x8_t v3883 = vaddq_s16(v3883_tmp, v3882); 4320 int16x8_t v3884 = vaddq_s16(v3881, v3883); 4321 int16x8_t v3885 = vsubq_s16(v3132, v3137); 4322 int16x8_t v3886 = vsubq_s16(v3142, v3147); 4323 int16x8_t v3887_tmp = vqrdmulhq_n_s16(v3886, 282); 4324 int16x8_t v3887 = vmlaq_n_s16(v3887_tmp, v3886, 2); 4325 int16x8_t v3888 = vaddq_s16(v3885, v3887); 4326 int16x8_t v3889 = vsubq_s16(v3110, v3115); 4327 int16x8_t v3890 = vsubq_s16(v3120, v3125); 4328 int16x8_t v3891_tmp = vqrdmulhq_n_s16(v3890, 3588); 4329 int16x8_t v3891 = vmlaq_n_s16(v3891_tmp, v3890, 2); 4330 int16x8_t v3892 = vaddq_s16(v3889, v3891); 4331 int16x8_t v3893 = vsubq_s16(v3088, v3093); 4332 int16x8_t v3894 = vsubq_s16(v3098, v3103); 4333 int16x8_t v3895_tmp = vqrdmulhq_n_s16(v3894, 7255); 4334 int16x8_t v3895 = vmlaq_n_s16(v3895_tmp, v3894, 2); 4335 int16x8_t v3896 = vaddq_s16(v3893, v3895); 4336 int16x8_t v3897 = vsubq_s16(v3066, v3071); 4337 int16x8_t v3898 = vsubq_s16(v3076, v3081); 4338 int16x8_t v3899_tmp = vqrdmulhq_n_s16(v3898, 11344); 4339 int16x8_t v3899 = vmlaq_n_s16(v3899_tmp, v3898, 2); 4340 int16x8_t v3900 = vaddq_s16(v3897, v3899); 4341 int16x8_t v3901 = vsubq_s16(v3044, v3049); 4342 int16x8_t v3902 = vsubq_s16(v3054, v3059); 4343 int16x8_t v3903_tmp = vqrdmulhq_n_s16(v3902, 15934); 4344 int16x8_t v3903 = vmlaq_n_s16(v3903_tmp, v3902, 2); 4345 int16x8_t v3904 = vaddq_s16(v3901, v3903); 4346 int16x8_t v3905 = vsubq_s16(v3004, v3015); 4347 int16x8_t v3906 = vsubq_s16(v3026, v3037); 4348 int16x8_t v3907_tmp = vqrdmulhq_n_s16(v3906, 21120); 4349 int16x8_t v3907 = vmlaq_n_s16(v3907_tmp, v3906, 2); 4350 int16x8_t v3908 = vaddq_s16(v3905, v3907); 4351 int16x8_t v3909 = vsubq_s16(v2958, v2969); 4352 int16x8_t v3910 = vsubq_s16(v2980, v2991); 4353 int16x8_t v3911_tmp = vqrdmulhq_n_s16(v3910, 27027); 4354 int16x8_t v3911 = vmlaq_n_s16(v3911_tmp, v3910, 2); 4355 int16x8_t v3912 = vaddq_s16(v3909, v3911); 4356 int16x8_t v3913 = vsubq_s16(v2912, v2923); 4357 int16x8_t v3914 = vsubq_s16(v2934, v2945); 4358 int16x8_t v3915_tmp = vqrdmulhq_n_s16(v3914, 1045); 4359 int16x8_t v3915 = vmlaq_n_s16(v3915_tmp, v3914, 3); 4360 int16x8_t v3916 = vaddq_s16(v3913, v3915); 4361 int16x8_t v3917 = vsubq_s16(v2866, v2877); 4362 int16x8_t v3918 = vsubq_s16(v2888, v2899); 4363 int16x8_t v3919_tmp = vqrdmulhq_n_s16(v3918, 8923); 4364 int16x8_t v3919 = vmlaq_n_s16(v3919_tmp, v3918, 3); 4365 int16x8_t v3920 = vaddq_s16(v3917, v3919); 4366 int16x8_t v3921 = vsubq_s16(v2820, v2831); 4367 int16x8_t v3922 = vsubq_s16(v2842, v2853); 4368 int16x8_t v3923_tmp = vqrdmulhq_n_s16(v3922, 18177); 4369 int16x8_t v3923 = vmlaq_n_s16(v3923_tmp, v3922, 3); 4370 int16x8_t v3924 = vaddq_s16(v3921, v3923); 4371 int16x8_t v3925 = vsubq_s16(v2774, v2785); 4372 int16x8_t v3926 = vsubq_s16(v2796, v2807); 4373 int16x8_t v3927_tmp = vqrdmulhq_n_s16(v3926, 29200); 4374 int16x8_t v3927 = vmlaq_n_s16(v3927_tmp, v3926, 3); 4375 int16x8_t v3928 = vaddq_s16(v3925, v3927); 4376 int16x8_t v3929 = vsubq_s16(v2728, v2739); 4377 int16x8_t v3930 = vsubq_s16(v2750, v2761); 4378 int16x8_t v3931_tmp = vqrdmulhq_n_s16(v3930, 9782); 4379 int16x8_t v3931 = vmlaq_n_s16(v3931_tmp, v3930, 4); 4380 int16x8_t v3932 = vaddq_s16(v3929, v3931); 4381 int16x8_t v3933 = vsubq_s16(v2682, v2693); 4382 int16x8_t v3934 = vsubq_s16(v2704, v2715); 4383 int16x8_t v3935_tmp = vqrdmulhq_n_s16(v3934, 26282); 4384 int16x8_t v3935 = vmlaq_n_s16(v3935_tmp, v3934, 4); 4385 int16x8_t v3936 = vaddq_s16(v3933, v3935); 4386 int16x8_t v3937 = vsubq_s16(v2600, v2623); 4387 int16x8_t v3938 = vsubq_s16(v2646, v2669); 4388 int16x8_t v3939_tmp = vqrdmulhq_n_s16(v3938, 14423); 4389 int16x8_t v3939 = vmlaq_n_s16(v3939_tmp, v3938, 5); 4390 int16x8_t v3940 = vaddq_s16(v3937, v3939); 4391 int16x8_t v3941 = vsubq_s16(v2506, v2529); 4392 int16x8_t v3942 = vsubq_s16(v2552, v2575); 4393 int16x8_t v3943_tmp = vqrdmulhq_n_s16(v3942, 9008); 4394 int16x8_t v3943 = vmlaq_n_s16(v3943_tmp, v3942, 6); 4395 int16x8_t v3944 = vaddq_s16(v3941, v3943); 4396 int16x8_t v3945 = vsubq_s16(v2411, v2434); 4397 int16x8_t v3946 = vsubq_s16(v2457, v2481); 4398 int16x8_t v3947_tmp = vqrdmulhq_n_s16(v3946, 13552); 4399 int16x8_t v3947 = vmlaq_n_s16(v3947_tmp, v3946, 7); 4400 int16x8_t v3948 = vaddq_s16(v3945, v3947); 4401 int16x8_t v3949 = vsubq_s16(v2317, v2340); 4402 int16x8_t v3950 = vsubq_s16(v2363, v2386); 4403 int16x8_t v3951_tmp = vqrdmulhq_n_s16(v3950, 1925); 4404 int16x8_t v3951 = vmlaq_n_s16(v3951_tmp, v3950, 9); 4405 int16x8_t v3952 = vaddq_s16(v3949, v3951); 4406 int16x8_t v3953 = vsubq_s16(v2151, v2198); 4407 int16x8_t v3954 = vsubq_s16(v2245, v2292); 4408 int16x8_t v3955_tmp = vqrdmulhq_n_s16(v3954, 21123); 4409 int16x8_t v3955 = vmlaq_n_s16(v3955_tmp, v3954, 11); 4410 int16x8_t v3956 = vaddq_s16(v3953, v3955); 4411 int16x8_t v3957 = vsubq_s16(v1961, v2008); 4412 int16x8_t v3958 = vsubq_s16(v2055, v2102); 4413 int16x8_t v3959_tmp = vqrdmulhq_n_s16(v3958, 9831); 4414 int16x8_t v3959 = vmlaq_n_s16(v3959_tmp, v3958, 16); 4415 int16x8_t v3960 = vaddq_s16(v3957, v3959); 4416 int16x8_t v3961 = vsubq_s16(v1627, v1722); 4417 int16x8_t v3962 = vsubq_s16(v1817, v1912); 4418 int16x8_t v3963_tmp = vqrdmulhq_n_s16(v3962, 5373); 4419 int16x8_t v3963 = vmlaq_n_s16(v3963_tmp, v3962, 27); 4420 int16x8_t v3964 = vaddq_s16(v3961, v3963); 4421 int16x8_t v3965 = vsubq_s16(v317, v700); 4422 int16x8_t v3966 = vsubq_s16(v1146, v1530); 4423 int16x8_t v3967_tmp = vqrdmulhq_n_s16(v3966, 15986); 4424 int16x8_t v3967 = vmlaq_n_s16(v3967_tmp, v3966, 81); 4425 int16x8_t v3968 = vaddq_s16(v3965, v3967); 4426 int16x8_t v3969 = vsubq_s16(v3965, v3967); 4427 int16x8_t v3970 = vsubq_s16(v3961, v3963); 4428 int16x8_t v3971 = vsubq_s16(v3957, v3959); 4429 int16x8_t v3972 = vsubq_s16(v3953, v3955); 4430 int16x8_t v3973 = vsubq_s16(v3949, v3951); 4431 int16x8_t v3974 = vsubq_s16(v3945, v3947); 4432 int16x8_t v3975 = vsubq_s16(v3941, v3943); 4433 int16x8_t v3976 = vsubq_s16(v3937, v3939); 4434 int16x8_t v3977 = vsubq_s16(v3933, v3935); 4435 int16x8_t v3978 = vsubq_s16(v3929, v3931); 4436 int16x8_t v3979 = vsubq_s16(v3925, v3927); 4437 int16x8_t v3980 = vsubq_s16(v3921, v3923); 4438 int16x8_t v3981 = vsubq_s16(v3917, v3919); 4439 int16x8_t v3982 = vsubq_s16(v3913, v3915); 4440 int16x8_t v3983 = vsubq_s16(v3909, v3911); 4441 int16x8_t v3984 = vsubq_s16(v3905, v3907); 4442 int16x8_t v3985 = vsubq_s16(v3901, v3903); 4443 int16x8_t v3986 = vsubq_s16(v3897, v3899); 4444 int16x8_t v3987 = vsubq_s16(v3893, v3895); 4445 int16x8_t v3988 = vsubq_s16(v3889, v3891); 4446 int16x8_t v3989 = vsubq_s16(v3885, v3887); 4447 int16x8_t v3990 = vsubq_s16(v3881, v3883); 4448 int16x8_t v3991 = vsubq_s16(v3877, v3879); 4449 int16x8_t v3992 = vsubq_s16(v3873, v3875); 4450 int16x8_t v3993 = vsubq_s16(v3869, v3871); 4451 int16x8_t v3994 = vsubq_s16(v3865, v3867); 4452 int16x8_t v3995 = vsubq_s16(v3861, v3863); 4453 int16x8_t v3996 = vsubq_s16(v3857, v3859); 4454 int16x8_t v3997 = vsubq_s16(v3853, v3855); 4455 int16x8_t v3998 = vsubq_s16(v3849, v3851); 4456 int16x8_t v3999 = vsubq_s16(v3845, v3847); 4457 int16x8_t v4000 = vsubq_s16(v3841, v3843); 4458 int16x8_t v4001 = vsubq_s16(v3837, v3839); 4459 int16x8_t v4002 = vsubq_s16(v3833, v3835); 4460 int16x8_t v4003 = vsubq_s16(v3829, v3831); 4461 int16x8_t v4004 = vsubq_s16(v3825, v3827); 4462 int16x8_t v4005 = vsubq_s16(v3821, v3823); 4463 int16x8_t v4006 = vsubq_s16(v3817, v3819); 4464 int16x8_t v4007 = vsubq_s16(v3813, v3815); 4465 int16x8_t v4008 = vsubq_s16(v3809, v3811); 4466 int16x8_t v4009 = vsubq_s16(v3805, v3807); 4467 int16x8_t v4010 = vsubq_s16(v3801, v3803); 4468 int16x8_t v4011 = vsubq_s16(v3797, v3799); 4469 int16x8_t v4012 = vsubq_s16(v3793, v3795); 4470 int16x8_t v4013 = vsubq_s16(v3789, v3791); 4471 int16x8_t v4014 = vsubq_s16(v3785, v3787); 4472 int16x8_t v4015 = vsubq_s16(v3781, v3783); 4473 int16x8_t v4016 = vsubq_s16(v3777, v3779); 4474 int16x8_t v4017 = vsubq_s16(v3773, v3775); 4475 int16x8_t v4018 = vsubq_s16(v3769, v3771); 4476 int16x8_t v4019 = vsubq_s16(v3765, v3767); 4477 int16x8_t v4020 = vsubq_s16(v3761, v3763); 4478 int16x8_t v4021 = vsubq_s16(v3757, v3759); 4479 int16x8_t v4022 = vsubq_s16(v3753, v3755); 4480 int16x8_t v4023 = vsubq_s16(v3749, v3751); 4481 int16x8_t v4024 = vsubq_s16(v3745, v3747); 4482 int16x8_t v4025 = vsubq_s16(v3741, v3743); 4483 int16x8_t v4026 = vsubq_s16(v3737, v3739); 4484 int16x8_t v4027 = vsubq_s16(v3733, v3735); 4485 int16x8_t v4028 = vsubq_s16(v3729, v3731); 4486 int16x8_t v4029 = vsubq_s16(v3725, v3727); 4487 int16x8_t v4030 = vsubq_s16(v3721, v3723); 4488 int16x8_t v4031 = vsubq_s16(v3717, v3719); 4489 int16x8_t v4032 = vsubq_s16(v3713, v3715); 4490 int16x8_t v4033 = vsubq_s16(v3706, v3711); 4491 int16x8_t v4034 = vsubq_s16(v3696, v3701); 4492 int16x8_t v4035 = vsubq_s16(v3686, v3691); 4493 int16x8_t v4036 = vsubq_s16(v3676, v3681); 4494 int16x8_t v4037 = vsubq_s16(v3666, v3671); 4495 int16x8_t v4038 = vsubq_s16(v3656, v3661); 4496 int16x8_t v4039 = vsubq_s16(v3646, v3651); 4497 int16x8_t v4040 = vsubq_s16(v3636, v3641); 4498 int16x8_t v4041 = vsubq_s16(v3626, v3631); 4499 int16x8_t v4042 = vsubq_s16(v3616, v3621); 4500 int16x8_t v4043 = vsubq_s16(v3606, v3611); 4501 int16x8_t v4044 = vsubq_s16(v3596, v3601); 4502 int16x8_t v4045 = vsubq_s16(v3586, v3591); 4503 int16x8_t v4046 = vsubq_s16(v3576, v3581); 4504 int16x8_t v4047 = vsubq_s16(v3566, v3571); 4505 int16x8_t v4048 = vsubq_s16(v3556, v3561); 4506 int16x8_t v4049 = vsubq_s16(v3546, v3551); 4507 int16x8_t v4050 = vsubq_s16(v3536, v3541); 4508 int16x8_t v4051 = vsubq_s16(v3526, v3531); 4509 int16x8_t v4052 = vsubq_s16(v3516, v3521); 4510 int16x8_t v4053 = vsubq_s16(v3506, v3511); 4511 int16x8_t v4054 = vsubq_s16(v3496, v3501); 4512 int16x8_t v4055 = vsubq_s16(v3486, v3491); 4513 int16x8_t v4056 = vsubq_s16(v3476, v3481); 4514 int16x8_t v4057 = vsubq_s16(v3466, v3471); 4515 int16x8_t v4058 = vsubq_s16(v3456, v3461); 4516 int16x8_t v4059 = vsubq_s16(v3446, v3451); 4517 int16x8_t v4060 = vsubq_s16(v3436, v3441); 4518 int16x8_t v4061 = vsubq_s16(v3426, v3431); 4519 int16x8_t v4062 = vsubq_s16(v3416, v3421); 4520 int16x8_t v4063 = vsubq_s16(v3406, v3411); 4521 int16x8_t v4064 = vsubq_s16(v3396, v3401); 4522 int16x8_t v4065 = vsubq_s16(v3380, v3391); 4523 int16x8_t v4066 = vsubq_s16(v3358, v3369); 4524 int16x8_t v4067 = vsubq_s16(v3336, v3347); 4525 int16x8_t v4068 = vsubq_s16(v3314, v3325); 4526 int16x8_t v4069 = vsubq_s16(v3292, v3303); 4527 int16x8_t v4070 = vsubq_s16(v3270, v3281); 4528 int16x8_t v4071 = vsubq_s16(v3248, v3259); 4529 int16x8_t v4072 = vsubq_s16(v3226, v3237); 4530 int16x8_t v4073 = vsubq_s16(v3204, v3215); 4531 int16x8_t v4074 = vsubq_s16(v3182, v3193); 4532 int16x8_t v4075 = vsubq_s16(v3160, v3171); 4533 int16x8_t v4076 = vsubq_s16(v3138, v3149); 4534 int16x8_t v4077 = vsubq_s16(v3116, v3127); 4535 int16x8_t v4078 = vsubq_s16(v3094, v3105); 4536 int16x8_t v4079 = vsubq_s16(v3072, v3083); 4537 int16x8_t v4080 = vsubq_s16(v3050, v3061); 4538 int16x8_t v4081 = vsubq_s16(v3016, v3039); 4539 int16x8_t v4082 = vsubq_s16(v2970, v2993); 4540 int16x8_t v4083 = vsubq_s16(v2924, v2947); 4541 int16x8_t v4084 = vsubq_s16(v2878, v2901); 4542 int16x8_t v4085 = vsubq_s16(v2832, v2855); 4543 int16x8_t v4086 = vsubq_s16(v2786, v2809); 4544 int16x8_t v4087 = vsubq_s16(v2740, v2763); 4545 int16x8_t v4088 = vsubq_s16(v2694, v2717); 4546 int16x8_t v4089 = vsubq_s16(v2624, v2671); 4547 int16x8_t v4090 = vsubq_s16(v2530, v2577); 4548 int16x8_t v4091 = vsubq_s16(v2435, v2483); 4549 int16x8_t v4092 = vsubq_s16(v2341, v2388); 4550 int16x8_t v4093 = vsubq_s16(v2199, v2294); 4551 int16x8_t v4094 = vsubq_s16(v2009, v2104); 4552 int16x8_t v4095 = vsubq_s16(v1723, v1914); 4553 int16x8_t v4096 = vsubq_s16(v701, v1532); 4554 vst1q_s16(out + out_stride * 0 + i, v1533); 4555 vst1q_s16(out + out_stride * 1 + i, v1915); 4556 vst1q_s16(out + out_stride * 2 + i, v2105); 4557 vst1q_s16(out + out_stride * 3 + i, v2295); 4558 vst1q_s16(out + out_stride * 4 + i, v2389); 4559 vst1q_s16(out + out_stride * 5 + i, v2484); 4560 vst1q_s16(out + out_stride * 6 + i, v2578); 4561 vst1q_s16(out + out_stride * 7 + i, v2672); 4562 vst1q_s16(out + out_stride * 8 + i, v2718); 4563 vst1q_s16(out + out_stride * 9 + i, v2764); 4564 vst1q_s16(out + out_stride * 10 + i, v2810); 4565 vst1q_s16(out + out_stride * 11 + i, v2856); 4566 vst1q_s16(out + out_stride * 12 + i, v2902); 4567 vst1q_s16(out + out_stride * 13 + i, v2948); 4568 vst1q_s16(out + out_stride * 14 + i, v2994); 4569 vst1q_s16(out + out_stride * 15 + i, v3040); 4570 vst1q_s16(out + out_stride * 16 + i, v3062); 4571 vst1q_s16(out + out_stride * 17 + i, v3084); 4572 vst1q_s16(out + out_stride * 18 + i, v3106); 4573 vst1q_s16(out + out_stride * 19 + i, v3128); 4574 vst1q_s16(out + out_stride * 20 + i, v3150); 4575 vst1q_s16(out + out_stride * 21 + i, v3172); 4576 vst1q_s16(out + out_stride * 22 + i, v3194); 4577 vst1q_s16(out + out_stride * 23 + i, v3216); 4578 vst1q_s16(out + out_stride * 24 + i, v3238); 4579 vst1q_s16(out + out_stride * 25 + i, v3260); 4580 vst1q_s16(out + out_stride * 26 + i, v3282); 4581 vst1q_s16(out + out_stride * 27 + i, v3304); 4582 vst1q_s16(out + out_stride * 28 + i, v3326); 4583 vst1q_s16(out + out_stride * 29 + i, v3348); 4584 vst1q_s16(out + out_stride * 30 + i, v3370); 4585 vst1q_s16(out + out_stride * 31 + i, v3392); 4586 vst1q_s16(out + out_stride * 32 + i, v3402); 4587 vst1q_s16(out + out_stride * 33 + i, v3412); 4588 vst1q_s16(out + out_stride * 34 + i, v3422); 4589 vst1q_s16(out + out_stride * 35 + i, v3432); 4590 vst1q_s16(out + out_stride * 36 + i, v3442); 4591 vst1q_s16(out + out_stride * 37 + i, v3452); 4592 vst1q_s16(out + out_stride * 38 + i, v3462); 4593 vst1q_s16(out + out_stride * 39 + i, v3472); 4594 vst1q_s16(out + out_stride * 40 + i, v3482); 4595 vst1q_s16(out + out_stride * 41 + i, v3492); 4596 vst1q_s16(out + out_stride * 42 + i, v3502); 4597 vst1q_s16(out + out_stride * 43 + i, v3512); 4598 vst1q_s16(out + out_stride * 44 + i, v3522); 4599 vst1q_s16(out + out_stride * 45 + i, v3532); 4600 vst1q_s16(out + out_stride * 46 + i, v3542); 4601 vst1q_s16(out + out_stride * 47 + i, v3552); 4602 vst1q_s16(out + out_stride * 48 + i, v3562); 4603 vst1q_s16(out + out_stride * 49 + i, v3572); 4604 vst1q_s16(out + out_stride * 50 + i, v3582); 4605 vst1q_s16(out + out_stride * 51 + i, v3592); 4606 vst1q_s16(out + out_stride * 52 + i, v3602); 4607 vst1q_s16(out + out_stride * 53 + i, v3612); 4608 vst1q_s16(out + out_stride * 54 + i, v3622); 4609 vst1q_s16(out + out_stride * 55 + i, v3632); 4610 vst1q_s16(out + out_stride * 56 + i, v3642); 4611 vst1q_s16(out + out_stride * 57 + i, v3652); 4612 vst1q_s16(out + out_stride * 58 + i, v3662); 4613 vst1q_s16(out + out_stride * 59 + i, v3672); 4614 vst1q_s16(out + out_stride * 60 + i, v3682); 4615 vst1q_s16(out + out_stride * 61 + i, v3692); 4616 vst1q_s16(out + out_stride * 62 + i, v3702); 4617 vst1q_s16(out + out_stride * 63 + i, v3712); 4618 vst1q_s16(out + out_stride * 64 + i, v3716); 4619 vst1q_s16(out + out_stride * 65 + i, v3720); 4620 vst1q_s16(out + out_stride * 66 + i, v3724); 4621 vst1q_s16(out + out_stride * 67 + i, v3728); 4622 vst1q_s16(out + out_stride * 68 + i, v3732); 4623 vst1q_s16(out + out_stride * 69 + i, v3736); 4624 vst1q_s16(out + out_stride * 70 + i, v3740); 4625 vst1q_s16(out + out_stride * 71 + i, v3744); 4626 vst1q_s16(out + out_stride * 72 + i, v3748); 4627 vst1q_s16(out + out_stride * 73 + i, v3752); 4628 vst1q_s16(out + out_stride * 74 + i, v3756); 4629 vst1q_s16(out + out_stride * 75 + i, v3760); 4630 vst1q_s16(out + out_stride * 76 + i, v3764); 4631 vst1q_s16(out + out_stride * 77 + i, v3768); 4632 vst1q_s16(out + out_stride * 78 + i, v3772); 4633 vst1q_s16(out + out_stride * 79 + i, v3776); 4634 vst1q_s16(out + out_stride * 80 + i, v3780); 4635 vst1q_s16(out + out_stride * 81 + i, v3784); 4636 vst1q_s16(out + out_stride * 82 + i, v3788); 4637 vst1q_s16(out + out_stride * 83 + i, v3792); 4638 vst1q_s16(out + out_stride * 84 + i, v3796); 4639 vst1q_s16(out + out_stride * 85 + i, v3800); 4640 vst1q_s16(out + out_stride * 86 + i, v3804); 4641 vst1q_s16(out + out_stride * 87 + i, v3808); 4642 vst1q_s16(out + out_stride * 88 + i, v3812); 4643 vst1q_s16(out + out_stride * 89 + i, v3816); 4644 vst1q_s16(out + out_stride * 90 + i, v3820); 4645 vst1q_s16(out + out_stride * 91 + i, v3824); 4646 vst1q_s16(out + out_stride * 92 + i, v3828); 4647 vst1q_s16(out + out_stride * 93 + i, v3832); 4648 vst1q_s16(out + out_stride * 94 + i, v3836); 4649 vst1q_s16(out + out_stride * 95 + i, v3840); 4650 vst1q_s16(out + out_stride * 96 + i, v3844); 4651 vst1q_s16(out + out_stride * 97 + i, v3848); 4652 vst1q_s16(out + out_stride * 98 + i, v3852); 4653 vst1q_s16(out + out_stride * 99 + i, v3856); 4654 vst1q_s16(out + out_stride * 100 + i, v3860); 4655 vst1q_s16(out + out_stride * 101 + i, v3864); 4656 vst1q_s16(out + out_stride * 102 + i, v3868); 4657 vst1q_s16(out + out_stride * 103 + i, v3872); 4658 vst1q_s16(out + out_stride * 104 + i, v3876); 4659 vst1q_s16(out + out_stride * 105 + i, v3880); 4660 vst1q_s16(out + out_stride * 106 + i, v3884); 4661 vst1q_s16(out + out_stride * 107 + i, v3888); 4662 vst1q_s16(out + out_stride * 108 + i, v3892); 4663 vst1q_s16(out + out_stride * 109 + i, v3896); 4664 vst1q_s16(out + out_stride * 110 + i, v3900); 4665 vst1q_s16(out + out_stride * 111 + i, v3904); 4666 vst1q_s16(out + out_stride * 112 + i, v3908); 4667 vst1q_s16(out + out_stride * 113 + i, v3912); 4668 vst1q_s16(out + out_stride * 114 + i, v3916); 4669 vst1q_s16(out + out_stride * 115 + i, v3920); 4670 vst1q_s16(out + out_stride * 116 + i, v3924); 4671 vst1q_s16(out + out_stride * 117 + i, v3928); 4672 vst1q_s16(out + out_stride * 118 + i, v3932); 4673 vst1q_s16(out + out_stride * 119 + i, v3936); 4674 vst1q_s16(out + out_stride * 120 + i, v3940); 4675 vst1q_s16(out + out_stride * 121 + i, v3944); 4676 vst1q_s16(out + out_stride * 122 + i, v3948); 4677 vst1q_s16(out + out_stride * 123 + i, v3952); 4678 vst1q_s16(out + out_stride * 124 + i, v3956); 4679 vst1q_s16(out + out_stride * 125 + i, v3960); 4680 vst1q_s16(out + out_stride * 126 + i, v3964); 4681 vst1q_s16(out + out_stride * 127 + i, v3968); 4682 vst1q_s16(out + out_stride * 128 + i, v3969); 4683 vst1q_s16(out + out_stride * 129 + i, v3970); 4684 vst1q_s16(out + out_stride * 130 + i, v3971); 4685 vst1q_s16(out + out_stride * 131 + i, v3972); 4686 vst1q_s16(out + out_stride * 132 + i, v3973); 4687 vst1q_s16(out + out_stride * 133 + i, v3974); 4688 vst1q_s16(out + out_stride * 134 + i, v3975); 4689 vst1q_s16(out + out_stride * 135 + i, v3976); 4690 vst1q_s16(out + out_stride * 136 + i, v3977); 4691 vst1q_s16(out + out_stride * 137 + i, v3978); 4692 vst1q_s16(out + out_stride * 138 + i, v3979); 4693 vst1q_s16(out + out_stride * 139 + i, v3980); 4694 vst1q_s16(out + out_stride * 140 + i, v3981); 4695 vst1q_s16(out + out_stride * 141 + i, v3982); 4696 vst1q_s16(out + out_stride * 142 + i, v3983); 4697 vst1q_s16(out + out_stride * 143 + i, v3984); 4698 vst1q_s16(out + out_stride * 144 + i, v3985); 4699 vst1q_s16(out + out_stride * 145 + i, v3986); 4700 vst1q_s16(out + out_stride * 146 + i, v3987); 4701 vst1q_s16(out + out_stride * 147 + i, v3988); 4702 vst1q_s16(out + out_stride * 148 + i, v3989); 4703 vst1q_s16(out + out_stride * 149 + i, v3990); 4704 vst1q_s16(out + out_stride * 150 + i, v3991); 4705 vst1q_s16(out + out_stride * 151 + i, v3992); 4706 vst1q_s16(out + out_stride * 152 + i, v3993); 4707 vst1q_s16(out + out_stride * 153 + i, v3994); 4708 vst1q_s16(out + out_stride * 154 + i, v3995); 4709 vst1q_s16(out + out_stride * 155 + i, v3996); 4710 vst1q_s16(out + out_stride * 156 + i, v3997); 4711 vst1q_s16(out + out_stride * 157 + i, v3998); 4712 vst1q_s16(out + out_stride * 158 + i, v3999); 4713 vst1q_s16(out + out_stride * 159 + i, v4000); 4714 vst1q_s16(out + out_stride * 160 + i, v4001); 4715 vst1q_s16(out + out_stride * 161 + i, v4002); 4716 vst1q_s16(out + out_stride * 162 + i, v4003); 4717 vst1q_s16(out + out_stride * 163 + i, v4004); 4718 vst1q_s16(out + out_stride * 164 + i, v4005); 4719 vst1q_s16(out + out_stride * 165 + i, v4006); 4720 vst1q_s16(out + out_stride * 166 + i, v4007); 4721 vst1q_s16(out + out_stride * 167 + i, v4008); 4722 vst1q_s16(out + out_stride * 168 + i, v4009); 4723 vst1q_s16(out + out_stride * 169 + i, v4010); 4724 vst1q_s16(out + out_stride * 170 + i, v4011); 4725 vst1q_s16(out + out_stride * 171 + i, v4012); 4726 vst1q_s16(out + out_stride * 172 + i, v4013); 4727 vst1q_s16(out + out_stride * 173 + i, v4014); 4728 vst1q_s16(out + out_stride * 174 + i, v4015); 4729 vst1q_s16(out + out_stride * 175 + i, v4016); 4730 vst1q_s16(out + out_stride * 176 + i, v4017); 4731 vst1q_s16(out + out_stride * 177 + i, v4018); 4732 vst1q_s16(out + out_stride * 178 + i, v4019); 4733 vst1q_s16(out + out_stride * 179 + i, v4020); 4734 vst1q_s16(out + out_stride * 180 + i, v4021); 4735 vst1q_s16(out + out_stride * 181 + i, v4022); 4736 vst1q_s16(out + out_stride * 182 + i, v4023); 4737 vst1q_s16(out + out_stride * 183 + i, v4024); 4738 vst1q_s16(out + out_stride * 184 + i, v4025); 4739 vst1q_s16(out + out_stride * 185 + i, v4026); 4740 vst1q_s16(out + out_stride * 186 + i, v4027); 4741 vst1q_s16(out + out_stride * 187 + i, v4028); 4742 vst1q_s16(out + out_stride * 188 + i, v4029); 4743 vst1q_s16(out + out_stride * 189 + i, v4030); 4744 vst1q_s16(out + out_stride * 190 + i, v4031); 4745 vst1q_s16(out + out_stride * 191 + i, v4032); 4746 vst1q_s16(out + out_stride * 192 + i, v4033); 4747 vst1q_s16(out + out_stride * 193 + i, v4034); 4748 vst1q_s16(out + out_stride * 194 + i, v4035); 4749 vst1q_s16(out + out_stride * 195 + i, v4036); 4750 vst1q_s16(out + out_stride * 196 + i, v4037); 4751 vst1q_s16(out + out_stride * 197 + i, v4038); 4752 vst1q_s16(out + out_stride * 198 + i, v4039); 4753 vst1q_s16(out + out_stride * 199 + i, v4040); 4754 vst1q_s16(out + out_stride * 200 + i, v4041); 4755 vst1q_s16(out + out_stride * 201 + i, v4042); 4756 vst1q_s16(out + out_stride * 202 + i, v4043); 4757 vst1q_s16(out + out_stride * 203 + i, v4044); 4758 vst1q_s16(out + out_stride * 204 + i, v4045); 4759 vst1q_s16(out + out_stride * 205 + i, v4046); 4760 vst1q_s16(out + out_stride * 206 + i, v4047); 4761 vst1q_s16(out + out_stride * 207 + i, v4048); 4762 vst1q_s16(out + out_stride * 208 + i, v4049); 4763 vst1q_s16(out + out_stride * 209 + i, v4050); 4764 vst1q_s16(out + out_stride * 210 + i, v4051); 4765 vst1q_s16(out + out_stride * 211 + i, v4052); 4766 vst1q_s16(out + out_stride * 212 + i, v4053); 4767 vst1q_s16(out + out_stride * 213 + i, v4054); 4768 vst1q_s16(out + out_stride * 214 + i, v4055); 4769 vst1q_s16(out + out_stride * 215 + i, v4056); 4770 vst1q_s16(out + out_stride * 216 + i, v4057); 4771 vst1q_s16(out + out_stride * 217 + i, v4058); 4772 vst1q_s16(out + out_stride * 218 + i, v4059); 4773 vst1q_s16(out + out_stride * 219 + i, v4060); 4774 vst1q_s16(out + out_stride * 220 + i, v4061); 4775 vst1q_s16(out + out_stride * 221 + i, v4062); 4776 vst1q_s16(out + out_stride * 222 + i, v4063); 4777 vst1q_s16(out + out_stride * 223 + i, v4064); 4778 vst1q_s16(out + out_stride * 224 + i, v4065); 4779 vst1q_s16(out + out_stride * 225 + i, v4066); 4780 vst1q_s16(out + out_stride * 226 + i, v4067); 4781 vst1q_s16(out + out_stride * 227 + i, v4068); 4782 vst1q_s16(out + out_stride * 228 + i, v4069); 4783 vst1q_s16(out + out_stride * 229 + i, v4070); 4784 vst1q_s16(out + out_stride * 230 + i, v4071); 4785 vst1q_s16(out + out_stride * 231 + i, v4072); 4786 vst1q_s16(out + out_stride * 232 + i, v4073); 4787 vst1q_s16(out + out_stride * 233 + i, v4074); 4788 vst1q_s16(out + out_stride * 234 + i, v4075); 4789 vst1q_s16(out + out_stride * 235 + i, v4076); 4790 vst1q_s16(out + out_stride * 236 + i, v4077); 4791 vst1q_s16(out + out_stride * 237 + i, v4078); 4792 vst1q_s16(out + out_stride * 238 + i, v4079); 4793 vst1q_s16(out + out_stride * 239 + i, v4080); 4794 vst1q_s16(out + out_stride * 240 + i, v4081); 4795 vst1q_s16(out + out_stride * 241 + i, v4082); 4796 vst1q_s16(out + out_stride * 242 + i, v4083); 4797 vst1q_s16(out + out_stride * 243 + i, v4084); 4798 vst1q_s16(out + out_stride * 244 + i, v4085); 4799 vst1q_s16(out + out_stride * 245 + i, v4086); 4800 vst1q_s16(out + out_stride * 246 + i, v4087); 4801 vst1q_s16(out + out_stride * 247 + i, v4088); 4802 vst1q_s16(out + out_stride * 248 + i, v4089); 4803 vst1q_s16(out + out_stride * 249 + i, v4090); 4804 vst1q_s16(out + out_stride * 250 + i, v4091); 4805 vst1q_s16(out + out_stride * 251 + i, v4092); 4806 vst1q_s16(out + out_stride * 252 + i, v4093); 4807 vst1q_s16(out + out_stride * 253 + i, v4094); 4808 vst1q_s16(out + out_stride * 254 + i, v4095); 4809 vst1q_s16(out + out_stride * 255 + i, v4096); 4810 } 4811 }