fast_dct32-inl.h (19429B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 /* This file is automatically generated. Do not modify it directly. */ 7 #if HWY_TARGET != HWY_NEON 8 #error "only include this file from fast_dct-inl.h" 9 #endif 10 11 constexpr size_t FastIDCTIntegerBits(FastDCTTag<32>) { return 1; } 12 13 void FastIDCT(FastDCTTag<32>, const int16_t* in, size_t in_stride, int16_t* out, 14 size_t out_stride, size_t count) { 15 JXL_ASSERT(count % 8 == 0); 16 for (size_t i = 0; i < count; i += 8) { 17 int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); 18 int16x8_t v1 = vld1q_s16(in + in_stride * 16 + i); 19 int16x8_t v2 = vaddq_s16(v0, v1); 20 int16x8_t v3 = vld1q_s16(in + in_stride * 8 + i); 21 int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); 22 int16x8_t v4 = vaddq_s16(v4_tmp, v3); 23 int16x8_t v5 = vld1q_s16(in + in_stride * 24 + i); 24 int16x8_t v6 = vaddq_s16(v5, v3); 25 int16x8_t v7 = vaddq_s16(v4, v6); 26 int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); 27 int16x8_t v9 = vaddq_s16(v2, v8); 28 int16x8_t v10 = vld1q_s16(in + in_stride * 4 + i); 29 int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); 30 int16x8_t v11 = vaddq_s16(v11_tmp, v10); 31 int16x8_t v12 = vld1q_s16(in + in_stride * 20 + i); 32 int16x8_t v13 = vld1q_s16(in + in_stride * 12 + i); 33 int16x8_t v14 = vaddq_s16(v12, v13); 34 int16x8_t v15 = vaddq_s16(v11, v14); 35 int16x8_t v16 = vld1q_s16(in + in_stride * 28 + i); 36 int16x8_t v17 = vaddq_s16(v16, v12); 37 int16x8_t v18 = vaddq_s16(v13, v10); 38 int16x8_t v19 = vaddq_s16(v17, v18); 39 int16x8_t v20 = vqrdmulhq_n_s16(v19, 17734); 40 int16x8_t v21 = vqrdmulhq_n_s16(v18, 25080); 41 int16x8_t v22 = vaddq_s16(v20, v21); 42 int16x8_t v23 = vaddq_s16(v15, v22); 43 int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); 44 int16x8_t v25 = vaddq_s16(v9, v24); 45 int16x8_t v26 = vld1q_s16(in + in_stride * 2 + i); 46 int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573); 47 int16x8_t v27 = vaddq_s16(v27_tmp, v26); 48 int16x8_t v28 = vld1q_s16(in + in_stride * 18 + i); 49 int16x8_t v29 = vld1q_s16(in + in_stride * 14 + i); 50 int16x8_t v30 = vaddq_s16(v28, v29); 51 int16x8_t v31 = vaddq_s16(v27, v30); 52 int16x8_t v32 = vld1q_s16(in + in_stride * 10 + i); 53 int16x8_t v33 = vld1q_s16(in + in_stride * 6 + i); 54 int16x8_t v34 = vaddq_s16(v32, v33); 55 int16x8_t v35 = vqrdmulhq_n_s16(v34, 25080); 56 int16x8_t v36 = vld1q_s16(in + in_stride * 26 + i); 57 int16x8_t v37 = vld1q_s16(in + in_stride * 22 + i); 58 int16x8_t v38 = vaddq_s16(v36, v37); 59 int16x8_t v39 = vaddq_s16(v38, v34); 60 int16x8_t v40 = vqrdmulhq_n_s16(v39, 17734); 61 int16x8_t v41 = vaddq_s16(v35, v40); 62 int16x8_t v42 = vaddq_s16(v31, v41); 63 int16x8_t v43 = vaddq_s16(v33, v26); 64 int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573); 65 int16x8_t v44 = vaddq_s16(v44_tmp, v43); 66 int16x8_t v45 = vaddq_s16(v29, v32); 67 int16x8_t v46 = vaddq_s16(v37, v28); 68 int16x8_t v47 = vaddq_s16(v45, v46); 69 int16x8_t v48 = vaddq_s16(v44, v47); 70 int16x8_t v49 = vqrdmulhq_n_s16(v48, 16705); 71 int16x8_t v50 = vld1q_s16(in + in_stride * 30 + i); 72 int16x8_t v51 = vaddq_s16(v50, v36); 73 int16x8_t v52 = vaddq_s16(v51, v46); 74 int16x8_t v53 = vqrdmulhq_n_s16(v52, 17734); 75 int16x8_t v54 = vaddq_s16(v45, v43); 76 int16x8_t v55_tmp = vqrdmulhq_n_s16(v54, 10045); 77 int16x8_t v55 = vaddq_s16(v55_tmp, v54); 78 int16x8_t v56 = vaddq_s16(v53, v55); 79 int16x8_t v57 = vqrdmulhq_n_s16(v56, 16705); 80 int16x8_t v58 = vaddq_s16(v49, v57); 81 int16x8_t v59 = vaddq_s16(v42, v58); 82 int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463); 83 int16x8_t v61 = vaddq_s16(v25, v60); 84 int16x8_t v62 = vld1q_s16(in + in_stride * 13 + i); 85 int16x8_t v63 = vld1q_s16(in + in_stride * 11 + i); 86 int16x8_t v64 = vaddq_s16(v62, v63); 87 int16x8_t v65 = vld1q_s16(in + in_stride * 5 + i); 88 int16x8_t v66 = vld1q_s16(in + in_stride * 3 + i); 89 int16x8_t v67 = vaddq_s16(v65, v66); 90 int16x8_t v68 = vaddq_s16(v64, v67); 91 int16x8_t v69_tmp = vqrdmulhq_n_s16(v68, 10045); 92 int16x8_t v69 = vaddq_s16(v69_tmp, v68); 93 int16x8_t v70 = vld1q_s16(in + in_stride * 21 + i); 94 int16x8_t v71 = vld1q_s16(in + in_stride * 19 + i); 95 int16x8_t v72 = vaddq_s16(v70, v71); 96 int16x8_t v73 = vld1q_s16(in + in_stride * 29 + i); 97 int16x8_t v74 = vld1q_s16(in + in_stride * 27 + i); 98 int16x8_t v75 = vaddq_s16(v73, v74); 99 int16x8_t v76 = vaddq_s16(v72, v75); 100 int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734); 101 int16x8_t v78 = vaddq_s16(v69, v77); 102 int16x8_t v79 = vqrdmulhq_n_s16(v78, 16705); 103 int16x8_t v80_tmp = vqrdmulhq_n_s16(v67, 13573); 104 int16x8_t v80 = vaddq_s16(v80_tmp, v67); 105 int16x8_t v81 = vaddq_s16(v64, v72); 106 int16x8_t v82 = vaddq_s16(v80, v81); 107 int16x8_t v83 = vqrdmulhq_n_s16(v82, 16705); 108 int16x8_t v84 = vaddq_s16(v79, v83); 109 int16x8_t v85 = vld1q_s16(in + in_stride * 1 + i); 110 int16x8_t v86_tmp = vqrdmulhq_n_s16(v85, 13573); 111 int16x8_t v86 = vaddq_s16(v86_tmp, v85); 112 int16x8_t v87 = vld1q_s16(in + in_stride * 17 + i); 113 int16x8_t v88 = vld1q_s16(in + in_stride * 15 + i); 114 int16x8_t v89 = vaddq_s16(v87, v88); 115 int16x8_t v90 = vaddq_s16(v86, v89); 116 int16x8_t v91 = vld1q_s16(in + in_stride * 9 + i); 117 int16x8_t v92 = vld1q_s16(in + in_stride * 7 + i); 118 int16x8_t v93 = vaddq_s16(v91, v92); 119 int16x8_t v94 = vqrdmulhq_n_s16(v93, 25080); 120 int16x8_t v95 = vld1q_s16(in + in_stride * 25 + i); 121 int16x8_t v96 = vld1q_s16(in + in_stride * 23 + i); 122 int16x8_t v97 = vaddq_s16(v95, v96); 123 int16x8_t v98 = vaddq_s16(v97, v93); 124 int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734); 125 int16x8_t v100 = vaddq_s16(v94, v99); 126 int16x8_t v101 = vaddq_s16(v90, v100); 127 int16x8_t v102 = vaddq_s16(v84, v101); 128 int16x8_t v103 = vaddq_s16(v92, v65); 129 int16x8_t v104 = vaddq_s16(v66, v85); 130 int16x8_t v105 = vaddq_s16(v103, v104); 131 int16x8_t v106_tmp = vqrdmulhq_n_s16(v105, 13573); 132 int16x8_t v106 = vaddq_s16(v106_tmp, v105); 133 int16x8_t v107 = vaddq_s16(v96, v70); 134 int16x8_t v108 = vaddq_s16(v71, v87); 135 int16x8_t v109 = vaddq_s16(v107, v108); 136 int16x8_t v110 = vaddq_s16(v63, v91); 137 int16x8_t v111 = vaddq_s16(v88, v62); 138 int16x8_t v112 = vaddq_s16(v110, v111); 139 int16x8_t v113 = vaddq_s16(v109, v112); 140 int16x8_t v114 = vaddq_s16(v106, v113); 141 int16x8_t v115 = vqrdmulhq_n_s16(v114, 16705); 142 int16x8_t v116 = vaddq_s16(v112, v105); 143 int16x8_t v117 = vqrdmulhq_n_s16(v116, 25080); 144 int16x8_t v118 = vqrdmulhq_n_s16(v116, 17734); 145 int16x8_t v119 = vaddq_s16(v74, v95); 146 int16x8_t v120 = vld1q_s16(in + in_stride * 31 + i); 147 int16x8_t v121 = vaddq_s16(v120, v73); 148 int16x8_t v122 = vaddq_s16(v119, v121); 149 int16x8_t v123 = vaddq_s16(v122, v109); 150 int16x8_t v124 = vqrdmulhq_n_s16(v123, 17734); 151 int16x8_t v125 = vaddq_s16(v118, v124); 152 int16x8_t v126 = vaddq_s16(v117, v125); 153 int16x8_t v127 = vqrdmulhq_n_s16(v126, 16705); 154 int16x8_t v128 = vaddq_s16(v115, v127); 155 int16x8_t v129 = vqrdmulhq_n_s16(v128, 16463); 156 int16x8_t v130_tmp = vqrdmulhq_n_s16(v104, 13573); 157 int16x8_t v130 = vaddq_s16(v130_tmp, v104); 158 int16x8_t v131 = vaddq_s16(v108, v111); 159 int16x8_t v132 = vaddq_s16(v130, v131); 160 int16x8_t v133 = vaddq_s16(v119, v107); 161 int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734); 162 int16x8_t v135 = vaddq_s16(v110, v103); 163 int16x8_t v136_tmp = vqrdmulhq_n_s16(v135, 10045); 164 int16x8_t v136 = vaddq_s16(v136_tmp, v135); 165 int16x8_t v137 = vaddq_s16(v134, v136); 166 int16x8_t v138 = vaddq_s16(v132, v137); 167 int16x8_t v139 = vqrdmulhq_n_s16(v138, 16463); 168 int16x8_t v140 = vaddq_s16(v129, v139); 169 int16x8_t v141 = vaddq_s16(v102, v140); 170 int16x8_t v142 = vqrdmulhq_n_s16(v141, 16404); 171 int16x8_t v143 = vaddq_s16(v61, v142); 172 int16x8_t v144 = vsubq_s16(v0, v1); 173 int16x8_t v145 = vsubq_s16(v4, v6); 174 int16x8_t v146_tmp = vqrdmulhq_n_s16(v145, 10045); 175 int16x8_t v146 = vaddq_s16(v146_tmp, v145); 176 int16x8_t v147 = vaddq_s16(v144, v146); 177 int16x8_t v148 = vsubq_s16(v11, v14); 178 int16x8_t v149 = vqrdmulhq_n_s16(v18, 17734); 179 int16x8_t v150_tmp = vqrdmulhq_n_s16(v17, 10045); 180 int16x8_t v150 = vaddq_s16(v150_tmp, v17); 181 int16x8_t v151 = vsubq_s16(v149, v150); 182 int16x8_t v152 = vaddq_s16(v148, v151); 183 int16x8_t v153 = vqrdmulhq_n_s16(v152, 19705); 184 int16x8_t v154 = vaddq_s16(v147, v153); 185 int16x8_t v155 = vsubq_s16(v27, v30); 186 int16x8_t v156 = vqrdmulhq_n_s16(v34, 17734); 187 int16x8_t v157_tmp = vqrdmulhq_n_s16(v38, 10045); 188 int16x8_t v157 = vaddq_s16(v157_tmp, v38); 189 int16x8_t v158 = vsubq_s16(v156, v157); 190 int16x8_t v159 = vaddq_s16(v155, v158); 191 int16x8_t v160 = vqrdmulhq_n_s16(v54, 13573); 192 int16x8_t v161 = vsubq_s16(v160, v52); 193 int16x8_t v162 = vqrdmulhq_n_s16(v161, 25746); 194 int16x8_t v163 = vsubq_s16(v44, v47); 195 int16x8_t v164 = vqrdmulhq_n_s16(v163, 19705); 196 int16x8_t v165 = vaddq_s16(v162, v164); 197 int16x8_t v166 = vaddq_s16(v159, v165); 198 int16x8_t v167 = vqrdmulhq_n_s16(v166, 17121); 199 int16x8_t v168 = vaddq_s16(v154, v167); 200 int16x8_t v169 = vsubq_s16(v86, v89); 201 int16x8_t v170 = vqrdmulhq_n_s16(v93, 17734); 202 int16x8_t v171_tmp = vqrdmulhq_n_s16(v97, 10045); 203 int16x8_t v171 = vaddq_s16(v171_tmp, v97); 204 int16x8_t v172 = vsubq_s16(v170, v171); 205 int16x8_t v173 = vaddq_s16(v169, v172); 206 int16x8_t v174 = vsubq_s16(v80, v81); 207 int16x8_t v175 = vqrdmulhq_n_s16(v174, 19705); 208 int16x8_t v176 = vqrdmulhq_n_s16(v68, 13573); 209 int16x8_t v177 = vsubq_s16(v176, v76); 210 int16x8_t v178 = vqrdmulhq_n_s16(v177, 25746); 211 int16x8_t v179 = vaddq_s16(v175, v178); 212 int16x8_t v180 = vaddq_s16(v173, v179); 213 int16x8_t v181 = vsubq_s16(v130, v131); 214 int16x8_t v182 = vqrdmulhq_n_s16(v135, 13573); 215 int16x8_t v183 = vsubq_s16(v182, v133); 216 int16x8_t v184_tmp = vqrdmulhq_n_s16(v183, 10045); 217 int16x8_t v184 = vaddq_s16(v184_tmp, v183); 218 int16x8_t v185 = vaddq_s16(v181, v184); 219 int16x8_t v186 = vqrdmulhq_n_s16(v185, 17121); 220 int16x8_t v187 = vqrdmulhq_n_s16(v105, 27867); 221 int16x8_t v188 = vqrdmulhq_n_s16(v113, 19705); 222 int16x8_t v189 = vsubq_s16(v187, v188); 223 int16x8_t v190 = vqrdmulhq_n_s16(v116, 13573); 224 int16x8_t v191 = vsubq_s16(v190, v123); 225 int16x8_t v192 = vqrdmulhq_n_s16(v191, 25746); 226 int16x8_t v193 = vaddq_s16(v189, v192); 227 int16x8_t v194 = vqrdmulhq_n_s16(v193, 17121); 228 int16x8_t v195 = vaddq_s16(v186, v194); 229 int16x8_t v196 = vaddq_s16(v180, v195); 230 int16x8_t v197 = vqrdmulhq_n_s16(v196, 16563); 231 int16x8_t v198 = vaddq_s16(v168, v197); 232 int16x8_t v199 = vsubq_s16(v144, v146); 233 int16x8_t v200 = vsubq_s16(v148, v151); 234 int16x8_t v201 = vqrdmulhq_n_s16(v200, 29490); 235 int16x8_t v202 = vaddq_s16(v199, v201); 236 int16x8_t v203 = vsubq_s16(v155, v158); 237 int16x8_t v204 = vqrdmulhq_n_s16(v163, 29490); 238 int16x8_t v205_tmp = vqrdmulhq_n_s16(v161, 5763); 239 int16x8_t v205 = vaddq_s16(v205_tmp, v161); 240 int16x8_t v206 = vsubq_s16(v204, v205); 241 int16x8_t v207 = vaddq_s16(v203, v206); 242 int16x8_t v208 = vqrdmulhq_n_s16(v207, 18578); 243 int16x8_t v209 = vaddq_s16(v202, v208); 244 int16x8_t v210 = vsubq_s16(v169, v172); 245 int16x8_t v211 = vqrdmulhq_n_s16(v174, 29490); 246 int16x8_t v212_tmp = vqrdmulhq_n_s16(v177, 5763); 247 int16x8_t v212 = vaddq_s16(v212_tmp, v177); 248 int16x8_t v213 = vsubq_s16(v211, v212); 249 int16x8_t v214 = vaddq_s16(v210, v213); 250 int16x8_t v215 = vsubq_s16(v181, v184); 251 int16x8_t v216 = vqrdmulhq_n_s16(v215, 18578); 252 int16x8_t v217 = vqrdmulhq_n_s16(v189, 27803); 253 int16x8_t v218 = vqrdmulhq_n_s16(v191, 21845); 254 int16x8_t v219 = vsubq_s16(v217, v218); 255 int16x8_t v220 = vaddq_s16(v216, v219); 256 int16x8_t v221 = vaddq_s16(v214, v220); 257 int16x8_t v222 = vqrdmulhq_n_s16(v221, 16890); 258 int16x8_t v223 = vaddq_s16(v209, v222); 259 int16x8_t v224 = vsubq_s16(v2, v8); 260 int16x8_t v225 = vsubq_s16(v15, v22); 261 int16x8_t v226_tmp = vqrdmulhq_n_s16(v225, 18446); 262 int16x8_t v226 = vmlaq_n_s16(v226_tmp, v225, 2); 263 int16x8_t v227 = vaddq_s16(v224, v226); 264 int16x8_t v228 = vsubq_s16(v31, v41); 265 int16x8_t v229 = vsubq_s16(v48, v56); 266 int16x8_t v230_tmp = vqrdmulhq_n_s16(v229, 18446); 267 int16x8_t v230 = vmlaq_n_s16(v230_tmp, v229, 2); 268 int16x8_t v231 = vaddq_s16(v228, v230); 269 int16x8_t v232 = vqrdmulhq_n_s16(v231, 21195); 270 int16x8_t v233 = vaddq_s16(v227, v232); 271 int16x8_t v234 = vsubq_s16(v82, v78); 272 int16x8_t v235_tmp = vqrdmulhq_n_s16(v234, 18446); 273 int16x8_t v235 = vmlaq_n_s16(v235_tmp, v234, 2); 274 int16x8_t v236 = vsubq_s16(v90, v100); 275 int16x8_t v237 = vaddq_s16(v235, v236); 276 int16x8_t v238 = vsubq_s16(v132, v137); 277 int16x8_t v239 = vsubq_s16(v114, v126); 278 int16x8_t v240_tmp = vqrdmulhq_n_s16(v239, 18446); 279 int16x8_t v240 = vmlaq_n_s16(v240_tmp, v239, 2); 280 int16x8_t v241 = vaddq_s16(v238, v240); 281 int16x8_t v242 = vqrdmulhq_n_s16(v241, 21195); 282 int16x8_t v243 = vaddq_s16(v237, v242); 283 int16x8_t v244 = vqrdmulhq_n_s16(v243, 17401); 284 int16x8_t v245 = vaddq_s16(v233, v244); 285 int16x8_t v246 = vsubq_s16(v228, v230); 286 int16x8_t v247 = vqrdmulhq_n_s16(v246, 25826); 287 int16x8_t v248 = vsubq_s16(v224, v226); 288 int16x8_t v249 = vaddq_s16(v247, v248); 289 int16x8_t v250 = vsubq_s16(v238, v240); 290 int16x8_t v251 = vqrdmulhq_n_s16(v250, 25826); 291 int16x8_t v252 = vsubq_s16(v236, v235); 292 int16x8_t v253 = vaddq_s16(v251, v252); 293 int16x8_t v254 = vqrdmulhq_n_s16(v253, 18124); 294 int16x8_t v255 = vaddq_s16(v249, v254); 295 int16x8_t v256 = vsubq_s16(v199, v201); 296 int16x8_t v257 = vsubq_s16(v203, v206); 297 int16x8_t v258_tmp = vqrdmulhq_n_s16(v257, 1988); 298 int16x8_t v258 = vaddq_s16(v258_tmp, v257); 299 int16x8_t v259 = vaddq_s16(v256, v258); 300 int16x8_t v260 = vsubq_s16(v210, v213); 301 int16x8_t v261_tmp = vqrdmulhq_n_s16(v219, 25030); 302 int16x8_t v261 = vaddq_s16(v261_tmp, v219); 303 int16x8_t v262 = vsubq_s16(v215, v261); 304 int16x8_t v263_tmp = vqrdmulhq_n_s16(v262, 1988); 305 int16x8_t v263 = vaddq_s16(v263_tmp, v262); 306 int16x8_t v264 = vaddq_s16(v260, v263); 307 int16x8_t v265 = vqrdmulhq_n_s16(v264, 19102); 308 int16x8_t v266 = vaddq_s16(v259, v265); 309 int16x8_t v267 = vsubq_s16(v147, v153); 310 int16x8_t v268 = vsubq_s16(v159, v165); 311 int16x8_t v269_tmp = vqrdmulhq_n_s16(v268, 23673); 312 int16x8_t v269 = vaddq_s16(v269_tmp, v268); 313 int16x8_t v270 = vaddq_s16(v267, v269); 314 int16x8_t v271 = vsubq_s16(v173, v179); 315 int16x8_t v272 = vsubq_s16(v185, v193); 316 int16x8_t v273_tmp = vqrdmulhq_n_s16(v272, 23673); 317 int16x8_t v273 = vaddq_s16(v273_tmp, v272); 318 int16x8_t v274 = vaddq_s16(v271, v273); 319 int16x8_t v275 = vqrdmulhq_n_s16(v274, 20398); 320 int16x8_t v276 = vaddq_s16(v270, v275); 321 int16x8_t v277 = vsubq_s16(v9, v24); 322 int16x8_t v278 = vsubq_s16(v42, v58); 323 int16x8_t v279_tmp = vqrdmulhq_n_s16(v278, 3314); 324 int16x8_t v279 = vmlaq_n_s16(v279_tmp, v278, 5); 325 int16x8_t v280 = vaddq_s16(v277, v279); 326 int16x8_t v281 = vsubq_s16(v138, v128); 327 int16x8_t v282_tmp = vqrdmulhq_n_s16(v281, 3314); 328 int16x8_t v282 = vmlaq_n_s16(v282_tmp, v281, 5); 329 int16x8_t v283 = vsubq_s16(v101, v84); 330 int16x8_t v284 = vaddq_s16(v282, v283); 331 int16x8_t v285 = vqrdmulhq_n_s16(v284, 22112); 332 int16x8_t v286 = vaddq_s16(v280, v285); 333 int16x8_t v287 = vsubq_s16(v277, v279); 334 int16x8_t v288 = vsubq_s16(v283, v282); 335 int16x8_t v289 = vqrdmulhq_n_s16(v288, 24397); 336 int16x8_t v290 = vaddq_s16(v287, v289); 337 int16x8_t v291 = vsubq_s16(v267, v269); 338 int16x8_t v292 = vsubq_s16(v271, v273); 339 int16x8_t v293 = vqrdmulhq_n_s16(v292, 27504); 340 int16x8_t v294 = vaddq_s16(v291, v293); 341 int16x8_t v295 = vsubq_s16(v260, v263); 342 int16x8_t v296 = vqrdmulhq_n_s16(v295, 31869); 343 int16x8_t v297 = vsubq_s16(v256, v258); 344 int16x8_t v298 = vaddq_s16(v296, v297); 345 int16x8_t v299 = vsubq_s16(v248, v247); 346 int16x8_t v300 = vsubq_s16(v252, v251); 347 int16x8_t v301_tmp = vqrdmulhq_n_s16(v300, 5552); 348 int16x8_t v301 = vaddq_s16(v301_tmp, v300); 349 int16x8_t v302 = vaddq_s16(v299, v301); 350 int16x8_t v303 = vsubq_s16(v227, v232); 351 int16x8_t v304 = vsubq_s16(v237, v242); 352 int16x8_t v305_tmp = vqrdmulhq_n_s16(v304, 15865); 353 int16x8_t v305 = vaddq_s16(v305_tmp, v304); 354 int16x8_t v306 = vaddq_s16(v303, v305); 355 int16x8_t v307 = vsubq_s16(v202, v208); 356 int16x8_t v308 = vsubq_s16(v214, v220); 357 int16x8_t v309_tmp = vqrdmulhq_n_s16(v308, 1893); 358 int16x8_t v309 = vmlaq_n_s16(v309_tmp, v308, 2); 359 int16x8_t v310 = vaddq_s16(v307, v309); 360 int16x8_t v311 = vsubq_s16(v154, v167); 361 int16x8_t v312 = vsubq_s16(v180, v195); 362 int16x8_t v313_tmp = vqrdmulhq_n_s16(v312, 13357); 363 int16x8_t v313 = vmlaq_n_s16(v313_tmp, v312, 3); 364 int16x8_t v314 = vaddq_s16(v311, v313); 365 int16x8_t v315 = vsubq_s16(v102, v140); 366 int16x8_t v316_tmp = vqrdmulhq_n_s16(v315, 6226); 367 int16x8_t v316 = vmlaq_n_s16(v316_tmp, v315, 10); 368 int16x8_t v317 = vsubq_s16(v25, v60); 369 int16x8_t v318 = vaddq_s16(v316, v317); 370 int16x8_t v319 = vsubq_s16(v317, v316); 371 int16x8_t v320 = vsubq_s16(v311, v313); 372 int16x8_t v321 = vsubq_s16(v307, v309); 373 int16x8_t v322 = vsubq_s16(v303, v305); 374 int16x8_t v323 = vsubq_s16(v299, v301); 375 int16x8_t v324 = vsubq_s16(v297, v296); 376 int16x8_t v325 = vsubq_s16(v291, v293); 377 int16x8_t v326 = vsubq_s16(v287, v289); 378 int16x8_t v327 = vsubq_s16(v280, v285); 379 int16x8_t v328 = vsubq_s16(v270, v275); 380 int16x8_t v329 = vsubq_s16(v259, v265); 381 int16x8_t v330 = vsubq_s16(v249, v254); 382 int16x8_t v331 = vsubq_s16(v233, v244); 383 int16x8_t v332 = vsubq_s16(v209, v222); 384 int16x8_t v333 = vsubq_s16(v168, v197); 385 int16x8_t v334 = vsubq_s16(v61, v142); 386 vst1q_s16(out + out_stride * 0 + i, v143); 387 vst1q_s16(out + out_stride * 1 + i, v198); 388 vst1q_s16(out + out_stride * 2 + i, v223); 389 vst1q_s16(out + out_stride * 3 + i, v245); 390 vst1q_s16(out + out_stride * 4 + i, v255); 391 vst1q_s16(out + out_stride * 5 + i, v266); 392 vst1q_s16(out + out_stride * 6 + i, v276); 393 vst1q_s16(out + out_stride * 7 + i, v286); 394 vst1q_s16(out + out_stride * 8 + i, v290); 395 vst1q_s16(out + out_stride * 9 + i, v294); 396 vst1q_s16(out + out_stride * 10 + i, v298); 397 vst1q_s16(out + out_stride * 11 + i, v302); 398 vst1q_s16(out + out_stride * 12 + i, v306); 399 vst1q_s16(out + out_stride * 13 + i, v310); 400 vst1q_s16(out + out_stride * 14 + i, v314); 401 vst1q_s16(out + out_stride * 15 + i, v318); 402 vst1q_s16(out + out_stride * 16 + i, v319); 403 vst1q_s16(out + out_stride * 17 + i, v320); 404 vst1q_s16(out + out_stride * 18 + i, v321); 405 vst1q_s16(out + out_stride * 19 + i, v322); 406 vst1q_s16(out + out_stride * 20 + i, v323); 407 vst1q_s16(out + out_stride * 21 + i, v324); 408 vst1q_s16(out + out_stride * 22 + i, v325); 409 vst1q_s16(out + out_stride * 23 + i, v326); 410 vst1q_s16(out + out_stride * 24 + i, v327); 411 vst1q_s16(out + out_stride * 25 + i, v328); 412 vst1q_s16(out + out_stride * 26 + i, v329); 413 vst1q_s16(out + out_stride * 27 + i, v330); 414 vst1q_s16(out + out_stride * 28 + i, v331); 415 vst1q_s16(out + out_stride * 29 + i, v332); 416 vst1q_s16(out + out_stride * 30 + i, v333); 417 vst1q_s16(out + out_stride * 31 + i, v334); 418 } 419 }