sve_helper.c (278800B)
1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/exec-all.h" 24 #include "exec/helper-proto.h" 25 #include "tcg/tcg-gvec-desc.h" 26 #include "fpu/softfloat.h" 27 #include "tcg/tcg.h" 28 #include "vec_internal.h" 29 #include "sve_ldst_internal.h" 30 31 32 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 33 * 34 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 35 * and bit 0 set if C is set. Compare the definitions of these variables 36 * within CPUARMState. 37 */ 38 39 /* For no G bits set, NZCV = C. */ 40 #define PREDTEST_INIT 1 41 42 /* This is an iterative function, called for each Pd and Pg word 43 * moving forward. 44 */ 45 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 46 { 47 if (likely(g)) { 48 /* Compute N from first D & G. 49 Use bit 2 to signal first G bit seen. */ 50 if (!(flags & 4)) { 51 flags |= ((d & (g & -g)) != 0) << 31; 52 flags |= 4; 53 } 54 55 /* Accumulate Z from each D & G. */ 56 flags |= ((d & g) != 0) << 1; 57 58 /* Compute C from last !(D & G). Replace previous. */ 59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 60 } 61 return flags; 62 } 63 64 /* This is an iterative function, called for each Pd and Pg word 65 * moving backward. 66 */ 67 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 68 { 69 if (likely(g)) { 70 /* Compute C from first (i.e last) !(D & G). 71 Use bit 2 to signal first G bit seen. */ 72 if (!(flags & 4)) { 73 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 74 flags |= (d & pow2floor(g)) == 0; 75 } 76 77 /* Accumulate Z from each D & G. */ 78 flags |= ((d & g) != 0) << 1; 79 80 /* Compute N from last (i.e first) D & G. Replace previous. */ 81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 82 } 83 return flags; 84 } 85 86 /* The same for a single word predicate. */ 87 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 88 { 89 return iter_predtest_fwd(d, g, PREDTEST_INIT); 90 } 91 92 /* The same for a multi-word predicate. */ 93 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 94 { 95 uint32_t flags = PREDTEST_INIT; 96 uint64_t *d = vd, *g = vg; 97 uintptr_t i = 0; 98 99 do { 100 flags = iter_predtest_fwd(d[i], g[i], flags); 101 } while (++i < words); 102 103 return flags; 104 } 105 106 /* Similarly for single word elements. */ 107 static inline uint64_t expand_pred_s(uint8_t byte) 108 { 109 static const uint64_t word[] = { 110 [0x01] = 0x00000000ffffffffull, 111 [0x10] = 0xffffffff00000000ull, 112 [0x11] = 0xffffffffffffffffull, 113 }; 114 return word[byte & 0x11]; 115 } 116 117 #define LOGICAL_PPPP(NAME, FUNC) \ 118 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 119 { \ 120 uintptr_t opr_sz = simd_oprsz(desc); \ 121 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 122 uintptr_t i; \ 123 for (i = 0; i < opr_sz / 8; ++i) { \ 124 d[i] = FUNC(n[i], m[i], g[i]); \ 125 } \ 126 } 127 128 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 129 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 130 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 131 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 132 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 133 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 134 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 135 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 136 137 LOGICAL_PPPP(sve_and_pppp, DO_AND) 138 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 139 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 140 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 141 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 142 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 143 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 144 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 145 146 #undef DO_AND 147 #undef DO_BIC 148 #undef DO_EOR 149 #undef DO_ORR 150 #undef DO_ORN 151 #undef DO_NOR 152 #undef DO_NAND 153 #undef DO_SEL 154 #undef LOGICAL_PPPP 155 156 /* Fully general three-operand expander, controlled by a predicate. 157 * This is complicated by the host-endian storage of the register file. 158 */ 159 /* ??? I don't expect the compiler could ever vectorize this itself. 160 * With some tables we can convert bit masks to byte masks, and with 161 * extra care wrt byte/word ordering we could use gcc generic vectors 162 * and do 16 bytes at a time. 163 */ 164 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 165 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 166 { \ 167 intptr_t i, opr_sz = simd_oprsz(desc); \ 168 for (i = 0; i < opr_sz; ) { \ 169 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 170 do { \ 171 if (pg & 1) { \ 172 TYPE nn = *(TYPE *)(vn + H(i)); \ 173 TYPE mm = *(TYPE *)(vm + H(i)); \ 174 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 175 } \ 176 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 177 } while (i & 15); \ 178 } \ 179 } 180 181 /* Similarly, specialized for 64-bit operands. */ 182 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 183 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 184 { \ 185 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 186 TYPE *d = vd, *n = vn, *m = vm; \ 187 uint8_t *pg = vg; \ 188 for (i = 0; i < opr_sz; i += 1) { \ 189 if (pg[H1(i)] & 1) { \ 190 TYPE nn = n[i], mm = m[i]; \ 191 d[i] = OP(nn, mm); \ 192 } \ 193 } \ 194 } 195 196 #define DO_AND(N, M) (N & M) 197 #define DO_EOR(N, M) (N ^ M) 198 #define DO_ORR(N, M) (N | M) 199 #define DO_BIC(N, M) (N & ~M) 200 #define DO_ADD(N, M) (N + M) 201 #define DO_SUB(N, M) (N - M) 202 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 203 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 204 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 205 #define DO_MUL(N, M) (N * M) 206 207 208 /* 209 * We must avoid the C undefined behaviour cases: division by 210 * zero and signed division of INT_MIN by -1. Both of these 211 * have architecturally defined required results for Arm. 212 * We special case all signed divisions by -1 to avoid having 213 * to deduce the minimum integer for the type involved. 214 */ 215 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 216 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 217 218 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 219 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 220 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 221 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 222 223 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 224 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 225 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 226 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 227 228 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 229 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 230 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 231 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 232 233 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 234 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 235 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 236 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 237 238 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 239 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 240 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 241 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 242 243 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 244 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 245 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 246 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 247 248 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 249 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 250 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 251 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 252 253 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 254 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 255 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 256 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 257 258 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 259 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 260 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 261 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 262 263 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 264 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 265 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 266 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 267 268 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 269 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 270 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 271 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 272 273 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 274 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 275 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 276 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 277 278 /* Because the computation type is at least twice as large as required, 279 these work for both signed and unsigned source types. */ 280 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 281 { 282 return (n * m) >> 8; 283 } 284 285 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 286 { 287 return (n * m) >> 16; 288 } 289 290 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 291 { 292 return (n * m) >> 32; 293 } 294 295 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 296 { 297 uint64_t lo, hi; 298 muls64(&lo, &hi, n, m); 299 return hi; 300 } 301 302 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 303 { 304 uint64_t lo, hi; 305 mulu64(&lo, &hi, n, m); 306 return hi; 307 } 308 309 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 310 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 311 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 312 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 313 314 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 315 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 316 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 317 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 318 319 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 320 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 321 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 322 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 323 324 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 325 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 326 327 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 328 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 329 330 /* Note that all bits of the shift are significant 331 and not modulo the element size. */ 332 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 333 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 334 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 335 336 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 337 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 338 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 339 340 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 341 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 342 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 343 344 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 345 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 346 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 347 348 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 349 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 350 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 351 352 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 353 { 354 int8_t n1 = n, n2 = n >> 8; 355 return m + n1 + n2; 356 } 357 358 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 359 { 360 int16_t n1 = n, n2 = n >> 16; 361 return m + n1 + n2; 362 } 363 364 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 365 { 366 int32_t n1 = n, n2 = n >> 32; 367 return m + n1 + n2; 368 } 369 370 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 371 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 372 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 373 374 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 375 { 376 uint8_t n1 = n, n2 = n >> 8; 377 return m + n1 + n2; 378 } 379 380 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 381 { 382 uint16_t n1 = n, n2 = n >> 16; 383 return m + n1 + n2; 384 } 385 386 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 387 { 388 uint32_t n1 = n, n2 = n >> 32; 389 return m + n1 + n2; 390 } 391 392 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 393 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 394 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 395 396 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 397 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 398 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 399 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 400 401 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 402 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 403 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 404 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 405 406 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 407 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 408 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 409 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 410 411 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 412 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 413 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 414 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 415 416 /* 417 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 418 * We pass in a pointer to a dummy saturation field to trigger 419 * the saturating arithmetic but discard the information about 420 * whether it has occurred. 421 */ 422 #define do_sqshl_b(n, m) \ 423 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 424 #define do_sqshl_h(n, m) \ 425 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 426 #define do_sqshl_s(n, m) \ 427 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 428 #define do_sqshl_d(n, m) \ 429 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 430 431 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 432 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 433 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 434 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 435 436 #define do_uqshl_b(n, m) \ 437 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 438 #define do_uqshl_h(n, m) \ 439 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 440 #define do_uqshl_s(n, m) \ 441 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 442 #define do_uqshl_d(n, m) \ 443 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 444 445 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 446 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 447 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 448 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 449 450 #define do_sqrshl_b(n, m) \ 451 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 452 #define do_sqrshl_h(n, m) \ 453 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 454 #define do_sqrshl_s(n, m) \ 455 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 456 #define do_sqrshl_d(n, m) \ 457 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 458 459 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 460 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 461 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 462 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 463 464 #undef do_sqrshl_d 465 466 #define do_uqrshl_b(n, m) \ 467 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 468 #define do_uqrshl_h(n, m) \ 469 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 470 #define do_uqrshl_s(n, m) \ 471 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 472 #define do_uqrshl_d(n, m) \ 473 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 474 475 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 476 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 477 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 478 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 479 480 #undef do_uqrshl_d 481 482 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 483 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 484 485 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 486 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 487 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 488 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 489 490 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 491 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 492 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 493 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 494 495 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 496 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 497 498 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 499 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 500 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 501 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 502 503 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 504 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 505 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 506 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 507 508 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 509 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 510 511 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 512 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 513 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 514 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 515 516 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 517 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 518 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 519 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 520 521 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) 522 { 523 return val >= max ? max : val <= min ? min : val; 524 } 525 526 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) 527 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) 528 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) 529 530 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 531 { 532 int64_t r = n + m; 533 if (((r ^ n) & ~(n ^ m)) < 0) { 534 /* Signed overflow. */ 535 return r < 0 ? INT64_MAX : INT64_MIN; 536 } 537 return r; 538 } 539 540 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 541 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 542 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 543 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 544 545 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) 546 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) 547 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) 548 549 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 550 { 551 uint64_t r = n + m; 552 return r < n ? UINT64_MAX : r; 553 } 554 555 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 556 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 557 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 558 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 559 560 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) 561 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) 562 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) 563 564 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 565 { 566 int64_t r = n - m; 567 if (((r ^ n) & (n ^ m)) < 0) { 568 /* Signed overflow. */ 569 return r < 0 ? INT64_MAX : INT64_MIN; 570 } 571 return r; 572 } 573 574 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 575 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 576 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 577 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 578 579 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) 580 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) 581 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) 582 583 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 584 { 585 return n > m ? n - m : 0; 586 } 587 588 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 589 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 590 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 591 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 592 593 #define DO_SUQADD_B(n, m) \ 594 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) 595 #define DO_SUQADD_H(n, m) \ 596 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) 597 #define DO_SUQADD_S(n, m) \ 598 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) 599 600 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 601 { 602 uint64_t r = n + m; 603 604 if (n < 0) { 605 /* Note that m - abs(n) cannot underflow. */ 606 if (r > INT64_MAX) { 607 /* Result is either very large positive or negative. */ 608 if (m > -n) { 609 /* m > abs(n), so r is a very large positive. */ 610 return INT64_MAX; 611 } 612 /* Result is negative. */ 613 } 614 } else { 615 /* Both inputs are positive: check for overflow. */ 616 if (r < m || r > INT64_MAX) { 617 return INT64_MAX; 618 } 619 } 620 return r; 621 } 622 623 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 624 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 625 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 626 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 627 628 #define DO_USQADD_B(n, m) \ 629 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) 630 #define DO_USQADD_H(n, m) \ 631 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) 632 #define DO_USQADD_S(n, m) \ 633 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) 634 635 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 636 { 637 uint64_t r = n + m; 638 639 if (m < 0) { 640 return n < -m ? 0 : r; 641 } 642 return r < n ? UINT64_MAX : r; 643 } 644 645 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 646 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 647 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 648 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 649 650 #undef DO_ZPZZ 651 #undef DO_ZPZZ_D 652 653 /* 654 * Three operand expander, operating on element pairs. 655 * If the slot I is even, the elements from from VN {I, I+1}. 656 * If the slot I is odd, the elements from from VM {I-1, I}. 657 * Load all of the input elements in each pair before overwriting output. 658 */ 659 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 660 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 661 { \ 662 intptr_t i, opr_sz = simd_oprsz(desc); \ 663 for (i = 0; i < opr_sz; ) { \ 664 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 665 do { \ 666 TYPE n0 = *(TYPE *)(vn + H(i)); \ 667 TYPE m0 = *(TYPE *)(vm + H(i)); \ 668 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 669 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 670 if (pg & 1) { \ 671 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 672 } \ 673 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 674 if (pg & 1) { \ 675 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 676 } \ 677 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 678 } while (i & 15); \ 679 } \ 680 } 681 682 /* Similarly, specialized for 64-bit operands. */ 683 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 684 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 685 { \ 686 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 687 TYPE *d = vd, *n = vn, *m = vm; \ 688 uint8_t *pg = vg; \ 689 for (i = 0; i < opr_sz; i += 2) { \ 690 TYPE n0 = n[i], n1 = n[i + 1]; \ 691 TYPE m0 = m[i], m1 = m[i + 1]; \ 692 if (pg[H1(i)] & 1) { \ 693 d[i] = OP(n0, n1); \ 694 } \ 695 if (pg[H1(i + 1)] & 1) { \ 696 d[i + 1] = OP(m0, m1); \ 697 } \ 698 } \ 699 } 700 701 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 702 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 703 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 704 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 705 706 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 707 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 709 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 710 711 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 712 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 714 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 715 716 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 717 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 719 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 720 721 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 722 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 724 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 725 726 #undef DO_ZPZZ_PAIR 727 #undef DO_ZPZZ_PAIR_D 728 729 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 730 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 731 void *status, uint32_t desc) \ 732 { \ 733 intptr_t i, opr_sz = simd_oprsz(desc); \ 734 for (i = 0; i < opr_sz; ) { \ 735 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 736 do { \ 737 TYPE n0 = *(TYPE *)(vn + H(i)); \ 738 TYPE m0 = *(TYPE *)(vm + H(i)); \ 739 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 740 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 741 if (pg & 1) { \ 742 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 743 } \ 744 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 745 if (pg & 1) { \ 746 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 747 } \ 748 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 749 } while (i & 15); \ 750 } \ 751 } 752 753 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 754 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 756 757 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 758 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 760 761 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 762 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 764 765 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 766 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 768 769 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 770 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 772 773 #undef DO_ZPZZ_PAIR_FP 774 775 /* Three-operand expander, controlled by a predicate, in which the 776 * third operand is "wide". That is, for D = N op M, the same 64-bit 777 * value of M is used with all of the narrower values of N. 778 */ 779 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 780 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 781 { \ 782 intptr_t i, opr_sz = simd_oprsz(desc); \ 783 for (i = 0; i < opr_sz; ) { \ 784 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 785 TYPEW mm = *(TYPEW *)(vm + i); \ 786 do { \ 787 if (pg & 1) { \ 788 TYPE nn = *(TYPE *)(vn + H(i)); \ 789 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 790 } \ 791 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 792 } while (i & 7); \ 793 } \ 794 } 795 796 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 797 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 798 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 799 800 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 801 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 802 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 803 804 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 805 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 806 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 807 808 #undef DO_ZPZW 809 810 /* Fully general two-operand expander, controlled by a predicate. 811 */ 812 #define DO_ZPZ(NAME, TYPE, H, OP) \ 813 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 814 { \ 815 intptr_t i, opr_sz = simd_oprsz(desc); \ 816 for (i = 0; i < opr_sz; ) { \ 817 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 818 do { \ 819 if (pg & 1) { \ 820 TYPE nn = *(TYPE *)(vn + H(i)); \ 821 *(TYPE *)(vd + H(i)) = OP(nn); \ 822 } \ 823 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 824 } while (i & 15); \ 825 } \ 826 } 827 828 /* Similarly, specialized for 64-bit operands. */ 829 #define DO_ZPZ_D(NAME, TYPE, OP) \ 830 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 831 { \ 832 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 833 TYPE *d = vd, *n = vn; \ 834 uint8_t *pg = vg; \ 835 for (i = 0; i < opr_sz; i += 1) { \ 836 if (pg[H1(i)] & 1) { \ 837 TYPE nn = n[i]; \ 838 d[i] = OP(nn); \ 839 } \ 840 } \ 841 } 842 843 #define DO_CLS_B(N) (clrsb32(N) - 24) 844 #define DO_CLS_H(N) (clrsb32(N) - 16) 845 846 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 847 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 848 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 849 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 850 851 #define DO_CLZ_B(N) (clz32(N) - 24) 852 #define DO_CLZ_H(N) (clz32(N) - 16) 853 854 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 855 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 856 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 857 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 858 859 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 860 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 861 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 862 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 863 864 #define DO_CNOT(N) (N == 0) 865 866 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 867 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 868 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 869 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 870 871 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 872 873 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 874 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 875 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 876 877 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 878 879 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 880 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 881 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 882 883 #define DO_NOT(N) (~N) 884 885 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 886 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 887 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 888 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 889 890 #define DO_SXTB(N) ((int8_t)N) 891 #define DO_SXTH(N) ((int16_t)N) 892 #define DO_SXTS(N) ((int32_t)N) 893 #define DO_UXTB(N) ((uint8_t)N) 894 #define DO_UXTH(N) ((uint16_t)N) 895 #define DO_UXTS(N) ((uint32_t)N) 896 897 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 898 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 899 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 900 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 901 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 902 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 903 904 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 905 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 906 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 907 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 908 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 909 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 910 911 #define DO_ABS(N) (N < 0 ? -N : N) 912 913 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 914 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 915 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 916 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 917 918 #define DO_NEG(N) (-N) 919 920 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 921 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 922 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 923 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 924 925 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 926 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 927 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 928 929 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 930 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 931 932 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 933 934 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 935 { 936 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 937 uint64_t *d = vd, *n = vn; 938 uint8_t *pg = vg; 939 940 for (i = 0; i < opr_sz; i += 2) { 941 if (pg[H1(i)] & 1) { 942 uint64_t n0 = n[i + 0]; 943 uint64_t n1 = n[i + 1]; 944 d[i + 0] = n1; 945 d[i + 1] = n0; 946 } 947 } 948 } 949 950 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 951 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 952 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 953 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 954 955 #define DO_SQABS(X) \ 956 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 957 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 958 959 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 960 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 961 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 962 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 963 964 #define DO_SQNEG(X) \ 965 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 966 x_ == min_ ? -min_ - 1 : -x_; }) 967 968 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 969 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 970 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 971 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 972 973 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 974 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 975 976 /* Three-operand expander, unpredicated, in which the third operand is "wide". 977 */ 978 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 979 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 980 { \ 981 intptr_t i, opr_sz = simd_oprsz(desc); \ 982 for (i = 0; i < opr_sz; ) { \ 983 TYPEW mm = *(TYPEW *)(vm + i); \ 984 do { \ 985 TYPE nn = *(TYPE *)(vn + H(i)); \ 986 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 987 i += sizeof(TYPE); \ 988 } while (i & 7); \ 989 } \ 990 } 991 992 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 993 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 994 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 995 996 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 997 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 998 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 999 1000 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1001 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1002 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1003 1004 #undef DO_ZZW 1005 1006 #undef DO_CLS_B 1007 #undef DO_CLS_H 1008 #undef DO_CLZ_B 1009 #undef DO_CLZ_H 1010 #undef DO_CNOT 1011 #undef DO_FABS 1012 #undef DO_FNEG 1013 #undef DO_ABS 1014 #undef DO_NEG 1015 #undef DO_ZPZ 1016 #undef DO_ZPZ_D 1017 1018 /* 1019 * Three-operand expander, unpredicated, in which the two inputs are 1020 * selected from the top or bottom half of the wide column. 1021 */ 1022 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1023 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1024 { \ 1025 intptr_t i, opr_sz = simd_oprsz(desc); \ 1026 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1027 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1028 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1029 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1030 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1031 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1032 } \ 1033 } 1034 1035 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1036 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1037 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1038 1039 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1040 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1041 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1042 1043 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1044 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1045 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1046 1047 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1048 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1049 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1050 1051 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1052 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1053 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1054 1055 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1056 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1057 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1058 1059 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1060 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1061 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1062 1063 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1064 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1065 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1066 1067 /* Note that the multiply cannot overflow, but the doubling can. */ 1068 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1069 { 1070 int16_t val = n * m; 1071 return DO_SQADD_H(val, val); 1072 } 1073 1074 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1075 { 1076 int32_t val = n * m; 1077 return DO_SQADD_S(val, val); 1078 } 1079 1080 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1081 { 1082 int64_t val = n * m; 1083 return do_sqadd_d(val, val); 1084 } 1085 1086 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1087 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1088 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1089 1090 #undef DO_ZZZ_TB 1091 1092 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1093 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1094 { \ 1095 intptr_t i, opr_sz = simd_oprsz(desc); \ 1096 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1097 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1098 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1099 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1100 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1101 } \ 1102 } 1103 1104 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1105 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1106 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1107 1108 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1109 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1110 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1111 1112 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1113 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1114 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1115 1116 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1117 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1118 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1119 1120 #undef DO_ZZZ_WTB 1121 1122 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1123 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1124 { \ 1125 intptr_t i, opr_sz = simd_oprsz(desc); \ 1126 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1127 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1128 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1129 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1130 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1131 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1132 } \ 1133 } 1134 1135 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1136 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1137 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1138 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1139 1140 #undef DO_ZZZ_NTB 1141 1142 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1143 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1144 { \ 1145 intptr_t i, opr_sz = simd_oprsz(desc); \ 1146 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1147 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1148 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1149 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1150 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1151 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1152 } \ 1153 } 1154 1155 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1156 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1157 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1158 1159 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1160 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1161 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1162 1163 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1164 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1165 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1166 1167 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1168 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1169 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1170 1171 #define DO_NMUL(N, M) -(N * M) 1172 1173 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1174 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1176 1177 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1178 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1180 1181 #undef DO_ZZZW_ACC 1182 1183 #define DO_XTNB(NAME, TYPE, OP) \ 1184 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1185 { \ 1186 intptr_t i, opr_sz = simd_oprsz(desc); \ 1187 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1188 TYPE nn = *(TYPE *)(vn + i); \ 1189 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1190 *(TYPE *)(vd + i) = nn; \ 1191 } \ 1192 } 1193 1194 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1195 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1196 { \ 1197 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1198 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1199 TYPE nn = *(TYPE *)(vn + i); \ 1200 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1201 } \ 1202 } 1203 1204 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) 1205 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) 1206 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) 1207 1208 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) 1209 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) 1210 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) 1211 1212 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) 1213 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) 1214 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) 1215 1216 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) 1217 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) 1218 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) 1219 1220 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) 1221 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) 1222 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) 1223 1224 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) 1225 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) 1226 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) 1227 1228 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) 1229 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) 1230 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) 1231 1232 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) 1233 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) 1234 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) 1235 1236 #undef DO_XTNB 1237 #undef DO_XTNT 1238 1239 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1240 { 1241 intptr_t i, opr_sz = simd_oprsz(desc); 1242 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1243 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1244 uint32_t *a = va, *n = vn; 1245 uint64_t *d = vd, *m = vm; 1246 1247 for (i = 0; i < opr_sz / 8; ++i) { 1248 uint32_t e1 = a[2 * i + H4(0)]; 1249 uint32_t e2 = n[2 * i + sel] ^ inv; 1250 uint64_t c = extract64(m[i], 32, 1); 1251 /* Compute and store the entire 33-bit result at once. */ 1252 d[i] = c + e1 + e2; 1253 } 1254 } 1255 1256 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1257 { 1258 intptr_t i, opr_sz = simd_oprsz(desc); 1259 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1260 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1261 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1262 1263 for (i = 0; i < opr_sz / 8; i += 2) { 1264 Int128 e1 = int128_make64(a[i]); 1265 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1266 Int128 c = int128_make64(m[i + 1] & 1); 1267 Int128 r = int128_add(int128_add(e1, e2), c); 1268 d[i + 0] = int128_getlo(r); 1269 d[i + 1] = int128_gethi(r); 1270 } 1271 } 1272 1273 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1274 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1275 { \ 1276 intptr_t i, opr_sz = simd_oprsz(desc); \ 1277 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1278 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1279 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1280 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1281 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1282 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1283 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1284 } \ 1285 } 1286 1287 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1288 do_sqdmull_h, DO_SQADD_H) 1289 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1290 do_sqdmull_s, DO_SQADD_S) 1291 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1292 do_sqdmull_d, do_sqadd_d) 1293 1294 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1295 do_sqdmull_h, DO_SQSUB_H) 1296 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1297 do_sqdmull_s, DO_SQSUB_S) 1298 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1299 do_sqdmull_d, do_sqsub_d) 1300 1301 #undef DO_SQDMLAL 1302 1303 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1304 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1305 { \ 1306 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1307 int rot = simd_data(desc); \ 1308 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1309 bool sub_r = rot == 1 || rot == 2; \ 1310 bool sub_i = rot >= 2; \ 1311 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1312 for (i = 0; i < opr_sz; i += 2) { \ 1313 TYPE elt1_a = n[H(i + sel_a)]; \ 1314 TYPE elt2_a = m[H(i + sel_a)]; \ 1315 TYPE elt2_b = m[H(i + sel_b)]; \ 1316 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1317 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1318 } \ 1319 } 1320 1321 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1322 1323 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1324 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1325 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1326 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1327 1328 #define DO_SQRDMLAH_B(N, M, A, S) \ 1329 do_sqrdmlah_b(N, M, A, S, true) 1330 #define DO_SQRDMLAH_H(N, M, A, S) \ 1331 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1332 #define DO_SQRDMLAH_S(N, M, A, S) \ 1333 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1334 #define DO_SQRDMLAH_D(N, M, A, S) \ 1335 do_sqrdmlah_d(N, M, A, S, true) 1336 1337 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1338 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1341 1342 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1343 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1344 { \ 1345 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1346 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1347 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1348 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1349 bool sub_r = rot == 1 || rot == 2; \ 1350 bool sub_i = rot >= 2; \ 1351 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1352 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1353 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1354 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1355 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1356 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1357 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1358 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1359 } \ 1360 } \ 1361 } 1362 1363 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1364 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1365 1366 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1367 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1368 1369 #undef DO_CMLA 1370 #undef DO_CMLA_FUNC 1371 #undef DO_CMLA_IDX_FUNC 1372 #undef DO_SQRDMLAH_B 1373 #undef DO_SQRDMLAH_H 1374 #undef DO_SQRDMLAH_S 1375 #undef DO_SQRDMLAH_D 1376 1377 /* Note N and M are 4 elements bundled into one unit. */ 1378 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1379 int sel_a, int sel_b, int sub_i) 1380 { 1381 for (int i = 0; i <= 1; i++) { 1382 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1383 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1384 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1385 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1386 1387 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1388 } 1389 return a; 1390 } 1391 1392 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1393 int sel_a, int sel_b, int sub_i) 1394 { 1395 for (int i = 0; i <= 1; i++) { 1396 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1397 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1398 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1399 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1400 1401 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1402 } 1403 return a; 1404 } 1405 1406 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1407 void *va, uint32_t desc) 1408 { 1409 int opr_sz = simd_oprsz(desc); 1410 int rot = simd_data(desc); 1411 int sel_a = rot & 1; 1412 int sel_b = sel_a ^ 1; 1413 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1414 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1415 1416 for (int e = 0; e < opr_sz / 4; e++) { 1417 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1418 } 1419 } 1420 1421 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1422 void *va, uint32_t desc) 1423 { 1424 int opr_sz = simd_oprsz(desc); 1425 int rot = simd_data(desc); 1426 int sel_a = rot & 1; 1427 int sel_b = sel_a ^ 1; 1428 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1429 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1430 1431 for (int e = 0; e < opr_sz / 8; e++) { 1432 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1433 } 1434 } 1435 1436 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1437 void *va, uint32_t desc) 1438 { 1439 int opr_sz = simd_oprsz(desc); 1440 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1441 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1442 int sel_a = rot & 1; 1443 int sel_b = sel_a ^ 1; 1444 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1445 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1446 1447 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1448 uint32_t seg_m = m[seg + idx]; 1449 for (int e = 0; e < 4; e++) { 1450 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1451 sel_a, sel_b, sub_i); 1452 } 1453 } 1454 } 1455 1456 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1457 void *va, uint32_t desc) 1458 { 1459 int seg, opr_sz = simd_oprsz(desc); 1460 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1461 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1462 int sel_a = rot & 1; 1463 int sel_b = sel_a ^ 1; 1464 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1465 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1466 1467 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1468 uint64_t seg_m = m[seg + idx]; 1469 for (int e = 0; e < 2; e++) { 1470 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1471 sel_a, sel_b, sub_i); 1472 } 1473 } 1474 } 1475 1476 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1477 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1478 { \ 1479 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1480 intptr_t i, j, idx = simd_data(desc); \ 1481 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1482 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1483 TYPE mm = m[i]; \ 1484 for (j = 0; j < segment; j++) { \ 1485 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1486 } \ 1487 } \ 1488 } 1489 1490 #define DO_SQRDMLAH_H(N, M, A) \ 1491 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1492 #define DO_SQRDMLAH_S(N, M, A) \ 1493 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1494 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1495 1496 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1497 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1498 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1499 1500 #define DO_SQRDMLSH_H(N, M, A) \ 1501 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1502 #define DO_SQRDMLSH_S(N, M, A) \ 1503 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1504 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1505 1506 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1507 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1508 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1509 1510 #undef DO_ZZXZ 1511 1512 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1513 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1514 { \ 1515 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1516 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1517 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1518 for (i = 0; i < oprsz; i += 16) { \ 1519 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1520 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1521 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1522 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1523 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1524 } \ 1525 } \ 1526 } 1527 1528 #define DO_MLA(N, M, A) (A + N * M) 1529 1530 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1531 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1532 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1533 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1534 1535 #define DO_MLS(N, M, A) (A - N * M) 1536 1537 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1538 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1539 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1540 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1541 1542 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1543 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1544 1545 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1546 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1547 1548 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1549 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1550 1551 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1552 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1553 1554 #undef DO_MLA 1555 #undef DO_MLS 1556 #undef DO_ZZXW 1557 1558 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1559 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1560 { \ 1561 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1562 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1563 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1564 for (i = 0; i < oprsz; i += 16) { \ 1565 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1566 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1567 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1568 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1569 } \ 1570 } \ 1571 } 1572 1573 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1574 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1575 1576 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1577 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1578 1579 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1580 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1581 1582 #undef DO_ZZX 1583 1584 #define DO_BITPERM(NAME, TYPE, OP) \ 1585 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1586 { \ 1587 intptr_t i, opr_sz = simd_oprsz(desc); \ 1588 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1589 TYPE nn = *(TYPE *)(vn + i); \ 1590 TYPE mm = *(TYPE *)(vm + i); \ 1591 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1592 } \ 1593 } 1594 1595 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1596 { 1597 uint64_t res = 0; 1598 int db, rb = 0; 1599 1600 for (db = 0; db < n; ++db) { 1601 if ((mask >> db) & 1) { 1602 res |= ((data >> db) & 1) << rb; 1603 ++rb; 1604 } 1605 } 1606 return res; 1607 } 1608 1609 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1610 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1611 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1612 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1613 1614 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1615 { 1616 uint64_t res = 0; 1617 int rb, db = 0; 1618 1619 for (rb = 0; rb < n; ++rb) { 1620 if ((mask >> rb) & 1) { 1621 res |= ((data >> db) & 1) << rb; 1622 ++db; 1623 } 1624 } 1625 return res; 1626 } 1627 1628 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1629 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1630 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1631 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1632 1633 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1634 { 1635 uint64_t resm = 0, resu = 0; 1636 int db, rbm = 0, rbu = 0; 1637 1638 for (db = 0; db < n; ++db) { 1639 uint64_t val = (data >> db) & 1; 1640 if ((mask >> db) & 1) { 1641 resm |= val << rbm++; 1642 } else { 1643 resu |= val << rbu++; 1644 } 1645 } 1646 1647 return resm | (resu << rbm); 1648 } 1649 1650 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1651 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1652 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1653 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1654 1655 #undef DO_BITPERM 1656 1657 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1658 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1659 { \ 1660 intptr_t i, opr_sz = simd_oprsz(desc); \ 1661 int sub_r = simd_data(desc); \ 1662 if (sub_r) { \ 1663 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1664 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1665 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1666 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1667 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1668 acc_r = ADD_OP(acc_r, el2_i); \ 1669 acc_i = SUB_OP(acc_i, el2_r); \ 1670 *(TYPE *)(vd + H(i)) = acc_r; \ 1671 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1672 } \ 1673 } else { \ 1674 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1675 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1676 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1677 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1678 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1679 acc_r = SUB_OP(acc_r, el2_i); \ 1680 acc_i = ADD_OP(acc_i, el2_r); \ 1681 *(TYPE *)(vd + H(i)) = acc_r; \ 1682 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1683 } \ 1684 } \ 1685 } 1686 1687 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1688 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1689 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1690 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1691 1692 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1693 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1694 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1695 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1696 1697 #undef DO_CADD 1698 1699 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1700 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1701 { \ 1702 intptr_t i, opr_sz = simd_oprsz(desc); \ 1703 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1704 int shift = simd_data(desc) >> 1; \ 1705 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1706 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1707 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1708 } \ 1709 } 1710 1711 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1712 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1713 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1714 1715 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1716 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1717 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1718 1719 #undef DO_ZZI_SHLL 1720 1721 /* Two-operand reduction expander, controlled by a predicate. 1722 * The difference between TYPERED and TYPERET has to do with 1723 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1724 * but TYPERET must be unsigned so that e.g. a 32-bit value 1725 * is not sign-extended to the ABI uint64_t return type. 1726 */ 1727 /* ??? If we were to vectorize this by hand the reduction ordering 1728 * would change. For integer operands, this is perfectly fine. 1729 */ 1730 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1731 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1732 { \ 1733 intptr_t i, opr_sz = simd_oprsz(desc); \ 1734 TYPERED ret = INIT; \ 1735 for (i = 0; i < opr_sz; ) { \ 1736 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1737 do { \ 1738 if (pg & 1) { \ 1739 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1740 ret = OP(ret, nn); \ 1741 } \ 1742 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1743 } while (i & 15); \ 1744 } \ 1745 return (TYPERET)ret; \ 1746 } 1747 1748 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1749 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1750 { \ 1751 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1752 TYPEE *n = vn; \ 1753 uint8_t *pg = vg; \ 1754 TYPER ret = INIT; \ 1755 for (i = 0; i < opr_sz; i += 1) { \ 1756 if (pg[H1(i)] & 1) { \ 1757 TYPEE nn = n[i]; \ 1758 ret = OP(ret, nn); \ 1759 } \ 1760 } \ 1761 return ret; \ 1762 } 1763 1764 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1765 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1766 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1767 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1768 1769 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1770 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1771 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1772 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1773 1774 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1775 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1776 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1777 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1778 1779 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1780 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1781 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1782 1783 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1784 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1785 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1786 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1787 1788 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1789 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1790 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1791 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1792 1793 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1794 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1795 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1796 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1797 1798 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1799 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1800 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1801 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1802 1803 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1804 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1805 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1806 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1807 1808 #undef DO_VPZ 1809 #undef DO_VPZ_D 1810 1811 /* Two vector operand, one scalar operand, unpredicated. */ 1812 #define DO_ZZI(NAME, TYPE, OP) \ 1813 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1814 { \ 1815 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1816 TYPE s = s64, *d = vd, *n = vn; \ 1817 for (i = 0; i < opr_sz; ++i) { \ 1818 d[i] = OP(n[i], s); \ 1819 } \ 1820 } 1821 1822 #define DO_SUBR(X, Y) (Y - X) 1823 1824 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1825 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1826 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1827 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1828 1829 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1830 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1831 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1832 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1833 1834 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1835 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1836 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1837 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1838 1839 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1840 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1841 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1842 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1843 1844 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1845 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1846 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1847 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1848 1849 #undef DO_ZZI 1850 1851 #undef DO_AND 1852 #undef DO_ORR 1853 #undef DO_EOR 1854 #undef DO_BIC 1855 #undef DO_ADD 1856 #undef DO_SUB 1857 #undef DO_MAX 1858 #undef DO_MIN 1859 #undef DO_ABD 1860 #undef DO_MUL 1861 #undef DO_DIV 1862 #undef DO_ASR 1863 #undef DO_LSR 1864 #undef DO_LSL 1865 #undef DO_SUBR 1866 1867 /* Similar to the ARM LastActiveElement pseudocode function, except the 1868 result is multiplied by the element size. This includes the not found 1869 indication; e.g. not found for esz=3 is -8. */ 1870 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1871 { 1872 uint64_t mask = pred_esz_masks[esz]; 1873 intptr_t i = words; 1874 1875 do { 1876 uint64_t this_g = g[--i] & mask; 1877 if (this_g) { 1878 return i * 64 + (63 - clz64(this_g)); 1879 } 1880 } while (i > 0); 1881 return (intptr_t)-1 << esz; 1882 } 1883 1884 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1885 { 1886 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1887 uint32_t flags = PREDTEST_INIT; 1888 uint64_t *d = vd, *g = vg; 1889 intptr_t i = 0; 1890 1891 do { 1892 uint64_t this_d = d[i]; 1893 uint64_t this_g = g[i]; 1894 1895 if (this_g) { 1896 if (!(flags & 4)) { 1897 /* Set in D the first bit of G. */ 1898 this_d |= this_g & -this_g; 1899 d[i] = this_d; 1900 } 1901 flags = iter_predtest_fwd(this_d, this_g, flags); 1902 } 1903 } while (++i < words); 1904 1905 return flags; 1906 } 1907 1908 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 1909 { 1910 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1911 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 1912 uint32_t flags = PREDTEST_INIT; 1913 uint64_t *d = vd, *g = vg, esz_mask; 1914 intptr_t i, next; 1915 1916 next = last_active_element(vd, words, esz) + (1 << esz); 1917 esz_mask = pred_esz_masks[esz]; 1918 1919 /* Similar to the pseudocode for pnext, but scaled by ESZ 1920 so that we find the correct bit. */ 1921 if (next < words * 64) { 1922 uint64_t mask = -1; 1923 1924 if (next & 63) { 1925 mask = ~((1ull << (next & 63)) - 1); 1926 next &= -64; 1927 } 1928 do { 1929 uint64_t this_g = g[next / 64] & esz_mask & mask; 1930 if (this_g != 0) { 1931 next = (next & -64) + ctz64(this_g); 1932 break; 1933 } 1934 next += 64; 1935 mask = -1; 1936 } while (next < words * 64); 1937 } 1938 1939 i = 0; 1940 do { 1941 uint64_t this_d = 0; 1942 if (i == next / 64) { 1943 this_d = 1ull << (next & 63); 1944 } 1945 d[i] = this_d; 1946 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 1947 } while (++i < words); 1948 1949 return flags; 1950 } 1951 1952 /* 1953 * Copy Zn into Zd, and store zero into inactive elements. 1954 * If inv, store zeros into the active elements. 1955 */ 1956 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 1957 { 1958 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1959 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1960 uint64_t *d = vd, *n = vn; 1961 uint8_t *pg = vg; 1962 1963 for (i = 0; i < opr_sz; i += 1) { 1964 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 1965 } 1966 } 1967 1968 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 1969 { 1970 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1971 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1972 uint64_t *d = vd, *n = vn; 1973 uint8_t *pg = vg; 1974 1975 for (i = 0; i < opr_sz; i += 1) { 1976 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 1977 } 1978 } 1979 1980 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 1981 { 1982 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1983 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1984 uint64_t *d = vd, *n = vn; 1985 uint8_t *pg = vg; 1986 1987 for (i = 0; i < opr_sz; i += 1) { 1988 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 1989 } 1990 } 1991 1992 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 1993 { 1994 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1995 uint64_t *d = vd, *n = vn; 1996 uint8_t *pg = vg; 1997 uint8_t inv = simd_data(desc); 1998 1999 for (i = 0; i < opr_sz; i += 1) { 2000 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2001 } 2002 } 2003 2004 /* Three-operand expander, immediate operand, controlled by a predicate. 2005 */ 2006 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2007 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2008 { \ 2009 intptr_t i, opr_sz = simd_oprsz(desc); \ 2010 TYPE imm = simd_data(desc); \ 2011 for (i = 0; i < opr_sz; ) { \ 2012 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2013 do { \ 2014 if (pg & 1) { \ 2015 TYPE nn = *(TYPE *)(vn + H(i)); \ 2016 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2017 } \ 2018 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2019 } while (i & 15); \ 2020 } \ 2021 } 2022 2023 /* Similarly, specialized for 64-bit operands. */ 2024 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2025 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2026 { \ 2027 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2028 TYPE *d = vd, *n = vn; \ 2029 TYPE imm = simd_data(desc); \ 2030 uint8_t *pg = vg; \ 2031 for (i = 0; i < opr_sz; i += 1) { \ 2032 if (pg[H1(i)] & 1) { \ 2033 TYPE nn = n[i]; \ 2034 d[i] = OP(nn, imm); \ 2035 } \ 2036 } \ 2037 } 2038 2039 #define DO_SHR(N, M) (N >> M) 2040 #define DO_SHL(N, M) (N << M) 2041 2042 /* Arithmetic shift right for division. This rounds negative numbers 2043 toward zero as per signed division. Therefore before shifting, 2044 when N is negative, add 2**M-1. */ 2045 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2046 2047 static inline uint64_t do_urshr(uint64_t x, unsigned sh) 2048 { 2049 if (likely(sh < 64)) { 2050 return (x >> sh) + ((x >> (sh - 1)) & 1); 2051 } else if (sh == 64) { 2052 return x >> 63; 2053 } else { 2054 return 0; 2055 } 2056 } 2057 2058 static inline int64_t do_srshr(int64_t x, unsigned sh) 2059 { 2060 if (likely(sh < 64)) { 2061 return (x >> sh) + ((x >> (sh - 1)) & 1); 2062 } else { 2063 /* Rounding the sign bit always produces 0. */ 2064 return 0; 2065 } 2066 } 2067 2068 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2069 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2070 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2071 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2072 2073 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2074 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2075 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2076 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2077 2078 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2079 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2080 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2081 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2082 2083 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2084 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2085 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2086 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2087 2088 /* SVE2 bitwise shift by immediate */ 2089 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2090 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2091 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2092 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2093 2094 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2095 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2096 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2097 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2098 2099 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2100 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2101 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2102 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2103 2104 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2105 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2106 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2107 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2108 2109 #define do_suqrshl_b(n, m) \ 2110 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2111 #define do_suqrshl_h(n, m) \ 2112 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2113 #define do_suqrshl_s(n, m) \ 2114 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2115 #define do_suqrshl_d(n, m) \ 2116 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2117 2118 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2119 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2120 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2121 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2122 2123 #undef DO_ASRD 2124 #undef DO_ZPZI 2125 #undef DO_ZPZI_D 2126 2127 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2128 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2129 { \ 2130 intptr_t i, opr_sz = simd_oprsz(desc); \ 2131 int shift = simd_data(desc); \ 2132 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2133 TYPEW nn = *(TYPEW *)(vn + i); \ 2134 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2135 } \ 2136 } 2137 2138 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2139 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2140 { \ 2141 intptr_t i, opr_sz = simd_oprsz(desc); \ 2142 int shift = simd_data(desc); \ 2143 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2144 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2145 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2146 } \ 2147 } 2148 2149 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2150 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2151 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2152 2153 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2154 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2155 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2156 2157 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2158 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2159 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2160 2161 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2162 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2163 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2164 2165 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) 2166 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) 2167 #define DO_SQSHRUN_D(x, sh) \ 2168 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) 2169 2170 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2171 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2172 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2173 2174 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2175 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2176 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2177 2178 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) 2179 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) 2180 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) 2181 2182 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2183 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2184 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2185 2186 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2187 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2188 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2189 2190 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) 2191 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) 2192 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) 2193 2194 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2195 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2196 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2197 2198 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2199 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2200 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2201 2202 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) 2203 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) 2204 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) 2205 2206 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2207 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2208 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2209 2210 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2211 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2212 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2213 2214 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2215 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2216 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2217 2218 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2219 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2220 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2221 2222 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2223 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2224 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2225 2226 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2227 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2228 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2229 2230 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2231 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2232 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2233 2234 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2235 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2236 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2237 2238 #undef DO_SHRNB 2239 #undef DO_SHRNT 2240 2241 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2242 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2243 { \ 2244 intptr_t i, opr_sz = simd_oprsz(desc); \ 2245 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2246 TYPEW nn = *(TYPEW *)(vn + i); \ 2247 TYPEW mm = *(TYPEW *)(vm + i); \ 2248 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2249 } \ 2250 } 2251 2252 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2253 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2254 { \ 2255 intptr_t i, opr_sz = simd_oprsz(desc); \ 2256 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2257 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2258 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2259 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2260 } \ 2261 } 2262 2263 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2264 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2265 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2266 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2267 2268 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2269 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2270 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2271 2272 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2273 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2274 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2275 2276 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2277 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2278 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2279 2280 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2281 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2282 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2283 2284 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2285 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2286 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2287 2288 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2289 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2290 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2291 2292 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2293 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2294 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2295 2296 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2297 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2298 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2299 2300 #undef DO_RSUBHN 2301 #undef DO_SUBHN 2302 #undef DO_RADDHN 2303 #undef DO_ADDHN 2304 2305 #undef DO_BINOPNB 2306 2307 /* Fully general four-operand expander, controlled by a predicate. 2308 */ 2309 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2310 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2311 void *vg, uint32_t desc) \ 2312 { \ 2313 intptr_t i, opr_sz = simd_oprsz(desc); \ 2314 for (i = 0; i < opr_sz; ) { \ 2315 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2316 do { \ 2317 if (pg & 1) { \ 2318 TYPE nn = *(TYPE *)(vn + H(i)); \ 2319 TYPE mm = *(TYPE *)(vm + H(i)); \ 2320 TYPE aa = *(TYPE *)(va + H(i)); \ 2321 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2322 } \ 2323 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2324 } while (i & 15); \ 2325 } \ 2326 } 2327 2328 /* Similarly, specialized for 64-bit operands. */ 2329 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2330 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2331 void *vg, uint32_t desc) \ 2332 { \ 2333 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2334 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2335 uint8_t *pg = vg; \ 2336 for (i = 0; i < opr_sz; i += 1) { \ 2337 if (pg[H1(i)] & 1) { \ 2338 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2339 d[i] = OP(aa, nn, mm); \ 2340 } \ 2341 } \ 2342 } 2343 2344 #define DO_MLA(A, N, M) (A + N * M) 2345 #define DO_MLS(A, N, M) (A - N * M) 2346 2347 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2348 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2349 2350 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2351 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2352 2353 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2354 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2355 2356 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2357 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2358 2359 #undef DO_MLA 2360 #undef DO_MLS 2361 #undef DO_ZPZZZ 2362 #undef DO_ZPZZZ_D 2363 2364 void HELPER(sve_index_b)(void *vd, uint32_t start, 2365 uint32_t incr, uint32_t desc) 2366 { 2367 intptr_t i, opr_sz = simd_oprsz(desc); 2368 uint8_t *d = vd; 2369 for (i = 0; i < opr_sz; i += 1) { 2370 d[H1(i)] = start + i * incr; 2371 } 2372 } 2373 2374 void HELPER(sve_index_h)(void *vd, uint32_t start, 2375 uint32_t incr, uint32_t desc) 2376 { 2377 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2378 uint16_t *d = vd; 2379 for (i = 0; i < opr_sz; i += 1) { 2380 d[H2(i)] = start + i * incr; 2381 } 2382 } 2383 2384 void HELPER(sve_index_s)(void *vd, uint32_t start, 2385 uint32_t incr, uint32_t desc) 2386 { 2387 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2388 uint32_t *d = vd; 2389 for (i = 0; i < opr_sz; i += 1) { 2390 d[H4(i)] = start + i * incr; 2391 } 2392 } 2393 2394 void HELPER(sve_index_d)(void *vd, uint64_t start, 2395 uint64_t incr, uint32_t desc) 2396 { 2397 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2398 uint64_t *d = vd; 2399 for (i = 0; i < opr_sz; i += 1) { 2400 d[i] = start + i * incr; 2401 } 2402 } 2403 2404 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2405 { 2406 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2407 uint32_t sh = simd_data(desc); 2408 uint32_t *d = vd, *n = vn, *m = vm; 2409 for (i = 0; i < opr_sz; i += 1) { 2410 d[i] = n[i] + (m[i] << sh); 2411 } 2412 } 2413 2414 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2415 { 2416 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2417 uint64_t sh = simd_data(desc); 2418 uint64_t *d = vd, *n = vn, *m = vm; 2419 for (i = 0; i < opr_sz; i += 1) { 2420 d[i] = n[i] + (m[i] << sh); 2421 } 2422 } 2423 2424 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2425 { 2426 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2427 uint64_t sh = simd_data(desc); 2428 uint64_t *d = vd, *n = vn, *m = vm; 2429 for (i = 0; i < opr_sz; i += 1) { 2430 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2431 } 2432 } 2433 2434 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2435 { 2436 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2437 uint64_t sh = simd_data(desc); 2438 uint64_t *d = vd, *n = vn, *m = vm; 2439 for (i = 0; i < opr_sz; i += 1) { 2440 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2441 } 2442 } 2443 2444 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2445 { 2446 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2447 static const uint16_t coeff[] = { 2448 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2449 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2450 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2451 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2452 }; 2453 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2454 uint16_t *d = vd, *n = vn; 2455 2456 for (i = 0; i < opr_sz; i++) { 2457 uint16_t nn = n[i]; 2458 intptr_t idx = extract32(nn, 0, 5); 2459 uint16_t exp = extract32(nn, 5, 5); 2460 d[i] = coeff[idx] | (exp << 10); 2461 } 2462 } 2463 2464 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2465 { 2466 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2467 static const uint32_t coeff[] = { 2468 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2469 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2470 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2471 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2472 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2473 0x1ef532, 0x20b051, 0x227043, 0x243516, 2474 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2475 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2476 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2477 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2478 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2479 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2480 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2481 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2482 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2483 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2484 }; 2485 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2486 uint32_t *d = vd, *n = vn; 2487 2488 for (i = 0; i < opr_sz; i++) { 2489 uint32_t nn = n[i]; 2490 intptr_t idx = extract32(nn, 0, 6); 2491 uint32_t exp = extract32(nn, 6, 8); 2492 d[i] = coeff[idx] | (exp << 23); 2493 } 2494 } 2495 2496 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2497 { 2498 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2499 static const uint64_t coeff[] = { 2500 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2501 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2502 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2503 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2504 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2505 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2506 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2507 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2508 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2509 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2510 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2511 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2512 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2513 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2514 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2515 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2516 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2517 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2518 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2519 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2520 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2521 0xFA7C1819E90D8ull, 2522 }; 2523 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2524 uint64_t *d = vd, *n = vn; 2525 2526 for (i = 0; i < opr_sz; i++) { 2527 uint64_t nn = n[i]; 2528 intptr_t idx = extract32(nn, 0, 6); 2529 uint64_t exp = extract32(nn, 6, 11); 2530 d[i] = coeff[idx] | (exp << 52); 2531 } 2532 } 2533 2534 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2535 { 2536 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2537 uint16_t *d = vd, *n = vn, *m = vm; 2538 for (i = 0; i < opr_sz; i += 1) { 2539 uint16_t nn = n[i]; 2540 uint16_t mm = m[i]; 2541 if (mm & 1) { 2542 nn = float16_one; 2543 } 2544 d[i] = nn ^ (mm & 2) << 14; 2545 } 2546 } 2547 2548 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2549 { 2550 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2551 uint32_t *d = vd, *n = vn, *m = vm; 2552 for (i = 0; i < opr_sz; i += 1) { 2553 uint32_t nn = n[i]; 2554 uint32_t mm = m[i]; 2555 if (mm & 1) { 2556 nn = float32_one; 2557 } 2558 d[i] = nn ^ (mm & 2) << 30; 2559 } 2560 } 2561 2562 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2563 { 2564 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2565 uint64_t *d = vd, *n = vn, *m = vm; 2566 for (i = 0; i < opr_sz; i += 1) { 2567 uint64_t nn = n[i]; 2568 uint64_t mm = m[i]; 2569 if (mm & 1) { 2570 nn = float64_one; 2571 } 2572 d[i] = nn ^ (mm & 2) << 62; 2573 } 2574 } 2575 2576 /* 2577 * Signed saturating addition with scalar operand. 2578 */ 2579 2580 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2581 { 2582 intptr_t i, oprsz = simd_oprsz(desc); 2583 2584 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2585 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2586 } 2587 } 2588 2589 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2590 { 2591 intptr_t i, oprsz = simd_oprsz(desc); 2592 2593 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2594 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2595 } 2596 } 2597 2598 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2599 { 2600 intptr_t i, oprsz = simd_oprsz(desc); 2601 2602 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2603 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2604 } 2605 } 2606 2607 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2608 { 2609 intptr_t i, oprsz = simd_oprsz(desc); 2610 2611 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2612 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2613 } 2614 } 2615 2616 /* 2617 * Unsigned saturating addition with scalar operand. 2618 */ 2619 2620 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2621 { 2622 intptr_t i, oprsz = simd_oprsz(desc); 2623 2624 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2625 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2626 } 2627 } 2628 2629 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2630 { 2631 intptr_t i, oprsz = simd_oprsz(desc); 2632 2633 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2634 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2635 } 2636 } 2637 2638 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2639 { 2640 intptr_t i, oprsz = simd_oprsz(desc); 2641 2642 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2643 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2644 } 2645 } 2646 2647 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2648 { 2649 intptr_t i, oprsz = simd_oprsz(desc); 2650 2651 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2652 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2653 } 2654 } 2655 2656 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2657 { 2658 intptr_t i, oprsz = simd_oprsz(desc); 2659 2660 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2661 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2662 } 2663 } 2664 2665 /* Two operand predicated copy immediate with merge. All valid immediates 2666 * can fit within 17 signed bits in the simd_data field. 2667 */ 2668 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2669 uint64_t mm, uint32_t desc) 2670 { 2671 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2672 uint64_t *d = vd, *n = vn; 2673 uint8_t *pg = vg; 2674 2675 mm = dup_const(MO_8, mm); 2676 for (i = 0; i < opr_sz; i += 1) { 2677 uint64_t nn = n[i]; 2678 uint64_t pp = expand_pred_b(pg[H1(i)]); 2679 d[i] = (mm & pp) | (nn & ~pp); 2680 } 2681 } 2682 2683 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2684 uint64_t mm, uint32_t desc) 2685 { 2686 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2687 uint64_t *d = vd, *n = vn; 2688 uint8_t *pg = vg; 2689 2690 mm = dup_const(MO_16, mm); 2691 for (i = 0; i < opr_sz; i += 1) { 2692 uint64_t nn = n[i]; 2693 uint64_t pp = expand_pred_h(pg[H1(i)]); 2694 d[i] = (mm & pp) | (nn & ~pp); 2695 } 2696 } 2697 2698 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2699 uint64_t mm, uint32_t desc) 2700 { 2701 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2702 uint64_t *d = vd, *n = vn; 2703 uint8_t *pg = vg; 2704 2705 mm = dup_const(MO_32, mm); 2706 for (i = 0; i < opr_sz; i += 1) { 2707 uint64_t nn = n[i]; 2708 uint64_t pp = expand_pred_s(pg[H1(i)]); 2709 d[i] = (mm & pp) | (nn & ~pp); 2710 } 2711 } 2712 2713 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2714 uint64_t mm, uint32_t desc) 2715 { 2716 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2717 uint64_t *d = vd, *n = vn; 2718 uint8_t *pg = vg; 2719 2720 for (i = 0; i < opr_sz; i += 1) { 2721 uint64_t nn = n[i]; 2722 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2723 } 2724 } 2725 2726 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2727 { 2728 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2729 uint64_t *d = vd; 2730 uint8_t *pg = vg; 2731 2732 val = dup_const(MO_8, val); 2733 for (i = 0; i < opr_sz; i += 1) { 2734 d[i] = val & expand_pred_b(pg[H1(i)]); 2735 } 2736 } 2737 2738 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2739 { 2740 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2741 uint64_t *d = vd; 2742 uint8_t *pg = vg; 2743 2744 val = dup_const(MO_16, val); 2745 for (i = 0; i < opr_sz; i += 1) { 2746 d[i] = val & expand_pred_h(pg[H1(i)]); 2747 } 2748 } 2749 2750 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2751 { 2752 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2753 uint64_t *d = vd; 2754 uint8_t *pg = vg; 2755 2756 val = dup_const(MO_32, val); 2757 for (i = 0; i < opr_sz; i += 1) { 2758 d[i] = val & expand_pred_s(pg[H1(i)]); 2759 } 2760 } 2761 2762 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2763 { 2764 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2765 uint64_t *d = vd; 2766 uint8_t *pg = vg; 2767 2768 for (i = 0; i < opr_sz; i += 1) { 2769 d[i] = (pg[H1(i)] & 1 ? val : 0); 2770 } 2771 } 2772 2773 /* Big-endian hosts need to frob the byte indices. If the copy 2774 * happens to be 8-byte aligned, then no frobbing necessary. 2775 */ 2776 static void swap_memmove(void *vd, void *vs, size_t n) 2777 { 2778 uintptr_t d = (uintptr_t)vd; 2779 uintptr_t s = (uintptr_t)vs; 2780 uintptr_t o = (d | s | n) & 7; 2781 size_t i; 2782 2783 #if !HOST_BIG_ENDIAN 2784 o = 0; 2785 #endif 2786 switch (o) { 2787 case 0: 2788 memmove(vd, vs, n); 2789 break; 2790 2791 case 4: 2792 if (d < s || d >= s + n) { 2793 for (i = 0; i < n; i += 4) { 2794 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2795 } 2796 } else { 2797 for (i = n; i > 0; ) { 2798 i -= 4; 2799 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2800 } 2801 } 2802 break; 2803 2804 case 2: 2805 case 6: 2806 if (d < s || d >= s + n) { 2807 for (i = 0; i < n; i += 2) { 2808 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2809 } 2810 } else { 2811 for (i = n; i > 0; ) { 2812 i -= 2; 2813 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2814 } 2815 } 2816 break; 2817 2818 default: 2819 if (d < s || d >= s + n) { 2820 for (i = 0; i < n; i++) { 2821 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2822 } 2823 } else { 2824 for (i = n; i > 0; ) { 2825 i -= 1; 2826 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2827 } 2828 } 2829 break; 2830 } 2831 } 2832 2833 /* Similarly for memset of 0. */ 2834 static void swap_memzero(void *vd, size_t n) 2835 { 2836 uintptr_t d = (uintptr_t)vd; 2837 uintptr_t o = (d | n) & 7; 2838 size_t i; 2839 2840 /* Usually, the first bit of a predicate is set, so N is 0. */ 2841 if (likely(n == 0)) { 2842 return; 2843 } 2844 2845 #if !HOST_BIG_ENDIAN 2846 o = 0; 2847 #endif 2848 switch (o) { 2849 case 0: 2850 memset(vd, 0, n); 2851 break; 2852 2853 case 4: 2854 for (i = 0; i < n; i += 4) { 2855 *(uint32_t *)H1_4(d + i) = 0; 2856 } 2857 break; 2858 2859 case 2: 2860 case 6: 2861 for (i = 0; i < n; i += 2) { 2862 *(uint16_t *)H1_2(d + i) = 0; 2863 } 2864 break; 2865 2866 default: 2867 for (i = 0; i < n; i++) { 2868 *(uint8_t *)H1(d + i) = 0; 2869 } 2870 break; 2871 } 2872 } 2873 2874 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2875 { 2876 intptr_t opr_sz = simd_oprsz(desc); 2877 size_t n_ofs = simd_data(desc); 2878 size_t n_siz = opr_sz - n_ofs; 2879 2880 if (vd != vm) { 2881 swap_memmove(vd, vn + n_ofs, n_siz); 2882 swap_memmove(vd + n_siz, vm, n_ofs); 2883 } else if (vd != vn) { 2884 swap_memmove(vd + n_siz, vd, n_ofs); 2885 swap_memmove(vd, vn + n_ofs, n_siz); 2886 } else { 2887 /* vd == vn == vm. Need temp space. */ 2888 ARMVectorReg tmp; 2889 swap_memmove(&tmp, vm, n_ofs); 2890 swap_memmove(vd, vd + n_ofs, n_siz); 2891 memcpy(vd + n_siz, &tmp, n_ofs); 2892 } 2893 } 2894 2895 #define DO_INSR(NAME, TYPE, H) \ 2896 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2897 { \ 2898 intptr_t opr_sz = simd_oprsz(desc); \ 2899 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2900 *(TYPE *)(vd + H(0)) = val; \ 2901 } 2902 2903 DO_INSR(sve_insr_b, uint8_t, H1) 2904 DO_INSR(sve_insr_h, uint16_t, H1_2) 2905 DO_INSR(sve_insr_s, uint32_t, H1_4) 2906 DO_INSR(sve_insr_d, uint64_t, H1_8) 2907 2908 #undef DO_INSR 2909 2910 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2911 { 2912 intptr_t i, j, opr_sz = simd_oprsz(desc); 2913 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2914 uint64_t f = *(uint64_t *)(vn + i); 2915 uint64_t b = *(uint64_t *)(vn + j); 2916 *(uint64_t *)(vd + i) = bswap64(b); 2917 *(uint64_t *)(vd + j) = bswap64(f); 2918 } 2919 } 2920 2921 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 2922 { 2923 intptr_t i, j, opr_sz = simd_oprsz(desc); 2924 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2925 uint64_t f = *(uint64_t *)(vn + i); 2926 uint64_t b = *(uint64_t *)(vn + j); 2927 *(uint64_t *)(vd + i) = hswap64(b); 2928 *(uint64_t *)(vd + j) = hswap64(f); 2929 } 2930 } 2931 2932 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 2933 { 2934 intptr_t i, j, opr_sz = simd_oprsz(desc); 2935 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2936 uint64_t f = *(uint64_t *)(vn + i); 2937 uint64_t b = *(uint64_t *)(vn + j); 2938 *(uint64_t *)(vd + i) = rol64(b, 32); 2939 *(uint64_t *)(vd + j) = rol64(f, 32); 2940 } 2941 } 2942 2943 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 2944 { 2945 intptr_t i, j, opr_sz = simd_oprsz(desc); 2946 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2947 uint64_t f = *(uint64_t *)(vn + i); 2948 uint64_t b = *(uint64_t *)(vn + j); 2949 *(uint64_t *)(vd + i) = b; 2950 *(uint64_t *)(vd + j) = f; 2951 } 2952 } 2953 2954 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 2955 2956 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 2957 bool is_tbx, tb_impl_fn *fn) 2958 { 2959 ARMVectorReg scratch; 2960 uintptr_t oprsz = simd_oprsz(desc); 2961 2962 if (unlikely(vd == vn)) { 2963 vn = memcpy(&scratch, vn, oprsz); 2964 } 2965 2966 fn(vd, vn, NULL, vm, oprsz, is_tbx); 2967 } 2968 2969 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 2970 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 2971 { 2972 ARMVectorReg scratch; 2973 uintptr_t oprsz = simd_oprsz(desc); 2974 2975 if (unlikely(vd == vn0)) { 2976 vn0 = memcpy(&scratch, vn0, oprsz); 2977 if (vd == vn1) { 2978 vn1 = vn0; 2979 } 2980 } else if (unlikely(vd == vn1)) { 2981 vn1 = memcpy(&scratch, vn1, oprsz); 2982 } 2983 2984 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 2985 } 2986 2987 #define DO_TB(SUFF, TYPE, H) \ 2988 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 2989 void *vm, uintptr_t oprsz, bool is_tbx) \ 2990 { \ 2991 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 2992 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 2993 for (i = 0; i < nelem; ++i) { \ 2994 TYPE index = indexes[H1(i)], val = 0; \ 2995 if (index < nelem) { \ 2996 val = tbl0[H(index)]; \ 2997 } else { \ 2998 index -= nelem; \ 2999 if (tbl1 && index < nelem) { \ 3000 val = tbl1[H(index)]; \ 3001 } else if (is_tbx) { \ 3002 continue; \ 3003 } \ 3004 } \ 3005 d[H(i)] = val; \ 3006 } \ 3007 } \ 3008 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3009 { \ 3010 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3011 } \ 3012 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3013 void *vm, uint32_t desc) \ 3014 { \ 3015 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3016 } \ 3017 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3018 { \ 3019 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3020 } 3021 3022 DO_TB(b, uint8_t, H1) 3023 DO_TB(h, uint16_t, H2) 3024 DO_TB(s, uint32_t, H4) 3025 DO_TB(d, uint64_t, H8) 3026 3027 #undef DO_TB 3028 3029 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3030 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3031 { \ 3032 intptr_t i, opr_sz = simd_oprsz(desc); \ 3033 TYPED *d = vd; \ 3034 TYPES *n = vn; \ 3035 ARMVectorReg tmp; \ 3036 if (unlikely(vn - vd < opr_sz)) { \ 3037 n = memcpy(&tmp, n, opr_sz / 2); \ 3038 } \ 3039 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3040 d[HD(i)] = n[HS(i)]; \ 3041 } \ 3042 } 3043 3044 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3045 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3046 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3047 3048 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3049 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3050 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3051 3052 #undef DO_UNPK 3053 3054 /* Mask of bits included in the even numbered predicates of width esz. 3055 * We also use this for expand_bits/compress_bits, and so extend the 3056 * same pattern out to 16-bit units. 3057 */ 3058 static const uint64_t even_bit_esz_masks[5] = { 3059 0x5555555555555555ull, 3060 0x3333333333333333ull, 3061 0x0f0f0f0f0f0f0f0full, 3062 0x00ff00ff00ff00ffull, 3063 0x0000ffff0000ffffull, 3064 }; 3065 3066 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3067 * For N==0, this corresponds to the operation that in qemu/bitops.h 3068 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3069 * section 7-2 Shuffling Bits. 3070 */ 3071 static uint64_t expand_bits(uint64_t x, int n) 3072 { 3073 int i; 3074 3075 x &= 0xffffffffu; 3076 for (i = 4; i >= n; i--) { 3077 int sh = 1 << i; 3078 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3079 } 3080 return x; 3081 } 3082 3083 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3084 * For N==0, this corresponds to the operation that in qemu/bitops.h 3085 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3086 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3087 */ 3088 static uint64_t compress_bits(uint64_t x, int n) 3089 { 3090 int i; 3091 3092 for (i = n; i <= 4; i++) { 3093 int sh = 1 << i; 3094 x &= even_bit_esz_masks[i]; 3095 x = (x >> sh) | x; 3096 } 3097 return x & 0xffffffffu; 3098 } 3099 3100 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3101 { 3102 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3103 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3104 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3105 int esize = 1 << esz; 3106 uint64_t *d = vd; 3107 intptr_t i; 3108 3109 if (oprsz <= 8) { 3110 uint64_t nn = *(uint64_t *)vn; 3111 uint64_t mm = *(uint64_t *)vm; 3112 int half = 4 * oprsz; 3113 3114 nn = extract64(nn, high * half, half); 3115 mm = extract64(mm, high * half, half); 3116 nn = expand_bits(nn, esz); 3117 mm = expand_bits(mm, esz); 3118 d[0] = nn | (mm << esize); 3119 } else { 3120 ARMPredicateReg tmp; 3121 3122 /* We produce output faster than we consume input. 3123 Therefore we must be mindful of possible overlap. */ 3124 if (vd == vn) { 3125 vn = memcpy(&tmp, vn, oprsz); 3126 if (vd == vm) { 3127 vm = vn; 3128 } 3129 } else if (vd == vm) { 3130 vm = memcpy(&tmp, vm, oprsz); 3131 } 3132 if (high) { 3133 high = oprsz >> 1; 3134 } 3135 3136 if ((oprsz & 7) == 0) { 3137 uint32_t *n = vn, *m = vm; 3138 high >>= 2; 3139 3140 for (i = 0; i < oprsz / 8; i++) { 3141 uint64_t nn = n[H4(high + i)]; 3142 uint64_t mm = m[H4(high + i)]; 3143 3144 nn = expand_bits(nn, esz); 3145 mm = expand_bits(mm, esz); 3146 d[i] = nn | (mm << esize); 3147 } 3148 } else { 3149 uint8_t *n = vn, *m = vm; 3150 uint16_t *d16 = vd; 3151 3152 for (i = 0; i < oprsz / 2; i++) { 3153 uint16_t nn = n[H1(high + i)]; 3154 uint16_t mm = m[H1(high + i)]; 3155 3156 nn = expand_bits(nn, esz); 3157 mm = expand_bits(mm, esz); 3158 d16[H2(i)] = nn | (mm << esize); 3159 } 3160 } 3161 } 3162 } 3163 3164 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3165 { 3166 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3167 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3168 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3169 uint64_t *d = vd, *n = vn, *m = vm; 3170 uint64_t l, h; 3171 intptr_t i; 3172 3173 if (oprsz <= 8) { 3174 l = compress_bits(n[0] >> odd, esz); 3175 h = compress_bits(m[0] >> odd, esz); 3176 d[0] = l | (h << (4 * oprsz)); 3177 } else { 3178 ARMPredicateReg tmp_m; 3179 intptr_t oprsz_16 = oprsz / 16; 3180 3181 if ((vm - vd) < (uintptr_t)oprsz) { 3182 m = memcpy(&tmp_m, vm, oprsz); 3183 } 3184 3185 for (i = 0; i < oprsz_16; i++) { 3186 l = n[2 * i + 0]; 3187 h = n[2 * i + 1]; 3188 l = compress_bits(l >> odd, esz); 3189 h = compress_bits(h >> odd, esz); 3190 d[i] = l | (h << 32); 3191 } 3192 3193 /* 3194 * For VL which is not a multiple of 512, the results from M do not 3195 * align nicely with the uint64_t for D. Put the aligned results 3196 * from M into TMP_M and then copy it into place afterward. 3197 */ 3198 if (oprsz & 15) { 3199 int final_shift = (oprsz & 15) * 2; 3200 3201 l = n[2 * i + 0]; 3202 h = n[2 * i + 1]; 3203 l = compress_bits(l >> odd, esz); 3204 h = compress_bits(h >> odd, esz); 3205 d[i] = l | (h << final_shift); 3206 3207 for (i = 0; i < oprsz_16; i++) { 3208 l = m[2 * i + 0]; 3209 h = m[2 * i + 1]; 3210 l = compress_bits(l >> odd, esz); 3211 h = compress_bits(h >> odd, esz); 3212 tmp_m.p[i] = l | (h << 32); 3213 } 3214 l = m[2 * i + 0]; 3215 h = m[2 * i + 1]; 3216 l = compress_bits(l >> odd, esz); 3217 h = compress_bits(h >> odd, esz); 3218 tmp_m.p[i] = l | (h << final_shift); 3219 3220 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3221 } else { 3222 for (i = 0; i < oprsz_16; i++) { 3223 l = m[2 * i + 0]; 3224 h = m[2 * i + 1]; 3225 l = compress_bits(l >> odd, esz); 3226 h = compress_bits(h >> odd, esz); 3227 d[oprsz_16 + i] = l | (h << 32); 3228 } 3229 } 3230 } 3231 } 3232 3233 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3234 { 3235 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3236 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3237 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3238 uint64_t *d = vd, *n = vn, *m = vm; 3239 uint64_t mask; 3240 int shr, shl; 3241 intptr_t i; 3242 3243 shl = 1 << esz; 3244 shr = 0; 3245 mask = even_bit_esz_masks[esz]; 3246 if (odd) { 3247 mask <<= shl; 3248 shr = shl; 3249 shl = 0; 3250 } 3251 3252 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3253 uint64_t nn = (n[i] & mask) >> shr; 3254 uint64_t mm = (m[i] & mask) << shl; 3255 d[i] = nn + mm; 3256 } 3257 } 3258 3259 /* Reverse units of 2**N bits. */ 3260 static uint64_t reverse_bits_64(uint64_t x, int n) 3261 { 3262 int i, sh; 3263 3264 x = bswap64(x); 3265 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3266 uint64_t mask = even_bit_esz_masks[i]; 3267 x = ((x & mask) << sh) | ((x >> sh) & mask); 3268 } 3269 return x; 3270 } 3271 3272 static uint8_t reverse_bits_8(uint8_t x, int n) 3273 { 3274 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3275 int i, sh; 3276 3277 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3278 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3279 } 3280 return x; 3281 } 3282 3283 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3284 { 3285 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3286 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3287 intptr_t i, oprsz_2 = oprsz / 2; 3288 3289 if (oprsz <= 8) { 3290 uint64_t l = *(uint64_t *)vn; 3291 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3292 *(uint64_t *)vd = l; 3293 } else if ((oprsz & 15) == 0) { 3294 for (i = 0; i < oprsz_2; i += 8) { 3295 intptr_t ih = oprsz - 8 - i; 3296 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3297 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3298 *(uint64_t *)(vd + i) = h; 3299 *(uint64_t *)(vd + ih) = l; 3300 } 3301 } else { 3302 for (i = 0; i < oprsz_2; i += 1) { 3303 intptr_t il = H1(i); 3304 intptr_t ih = H1(oprsz - 1 - i); 3305 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3306 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3307 *(uint8_t *)(vd + il) = h; 3308 *(uint8_t *)(vd + ih) = l; 3309 } 3310 } 3311 } 3312 3313 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3314 { 3315 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3316 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3317 uint64_t *d = vd; 3318 intptr_t i; 3319 3320 if (oprsz <= 8) { 3321 uint64_t nn = *(uint64_t *)vn; 3322 int half = 4 * oprsz; 3323 3324 nn = extract64(nn, high * half, half); 3325 nn = expand_bits(nn, 0); 3326 d[0] = nn; 3327 } else { 3328 ARMPredicateReg tmp_n; 3329 3330 /* We produce output faster than we consume input. 3331 Therefore we must be mindful of possible overlap. */ 3332 if ((vn - vd) < (uintptr_t)oprsz) { 3333 vn = memcpy(&tmp_n, vn, oprsz); 3334 } 3335 if (high) { 3336 high = oprsz >> 1; 3337 } 3338 3339 if ((oprsz & 7) == 0) { 3340 uint32_t *n = vn; 3341 high >>= 2; 3342 3343 for (i = 0; i < oprsz / 8; i++) { 3344 uint64_t nn = n[H4(high + i)]; 3345 d[i] = expand_bits(nn, 0); 3346 } 3347 } else { 3348 uint16_t *d16 = vd; 3349 uint8_t *n = vn; 3350 3351 for (i = 0; i < oprsz / 2; i++) { 3352 uint16_t nn = n[H1(high + i)]; 3353 d16[H2(i)] = expand_bits(nn, 0); 3354 } 3355 } 3356 } 3357 } 3358 3359 #define DO_ZIP(NAME, TYPE, H) \ 3360 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3361 { \ 3362 intptr_t oprsz = simd_oprsz(desc); \ 3363 intptr_t odd_ofs = simd_data(desc); \ 3364 intptr_t i, oprsz_2 = oprsz / 2; \ 3365 ARMVectorReg tmp_n, tmp_m; \ 3366 /* We produce output faster than we consume input. \ 3367 Therefore we must be mindful of possible overlap. */ \ 3368 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3369 vn = memcpy(&tmp_n, vn, oprsz); \ 3370 } \ 3371 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3372 vm = memcpy(&tmp_m, vm, oprsz); \ 3373 } \ 3374 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3375 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3376 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3377 *(TYPE *)(vm + odd_ofs + H(i)); \ 3378 } \ 3379 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3380 memset(vd + oprsz - 16, 0, 16); \ 3381 } \ 3382 } 3383 3384 DO_ZIP(sve_zip_b, uint8_t, H1) 3385 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3386 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3387 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3388 DO_ZIP(sve2_zip_q, Int128, ) 3389 3390 #define DO_UZP(NAME, TYPE, H) \ 3391 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3392 { \ 3393 intptr_t oprsz = simd_oprsz(desc); \ 3394 intptr_t odd_ofs = simd_data(desc); \ 3395 intptr_t i, p; \ 3396 ARMVectorReg tmp_m; \ 3397 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3398 vm = memcpy(&tmp_m, vm, oprsz); \ 3399 } \ 3400 i = 0, p = odd_ofs; \ 3401 do { \ 3402 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3403 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3404 } while (p < oprsz); \ 3405 p -= oprsz; \ 3406 do { \ 3407 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3408 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3409 } while (p < oprsz); \ 3410 tcg_debug_assert(i == oprsz); \ 3411 } 3412 3413 DO_UZP(sve_uzp_b, uint8_t, H1) 3414 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3415 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3416 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3417 DO_UZP(sve2_uzp_q, Int128, ) 3418 3419 #define DO_TRN(NAME, TYPE, H) \ 3420 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3421 { \ 3422 intptr_t oprsz = simd_oprsz(desc); \ 3423 intptr_t odd_ofs = simd_data(desc); \ 3424 intptr_t i; \ 3425 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3426 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3427 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3428 *(TYPE *)(vd + H(i + 0)) = ae; \ 3429 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3430 } \ 3431 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3432 memset(vd + oprsz - 16, 0, 16); \ 3433 } \ 3434 } 3435 3436 DO_TRN(sve_trn_b, uint8_t, H1) 3437 DO_TRN(sve_trn_h, uint16_t, H1_2) 3438 DO_TRN(sve_trn_s, uint32_t, H1_4) 3439 DO_TRN(sve_trn_d, uint64_t, H1_8) 3440 DO_TRN(sve2_trn_q, Int128, ) 3441 3442 #undef DO_ZIP 3443 #undef DO_UZP 3444 #undef DO_TRN 3445 3446 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3447 { 3448 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3449 uint32_t *d = vd, *n = vn; 3450 uint8_t *pg = vg; 3451 3452 for (i = j = 0; i < opr_sz; i++) { 3453 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3454 d[H4(j)] = n[H4(i)]; 3455 j++; 3456 } 3457 } 3458 for (; j < opr_sz; j++) { 3459 d[H4(j)] = 0; 3460 } 3461 } 3462 3463 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3464 { 3465 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3466 uint64_t *d = vd, *n = vn; 3467 uint8_t *pg = vg; 3468 3469 for (i = j = 0; i < opr_sz; i++) { 3470 if (pg[H1(i)] & 1) { 3471 d[j] = n[i]; 3472 j++; 3473 } 3474 } 3475 for (; j < opr_sz; j++) { 3476 d[j] = 0; 3477 } 3478 } 3479 3480 /* Similar to the ARM LastActiveElement pseudocode function, except the 3481 * result is multiplied by the element size. This includes the not found 3482 * indication; e.g. not found for esz=3 is -8. 3483 */ 3484 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3485 { 3486 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3487 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3488 3489 return last_active_element(vg, words, esz); 3490 } 3491 3492 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3493 { 3494 intptr_t opr_sz = simd_oprsz(desc) / 8; 3495 int esz = simd_data(desc); 3496 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3497 intptr_t i, first_i, last_i; 3498 ARMVectorReg tmp; 3499 3500 first_i = last_i = 0; 3501 first_g = last_g = 0; 3502 3503 /* Find the extent of the active elements within VG. */ 3504 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3505 pg = *(uint64_t *)(vg + i) & mask; 3506 if (pg) { 3507 if (last_g == 0) { 3508 last_g = pg; 3509 last_i = i; 3510 } 3511 first_g = pg; 3512 first_i = i; 3513 } 3514 } 3515 3516 len = 0; 3517 if (first_g != 0) { 3518 first_i = first_i * 8 + ctz64(first_g); 3519 last_i = last_i * 8 + 63 - clz64(last_g); 3520 len = last_i - first_i + (1 << esz); 3521 if (vd == vm) { 3522 vm = memcpy(&tmp, vm, opr_sz * 8); 3523 } 3524 swap_memmove(vd, vn + first_i, len); 3525 } 3526 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3527 } 3528 3529 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3530 void *vg, uint32_t desc) 3531 { 3532 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3533 uint64_t *d = vd, *n = vn, *m = vm; 3534 uint8_t *pg = vg; 3535 3536 for (i = 0; i < opr_sz; i += 1) { 3537 uint64_t nn = n[i], mm = m[i]; 3538 uint64_t pp = expand_pred_b(pg[H1(i)]); 3539 d[i] = (nn & pp) | (mm & ~pp); 3540 } 3541 } 3542 3543 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3544 void *vg, uint32_t desc) 3545 { 3546 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3547 uint64_t *d = vd, *n = vn, *m = vm; 3548 uint8_t *pg = vg; 3549 3550 for (i = 0; i < opr_sz; i += 1) { 3551 uint64_t nn = n[i], mm = m[i]; 3552 uint64_t pp = expand_pred_h(pg[H1(i)]); 3553 d[i] = (nn & pp) | (mm & ~pp); 3554 } 3555 } 3556 3557 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3558 void *vg, uint32_t desc) 3559 { 3560 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3561 uint64_t *d = vd, *n = vn, *m = vm; 3562 uint8_t *pg = vg; 3563 3564 for (i = 0; i < opr_sz; i += 1) { 3565 uint64_t nn = n[i], mm = m[i]; 3566 uint64_t pp = expand_pred_s(pg[H1(i)]); 3567 d[i] = (nn & pp) | (mm & ~pp); 3568 } 3569 } 3570 3571 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3572 void *vg, uint32_t desc) 3573 { 3574 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3575 uint64_t *d = vd, *n = vn, *m = vm; 3576 uint8_t *pg = vg; 3577 3578 for (i = 0; i < opr_sz; i += 1) { 3579 uint64_t nn = n[i], mm = m[i]; 3580 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3581 } 3582 } 3583 3584 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3585 void *vg, uint32_t desc) 3586 { 3587 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3588 Int128 *d = vd, *n = vn, *m = vm; 3589 uint16_t *pg = vg; 3590 3591 for (i = 0; i < opr_sz; i += 1) { 3592 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3593 } 3594 } 3595 3596 /* Two operand comparison controlled by a predicate. 3597 * ??? It is very tempting to want to be able to expand this inline 3598 * with x86 instructions, e.g. 3599 * 3600 * vcmpeqw zm, zn, %ymm0 3601 * vpmovmskb %ymm0, %eax 3602 * and $0x5555, %eax 3603 * and pg, %eax 3604 * 3605 * or even aarch64, e.g. 3606 * 3607 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3608 * cmeq v0.8h, zn, zm 3609 * and v0.8h, v0.8h, mask 3610 * addv h0, v0.8h 3611 * and v0.8b, pg 3612 * 3613 * However, coming up with an abstraction that allows vector inputs and 3614 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3615 * scalar outputs, is tricky. 3616 */ 3617 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3618 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3619 { \ 3620 intptr_t opr_sz = simd_oprsz(desc); \ 3621 uint32_t flags = PREDTEST_INIT; \ 3622 intptr_t i = opr_sz; \ 3623 do { \ 3624 uint64_t out = 0, pg; \ 3625 do { \ 3626 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3627 TYPE nn = *(TYPE *)(vn + H(i)); \ 3628 TYPE mm = *(TYPE *)(vm + H(i)); \ 3629 out |= nn OP mm; \ 3630 } while (i & 63); \ 3631 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3632 out &= pg; \ 3633 *(uint64_t *)(vd + (i >> 3)) = out; \ 3634 flags = iter_predtest_bwd(out, pg, flags); \ 3635 } while (i > 0); \ 3636 return flags; \ 3637 } 3638 3639 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3640 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3641 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3642 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3643 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3644 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3645 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3646 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3647 3648 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3649 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3650 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3651 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3652 3653 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3654 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3655 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3656 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3657 3658 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3659 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3660 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3661 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3662 3663 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3664 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3665 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3666 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3667 3668 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3669 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3670 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3671 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3672 3673 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3674 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3675 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3676 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3677 3678 #undef DO_CMP_PPZZ_B 3679 #undef DO_CMP_PPZZ_H 3680 #undef DO_CMP_PPZZ_S 3681 #undef DO_CMP_PPZZ_D 3682 #undef DO_CMP_PPZZ 3683 3684 /* Similar, but the second source is "wide". */ 3685 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3686 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3687 { \ 3688 intptr_t opr_sz = simd_oprsz(desc); \ 3689 uint32_t flags = PREDTEST_INIT; \ 3690 intptr_t i = opr_sz; \ 3691 do { \ 3692 uint64_t out = 0, pg; \ 3693 do { \ 3694 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3695 do { \ 3696 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3697 TYPE nn = *(TYPE *)(vn + H(i)); \ 3698 out |= nn OP mm; \ 3699 } while (i & 7); \ 3700 } while (i & 63); \ 3701 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3702 out &= pg; \ 3703 *(uint64_t *)(vd + (i >> 3)) = out; \ 3704 flags = iter_predtest_bwd(out, pg, flags); \ 3705 } while (i > 0); \ 3706 return flags; \ 3707 } 3708 3709 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3710 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3711 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3712 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3713 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3714 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3715 3716 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3717 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3718 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3719 3720 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3721 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3722 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3723 3724 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3725 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3726 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3727 3728 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3729 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3730 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3731 3732 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3733 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3734 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3735 3736 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3737 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3738 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3739 3740 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3741 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3742 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3743 3744 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3745 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3746 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3747 3748 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3749 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3750 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3751 3752 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3753 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3754 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3755 3756 #undef DO_CMP_PPZW_B 3757 #undef DO_CMP_PPZW_H 3758 #undef DO_CMP_PPZW_S 3759 #undef DO_CMP_PPZW 3760 3761 /* Similar, but the second source is immediate. */ 3762 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3763 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3764 { \ 3765 intptr_t opr_sz = simd_oprsz(desc); \ 3766 uint32_t flags = PREDTEST_INIT; \ 3767 TYPE mm = simd_data(desc); \ 3768 intptr_t i = opr_sz; \ 3769 do { \ 3770 uint64_t out = 0, pg; \ 3771 do { \ 3772 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3773 TYPE nn = *(TYPE *)(vn + H(i)); \ 3774 out |= nn OP mm; \ 3775 } while (i & 63); \ 3776 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3777 out &= pg; \ 3778 *(uint64_t *)(vd + (i >> 3)) = out; \ 3779 flags = iter_predtest_bwd(out, pg, flags); \ 3780 } while (i > 0); \ 3781 return flags; \ 3782 } 3783 3784 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3785 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3786 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3787 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3788 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3789 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3790 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3791 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3792 3793 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3794 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3795 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3796 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3797 3798 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3799 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3800 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3801 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3802 3803 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3804 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3805 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3806 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3807 3808 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3809 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3810 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3811 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3812 3813 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3814 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3815 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3816 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3817 3818 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3819 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3820 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3821 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3822 3823 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3824 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3825 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3826 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 3827 3828 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 3829 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 3830 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 3831 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 3832 3833 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 3834 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 3835 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 3836 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 3837 3838 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 3839 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 3840 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 3841 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 3842 3843 #undef DO_CMP_PPZI_B 3844 #undef DO_CMP_PPZI_H 3845 #undef DO_CMP_PPZI_S 3846 #undef DO_CMP_PPZI_D 3847 #undef DO_CMP_PPZI 3848 3849 /* Similar to the ARM LastActive pseudocode function. */ 3850 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 3851 { 3852 intptr_t i; 3853 3854 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 3855 uint64_t pg = *(uint64_t *)(vg + i); 3856 if (pg) { 3857 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 3858 } 3859 } 3860 return 0; 3861 } 3862 3863 /* Compute a mask into RETB that is true for all G, up to and including 3864 * (if after) or excluding (if !after) the first G & N. 3865 * Return true if BRK found. 3866 */ 3867 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 3868 bool brk, bool after) 3869 { 3870 uint64_t b; 3871 3872 if (brk) { 3873 b = 0; 3874 } else if ((g & n) == 0) { 3875 /* For all G, no N are set; break not found. */ 3876 b = g; 3877 } else { 3878 /* Break somewhere in N. Locate it. */ 3879 b = g & n; /* guard true, pred true */ 3880 b = b & -b; /* first such */ 3881 if (after) { 3882 b = b | (b - 1); /* break after same */ 3883 } else { 3884 b = b - 1; /* break before same */ 3885 } 3886 brk = true; 3887 } 3888 3889 *retb = b; 3890 return brk; 3891 } 3892 3893 /* Compute a zeroing BRK. */ 3894 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 3895 intptr_t oprsz, bool after) 3896 { 3897 bool brk = false; 3898 intptr_t i; 3899 3900 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3901 uint64_t this_b, this_g = g[i]; 3902 3903 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3904 d[i] = this_b & this_g; 3905 } 3906 } 3907 3908 /* Likewise, but also compute flags. */ 3909 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 3910 intptr_t oprsz, bool after) 3911 { 3912 uint32_t flags = PREDTEST_INIT; 3913 bool brk = false; 3914 intptr_t i; 3915 3916 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3917 uint64_t this_b, this_d, this_g = g[i]; 3918 3919 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3920 d[i] = this_d = this_b & this_g; 3921 flags = iter_predtest_fwd(this_d, this_g, flags); 3922 } 3923 return flags; 3924 } 3925 3926 /* Compute a merging BRK. */ 3927 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 3928 intptr_t oprsz, bool after) 3929 { 3930 bool brk = false; 3931 intptr_t i; 3932 3933 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3934 uint64_t this_b, this_g = g[i]; 3935 3936 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3937 d[i] = (this_b & this_g) | (d[i] & ~this_g); 3938 } 3939 } 3940 3941 /* Likewise, but also compute flags. */ 3942 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 3943 intptr_t oprsz, bool after) 3944 { 3945 uint32_t flags = PREDTEST_INIT; 3946 bool brk = false; 3947 intptr_t i; 3948 3949 for (i = 0; i < oprsz / 8; ++i) { 3950 uint64_t this_b, this_d = d[i], this_g = g[i]; 3951 3952 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3953 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 3954 flags = iter_predtest_fwd(this_d, this_g, flags); 3955 } 3956 return flags; 3957 } 3958 3959 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) 3960 { 3961 /* It is quicker to zero the whole predicate than loop on OPRSZ. 3962 * The compiler should turn this into 4 64-bit integer stores. 3963 */ 3964 memset(d, 0, sizeof(ARMPredicateReg)); 3965 return PREDTEST_INIT; 3966 } 3967 3968 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 3969 uint32_t pred_desc) 3970 { 3971 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3972 if (last_active_pred(vn, vg, oprsz)) { 3973 compute_brk_z(vd, vm, vg, oprsz, true); 3974 } else { 3975 do_zero(vd, oprsz); 3976 } 3977 } 3978 3979 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 3980 uint32_t pred_desc) 3981 { 3982 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3983 if (last_active_pred(vn, vg, oprsz)) { 3984 return compute_brks_z(vd, vm, vg, oprsz, true); 3985 } else { 3986 return do_zero(vd, oprsz); 3987 } 3988 } 3989 3990 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 3991 uint32_t pred_desc) 3992 { 3993 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3994 if (last_active_pred(vn, vg, oprsz)) { 3995 compute_brk_z(vd, vm, vg, oprsz, false); 3996 } else { 3997 do_zero(vd, oprsz); 3998 } 3999 } 4000 4001 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4002 uint32_t pred_desc) 4003 { 4004 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4005 if (last_active_pred(vn, vg, oprsz)) { 4006 return compute_brks_z(vd, vm, vg, oprsz, false); 4007 } else { 4008 return do_zero(vd, oprsz); 4009 } 4010 } 4011 4012 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4013 { 4014 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4015 compute_brk_z(vd, vn, vg, oprsz, true); 4016 } 4017 4018 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4019 { 4020 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4021 return compute_brks_z(vd, vn, vg, oprsz, true); 4022 } 4023 4024 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4025 { 4026 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4027 compute_brk_z(vd, vn, vg, oprsz, false); 4028 } 4029 4030 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4031 { 4032 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4033 return compute_brks_z(vd, vn, vg, oprsz, false); 4034 } 4035 4036 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4037 { 4038 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4039 compute_brk_m(vd, vn, vg, oprsz, true); 4040 } 4041 4042 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4043 { 4044 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4045 return compute_brks_m(vd, vn, vg, oprsz, true); 4046 } 4047 4048 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4049 { 4050 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4051 compute_brk_m(vd, vn, vg, oprsz, false); 4052 } 4053 4054 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4055 { 4056 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4057 return compute_brks_m(vd, vn, vg, oprsz, false); 4058 } 4059 4060 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4061 { 4062 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4063 if (!last_active_pred(vn, vg, oprsz)) { 4064 do_zero(vd, oprsz); 4065 } 4066 } 4067 4068 /* As if PredTest(Ones(PL), D, esz). */ 4069 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, 4070 uint64_t esz_mask) 4071 { 4072 uint32_t flags = PREDTEST_INIT; 4073 intptr_t i; 4074 4075 for (i = 0; i < oprsz / 8; i++) { 4076 flags = iter_predtest_fwd(d->p[i], esz_mask, flags); 4077 } 4078 if (oprsz & 7) { 4079 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4080 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); 4081 } 4082 return flags; 4083 } 4084 4085 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4086 { 4087 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4088 if (last_active_pred(vn, vg, oprsz)) { 4089 return predtest_ones(vd, oprsz, -1); 4090 } else { 4091 return do_zero(vd, oprsz); 4092 } 4093 } 4094 4095 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4096 { 4097 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4098 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4099 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4100 intptr_t i; 4101 4102 for (i = 0; i < words; ++i) { 4103 uint64_t t = n[i] & g[i] & mask; 4104 sum += ctpop64(t); 4105 } 4106 return sum; 4107 } 4108 4109 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4110 { 4111 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4112 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4113 uint64_t esz_mask = pred_esz_masks[esz]; 4114 ARMPredicateReg *d = vd; 4115 uint32_t flags; 4116 intptr_t i; 4117 4118 /* Begin with a zero predicate register. */ 4119 flags = do_zero(d, oprsz); 4120 if (count == 0) { 4121 return flags; 4122 } 4123 4124 /* Set all of the requested bits. */ 4125 for (i = 0; i < count / 64; ++i) { 4126 d->p[i] = esz_mask; 4127 } 4128 if (count & 63) { 4129 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4130 } 4131 4132 return predtest_ones(d, oprsz, esz_mask); 4133 } 4134 4135 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4136 { 4137 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4138 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4139 uint64_t esz_mask = pred_esz_masks[esz]; 4140 ARMPredicateReg *d = vd; 4141 intptr_t i, invcount, oprbits; 4142 uint64_t bits; 4143 4144 if (count == 0) { 4145 return do_zero(d, oprsz); 4146 } 4147 4148 oprbits = oprsz * 8; 4149 tcg_debug_assert(count <= oprbits); 4150 4151 bits = esz_mask; 4152 if (oprbits & 63) { 4153 bits &= MAKE_64BIT_MASK(0, oprbits & 63); 4154 } 4155 4156 invcount = oprbits - count; 4157 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { 4158 d->p[i] = bits; 4159 bits = esz_mask; 4160 } 4161 4162 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); 4163 4164 while (--i >= 0) { 4165 d->p[i] = 0; 4166 } 4167 4168 return predtest_ones(d, oprsz, esz_mask); 4169 } 4170 4171 /* Recursive reduction on a function; 4172 * C.f. the ARM ARM function ReducePredicated. 4173 * 4174 * While it would be possible to write this without the DATA temporary, 4175 * it is much simpler to process the predicate register this way. 4176 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4177 * little to gain with a more complex non-recursive form. 4178 */ 4179 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ 4180 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4181 { \ 4182 if (n == 1) { \ 4183 return *data; \ 4184 } else { \ 4185 uintptr_t half = n / 2; \ 4186 TYPE lo = NAME##_reduce(data, status, half); \ 4187 TYPE hi = NAME##_reduce(data + half, status, half); \ 4188 return TYPE##_##FUNC(lo, hi, status); \ 4189 } \ 4190 } \ 4191 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \ 4192 { \ 4193 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4194 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4195 for (i = 0; i < oprsz; ) { \ 4196 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4197 do { \ 4198 TYPE nn = *(TYPE *)(vn + H(i)); \ 4199 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4200 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4201 } while (i & 15); \ 4202 } \ 4203 for (; i < maxsz; i += sizeof(TYPE)) { \ 4204 *(TYPE *)((void *)data + i) = IDENT; \ 4205 } \ 4206 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \ 4207 } 4208 4209 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) 4210 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) 4211 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero) 4212 4213 /* Identity is floatN_default_nan, without the function call. */ 4214 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) 4215 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) 4216 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL) 4217 4218 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) 4219 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) 4220 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL) 4221 4222 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) 4223 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) 4224 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity) 4225 4226 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) 4227 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) 4228 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity)) 4229 4230 #undef DO_REDUCE 4231 4232 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4233 void *status, uint32_t desc) 4234 { 4235 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4236 float16 result = nn; 4237 4238 do { 4239 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4240 do { 4241 if (pg & 1) { 4242 float16 mm = *(float16 *)(vm + H1_2(i)); 4243 result = float16_add(result, mm, status); 4244 } 4245 i += sizeof(float16), pg >>= sizeof(float16); 4246 } while (i & 15); 4247 } while (i < opr_sz); 4248 4249 return result; 4250 } 4251 4252 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4253 void *status, uint32_t desc) 4254 { 4255 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4256 float32 result = nn; 4257 4258 do { 4259 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4260 do { 4261 if (pg & 1) { 4262 float32 mm = *(float32 *)(vm + H1_2(i)); 4263 result = float32_add(result, mm, status); 4264 } 4265 i += sizeof(float32), pg >>= sizeof(float32); 4266 } while (i & 15); 4267 } while (i < opr_sz); 4268 4269 return result; 4270 } 4271 4272 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4273 void *status, uint32_t desc) 4274 { 4275 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4276 uint64_t *m = vm; 4277 uint8_t *pg = vg; 4278 4279 for (i = 0; i < opr_sz; i++) { 4280 if (pg[H1(i)] & 1) { 4281 nn = float64_add(nn, m[i], status); 4282 } 4283 } 4284 4285 return nn; 4286 } 4287 4288 /* Fully general three-operand expander, controlled by a predicate, 4289 * With the extra float_status parameter. 4290 */ 4291 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4292 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4293 void *status, uint32_t desc) \ 4294 { \ 4295 intptr_t i = simd_oprsz(desc); \ 4296 uint64_t *g = vg; \ 4297 do { \ 4298 uint64_t pg = g[(i - 1) >> 6]; \ 4299 do { \ 4300 i -= sizeof(TYPE); \ 4301 if (likely((pg >> (i & 63)) & 1)) { \ 4302 TYPE nn = *(TYPE *)(vn + H(i)); \ 4303 TYPE mm = *(TYPE *)(vm + H(i)); \ 4304 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4305 } \ 4306 } while (i & 63); \ 4307 } while (i != 0); \ 4308 } 4309 4310 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4311 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4312 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4313 4314 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4315 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4316 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4317 4318 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4319 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4320 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4321 4322 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4323 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4324 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4325 4326 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4327 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4328 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4329 4330 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4331 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4332 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4333 4334 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4335 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4336 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4337 4338 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4339 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4340 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4341 4342 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4343 { 4344 return float16_abs(float16_sub(a, b, s)); 4345 } 4346 4347 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4348 { 4349 return float32_abs(float32_sub(a, b, s)); 4350 } 4351 4352 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4353 { 4354 return float64_abs(float64_sub(a, b, s)); 4355 } 4356 4357 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4358 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4359 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4360 4361 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4362 { 4363 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4364 return float64_scalbn(a, b_int, s); 4365 } 4366 4367 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4368 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4369 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4370 4371 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4372 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4373 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4374 4375 #undef DO_ZPZZ_FP 4376 4377 /* Three-operand expander, with one scalar operand, controlled by 4378 * a predicate, with the extra float_status parameter. 4379 */ 4380 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4381 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4382 void *status, uint32_t desc) \ 4383 { \ 4384 intptr_t i = simd_oprsz(desc); \ 4385 uint64_t *g = vg; \ 4386 TYPE mm = scalar; \ 4387 do { \ 4388 uint64_t pg = g[(i - 1) >> 6]; \ 4389 do { \ 4390 i -= sizeof(TYPE); \ 4391 if (likely((pg >> (i & 63)) & 1)) { \ 4392 TYPE nn = *(TYPE *)(vn + H(i)); \ 4393 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4394 } \ 4395 } while (i & 63); \ 4396 } while (i != 0); \ 4397 } 4398 4399 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4400 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4401 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4402 4403 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4404 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4405 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4406 4407 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4408 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4409 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4410 4411 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4412 { 4413 return float16_sub(b, a, s); 4414 } 4415 4416 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4417 { 4418 return float32_sub(b, a, s); 4419 } 4420 4421 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4422 { 4423 return float64_sub(b, a, s); 4424 } 4425 4426 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4427 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4428 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4429 4430 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4431 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4432 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4433 4434 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4435 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4436 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4437 4438 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4439 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4440 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4441 4442 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4443 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4444 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4445 4446 /* Fully general two-operand expander, controlled by a predicate, 4447 * With the extra float_status parameter. 4448 */ 4449 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4450 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 4451 { \ 4452 intptr_t i = simd_oprsz(desc); \ 4453 uint64_t *g = vg; \ 4454 do { \ 4455 uint64_t pg = g[(i - 1) >> 6]; \ 4456 do { \ 4457 i -= sizeof(TYPE); \ 4458 if (likely((pg >> (i & 63)) & 1)) { \ 4459 TYPE nn = *(TYPE *)(vn + H(i)); \ 4460 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4461 } \ 4462 } while (i & 63); \ 4463 } while (i != 0); \ 4464 } 4465 4466 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4467 * FZ16. When converting from fp16, this affects flushing input denormals; 4468 * when converting to fp16, this affects flushing output denormals. 4469 */ 4470 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) 4471 { 4472 bool save = get_flush_inputs_to_zero(fpst); 4473 float32 ret; 4474 4475 set_flush_inputs_to_zero(false, fpst); 4476 ret = float16_to_float32(f, true, fpst); 4477 set_flush_inputs_to_zero(save, fpst); 4478 return ret; 4479 } 4480 4481 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4482 { 4483 bool save = get_flush_inputs_to_zero(fpst); 4484 float64 ret; 4485 4486 set_flush_inputs_to_zero(false, fpst); 4487 ret = float16_to_float64(f, true, fpst); 4488 set_flush_inputs_to_zero(save, fpst); 4489 return ret; 4490 } 4491 4492 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) 4493 { 4494 bool save = get_flush_to_zero(fpst); 4495 float16 ret; 4496 4497 set_flush_to_zero(false, fpst); 4498 ret = float32_to_float16(f, true, fpst); 4499 set_flush_to_zero(save, fpst); 4500 return ret; 4501 } 4502 4503 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4504 { 4505 bool save = get_flush_to_zero(fpst); 4506 float16 ret; 4507 4508 set_flush_to_zero(false, fpst); 4509 ret = float64_to_float16(f, true, fpst); 4510 set_flush_to_zero(save, fpst); 4511 return ret; 4512 } 4513 4514 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4515 { 4516 if (float16_is_any_nan(f)) { 4517 float_raise(float_flag_invalid, s); 4518 return 0; 4519 } 4520 return float16_to_int16_round_to_zero(f, s); 4521 } 4522 4523 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4524 { 4525 if (float16_is_any_nan(f)) { 4526 float_raise(float_flag_invalid, s); 4527 return 0; 4528 } 4529 return float16_to_int64_round_to_zero(f, s); 4530 } 4531 4532 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4533 { 4534 if (float32_is_any_nan(f)) { 4535 float_raise(float_flag_invalid, s); 4536 return 0; 4537 } 4538 return float32_to_int64_round_to_zero(f, s); 4539 } 4540 4541 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4542 { 4543 if (float64_is_any_nan(f)) { 4544 float_raise(float_flag_invalid, s); 4545 return 0; 4546 } 4547 return float64_to_int64_round_to_zero(f, s); 4548 } 4549 4550 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4551 { 4552 if (float16_is_any_nan(f)) { 4553 float_raise(float_flag_invalid, s); 4554 return 0; 4555 } 4556 return float16_to_uint16_round_to_zero(f, s); 4557 } 4558 4559 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4560 { 4561 if (float16_is_any_nan(f)) { 4562 float_raise(float_flag_invalid, s); 4563 return 0; 4564 } 4565 return float16_to_uint64_round_to_zero(f, s); 4566 } 4567 4568 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4569 { 4570 if (float32_is_any_nan(f)) { 4571 float_raise(float_flag_invalid, s); 4572 return 0; 4573 } 4574 return float32_to_uint64_round_to_zero(f, s); 4575 } 4576 4577 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4578 { 4579 if (float64_is_any_nan(f)) { 4580 float_raise(float_flag_invalid, s); 4581 return 0; 4582 } 4583 return float64_to_uint64_round_to_zero(f, s); 4584 } 4585 4586 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4587 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4588 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4589 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4590 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4591 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4592 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4593 4594 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4595 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4596 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4597 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4598 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4599 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4600 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4601 4602 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4603 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4604 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4605 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4606 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4607 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4608 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4609 4610 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4611 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4612 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4613 4614 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4615 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4616 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4617 4618 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4619 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4620 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4621 4622 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4623 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4624 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4625 4626 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4627 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4628 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4629 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4630 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4631 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4632 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4633 4634 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4635 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4636 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4637 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4638 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 4639 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 4640 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 4641 4642 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 4643 { 4644 /* Extract frac to the top of the uint32_t. */ 4645 uint32_t frac = (uint32_t)a << (16 + 6); 4646 int16_t exp = extract32(a, 10, 5); 4647 4648 if (unlikely(exp == 0)) { 4649 if (frac != 0) { 4650 if (!get_flush_inputs_to_zero(s)) { 4651 /* denormal: bias - fractional_zeros */ 4652 return -15 - clz32(frac); 4653 } 4654 /* flush to zero */ 4655 float_raise(float_flag_input_denormal, s); 4656 } 4657 } else if (unlikely(exp == 0x1f)) { 4658 if (frac == 0) { 4659 return INT16_MAX; /* infinity */ 4660 } 4661 } else { 4662 /* normal: exp - bias */ 4663 return exp - 15; 4664 } 4665 /* nan or zero */ 4666 float_raise(float_flag_invalid, s); 4667 return INT16_MIN; 4668 } 4669 4670 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 4671 { 4672 /* Extract frac to the top of the uint32_t. */ 4673 uint32_t frac = a << 9; 4674 int32_t exp = extract32(a, 23, 8); 4675 4676 if (unlikely(exp == 0)) { 4677 if (frac != 0) { 4678 if (!get_flush_inputs_to_zero(s)) { 4679 /* denormal: bias - fractional_zeros */ 4680 return -127 - clz32(frac); 4681 } 4682 /* flush to zero */ 4683 float_raise(float_flag_input_denormal, s); 4684 } 4685 } else if (unlikely(exp == 0xff)) { 4686 if (frac == 0) { 4687 return INT32_MAX; /* infinity */ 4688 } 4689 } else { 4690 /* normal: exp - bias */ 4691 return exp - 127; 4692 } 4693 /* nan or zero */ 4694 float_raise(float_flag_invalid, s); 4695 return INT32_MIN; 4696 } 4697 4698 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 4699 { 4700 /* Extract frac to the top of the uint64_t. */ 4701 uint64_t frac = a << 12; 4702 int64_t exp = extract64(a, 52, 11); 4703 4704 if (unlikely(exp == 0)) { 4705 if (frac != 0) { 4706 if (!get_flush_inputs_to_zero(s)) { 4707 /* denormal: bias - fractional_zeros */ 4708 return -1023 - clz64(frac); 4709 } 4710 /* flush to zero */ 4711 float_raise(float_flag_input_denormal, s); 4712 } 4713 } else if (unlikely(exp == 0x7ff)) { 4714 if (frac == 0) { 4715 return INT64_MAX; /* infinity */ 4716 } 4717 } else { 4718 /* normal: exp - bias */ 4719 return exp - 1023; 4720 } 4721 /* nan or zero */ 4722 float_raise(float_flag_invalid, s); 4723 return INT64_MIN; 4724 } 4725 4726 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 4727 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 4728 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 4729 4730 #undef DO_ZPZ_FP 4731 4732 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 4733 float_status *status, uint32_t desc, 4734 uint16_t neg1, uint16_t neg3) 4735 { 4736 intptr_t i = simd_oprsz(desc); 4737 uint64_t *g = vg; 4738 4739 do { 4740 uint64_t pg = g[(i - 1) >> 6]; 4741 do { 4742 i -= 2; 4743 if (likely((pg >> (i & 63)) & 1)) { 4744 float16 e1, e2, e3, r; 4745 4746 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 4747 e2 = *(uint16_t *)(vm + H1_2(i)); 4748 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 4749 r = float16_muladd(e1, e2, e3, 0, status); 4750 *(uint16_t *)(vd + H1_2(i)) = r; 4751 } 4752 } while (i & 63); 4753 } while (i != 0); 4754 } 4755 4756 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4757 void *vg, void *status, uint32_t desc) 4758 { 4759 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0); 4760 } 4761 4762 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4763 void *vg, void *status, uint32_t desc) 4764 { 4765 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0); 4766 } 4767 4768 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4769 void *vg, void *status, uint32_t desc) 4770 { 4771 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000); 4772 } 4773 4774 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4775 void *vg, void *status, uint32_t desc) 4776 { 4777 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000); 4778 } 4779 4780 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 4781 float_status *status, uint32_t desc, 4782 uint32_t neg1, uint32_t neg3) 4783 { 4784 intptr_t i = simd_oprsz(desc); 4785 uint64_t *g = vg; 4786 4787 do { 4788 uint64_t pg = g[(i - 1) >> 6]; 4789 do { 4790 i -= 4; 4791 if (likely((pg >> (i & 63)) & 1)) { 4792 float32 e1, e2, e3, r; 4793 4794 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 4795 e2 = *(uint32_t *)(vm + H1_4(i)); 4796 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 4797 r = float32_muladd(e1, e2, e3, 0, status); 4798 *(uint32_t *)(vd + H1_4(i)) = r; 4799 } 4800 } while (i & 63); 4801 } while (i != 0); 4802 } 4803 4804 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4805 void *vg, void *status, uint32_t desc) 4806 { 4807 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0); 4808 } 4809 4810 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4811 void *vg, void *status, uint32_t desc) 4812 { 4813 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0); 4814 } 4815 4816 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4817 void *vg, void *status, uint32_t desc) 4818 { 4819 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000); 4820 } 4821 4822 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4823 void *vg, void *status, uint32_t desc) 4824 { 4825 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000); 4826 } 4827 4828 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 4829 float_status *status, uint32_t desc, 4830 uint64_t neg1, uint64_t neg3) 4831 { 4832 intptr_t i = simd_oprsz(desc); 4833 uint64_t *g = vg; 4834 4835 do { 4836 uint64_t pg = g[(i - 1) >> 6]; 4837 do { 4838 i -= 8; 4839 if (likely((pg >> (i & 63)) & 1)) { 4840 float64 e1, e2, e3, r; 4841 4842 e1 = *(uint64_t *)(vn + i) ^ neg1; 4843 e2 = *(uint64_t *)(vm + i); 4844 e3 = *(uint64_t *)(va + i) ^ neg3; 4845 r = float64_muladd(e1, e2, e3, 0, status); 4846 *(uint64_t *)(vd + i) = r; 4847 } 4848 } while (i & 63); 4849 } while (i != 0); 4850 } 4851 4852 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4853 void *vg, void *status, uint32_t desc) 4854 { 4855 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0); 4856 } 4857 4858 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4859 void *vg, void *status, uint32_t desc) 4860 { 4861 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0); 4862 } 4863 4864 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4865 void *vg, void *status, uint32_t desc) 4866 { 4867 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN); 4868 } 4869 4870 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4871 void *vg, void *status, uint32_t desc) 4872 { 4873 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN); 4874 } 4875 4876 /* Two operand floating-point comparison controlled by a predicate. 4877 * Unlike the integer version, we are not allowed to optimistically 4878 * compare operands, since the comparison may have side effects wrt 4879 * the FPSR. 4880 */ 4881 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 4882 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4883 void *status, uint32_t desc) \ 4884 { \ 4885 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4886 uint64_t *d = vd, *g = vg; \ 4887 do { \ 4888 uint64_t out = 0, pg = g[j]; \ 4889 do { \ 4890 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4891 if (likely((pg >> (i & 63)) & 1)) { \ 4892 TYPE nn = *(TYPE *)(vn + H(i)); \ 4893 TYPE mm = *(TYPE *)(vm + H(i)); \ 4894 out |= OP(TYPE, nn, mm, status); \ 4895 } \ 4896 } while (i & 63); \ 4897 d[j--] = out; \ 4898 } while (i > 0); \ 4899 } 4900 4901 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 4902 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 4903 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 4904 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 4905 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 4906 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 4907 4908 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 4909 DO_FPCMP_PPZZ_H(NAME, OP) \ 4910 DO_FPCMP_PPZZ_S(NAME, OP) \ 4911 DO_FPCMP_PPZZ_D(NAME, OP) 4912 4913 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 4914 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 4915 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 4916 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 4917 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 4918 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 4919 #define DO_FCMUO(TYPE, X, Y, ST) \ 4920 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 4921 #define DO_FACGE(TYPE, X, Y, ST) \ 4922 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 4923 #define DO_FACGT(TYPE, X, Y, ST) \ 4924 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 4925 4926 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 4927 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 4928 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 4929 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 4930 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 4931 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 4932 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 4933 4934 #undef DO_FPCMP_PPZZ_ALL 4935 #undef DO_FPCMP_PPZZ_D 4936 #undef DO_FPCMP_PPZZ_S 4937 #undef DO_FPCMP_PPZZ_H 4938 #undef DO_FPCMP_PPZZ 4939 4940 /* One operand floating-point comparison against zero, controlled 4941 * by a predicate. 4942 */ 4943 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 4944 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4945 void *status, uint32_t desc) \ 4946 { \ 4947 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4948 uint64_t *d = vd, *g = vg; \ 4949 do { \ 4950 uint64_t out = 0, pg = g[j]; \ 4951 do { \ 4952 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4953 if ((pg >> (i & 63)) & 1) { \ 4954 TYPE nn = *(TYPE *)(vn + H(i)); \ 4955 out |= OP(TYPE, nn, 0, status); \ 4956 } \ 4957 } while (i & 63); \ 4958 d[j--] = out; \ 4959 } while (i > 0); \ 4960 } 4961 4962 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 4963 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 4964 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 4965 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 4966 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 4967 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 4968 4969 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 4970 DO_FPCMP_PPZ0_H(NAME, OP) \ 4971 DO_FPCMP_PPZ0_S(NAME, OP) \ 4972 DO_FPCMP_PPZ0_D(NAME, OP) 4973 4974 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 4975 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 4976 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 4977 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 4978 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 4979 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 4980 4981 /* FP Trig Multiply-Add. */ 4982 4983 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 4984 { 4985 static const float16 coeff[16] = { 4986 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4987 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4988 }; 4989 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 4990 intptr_t x = simd_data(desc); 4991 float16 *d = vd, *n = vn, *m = vm; 4992 for (i = 0; i < opr_sz; i++) { 4993 float16 mm = m[i]; 4994 intptr_t xx = x; 4995 if (float16_is_neg(mm)) { 4996 mm = float16_abs(mm); 4997 xx += 8; 4998 } 4999 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs); 5000 } 5001 } 5002 5003 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 5004 { 5005 static const float32 coeff[16] = { 5006 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5007 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5008 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5009 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5010 }; 5011 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5012 intptr_t x = simd_data(desc); 5013 float32 *d = vd, *n = vn, *m = vm; 5014 for (i = 0; i < opr_sz; i++) { 5015 float32 mm = m[i]; 5016 intptr_t xx = x; 5017 if (float32_is_neg(mm)) { 5018 mm = float32_abs(mm); 5019 xx += 8; 5020 } 5021 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs); 5022 } 5023 } 5024 5025 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 5026 { 5027 static const float64 coeff[16] = { 5028 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5029 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5030 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5031 0x3de5d8408868552full, 0x0000000000000000ull, 5032 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5033 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5034 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5035 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5036 }; 5037 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5038 intptr_t x = simd_data(desc); 5039 float64 *d = vd, *n = vn, *m = vm; 5040 for (i = 0; i < opr_sz; i++) { 5041 float64 mm = m[i]; 5042 intptr_t xx = x; 5043 if (float64_is_neg(mm)) { 5044 mm = float64_abs(mm); 5045 xx += 8; 5046 } 5047 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs); 5048 } 5049 } 5050 5051 /* 5052 * FP Complex Add 5053 */ 5054 5055 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5056 void *vs, uint32_t desc) 5057 { 5058 intptr_t j, i = simd_oprsz(desc); 5059 uint64_t *g = vg; 5060 float16 neg_imag = float16_set_sign(0, simd_data(desc)); 5061 float16 neg_real = float16_chs(neg_imag); 5062 5063 do { 5064 uint64_t pg = g[(i - 1) >> 6]; 5065 do { 5066 float16 e0, e1, e2, e3; 5067 5068 /* I holds the real index; J holds the imag index. */ 5069 j = i - sizeof(float16); 5070 i -= 2 * sizeof(float16); 5071 5072 e0 = *(float16 *)(vn + H1_2(i)); 5073 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real; 5074 e2 = *(float16 *)(vn + H1_2(j)); 5075 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag; 5076 5077 if (likely((pg >> (i & 63)) & 1)) { 5078 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs); 5079 } 5080 if (likely((pg >> (j & 63)) & 1)) { 5081 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs); 5082 } 5083 } while (i & 63); 5084 } while (i != 0); 5085 } 5086 5087 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5088 void *vs, uint32_t desc) 5089 { 5090 intptr_t j, i = simd_oprsz(desc); 5091 uint64_t *g = vg; 5092 float32 neg_imag = float32_set_sign(0, simd_data(desc)); 5093 float32 neg_real = float32_chs(neg_imag); 5094 5095 do { 5096 uint64_t pg = g[(i - 1) >> 6]; 5097 do { 5098 float32 e0, e1, e2, e3; 5099 5100 /* I holds the real index; J holds the imag index. */ 5101 j = i - sizeof(float32); 5102 i -= 2 * sizeof(float32); 5103 5104 e0 = *(float32 *)(vn + H1_2(i)); 5105 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real; 5106 e2 = *(float32 *)(vn + H1_2(j)); 5107 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag; 5108 5109 if (likely((pg >> (i & 63)) & 1)) { 5110 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs); 5111 } 5112 if (likely((pg >> (j & 63)) & 1)) { 5113 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs); 5114 } 5115 } while (i & 63); 5116 } while (i != 0); 5117 } 5118 5119 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5120 void *vs, uint32_t desc) 5121 { 5122 intptr_t j, i = simd_oprsz(desc); 5123 uint64_t *g = vg; 5124 float64 neg_imag = float64_set_sign(0, simd_data(desc)); 5125 float64 neg_real = float64_chs(neg_imag); 5126 5127 do { 5128 uint64_t pg = g[(i - 1) >> 6]; 5129 do { 5130 float64 e0, e1, e2, e3; 5131 5132 /* I holds the real index; J holds the imag index. */ 5133 j = i - sizeof(float64); 5134 i -= 2 * sizeof(float64); 5135 5136 e0 = *(float64 *)(vn + H1_2(i)); 5137 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real; 5138 e2 = *(float64 *)(vn + H1_2(j)); 5139 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag; 5140 5141 if (likely((pg >> (i & 63)) & 1)) { 5142 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs); 5143 } 5144 if (likely((pg >> (j & 63)) & 1)) { 5145 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs); 5146 } 5147 } while (i & 63); 5148 } while (i != 0); 5149 } 5150 5151 /* 5152 * FP Complex Multiply 5153 */ 5154 5155 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5156 void *vg, void *status, uint32_t desc) 5157 { 5158 intptr_t j, i = simd_oprsz(desc); 5159 unsigned rot = simd_data(desc); 5160 bool flip = rot & 1; 5161 float16 neg_imag, neg_real; 5162 uint64_t *g = vg; 5163 5164 neg_imag = float16_set_sign(0, (rot & 2) != 0); 5165 neg_real = float16_set_sign(0, rot == 1 || rot == 2); 5166 5167 do { 5168 uint64_t pg = g[(i - 1) >> 6]; 5169 do { 5170 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5171 5172 /* I holds the real index; J holds the imag index. */ 5173 j = i - sizeof(float16); 5174 i -= 2 * sizeof(float16); 5175 5176 nr = *(float16 *)(vn + H1_2(i)); 5177 ni = *(float16 *)(vn + H1_2(j)); 5178 mr = *(float16 *)(vm + H1_2(i)); 5179 mi = *(float16 *)(vm + H1_2(j)); 5180 5181 e2 = (flip ? ni : nr); 5182 e1 = (flip ? mi : mr) ^ neg_real; 5183 e4 = e2; 5184 e3 = (flip ? mr : mi) ^ neg_imag; 5185 5186 if (likely((pg >> (i & 63)) & 1)) { 5187 d = *(float16 *)(va + H1_2(i)); 5188 d = float16_muladd(e2, e1, d, 0, status); 5189 *(float16 *)(vd + H1_2(i)) = d; 5190 } 5191 if (likely((pg >> (j & 63)) & 1)) { 5192 d = *(float16 *)(va + H1_2(j)); 5193 d = float16_muladd(e4, e3, d, 0, status); 5194 *(float16 *)(vd + H1_2(j)) = d; 5195 } 5196 } while (i & 63); 5197 } while (i != 0); 5198 } 5199 5200 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5201 void *vg, void *status, uint32_t desc) 5202 { 5203 intptr_t j, i = simd_oprsz(desc); 5204 unsigned rot = simd_data(desc); 5205 bool flip = rot & 1; 5206 float32 neg_imag, neg_real; 5207 uint64_t *g = vg; 5208 5209 neg_imag = float32_set_sign(0, (rot & 2) != 0); 5210 neg_real = float32_set_sign(0, rot == 1 || rot == 2); 5211 5212 do { 5213 uint64_t pg = g[(i - 1) >> 6]; 5214 do { 5215 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5216 5217 /* I holds the real index; J holds the imag index. */ 5218 j = i - sizeof(float32); 5219 i -= 2 * sizeof(float32); 5220 5221 nr = *(float32 *)(vn + H1_2(i)); 5222 ni = *(float32 *)(vn + H1_2(j)); 5223 mr = *(float32 *)(vm + H1_2(i)); 5224 mi = *(float32 *)(vm + H1_2(j)); 5225 5226 e2 = (flip ? ni : nr); 5227 e1 = (flip ? mi : mr) ^ neg_real; 5228 e4 = e2; 5229 e3 = (flip ? mr : mi) ^ neg_imag; 5230 5231 if (likely((pg >> (i & 63)) & 1)) { 5232 d = *(float32 *)(va + H1_2(i)); 5233 d = float32_muladd(e2, e1, d, 0, status); 5234 *(float32 *)(vd + H1_2(i)) = d; 5235 } 5236 if (likely((pg >> (j & 63)) & 1)) { 5237 d = *(float32 *)(va + H1_2(j)); 5238 d = float32_muladd(e4, e3, d, 0, status); 5239 *(float32 *)(vd + H1_2(j)) = d; 5240 } 5241 } while (i & 63); 5242 } while (i != 0); 5243 } 5244 5245 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5246 void *vg, void *status, uint32_t desc) 5247 { 5248 intptr_t j, i = simd_oprsz(desc); 5249 unsigned rot = simd_data(desc); 5250 bool flip = rot & 1; 5251 float64 neg_imag, neg_real; 5252 uint64_t *g = vg; 5253 5254 neg_imag = float64_set_sign(0, (rot & 2) != 0); 5255 neg_real = float64_set_sign(0, rot == 1 || rot == 2); 5256 5257 do { 5258 uint64_t pg = g[(i - 1) >> 6]; 5259 do { 5260 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5261 5262 /* I holds the real index; J holds the imag index. */ 5263 j = i - sizeof(float64); 5264 i -= 2 * sizeof(float64); 5265 5266 nr = *(float64 *)(vn + H1_2(i)); 5267 ni = *(float64 *)(vn + H1_2(j)); 5268 mr = *(float64 *)(vm + H1_2(i)); 5269 mi = *(float64 *)(vm + H1_2(j)); 5270 5271 e2 = (flip ? ni : nr); 5272 e1 = (flip ? mi : mr) ^ neg_real; 5273 e4 = e2; 5274 e3 = (flip ? mr : mi) ^ neg_imag; 5275 5276 if (likely((pg >> (i & 63)) & 1)) { 5277 d = *(float64 *)(va + H1_2(i)); 5278 d = float64_muladd(e2, e1, d, 0, status); 5279 *(float64 *)(vd + H1_2(i)) = d; 5280 } 5281 if (likely((pg >> (j & 63)) & 1)) { 5282 d = *(float64 *)(va + H1_2(j)); 5283 d = float64_muladd(e4, e3, d, 0, status); 5284 *(float64 *)(vd + H1_2(j)) = d; 5285 } 5286 } while (i & 63); 5287 } while (i != 0); 5288 } 5289 5290 /* 5291 * Load contiguous data, protected by a governing predicate. 5292 */ 5293 5294 /* 5295 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5296 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5297 * element >= @reg_off, or @reg_max if there were no active elements at all. 5298 */ 5299 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5300 intptr_t reg_max, int esz) 5301 { 5302 uint64_t pg_mask = pred_esz_masks[esz]; 5303 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5304 5305 /* In normal usage, the first element is active. */ 5306 if (likely(pg & 1)) { 5307 return reg_off; 5308 } 5309 5310 if (pg == 0) { 5311 reg_off &= -64; 5312 do { 5313 reg_off += 64; 5314 if (unlikely(reg_off >= reg_max)) { 5315 /* The entire predicate was false. */ 5316 return reg_max; 5317 } 5318 pg = vg[reg_off >> 6] & pg_mask; 5319 } while (pg == 0); 5320 } 5321 reg_off += ctz64(pg); 5322 5323 /* We should never see an out of range predicate bit set. */ 5324 tcg_debug_assert(reg_off < reg_max); 5325 return reg_off; 5326 } 5327 5328 /* 5329 * Resolve the guest virtual address to info->host and info->flags. 5330 * If @nofault, return false if the page is invalid, otherwise 5331 * exit via page fault exception. 5332 */ 5333 5334 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5335 target_ulong addr, int mem_off, MMUAccessType access_type, 5336 int mmu_idx, uintptr_t retaddr) 5337 { 5338 int flags; 5339 5340 addr += mem_off; 5341 5342 /* 5343 * User-only currently always issues with TBI. See the comment 5344 * above useronly_clean_ptr. Usually we clean this top byte away 5345 * during translation, but we can't do that for e.g. vector + imm 5346 * addressing modes. 5347 * 5348 * We currently always enable TBI for user-only, and do not provide 5349 * a way to turn it off. So clean the pointer unconditionally here, 5350 * rather than look it up here, or pass it down from above. 5351 */ 5352 addr = useronly_clean_ptr(addr); 5353 5354 #ifdef CONFIG_USER_ONLY 5355 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault, 5356 &info->host, retaddr); 5357 memset(&info->attrs, 0, sizeof(info->attrs)); 5358 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5359 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5360 #else 5361 CPUTLBEntryFull *full; 5362 flags = probe_access_full(env, addr, access_type, mmu_idx, nofault, 5363 &info->host, &full, retaddr); 5364 info->attrs = full->attrs; 5365 info->tagged = full->pte_attrs == 0xf0; 5366 #endif 5367 info->flags = flags; 5368 5369 if (flags & TLB_INVALID_MASK) { 5370 g_assert(nofault); 5371 return false; 5372 } 5373 5374 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5375 info->host -= mem_off; 5376 return true; 5377 } 5378 5379 /* 5380 * Find first active element on each page, and a loose bound for the 5381 * final element on each page. Identify any single element that spans 5382 * the page boundary. Return true if there are any active elements. 5383 */ 5384 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5385 intptr_t reg_max, int esz, int msize) 5386 { 5387 const int esize = 1 << esz; 5388 const uint64_t pg_mask = pred_esz_masks[esz]; 5389 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5390 intptr_t mem_off_last, mem_off_split; 5391 intptr_t page_split, elt_split; 5392 intptr_t i; 5393 5394 /* Set all of the element indices to -1, and the TLB data to 0. */ 5395 memset(info, -1, offsetof(SVEContLdSt, page)); 5396 memset(info->page, 0, sizeof(info->page)); 5397 5398 /* Gross scan over the entire predicate to find bounds. */ 5399 i = 0; 5400 do { 5401 uint64_t pg = vg[i] & pg_mask; 5402 if (pg) { 5403 reg_off_last = i * 64 + 63 - clz64(pg); 5404 if (reg_off_first < 0) { 5405 reg_off_first = i * 64 + ctz64(pg); 5406 } 5407 } 5408 } while (++i * 64 < reg_max); 5409 5410 if (unlikely(reg_off_first < 0)) { 5411 /* No active elements, no pages touched. */ 5412 return false; 5413 } 5414 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5415 5416 info->reg_off_first[0] = reg_off_first; 5417 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5418 mem_off_last = (reg_off_last >> esz) * msize; 5419 5420 page_split = -(addr | TARGET_PAGE_MASK); 5421 if (likely(mem_off_last + msize <= page_split)) { 5422 /* The entire operation fits within a single page. */ 5423 info->reg_off_last[0] = reg_off_last; 5424 return true; 5425 } 5426 5427 info->page_split = page_split; 5428 elt_split = page_split / msize; 5429 reg_off_split = elt_split << esz; 5430 mem_off_split = elt_split * msize; 5431 5432 /* 5433 * This is the last full element on the first page, but it is not 5434 * necessarily active. If there is no full element, i.e. the first 5435 * active element is the one that's split, this value remains -1. 5436 * It is useful as iteration bounds. 5437 */ 5438 if (elt_split != 0) { 5439 info->reg_off_last[0] = reg_off_split - esize; 5440 } 5441 5442 /* Determine if an unaligned element spans the pages. */ 5443 if (page_split % msize != 0) { 5444 /* It is helpful to know if the split element is active. */ 5445 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5446 info->reg_off_split = reg_off_split; 5447 info->mem_off_split = mem_off_split; 5448 5449 if (reg_off_split == reg_off_last) { 5450 /* The page crossing element is last. */ 5451 return true; 5452 } 5453 } 5454 reg_off_split += esize; 5455 mem_off_split += msize; 5456 } 5457 5458 /* 5459 * We do want the first active element on the second page, because 5460 * this may affect the address reported in an exception. 5461 */ 5462 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5463 tcg_debug_assert(reg_off_split <= reg_off_last); 5464 info->reg_off_first[1] = reg_off_split; 5465 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5466 info->reg_off_last[1] = reg_off_last; 5467 return true; 5468 } 5469 5470 /* 5471 * Resolve the guest virtual addresses to info->page[]. 5472 * Control the generation of page faults with @fault. Return false if 5473 * there is no work to do, which can only happen with @fault == FAULT_NO. 5474 */ 5475 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5476 CPUARMState *env, target_ulong addr, 5477 MMUAccessType access_type, uintptr_t retaddr) 5478 { 5479 int mmu_idx = cpu_mmu_index(env, false); 5480 int mem_off = info->mem_off_first[0]; 5481 bool nofault = fault == FAULT_NO; 5482 bool have_work = true; 5483 5484 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5485 access_type, mmu_idx, retaddr)) { 5486 /* No work to be done. */ 5487 return false; 5488 } 5489 5490 if (likely(info->page_split < 0)) { 5491 /* The entire operation was on the one page. */ 5492 return true; 5493 } 5494 5495 /* 5496 * If the second page is invalid, then we want the fault address to be 5497 * the first byte on that page which is accessed. 5498 */ 5499 if (info->mem_off_split >= 0) { 5500 /* 5501 * There is an element split across the pages. The fault address 5502 * should be the first byte of the second page. 5503 */ 5504 mem_off = info->page_split; 5505 /* 5506 * If the split element is also the first active element 5507 * of the vector, then: For first-fault we should continue 5508 * to generate faults for the second page. For no-fault, 5509 * we have work only if the second page is valid. 5510 */ 5511 if (info->mem_off_first[0] < info->mem_off_split) { 5512 nofault = FAULT_FIRST; 5513 have_work = false; 5514 } 5515 } else { 5516 /* 5517 * There is no element split across the pages. The fault address 5518 * should be the first active element on the second page. 5519 */ 5520 mem_off = info->mem_off_first[1]; 5521 /* 5522 * There must have been one active element on the first page, 5523 * so we're out of first-fault territory. 5524 */ 5525 nofault = fault != FAULT_ALL; 5526 } 5527 5528 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 5529 access_type, mmu_idx, retaddr); 5530 return have_work; 5531 } 5532 5533 #ifndef CONFIG_USER_ONLY 5534 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 5535 uint64_t *vg, target_ulong addr, 5536 int esize, int msize, int wp_access, 5537 uintptr_t retaddr) 5538 { 5539 intptr_t mem_off, reg_off, reg_last; 5540 int flags0 = info->page[0].flags; 5541 int flags1 = info->page[1].flags; 5542 5543 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 5544 return; 5545 } 5546 5547 /* Indicate that watchpoints are handled. */ 5548 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 5549 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 5550 5551 if (flags0 & TLB_WATCHPOINT) { 5552 mem_off = info->mem_off_first[0]; 5553 reg_off = info->reg_off_first[0]; 5554 reg_last = info->reg_off_last[0]; 5555 5556 while (reg_off <= reg_last) { 5557 uint64_t pg = vg[reg_off >> 6]; 5558 do { 5559 if ((pg >> (reg_off & 63)) & 1) { 5560 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5561 msize, info->page[0].attrs, 5562 wp_access, retaddr); 5563 } 5564 reg_off += esize; 5565 mem_off += msize; 5566 } while (reg_off <= reg_last && (reg_off & 63)); 5567 } 5568 } 5569 5570 mem_off = info->mem_off_split; 5571 if (mem_off >= 0) { 5572 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 5573 info->page[0].attrs, wp_access, retaddr); 5574 } 5575 5576 mem_off = info->mem_off_first[1]; 5577 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 5578 reg_off = info->reg_off_first[1]; 5579 reg_last = info->reg_off_last[1]; 5580 5581 do { 5582 uint64_t pg = vg[reg_off >> 6]; 5583 do { 5584 if ((pg >> (reg_off & 63)) & 1) { 5585 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5586 msize, info->page[1].attrs, 5587 wp_access, retaddr); 5588 } 5589 reg_off += esize; 5590 mem_off += msize; 5591 } while (reg_off & 63); 5592 } while (reg_off <= reg_last); 5593 } 5594 } 5595 #endif 5596 5597 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 5598 uint64_t *vg, target_ulong addr, int esize, 5599 int msize, uint32_t mtedesc, uintptr_t ra) 5600 { 5601 intptr_t mem_off, reg_off, reg_last; 5602 5603 /* Process the page only if MemAttr == Tagged. */ 5604 if (info->page[0].tagged) { 5605 mem_off = info->mem_off_first[0]; 5606 reg_off = info->reg_off_first[0]; 5607 reg_last = info->reg_off_split; 5608 if (reg_last < 0) { 5609 reg_last = info->reg_off_last[0]; 5610 } 5611 5612 do { 5613 uint64_t pg = vg[reg_off >> 6]; 5614 do { 5615 if ((pg >> (reg_off & 63)) & 1) { 5616 mte_check(env, mtedesc, addr, ra); 5617 } 5618 reg_off += esize; 5619 mem_off += msize; 5620 } while (reg_off <= reg_last && (reg_off & 63)); 5621 } while (reg_off <= reg_last); 5622 } 5623 5624 mem_off = info->mem_off_first[1]; 5625 if (mem_off >= 0 && info->page[1].tagged) { 5626 reg_off = info->reg_off_first[1]; 5627 reg_last = info->reg_off_last[1]; 5628 5629 do { 5630 uint64_t pg = vg[reg_off >> 6]; 5631 do { 5632 if ((pg >> (reg_off & 63)) & 1) { 5633 mte_check(env, mtedesc, addr, ra); 5634 } 5635 reg_off += esize; 5636 mem_off += msize; 5637 } while (reg_off & 63); 5638 } while (reg_off <= reg_last); 5639 } 5640 } 5641 5642 /* 5643 * Common helper for all contiguous 1,2,3,4-register predicated stores. 5644 */ 5645 static inline QEMU_ALWAYS_INLINE 5646 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 5647 uint32_t desc, const uintptr_t retaddr, 5648 const int esz, const int msz, const int N, uint32_t mtedesc, 5649 sve_ldst1_host_fn *host_fn, 5650 sve_ldst1_tlb_fn *tlb_fn) 5651 { 5652 const unsigned rd = simd_data(desc); 5653 const intptr_t reg_max = simd_oprsz(desc); 5654 intptr_t reg_off, reg_last, mem_off; 5655 SVEContLdSt info; 5656 void *host; 5657 int flags, i; 5658 5659 /* Find the active elements. */ 5660 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 5661 /* The entire predicate was false; no load occurs. */ 5662 for (i = 0; i < N; ++i) { 5663 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5664 } 5665 return; 5666 } 5667 5668 /* Probe the page(s). Exit with exception for any invalid page. */ 5669 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 5670 5671 /* Handle watchpoints for all active elements. */ 5672 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 5673 BP_MEM_READ, retaddr); 5674 5675 /* 5676 * Handle mte checks for all active elements. 5677 * Since TBI must be set for MTE, !mtedesc => !mte_active. 5678 */ 5679 if (mtedesc) { 5680 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 5681 mtedesc, retaddr); 5682 } 5683 5684 flags = info.page[0].flags | info.page[1].flags; 5685 if (unlikely(flags != 0)) { 5686 #ifdef CONFIG_USER_ONLY 5687 g_assert_not_reached(); 5688 #else 5689 /* 5690 * At least one page includes MMIO. 5691 * Any bus operation can fail with cpu_transaction_failed, 5692 * which for ARM will raise SyncExternal. Perform the load 5693 * into scratch memory to preserve register state until the end. 5694 */ 5695 ARMVectorReg scratch[4] = { }; 5696 5697 mem_off = info.mem_off_first[0]; 5698 reg_off = info.reg_off_first[0]; 5699 reg_last = info.reg_off_last[1]; 5700 if (reg_last < 0) { 5701 reg_last = info.reg_off_split; 5702 if (reg_last < 0) { 5703 reg_last = info.reg_off_last[0]; 5704 } 5705 } 5706 5707 do { 5708 uint64_t pg = vg[reg_off >> 6]; 5709 do { 5710 if ((pg >> (reg_off & 63)) & 1) { 5711 for (i = 0; i < N; ++i) { 5712 tlb_fn(env, &scratch[i], reg_off, 5713 addr + mem_off + (i << msz), retaddr); 5714 } 5715 } 5716 reg_off += 1 << esz; 5717 mem_off += N << msz; 5718 } while (reg_off & 63); 5719 } while (reg_off <= reg_last); 5720 5721 for (i = 0; i < N; ++i) { 5722 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 5723 } 5724 return; 5725 #endif 5726 } 5727 5728 /* The entire operation is in RAM, on valid pages. */ 5729 5730 for (i = 0; i < N; ++i) { 5731 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5732 } 5733 5734 mem_off = info.mem_off_first[0]; 5735 reg_off = info.reg_off_first[0]; 5736 reg_last = info.reg_off_last[0]; 5737 host = info.page[0].host; 5738 5739 while (reg_off <= reg_last) { 5740 uint64_t pg = vg[reg_off >> 6]; 5741 do { 5742 if ((pg >> (reg_off & 63)) & 1) { 5743 for (i = 0; i < N; ++i) { 5744 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5745 host + mem_off + (i << msz)); 5746 } 5747 } 5748 reg_off += 1 << esz; 5749 mem_off += N << msz; 5750 } while (reg_off <= reg_last && (reg_off & 63)); 5751 } 5752 5753 /* 5754 * Use the slow path to manage the cross-page misalignment. 5755 * But we know this is RAM and cannot trap. 5756 */ 5757 mem_off = info.mem_off_split; 5758 if (unlikely(mem_off >= 0)) { 5759 reg_off = info.reg_off_split; 5760 for (i = 0; i < N; ++i) { 5761 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 5762 addr + mem_off + (i << msz), retaddr); 5763 } 5764 } 5765 5766 mem_off = info.mem_off_first[1]; 5767 if (unlikely(mem_off >= 0)) { 5768 reg_off = info.reg_off_first[1]; 5769 reg_last = info.reg_off_last[1]; 5770 host = info.page[1].host; 5771 5772 do { 5773 uint64_t pg = vg[reg_off >> 6]; 5774 do { 5775 if ((pg >> (reg_off & 63)) & 1) { 5776 for (i = 0; i < N; ++i) { 5777 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5778 host + mem_off + (i << msz)); 5779 } 5780 } 5781 reg_off += 1 << esz; 5782 mem_off += N << msz; 5783 } while (reg_off & 63); 5784 } while (reg_off <= reg_last); 5785 } 5786 } 5787 5788 static inline QEMU_ALWAYS_INLINE 5789 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 5790 uint32_t desc, const uintptr_t ra, 5791 const int esz, const int msz, const int N, 5792 sve_ldst1_host_fn *host_fn, 5793 sve_ldst1_tlb_fn *tlb_fn) 5794 { 5795 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5796 int bit55 = extract64(addr, 55, 1); 5797 5798 /* Remove mtedesc from the normal sve descriptor. */ 5799 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5800 5801 /* Perform gross MTE suppression early. */ 5802 if (!tbi_check(desc, bit55) || 5803 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 5804 mtedesc = 0; 5805 } 5806 5807 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 5808 } 5809 5810 #define DO_LD1_1(NAME, ESZ) \ 5811 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 5812 target_ulong addr, uint32_t desc) \ 5813 { \ 5814 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 5815 sve_##NAME##_host, sve_##NAME##_tlb); \ 5816 } \ 5817 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 5818 target_ulong addr, uint32_t desc) \ 5819 { \ 5820 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 5821 sve_##NAME##_host, sve_##NAME##_tlb); \ 5822 } 5823 5824 #define DO_LD1_2(NAME, ESZ, MSZ) \ 5825 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 5826 target_ulong addr, uint32_t desc) \ 5827 { \ 5828 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5829 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5830 } \ 5831 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 5832 target_ulong addr, uint32_t desc) \ 5833 { \ 5834 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5835 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5836 } \ 5837 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 5838 target_ulong addr, uint32_t desc) \ 5839 { \ 5840 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5841 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5842 } \ 5843 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 5844 target_ulong addr, uint32_t desc) \ 5845 { \ 5846 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5847 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5848 } 5849 5850 DO_LD1_1(ld1bb, MO_8) 5851 DO_LD1_1(ld1bhu, MO_16) 5852 DO_LD1_1(ld1bhs, MO_16) 5853 DO_LD1_1(ld1bsu, MO_32) 5854 DO_LD1_1(ld1bss, MO_32) 5855 DO_LD1_1(ld1bdu, MO_64) 5856 DO_LD1_1(ld1bds, MO_64) 5857 5858 DO_LD1_2(ld1hh, MO_16, MO_16) 5859 DO_LD1_2(ld1hsu, MO_32, MO_16) 5860 DO_LD1_2(ld1hss, MO_32, MO_16) 5861 DO_LD1_2(ld1hdu, MO_64, MO_16) 5862 DO_LD1_2(ld1hds, MO_64, MO_16) 5863 5864 DO_LD1_2(ld1ss, MO_32, MO_32) 5865 DO_LD1_2(ld1sdu, MO_64, MO_32) 5866 DO_LD1_2(ld1sds, MO_64, MO_32) 5867 5868 DO_LD1_2(ld1dd, MO_64, MO_64) 5869 5870 #undef DO_LD1_1 5871 #undef DO_LD1_2 5872 5873 #define DO_LDN_1(N) \ 5874 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 5875 target_ulong addr, uint32_t desc) \ 5876 { \ 5877 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 5878 sve_ld1bb_host, sve_ld1bb_tlb); \ 5879 } \ 5880 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 5881 target_ulong addr, uint32_t desc) \ 5882 { \ 5883 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 5884 sve_ld1bb_host, sve_ld1bb_tlb); \ 5885 } 5886 5887 #define DO_LDN_2(N, SUFF, ESZ) \ 5888 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 5889 target_ulong addr, uint32_t desc) \ 5890 { \ 5891 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5892 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5893 } \ 5894 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 5895 target_ulong addr, uint32_t desc) \ 5896 { \ 5897 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5898 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5899 } \ 5900 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 5901 target_ulong addr, uint32_t desc) \ 5902 { \ 5903 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5904 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5905 } \ 5906 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 5907 target_ulong addr, uint32_t desc) \ 5908 { \ 5909 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5910 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5911 } 5912 5913 DO_LDN_1(2) 5914 DO_LDN_1(3) 5915 DO_LDN_1(4) 5916 5917 DO_LDN_2(2, hh, MO_16) 5918 DO_LDN_2(3, hh, MO_16) 5919 DO_LDN_2(4, hh, MO_16) 5920 5921 DO_LDN_2(2, ss, MO_32) 5922 DO_LDN_2(3, ss, MO_32) 5923 DO_LDN_2(4, ss, MO_32) 5924 5925 DO_LDN_2(2, dd, MO_64) 5926 DO_LDN_2(3, dd, MO_64) 5927 DO_LDN_2(4, dd, MO_64) 5928 5929 #undef DO_LDN_1 5930 #undef DO_LDN_2 5931 5932 /* 5933 * Load contiguous data, first-fault and no-fault. 5934 * 5935 * For user-only, one could argue that we should hold the mmap_lock during 5936 * the operation so that there is no race between page_check_range and the 5937 * load operation. However, unmapping pages out from under a running thread 5938 * is extraordinarily unlikely. This theoretical race condition also affects 5939 * linux-user/ in its get_user/put_user macros. 5940 * 5941 * TODO: Construct some helpers, written in assembly, that interact with 5942 * host_signal_handler to produce memory ops which can properly report errors 5943 * without racing. 5944 */ 5945 5946 /* Fault on byte I. All bits in FFR from I are cleared. The vector 5947 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 5948 * option, which leaves subsequent data unchanged. 5949 */ 5950 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 5951 { 5952 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 5953 5954 if (i & 63) { 5955 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 5956 i = ROUND_UP(i, 64); 5957 } 5958 for (; i < oprsz; i += 64) { 5959 ffr[i / 64] = 0; 5960 } 5961 } 5962 5963 /* 5964 * Common helper for all contiguous no-fault and first-fault loads. 5965 */ 5966 static inline QEMU_ALWAYS_INLINE 5967 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 5968 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 5969 const int esz, const int msz, const SVEContFault fault, 5970 sve_ldst1_host_fn *host_fn, 5971 sve_ldst1_tlb_fn *tlb_fn) 5972 { 5973 const unsigned rd = simd_data(desc); 5974 void *vd = &env->vfp.zregs[rd]; 5975 const intptr_t reg_max = simd_oprsz(desc); 5976 intptr_t reg_off, mem_off, reg_last; 5977 SVEContLdSt info; 5978 int flags; 5979 void *host; 5980 5981 /* Find the active elements. */ 5982 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 5983 /* The entire predicate was false; no load occurs. */ 5984 memset(vd, 0, reg_max); 5985 return; 5986 } 5987 reg_off = info.reg_off_first[0]; 5988 5989 /* Probe the page(s). */ 5990 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 5991 /* Fault on first element. */ 5992 tcg_debug_assert(fault == FAULT_NO); 5993 memset(vd, 0, reg_max); 5994 goto do_fault; 5995 } 5996 5997 mem_off = info.mem_off_first[0]; 5998 flags = info.page[0].flags; 5999 6000 /* 6001 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6002 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6003 */ 6004 if (!info.page[0].tagged) { 6005 mtedesc = 0; 6006 } 6007 6008 if (fault == FAULT_FIRST) { 6009 /* Trapping mte check for the first-fault element. */ 6010 if (mtedesc) { 6011 mte_check(env, mtedesc, addr + mem_off, retaddr); 6012 } 6013 6014 /* 6015 * Special handling of the first active element, 6016 * if it crosses a page boundary or is MMIO. 6017 */ 6018 bool is_split = mem_off == info.mem_off_split; 6019 if (unlikely(flags != 0) || unlikely(is_split)) { 6020 /* 6021 * Use the slow path for cross-page handling. 6022 * Might trap for MMIO or watchpoints. 6023 */ 6024 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6025 6026 /* After any fault, zero the other elements. */ 6027 swap_memzero(vd, reg_off); 6028 reg_off += 1 << esz; 6029 mem_off += 1 << msz; 6030 swap_memzero(vd + reg_off, reg_max - reg_off); 6031 6032 if (is_split) { 6033 goto second_page; 6034 } 6035 } else { 6036 memset(vd, 0, reg_max); 6037 } 6038 } else { 6039 memset(vd, 0, reg_max); 6040 if (unlikely(mem_off == info.mem_off_split)) { 6041 /* The first active element crosses a page boundary. */ 6042 flags |= info.page[1].flags; 6043 if (unlikely(flags & TLB_MMIO)) { 6044 /* Some page is MMIO, see below. */ 6045 goto do_fault; 6046 } 6047 if (unlikely(flags & TLB_WATCHPOINT) && 6048 (cpu_watchpoint_address_matches 6049 (env_cpu(env), addr + mem_off, 1 << msz) 6050 & BP_MEM_READ)) { 6051 /* Watchpoint hit, see below. */ 6052 goto do_fault; 6053 } 6054 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6055 goto do_fault; 6056 } 6057 /* 6058 * Use the slow path for cross-page handling. 6059 * This is RAM, without a watchpoint, and will not trap. 6060 */ 6061 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6062 goto second_page; 6063 } 6064 } 6065 6066 /* 6067 * From this point on, all memory operations are MemSingleNF. 6068 * 6069 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6070 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6071 * 6072 * Unfortuately we do not have access to the memory attributes from the 6073 * PTE to tell Device memory from Normal memory. So we make a mostly 6074 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6075 * This gives the right answer for the common cases of "Normal memory, 6076 * backed by host RAM" and "Device memory, backed by MMIO". 6077 * The architecture allows us to suppress an NF load and return 6078 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6079 * case of "Normal memory, backed by MMIO" is permitted. The case we 6080 * get wrong is "Device memory, backed by host RAM", for which we 6081 * should return (UNKNOWN, FAULT) for but do not. 6082 * 6083 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6084 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6085 * architectural breakpoints the same. 6086 */ 6087 if (unlikely(flags & TLB_MMIO)) { 6088 goto do_fault; 6089 } 6090 6091 reg_last = info.reg_off_last[0]; 6092 host = info.page[0].host; 6093 6094 do { 6095 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6096 do { 6097 if ((pg >> (reg_off & 63)) & 1) { 6098 if (unlikely(flags & TLB_WATCHPOINT) && 6099 (cpu_watchpoint_address_matches 6100 (env_cpu(env), addr + mem_off, 1 << msz) 6101 & BP_MEM_READ)) { 6102 goto do_fault; 6103 } 6104 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6105 goto do_fault; 6106 } 6107 host_fn(vd, reg_off, host + mem_off); 6108 } 6109 reg_off += 1 << esz; 6110 mem_off += 1 << msz; 6111 } while (reg_off <= reg_last && (reg_off & 63)); 6112 } while (reg_off <= reg_last); 6113 6114 /* 6115 * MemSingleNF is allowed to fail for any reason. We have special 6116 * code above to handle the first element crossing a page boundary. 6117 * As an implementation choice, decline to handle a cross-page element 6118 * in any other position. 6119 */ 6120 reg_off = info.reg_off_split; 6121 if (reg_off >= 0) { 6122 goto do_fault; 6123 } 6124 6125 second_page: 6126 reg_off = info.reg_off_first[1]; 6127 if (likely(reg_off < 0)) { 6128 /* No active elements on the second page. All done. */ 6129 return; 6130 } 6131 6132 /* 6133 * MemSingleNF is allowed to fail for any reason. As an implementation 6134 * choice, decline to handle elements on the second page. This should 6135 * be low frequency as the guest walks through memory -- the next 6136 * iteration of the guest's loop should be aligned on the page boundary, 6137 * and then all following iterations will stay aligned. 6138 */ 6139 6140 do_fault: 6141 record_fault(env, reg_off, reg_max); 6142 } 6143 6144 static inline QEMU_ALWAYS_INLINE 6145 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6146 uint32_t desc, const uintptr_t retaddr, 6147 const int esz, const int msz, const SVEContFault fault, 6148 sve_ldst1_host_fn *host_fn, 6149 sve_ldst1_tlb_fn *tlb_fn) 6150 { 6151 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6152 int bit55 = extract64(addr, 55, 1); 6153 6154 /* Remove mtedesc from the normal sve descriptor. */ 6155 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6156 6157 /* Perform gross MTE suppression early. */ 6158 if (!tbi_check(desc, bit55) || 6159 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 6160 mtedesc = 0; 6161 } 6162 6163 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6164 esz, msz, fault, host_fn, tlb_fn); 6165 } 6166 6167 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6168 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6169 target_ulong addr, uint32_t desc) \ 6170 { \ 6171 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6172 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6173 } \ 6174 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6175 target_ulong addr, uint32_t desc) \ 6176 { \ 6177 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6178 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6179 } \ 6180 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6181 target_ulong addr, uint32_t desc) \ 6182 { \ 6183 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6184 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6185 } \ 6186 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6187 target_ulong addr, uint32_t desc) \ 6188 { \ 6189 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6190 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6191 } 6192 6193 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6194 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6195 target_ulong addr, uint32_t desc) \ 6196 { \ 6197 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6198 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6199 } \ 6200 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6201 target_ulong addr, uint32_t desc) \ 6202 { \ 6203 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6204 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6205 } \ 6206 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6207 target_ulong addr, uint32_t desc) \ 6208 { \ 6209 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6210 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6211 } \ 6212 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6213 target_ulong addr, uint32_t desc) \ 6214 { \ 6215 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6216 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6217 } \ 6218 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6219 target_ulong addr, uint32_t desc) \ 6220 { \ 6221 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6222 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6223 } \ 6224 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6225 target_ulong addr, uint32_t desc) \ 6226 { \ 6227 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6228 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6229 } \ 6230 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6231 target_ulong addr, uint32_t desc) \ 6232 { \ 6233 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6234 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6235 } \ 6236 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6237 target_ulong addr, uint32_t desc) \ 6238 { \ 6239 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6240 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6241 } 6242 6243 DO_LDFF1_LDNF1_1(bb, MO_8) 6244 DO_LDFF1_LDNF1_1(bhu, MO_16) 6245 DO_LDFF1_LDNF1_1(bhs, MO_16) 6246 DO_LDFF1_LDNF1_1(bsu, MO_32) 6247 DO_LDFF1_LDNF1_1(bss, MO_32) 6248 DO_LDFF1_LDNF1_1(bdu, MO_64) 6249 DO_LDFF1_LDNF1_1(bds, MO_64) 6250 6251 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6252 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6253 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6254 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6255 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6256 6257 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6258 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6259 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6260 6261 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6262 6263 #undef DO_LDFF1_LDNF1_1 6264 #undef DO_LDFF1_LDNF1_2 6265 6266 /* 6267 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6268 */ 6269 6270 static inline QEMU_ALWAYS_INLINE 6271 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6272 uint32_t desc, const uintptr_t retaddr, 6273 const int esz, const int msz, const int N, uint32_t mtedesc, 6274 sve_ldst1_host_fn *host_fn, 6275 sve_ldst1_tlb_fn *tlb_fn) 6276 { 6277 const unsigned rd = simd_data(desc); 6278 const intptr_t reg_max = simd_oprsz(desc); 6279 intptr_t reg_off, reg_last, mem_off; 6280 SVEContLdSt info; 6281 void *host; 6282 int i, flags; 6283 6284 /* Find the active elements. */ 6285 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6286 /* The entire predicate was false; no store occurs. */ 6287 return; 6288 } 6289 6290 /* Probe the page(s). Exit with exception for any invalid page. */ 6291 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6292 6293 /* Handle watchpoints for all active elements. */ 6294 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6295 BP_MEM_WRITE, retaddr); 6296 6297 /* 6298 * Handle mte checks for all active elements. 6299 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6300 */ 6301 if (mtedesc) { 6302 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6303 mtedesc, retaddr); 6304 } 6305 6306 flags = info.page[0].flags | info.page[1].flags; 6307 if (unlikely(flags != 0)) { 6308 #ifdef CONFIG_USER_ONLY 6309 g_assert_not_reached(); 6310 #else 6311 /* 6312 * At least one page includes MMIO. 6313 * Any bus operation can fail with cpu_transaction_failed, 6314 * which for ARM will raise SyncExternal. We cannot avoid 6315 * this fault and will leave with the store incomplete. 6316 */ 6317 mem_off = info.mem_off_first[0]; 6318 reg_off = info.reg_off_first[0]; 6319 reg_last = info.reg_off_last[1]; 6320 if (reg_last < 0) { 6321 reg_last = info.reg_off_split; 6322 if (reg_last < 0) { 6323 reg_last = info.reg_off_last[0]; 6324 } 6325 } 6326 6327 do { 6328 uint64_t pg = vg[reg_off >> 6]; 6329 do { 6330 if ((pg >> (reg_off & 63)) & 1) { 6331 for (i = 0; i < N; ++i) { 6332 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6333 addr + mem_off + (i << msz), retaddr); 6334 } 6335 } 6336 reg_off += 1 << esz; 6337 mem_off += N << msz; 6338 } while (reg_off & 63); 6339 } while (reg_off <= reg_last); 6340 return; 6341 #endif 6342 } 6343 6344 mem_off = info.mem_off_first[0]; 6345 reg_off = info.reg_off_first[0]; 6346 reg_last = info.reg_off_last[0]; 6347 host = info.page[0].host; 6348 6349 while (reg_off <= reg_last) { 6350 uint64_t pg = vg[reg_off >> 6]; 6351 do { 6352 if ((pg >> (reg_off & 63)) & 1) { 6353 for (i = 0; i < N; ++i) { 6354 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6355 host + mem_off + (i << msz)); 6356 } 6357 } 6358 reg_off += 1 << esz; 6359 mem_off += N << msz; 6360 } while (reg_off <= reg_last && (reg_off & 63)); 6361 } 6362 6363 /* 6364 * Use the slow path to manage the cross-page misalignment. 6365 * But we know this is RAM and cannot trap. 6366 */ 6367 mem_off = info.mem_off_split; 6368 if (unlikely(mem_off >= 0)) { 6369 reg_off = info.reg_off_split; 6370 for (i = 0; i < N; ++i) { 6371 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6372 addr + mem_off + (i << msz), retaddr); 6373 } 6374 } 6375 6376 mem_off = info.mem_off_first[1]; 6377 if (unlikely(mem_off >= 0)) { 6378 reg_off = info.reg_off_first[1]; 6379 reg_last = info.reg_off_last[1]; 6380 host = info.page[1].host; 6381 6382 do { 6383 uint64_t pg = vg[reg_off >> 6]; 6384 do { 6385 if ((pg >> (reg_off & 63)) & 1) { 6386 for (i = 0; i < N; ++i) { 6387 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6388 host + mem_off + (i << msz)); 6389 } 6390 } 6391 reg_off += 1 << esz; 6392 mem_off += N << msz; 6393 } while (reg_off & 63); 6394 } while (reg_off <= reg_last); 6395 } 6396 } 6397 6398 static inline QEMU_ALWAYS_INLINE 6399 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6400 uint32_t desc, const uintptr_t ra, 6401 const int esz, const int msz, const int N, 6402 sve_ldst1_host_fn *host_fn, 6403 sve_ldst1_tlb_fn *tlb_fn) 6404 { 6405 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6406 int bit55 = extract64(addr, 55, 1); 6407 6408 /* Remove mtedesc from the normal sve descriptor. */ 6409 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6410 6411 /* Perform gross MTE suppression early. */ 6412 if (!tbi_check(desc, bit55) || 6413 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 6414 mtedesc = 0; 6415 } 6416 6417 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6418 } 6419 6420 #define DO_STN_1(N, NAME, ESZ) \ 6421 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6422 target_ulong addr, uint32_t desc) \ 6423 { \ 6424 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6425 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6426 } \ 6427 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6428 target_ulong addr, uint32_t desc) \ 6429 { \ 6430 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6431 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6432 } 6433 6434 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 6435 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6436 target_ulong addr, uint32_t desc) \ 6437 { \ 6438 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6439 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6440 } \ 6441 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6442 target_ulong addr, uint32_t desc) \ 6443 { \ 6444 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6445 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6446 } \ 6447 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6448 target_ulong addr, uint32_t desc) \ 6449 { \ 6450 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6451 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6452 } \ 6453 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6454 target_ulong addr, uint32_t desc) \ 6455 { \ 6456 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6457 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6458 } 6459 6460 DO_STN_1(1, bb, MO_8) 6461 DO_STN_1(1, bh, MO_16) 6462 DO_STN_1(1, bs, MO_32) 6463 DO_STN_1(1, bd, MO_64) 6464 DO_STN_1(2, bb, MO_8) 6465 DO_STN_1(3, bb, MO_8) 6466 DO_STN_1(4, bb, MO_8) 6467 6468 DO_STN_2(1, hh, MO_16, MO_16) 6469 DO_STN_2(1, hs, MO_32, MO_16) 6470 DO_STN_2(1, hd, MO_64, MO_16) 6471 DO_STN_2(2, hh, MO_16, MO_16) 6472 DO_STN_2(3, hh, MO_16, MO_16) 6473 DO_STN_2(4, hh, MO_16, MO_16) 6474 6475 DO_STN_2(1, ss, MO_32, MO_32) 6476 DO_STN_2(1, sd, MO_64, MO_32) 6477 DO_STN_2(2, ss, MO_32, MO_32) 6478 DO_STN_2(3, ss, MO_32, MO_32) 6479 DO_STN_2(4, ss, MO_32, MO_32) 6480 6481 DO_STN_2(1, dd, MO_64, MO_64) 6482 DO_STN_2(2, dd, MO_64, MO_64) 6483 DO_STN_2(3, dd, MO_64, MO_64) 6484 DO_STN_2(4, dd, MO_64, MO_64) 6485 6486 #undef DO_STN_1 6487 #undef DO_STN_2 6488 6489 /* 6490 * Loads with a vector index. 6491 */ 6492 6493 /* 6494 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 6495 */ 6496 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 6497 6498 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 6499 { 6500 return *(uint32_t *)(reg + H1_4(reg_ofs)); 6501 } 6502 6503 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 6504 { 6505 return *(int32_t *)(reg + H1_4(reg_ofs)); 6506 } 6507 6508 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 6509 { 6510 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 6511 } 6512 6513 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 6514 { 6515 return (int32_t)*(uint64_t *)(reg + reg_ofs); 6516 } 6517 6518 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 6519 { 6520 return *(uint64_t *)(reg + reg_ofs); 6521 } 6522 6523 static inline QEMU_ALWAYS_INLINE 6524 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6525 target_ulong base, uint32_t desc, uintptr_t retaddr, 6526 uint32_t mtedesc, int esize, int msize, 6527 zreg_off_fn *off_fn, 6528 sve_ldst1_host_fn *host_fn, 6529 sve_ldst1_tlb_fn *tlb_fn) 6530 { 6531 const int mmu_idx = cpu_mmu_index(env, false); 6532 const intptr_t reg_max = simd_oprsz(desc); 6533 const int scale = simd_data(desc); 6534 ARMVectorReg scratch; 6535 intptr_t reg_off; 6536 SVEHostPage info, info2; 6537 6538 memset(&scratch, 0, reg_max); 6539 reg_off = 0; 6540 do { 6541 uint64_t pg = vg[reg_off >> 6]; 6542 do { 6543 if (likely(pg & 1)) { 6544 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6545 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6546 6547 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 6548 mmu_idx, retaddr); 6549 6550 if (likely(in_page >= msize)) { 6551 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6552 cpu_check_watchpoint(env_cpu(env), addr, msize, 6553 info.attrs, BP_MEM_READ, retaddr); 6554 } 6555 if (mtedesc && info.tagged) { 6556 mte_check(env, mtedesc, addr, retaddr); 6557 } 6558 if (unlikely(info.flags & TLB_MMIO)) { 6559 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6560 } else { 6561 host_fn(&scratch, reg_off, info.host); 6562 } 6563 } else { 6564 /* Element crosses the page boundary. */ 6565 sve_probe_page(&info2, false, env, addr + in_page, 0, 6566 MMU_DATA_LOAD, mmu_idx, retaddr); 6567 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 6568 cpu_check_watchpoint(env_cpu(env), addr, 6569 msize, info.attrs, 6570 BP_MEM_READ, retaddr); 6571 } 6572 if (mtedesc && info.tagged) { 6573 mte_check(env, mtedesc, addr, retaddr); 6574 } 6575 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6576 } 6577 } 6578 reg_off += esize; 6579 pg >>= esize; 6580 } while (reg_off & 63); 6581 } while (reg_off < reg_max); 6582 6583 /* Wait until all exceptions have been raised to write back. */ 6584 memcpy(vd, &scratch, reg_max); 6585 } 6586 6587 static inline QEMU_ALWAYS_INLINE 6588 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6589 target_ulong base, uint32_t desc, uintptr_t retaddr, 6590 int esize, int msize, zreg_off_fn *off_fn, 6591 sve_ldst1_host_fn *host_fn, 6592 sve_ldst1_tlb_fn *tlb_fn) 6593 { 6594 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6595 /* Remove mtedesc from the normal sve descriptor. */ 6596 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6597 6598 /* 6599 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6600 * offset base entirely over the address space hole to change the 6601 * pointer tag, or change the bit55 selector. So we could here 6602 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6603 */ 6604 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6605 esize, msize, off_fn, host_fn, tlb_fn); 6606 } 6607 6608 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 6609 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6610 void *vm, target_ulong base, uint32_t desc) \ 6611 { \ 6612 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 6613 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6614 } \ 6615 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6616 void *vm, target_ulong base, uint32_t desc) \ 6617 { \ 6618 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 6619 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6620 } 6621 6622 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 6623 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6624 void *vm, target_ulong base, uint32_t desc) \ 6625 { \ 6626 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 6627 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6628 } \ 6629 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6630 void *vm, target_ulong base, uint32_t desc) \ 6631 { \ 6632 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 6633 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6634 } 6635 6636 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 6637 DO_LD1_ZPZ_S(bsu, zss, MO_8) 6638 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 6639 DO_LD1_ZPZ_D(bdu, zss, MO_8) 6640 DO_LD1_ZPZ_D(bdu, zd, MO_8) 6641 6642 DO_LD1_ZPZ_S(bss, zsu, MO_8) 6643 DO_LD1_ZPZ_S(bss, zss, MO_8) 6644 DO_LD1_ZPZ_D(bds, zsu, MO_8) 6645 DO_LD1_ZPZ_D(bds, zss, MO_8) 6646 DO_LD1_ZPZ_D(bds, zd, MO_8) 6647 6648 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 6649 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 6650 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 6651 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 6652 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 6653 6654 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 6655 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 6656 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 6657 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 6658 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 6659 6660 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 6661 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 6662 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 6663 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 6664 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 6665 6666 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 6667 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 6668 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 6669 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 6670 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 6671 6672 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 6673 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 6674 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 6675 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 6676 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 6677 6678 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 6679 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 6680 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 6681 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 6682 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 6683 6684 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 6685 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 6686 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 6687 6688 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 6689 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 6690 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 6691 6692 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 6693 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 6694 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 6695 6696 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 6697 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 6698 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 6699 6700 #undef DO_LD1_ZPZ_S 6701 #undef DO_LD1_ZPZ_D 6702 6703 /* First fault loads with a vector index. */ 6704 6705 /* 6706 * Common helpers for all gather first-faulting loads. 6707 */ 6708 6709 static inline QEMU_ALWAYS_INLINE 6710 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6711 target_ulong base, uint32_t desc, uintptr_t retaddr, 6712 uint32_t mtedesc, const int esz, const int msz, 6713 zreg_off_fn *off_fn, 6714 sve_ldst1_host_fn *host_fn, 6715 sve_ldst1_tlb_fn *tlb_fn) 6716 { 6717 const int mmu_idx = cpu_mmu_index(env, false); 6718 const intptr_t reg_max = simd_oprsz(desc); 6719 const int scale = simd_data(desc); 6720 const int esize = 1 << esz; 6721 const int msize = 1 << msz; 6722 intptr_t reg_off; 6723 SVEHostPage info; 6724 target_ulong addr, in_page; 6725 6726 /* Skip to the first true predicate. */ 6727 reg_off = find_next_active(vg, 0, reg_max, esz); 6728 if (unlikely(reg_off >= reg_max)) { 6729 /* The entire predicate was false; no load occurs. */ 6730 memset(vd, 0, reg_max); 6731 return; 6732 } 6733 6734 /* 6735 * Probe the first element, allowing faults. 6736 */ 6737 addr = base + (off_fn(vm, reg_off) << scale); 6738 if (mtedesc) { 6739 mte_check(env, mtedesc, addr, retaddr); 6740 } 6741 tlb_fn(env, vd, reg_off, addr, retaddr); 6742 6743 /* After any fault, zero the other elements. */ 6744 swap_memzero(vd, reg_off); 6745 reg_off += esize; 6746 swap_memzero(vd + reg_off, reg_max - reg_off); 6747 6748 /* 6749 * Probe the remaining elements, not allowing faults. 6750 */ 6751 while (reg_off < reg_max) { 6752 uint64_t pg = vg[reg_off >> 6]; 6753 do { 6754 if (likely((pg >> (reg_off & 63)) & 1)) { 6755 addr = base + (off_fn(vm, reg_off) << scale); 6756 in_page = -(addr | TARGET_PAGE_MASK); 6757 6758 if (unlikely(in_page < msize)) { 6759 /* Stop if the element crosses a page boundary. */ 6760 goto fault; 6761 } 6762 6763 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 6764 mmu_idx, retaddr); 6765 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 6766 goto fault; 6767 } 6768 if (unlikely(info.flags & TLB_WATCHPOINT) && 6769 (cpu_watchpoint_address_matches 6770 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 6771 goto fault; 6772 } 6773 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 6774 goto fault; 6775 } 6776 6777 host_fn(vd, reg_off, info.host); 6778 } 6779 reg_off += esize; 6780 } while (reg_off & 63); 6781 } 6782 return; 6783 6784 fault: 6785 record_fault(env, reg_off, reg_max); 6786 } 6787 6788 static inline QEMU_ALWAYS_INLINE 6789 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6790 target_ulong base, uint32_t desc, uintptr_t retaddr, 6791 const int esz, const int msz, 6792 zreg_off_fn *off_fn, 6793 sve_ldst1_host_fn *host_fn, 6794 sve_ldst1_tlb_fn *tlb_fn) 6795 { 6796 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6797 /* Remove mtedesc from the normal sve descriptor. */ 6798 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6799 6800 /* 6801 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6802 * offset base entirely over the address space hole to change the 6803 * pointer tag, or change the bit55 selector. So we could here 6804 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6805 */ 6806 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6807 esz, msz, off_fn, host_fn, tlb_fn); 6808 } 6809 6810 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 6811 void HELPER(sve_ldff##MEM##_##OFS) \ 6812 (CPUARMState *env, void *vd, void *vg, \ 6813 void *vm, target_ulong base, uint32_t desc) \ 6814 { \ 6815 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 6816 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6817 } \ 6818 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6819 (CPUARMState *env, void *vd, void *vg, \ 6820 void *vm, target_ulong base, uint32_t desc) \ 6821 { \ 6822 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 6823 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6824 } 6825 6826 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 6827 void HELPER(sve_ldff##MEM##_##OFS) \ 6828 (CPUARMState *env, void *vd, void *vg, \ 6829 void *vm, target_ulong base, uint32_t desc) \ 6830 { \ 6831 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 6832 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6833 } \ 6834 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6835 (CPUARMState *env, void *vd, void *vg, \ 6836 void *vm, target_ulong base, uint32_t desc) \ 6837 { \ 6838 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 6839 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6840 } 6841 6842 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 6843 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 6844 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 6845 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 6846 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 6847 6848 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 6849 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 6850 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 6851 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 6852 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 6853 6854 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 6855 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 6856 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 6857 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 6858 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 6859 6860 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 6861 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 6862 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 6863 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 6864 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 6865 6866 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 6867 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 6868 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 6869 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 6870 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 6871 6872 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 6873 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 6874 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 6875 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 6876 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 6877 6878 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 6879 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 6880 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 6881 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 6882 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 6883 6884 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 6885 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 6886 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 6887 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 6888 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 6889 6890 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 6891 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 6892 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 6893 6894 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 6895 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 6896 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 6897 6898 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 6899 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 6900 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 6901 6902 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 6903 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 6904 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 6905 6906 /* Stores with a vector index. */ 6907 6908 static inline QEMU_ALWAYS_INLINE 6909 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6910 target_ulong base, uint32_t desc, uintptr_t retaddr, 6911 uint32_t mtedesc, int esize, int msize, 6912 zreg_off_fn *off_fn, 6913 sve_ldst1_host_fn *host_fn, 6914 sve_ldst1_tlb_fn *tlb_fn) 6915 { 6916 const int mmu_idx = cpu_mmu_index(env, false); 6917 const intptr_t reg_max = simd_oprsz(desc); 6918 const int scale = simd_data(desc); 6919 void *host[ARM_MAX_VQ * 4]; 6920 intptr_t reg_off, i; 6921 SVEHostPage info, info2; 6922 6923 /* 6924 * Probe all of the elements for host addresses and flags. 6925 */ 6926 i = reg_off = 0; 6927 do { 6928 uint64_t pg = vg[reg_off >> 6]; 6929 do { 6930 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6931 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6932 6933 host[i] = NULL; 6934 if (likely((pg >> (reg_off & 63)) & 1)) { 6935 if (likely(in_page >= msize)) { 6936 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 6937 mmu_idx, retaddr); 6938 if (!(info.flags & TLB_MMIO)) { 6939 host[i] = info.host; 6940 } 6941 } else { 6942 /* 6943 * Element crosses the page boundary. 6944 * Probe both pages, but do not record the host address, 6945 * so that we use the slow path. 6946 */ 6947 sve_probe_page(&info, false, env, addr, 0, 6948 MMU_DATA_STORE, mmu_idx, retaddr); 6949 sve_probe_page(&info2, false, env, addr + in_page, 0, 6950 MMU_DATA_STORE, mmu_idx, retaddr); 6951 info.flags |= info2.flags; 6952 } 6953 6954 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6955 cpu_check_watchpoint(env_cpu(env), addr, msize, 6956 info.attrs, BP_MEM_WRITE, retaddr); 6957 } 6958 6959 if (mtedesc && info.tagged) { 6960 mte_check(env, mtedesc, addr, retaddr); 6961 } 6962 } 6963 i += 1; 6964 reg_off += esize; 6965 } while (reg_off & 63); 6966 } while (reg_off < reg_max); 6967 6968 /* 6969 * Now that we have recognized all exceptions except SyncExternal 6970 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 6971 * 6972 * Note for the common case of an element in RAM, not crossing a page 6973 * boundary, we have stored the host address in host[]. This doubles 6974 * as a first-level check against the predicate, since only enabled 6975 * elements have non-null host addresses. 6976 */ 6977 i = reg_off = 0; 6978 do { 6979 void *h = host[i]; 6980 if (likely(h != NULL)) { 6981 host_fn(vd, reg_off, h); 6982 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 6983 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6984 tlb_fn(env, vd, reg_off, addr, retaddr); 6985 } 6986 i += 1; 6987 reg_off += esize; 6988 } while (reg_off < reg_max); 6989 } 6990 6991 static inline QEMU_ALWAYS_INLINE 6992 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6993 target_ulong base, uint32_t desc, uintptr_t retaddr, 6994 int esize, int msize, zreg_off_fn *off_fn, 6995 sve_ldst1_host_fn *host_fn, 6996 sve_ldst1_tlb_fn *tlb_fn) 6997 { 6998 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6999 /* Remove mtedesc from the normal sve descriptor. */ 7000 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7001 7002 /* 7003 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7004 * offset base entirely over the address space hole to change the 7005 * pointer tag, or change the bit55 selector. So we could here 7006 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7007 */ 7008 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7009 esize, msize, off_fn, host_fn, tlb_fn); 7010 } 7011 7012 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7013 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7014 void *vm, target_ulong base, uint32_t desc) \ 7015 { \ 7016 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7017 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7018 } \ 7019 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7020 void *vm, target_ulong base, uint32_t desc) \ 7021 { \ 7022 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7023 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7024 } 7025 7026 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7027 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7028 void *vm, target_ulong base, uint32_t desc) \ 7029 { \ 7030 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7031 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7032 } \ 7033 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7034 void *vm, target_ulong base, uint32_t desc) \ 7035 { \ 7036 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7037 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7038 } 7039 7040 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7041 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7042 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7043 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7044 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7045 7046 DO_ST1_ZPZ_S(bs, zss, MO_8) 7047 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7048 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7049 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7050 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7051 7052 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7053 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7054 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7055 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7056 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7057 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7058 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7059 7060 DO_ST1_ZPZ_D(bd, zss, MO_8) 7061 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7062 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7063 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7064 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7065 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7066 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7067 7068 DO_ST1_ZPZ_D(bd, zd, MO_8) 7069 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7070 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7071 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7072 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7073 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7074 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7075 7076 #undef DO_ST1_ZPZ_S 7077 #undef DO_ST1_ZPZ_D 7078 7079 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7080 { 7081 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7082 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7083 7084 for (i = 0; i < opr_sz; ++i) { 7085 d[i] = n[i] ^ m[i] ^ k[i]; 7086 } 7087 } 7088 7089 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7090 { 7091 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7092 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7093 7094 for (i = 0; i < opr_sz; ++i) { 7095 d[i] = n[i] ^ (m[i] & ~k[i]); 7096 } 7097 } 7098 7099 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7100 { 7101 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7102 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7103 7104 for (i = 0; i < opr_sz; ++i) { 7105 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 7106 } 7107 } 7108 7109 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7110 { 7111 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7112 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7113 7114 for (i = 0; i < opr_sz; ++i) { 7115 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 7116 } 7117 } 7118 7119 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7120 { 7121 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7122 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7123 7124 for (i = 0; i < opr_sz; ++i) { 7125 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 7126 } 7127 } 7128 7129 /* 7130 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 7131 * See hasless(v,1) from 7132 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 7133 */ 7134 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 7135 { 7136 int bits = 8 << esz; 7137 uint64_t ones = dup_const(esz, 1); 7138 uint64_t signs = ones << (bits - 1); 7139 uint64_t cmp0, cmp1; 7140 7141 cmp1 = dup_const(esz, n); 7142 cmp0 = cmp1 ^ m0; 7143 cmp1 = cmp1 ^ m1; 7144 cmp0 = (cmp0 - ones) & ~cmp0; 7145 cmp1 = (cmp1 - ones) & ~cmp1; 7146 return (cmp0 | cmp1) & signs; 7147 } 7148 7149 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 7150 uint32_t desc, int esz, bool nmatch) 7151 { 7152 uint16_t esz_mask = pred_esz_masks[esz]; 7153 intptr_t opr_sz = simd_oprsz(desc); 7154 uint32_t flags = PREDTEST_INIT; 7155 intptr_t i, j, k; 7156 7157 for (i = 0; i < opr_sz; i += 16) { 7158 uint64_t m0 = *(uint64_t *)(vm + i); 7159 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7160 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 7161 uint16_t out = 0; 7162 7163 for (j = 0; j < 16; j += 8) { 7164 uint64_t n = *(uint64_t *)(vn + i + j); 7165 7166 for (k = 0; k < 8; k += 1 << esz) { 7167 if (pg & (1 << (j + k))) { 7168 bool o = do_match2(n >> (k * 8), m0, m1, esz); 7169 out |= (o ^ nmatch) << (j + k); 7170 } 7171 } 7172 } 7173 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 7174 flags = iter_predtest_fwd(out, pg, flags); 7175 } 7176 return flags; 7177 } 7178 7179 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 7180 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 7181 { \ 7182 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 7183 } 7184 7185 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 7186 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 7187 7188 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 7189 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 7190 7191 #undef DO_PPZZ_MATCH 7192 7193 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 7194 uint32_t desc) 7195 { 7196 ARMVectorReg scratch; 7197 intptr_t i, j; 7198 intptr_t opr_sz = simd_oprsz(desc); 7199 uint32_t *d = vd, *n = vn, *m = vm; 7200 uint8_t *pg = vg; 7201 7202 if (d == n) { 7203 n = memcpy(&scratch, n, opr_sz); 7204 if (d == m) { 7205 m = n; 7206 } 7207 } else if (d == m) { 7208 m = memcpy(&scratch, m, opr_sz); 7209 } 7210 7211 for (i = 0; i < opr_sz; i += 4) { 7212 uint64_t count = 0; 7213 uint8_t pred; 7214 7215 pred = pg[H1(i >> 3)] >> (i & 7); 7216 if (pred & 1) { 7217 uint32_t nn = n[H4(i >> 2)]; 7218 7219 for (j = 0; j <= i; j += 4) { 7220 pred = pg[H1(j >> 3)] >> (j & 7); 7221 if ((pred & 1) && nn == m[H4(j >> 2)]) { 7222 ++count; 7223 } 7224 } 7225 } 7226 d[H4(i >> 2)] = count; 7227 } 7228 } 7229 7230 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 7231 uint32_t desc) 7232 { 7233 ARMVectorReg scratch; 7234 intptr_t i, j; 7235 intptr_t opr_sz = simd_oprsz(desc); 7236 uint64_t *d = vd, *n = vn, *m = vm; 7237 uint8_t *pg = vg; 7238 7239 if (d == n) { 7240 n = memcpy(&scratch, n, opr_sz); 7241 if (d == m) { 7242 m = n; 7243 } 7244 } else if (d == m) { 7245 m = memcpy(&scratch, m, opr_sz); 7246 } 7247 7248 for (i = 0; i < opr_sz / 8; ++i) { 7249 uint64_t count = 0; 7250 if (pg[H1(i)] & 1) { 7251 uint64_t nn = n[i]; 7252 for (j = 0; j <= i; ++j) { 7253 if ((pg[H1(j)] & 1) && nn == m[j]) { 7254 ++count; 7255 } 7256 } 7257 } 7258 d[i] = count; 7259 } 7260 } 7261 7262 /* 7263 * Returns the number of bytes in m0 and m1 that match n. 7264 * Unlike do_match2 we don't just need true/false, we need an exact count. 7265 * This requires two extra logical operations. 7266 */ 7267 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 7268 { 7269 const uint64_t mask = dup_const(MO_8, 0x7f); 7270 uint64_t cmp0, cmp1; 7271 7272 cmp1 = dup_const(MO_8, n); 7273 cmp0 = cmp1 ^ m0; 7274 cmp1 = cmp1 ^ m1; 7275 7276 /* 7277 * 1: clear msb of each byte to avoid carry to next byte (& mask) 7278 * 2: carry in to msb if byte != 0 (+ mask) 7279 * 3: set msb if cmp has msb set (| cmp) 7280 * 4: set ~msb to ignore them (| mask) 7281 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 7282 * 5: invert, resulting in 0x80 if and only if byte == 0. 7283 */ 7284 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 7285 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 7286 7287 /* 7288 * Combine the two compares in a way that the bits do 7289 * not overlap, and so preserves the count of set bits. 7290 * If the host has an efficient instruction for ctpop, 7291 * then ctpop(x) + ctpop(y) has the same number of 7292 * operations as ctpop(x | (y >> 1)). If the host does 7293 * not have an efficient ctpop, then we only want to 7294 * use it once. 7295 */ 7296 return ctpop64(cmp0 | (cmp1 >> 1)); 7297 } 7298 7299 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 7300 { 7301 intptr_t i, j; 7302 intptr_t opr_sz = simd_oprsz(desc); 7303 7304 for (i = 0; i < opr_sz; i += 16) { 7305 uint64_t n0 = *(uint64_t *)(vn + i); 7306 uint64_t m0 = *(uint64_t *)(vm + i); 7307 uint64_t n1 = *(uint64_t *)(vn + i + 8); 7308 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7309 uint64_t out0 = 0; 7310 uint64_t out1 = 0; 7311 7312 for (j = 0; j < 64; j += 8) { 7313 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 7314 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 7315 out0 |= cnt0 << j; 7316 out1 |= cnt1 << j; 7317 } 7318 7319 *(uint64_t *)(vd + i) = out0; 7320 *(uint64_t *)(vd + i + 8) = out1; 7321 } 7322 } 7323 7324 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 7325 { 7326 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7327 int shr = simd_data(desc); 7328 int shl = 8 - shr; 7329 uint64_t mask = dup_const(MO_8, 0xff >> shr); 7330 uint64_t *d = vd, *n = vn, *m = vm; 7331 7332 for (i = 0; i < opr_sz; ++i) { 7333 uint64_t t = n[i] ^ m[i]; 7334 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7335 } 7336 } 7337 7338 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 7339 { 7340 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7341 int shr = simd_data(desc); 7342 int shl = 16 - shr; 7343 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 7344 uint64_t *d = vd, *n = vn, *m = vm; 7345 7346 for (i = 0; i < opr_sz; ++i) { 7347 uint64_t t = n[i] ^ m[i]; 7348 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7349 } 7350 } 7351 7352 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 7353 { 7354 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 7355 int shr = simd_data(desc); 7356 uint32_t *d = vd, *n = vn, *m = vm; 7357 7358 for (i = 0; i < opr_sz; ++i) { 7359 d[i] = ror32(n[i] ^ m[i], shr); 7360 } 7361 } 7362 7363 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 7364 void *status, uint32_t desc) 7365 { 7366 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 7367 7368 for (s = 0; s < opr_sz; ++s) { 7369 float32 *n = vn + s * sizeof(float32) * 4; 7370 float32 *m = vm + s * sizeof(float32) * 4; 7371 float32 *a = va + s * sizeof(float32) * 4; 7372 float32 *d = vd + s * sizeof(float32) * 4; 7373 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 7374 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 7375 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 7376 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 7377 float32 p0, p1; 7378 7379 /* i = 0, j = 0 */ 7380 p0 = float32_mul(n00, m00, status); 7381 p1 = float32_mul(n01, m01, status); 7382 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 7383 7384 /* i = 0, j = 1 */ 7385 p0 = float32_mul(n00, m10, status); 7386 p1 = float32_mul(n01, m11, status); 7387 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 7388 7389 /* i = 1, j = 0 */ 7390 p0 = float32_mul(n10, m00, status); 7391 p1 = float32_mul(n11, m01, status); 7392 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 7393 7394 /* i = 1, j = 1 */ 7395 p0 = float32_mul(n10, m10, status); 7396 p1 = float32_mul(n11, m11, status); 7397 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 7398 } 7399 } 7400 7401 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 7402 void *status, uint32_t desc) 7403 { 7404 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 7405 7406 for (s = 0; s < opr_sz; ++s) { 7407 float64 *n = vn + s * sizeof(float64) * 4; 7408 float64 *m = vm + s * sizeof(float64) * 4; 7409 float64 *a = va + s * sizeof(float64) * 4; 7410 float64 *d = vd + s * sizeof(float64) * 4; 7411 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 7412 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 7413 float64 p0, p1; 7414 7415 /* i = 0, j = 0 */ 7416 p0 = float64_mul(n00, m00, status); 7417 p1 = float64_mul(n01, m01, status); 7418 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 7419 7420 /* i = 0, j = 1 */ 7421 p0 = float64_mul(n00, m10, status); 7422 p1 = float64_mul(n01, m11, status); 7423 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 7424 7425 /* i = 1, j = 0 */ 7426 p0 = float64_mul(n10, m00, status); 7427 p1 = float64_mul(n11, m01, status); 7428 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 7429 7430 /* i = 1, j = 1 */ 7431 p0 = float64_mul(n10, m10, status); 7432 p1 = float64_mul(n11, m11, status); 7433 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 7434 } 7435 } 7436 7437 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7438 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 7439 { \ 7440 intptr_t i = simd_oprsz(desc); \ 7441 uint64_t *g = vg; \ 7442 do { \ 7443 uint64_t pg = g[(i - 1) >> 6]; \ 7444 do { \ 7445 i -= sizeof(TYPEW); \ 7446 if (likely((pg >> (i & 63)) & 1)) { \ 7447 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 7448 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 7449 } \ 7450 } while (i & 63); \ 7451 } while (i != 0); \ 7452 } 7453 7454 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 7455 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 7456 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 7457 7458 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7459 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 7460 { \ 7461 intptr_t i = simd_oprsz(desc); \ 7462 uint64_t *g = vg; \ 7463 do { \ 7464 uint64_t pg = g[(i - 1) >> 6]; \ 7465 do { \ 7466 i -= sizeof(TYPEW); \ 7467 if (likely((pg >> (i & 63)) & 1)) { \ 7468 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 7469 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 7470 } \ 7471 } while (i & 63); \ 7472 } while (i != 0); \ 7473 } 7474 7475 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 7476 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 7477 7478 #undef DO_FCVTLT 7479 #undef DO_FCVTNT