qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

sve_helper.c (278800B)


      1 /*
      2  * ARM SVE Operations
      3  *
      4  * Copyright (c) 2018 Linaro, Ltd.
      5  *
      6  * This library is free software; you can redistribute it and/or
      7  * modify it under the terms of the GNU Lesser General Public
      8  * License as published by the Free Software Foundation; either
      9  * version 2.1 of the License, or (at your option) any later version.
     10  *
     11  * This library is distributed in the hope that it will be useful,
     12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14  * Lesser General Public License for more details.
     15  *
     16  * You should have received a copy of the GNU Lesser General Public
     17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
     18  */
     19 
     20 #include "qemu/osdep.h"
     21 #include "cpu.h"
     22 #include "internals.h"
     23 #include "exec/exec-all.h"
     24 #include "exec/helper-proto.h"
     25 #include "tcg/tcg-gvec-desc.h"
     26 #include "fpu/softfloat.h"
     27 #include "tcg/tcg.h"
     28 #include "vec_internal.h"
     29 #include "sve_ldst_internal.h"
     30 
     31 
     32 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
     33  *
     34  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
     35  * and bit 0 set if C is set.  Compare the definitions of these variables
     36  * within CPUARMState.
     37  */
     38 
     39 /* For no G bits set, NZCV = C.  */
     40 #define PREDTEST_INIT  1
     41 
     42 /* This is an iterative function, called for each Pd and Pg word
     43  * moving forward.
     44  */
     45 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
     46 {
     47     if (likely(g)) {
     48         /* Compute N from first D & G.
     49            Use bit 2 to signal first G bit seen.  */
     50         if (!(flags & 4)) {
     51             flags |= ((d & (g & -g)) != 0) << 31;
     52             flags |= 4;
     53         }
     54 
     55         /* Accumulate Z from each D & G.  */
     56         flags |= ((d & g) != 0) << 1;
     57 
     58         /* Compute C from last !(D & G).  Replace previous.  */
     59         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
     60     }
     61     return flags;
     62 }
     63 
     64 /* This is an iterative function, called for each Pd and Pg word
     65  * moving backward.
     66  */
     67 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
     68 {
     69     if (likely(g)) {
     70         /* Compute C from first (i.e last) !(D & G).
     71            Use bit 2 to signal first G bit seen.  */
     72         if (!(flags & 4)) {
     73             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
     74             flags |= (d & pow2floor(g)) == 0;
     75         }
     76 
     77         /* Accumulate Z from each D & G.  */
     78         flags |= ((d & g) != 0) << 1;
     79 
     80         /* Compute N from last (i.e first) D & G.  Replace previous.  */
     81         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
     82     }
     83     return flags;
     84 }
     85 
     86 /* The same for a single word predicate.  */
     87 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
     88 {
     89     return iter_predtest_fwd(d, g, PREDTEST_INIT);
     90 }
     91 
     92 /* The same for a multi-word predicate.  */
     93 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
     94 {
     95     uint32_t flags = PREDTEST_INIT;
     96     uint64_t *d = vd, *g = vg;
     97     uintptr_t i = 0;
     98 
     99     do {
    100         flags = iter_predtest_fwd(d[i], g[i], flags);
    101     } while (++i < words);
    102 
    103     return flags;
    104 }
    105 
    106 /* Similarly for single word elements.  */
    107 static inline uint64_t expand_pred_s(uint8_t byte)
    108 {
    109     static const uint64_t word[] = {
    110         [0x01] = 0x00000000ffffffffull,
    111         [0x10] = 0xffffffff00000000ull,
    112         [0x11] = 0xffffffffffffffffull,
    113     };
    114     return word[byte & 0x11];
    115 }
    116 
    117 #define LOGICAL_PPPP(NAME, FUNC) \
    118 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
    119 {                                                                         \
    120     uintptr_t opr_sz = simd_oprsz(desc);                                  \
    121     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
    122     uintptr_t i;                                                          \
    123     for (i = 0; i < opr_sz / 8; ++i) {                                    \
    124         d[i] = FUNC(n[i], m[i], g[i]);                                    \
    125     }                                                                     \
    126 }
    127 
    128 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
    129 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
    130 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
    131 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
    132 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
    133 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
    134 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
    135 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
    136 
    137 LOGICAL_PPPP(sve_and_pppp, DO_AND)
    138 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
    139 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
    140 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
    141 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
    142 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
    143 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
    144 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
    145 
    146 #undef DO_AND
    147 #undef DO_BIC
    148 #undef DO_EOR
    149 #undef DO_ORR
    150 #undef DO_ORN
    151 #undef DO_NOR
    152 #undef DO_NAND
    153 #undef DO_SEL
    154 #undef LOGICAL_PPPP
    155 
    156 /* Fully general three-operand expander, controlled by a predicate.
    157  * This is complicated by the host-endian storage of the register file.
    158  */
    159 /* ??? I don't expect the compiler could ever vectorize this itself.
    160  * With some tables we can convert bit masks to byte masks, and with
    161  * extra care wrt byte/word ordering we could use gcc generic vectors
    162  * and do 16 bytes at a time.
    163  */
    164 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
    165 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
    166 {                                                                       \
    167     intptr_t i, opr_sz = simd_oprsz(desc);                              \
    168     for (i = 0; i < opr_sz; ) {                                         \
    169         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
    170         do {                                                            \
    171             if (pg & 1) {                                               \
    172                 TYPE nn = *(TYPE *)(vn + H(i));                         \
    173                 TYPE mm = *(TYPE *)(vm + H(i));                         \
    174                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
    175             }                                                           \
    176             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
    177         } while (i & 15);                                               \
    178     }                                                                   \
    179 }
    180 
    181 /* Similarly, specialized for 64-bit operands.  */
    182 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
    183 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
    184 {                                                               \
    185     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
    186     TYPE *d = vd, *n = vn, *m = vm;                             \
    187     uint8_t *pg = vg;                                           \
    188     for (i = 0; i < opr_sz; i += 1) {                           \
    189         if (pg[H1(i)] & 1) {                                    \
    190             TYPE nn = n[i], mm = m[i];                          \
    191             d[i] = OP(nn, mm);                                  \
    192         }                                                       \
    193     }                                                           \
    194 }
    195 
    196 #define DO_AND(N, M)  (N & M)
    197 #define DO_EOR(N, M)  (N ^ M)
    198 #define DO_ORR(N, M)  (N | M)
    199 #define DO_BIC(N, M)  (N & ~M)
    200 #define DO_ADD(N, M)  (N + M)
    201 #define DO_SUB(N, M)  (N - M)
    202 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
    203 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
    204 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
    205 #define DO_MUL(N, M)  (N * M)
    206 
    207 
    208 /*
    209  * We must avoid the C undefined behaviour cases: division by
    210  * zero and signed division of INT_MIN by -1. Both of these
    211  * have architecturally defined required results for Arm.
    212  * We special case all signed divisions by -1 to avoid having
    213  * to deduce the minimum integer for the type involved.
    214  */
    215 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
    216 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
    217 
    218 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
    219 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
    220 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
    221 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
    222 
    223 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
    224 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
    225 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
    226 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
    227 
    228 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
    229 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
    230 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
    231 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
    232 
    233 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
    234 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
    235 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
    236 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
    237 
    238 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
    239 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
    240 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
    241 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
    242 
    243 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
    244 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
    245 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
    246 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
    247 
    248 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
    249 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
    250 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
    251 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
    252 
    253 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
    254 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
    255 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
    256 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
    257 
    258 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
    259 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
    260 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
    261 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
    262 
    263 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
    264 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
    265 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
    266 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
    267 
    268 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
    269 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
    270 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
    271 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
    272 
    273 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
    274 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
    275 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
    276 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
    277 
    278 /* Because the computation type is at least twice as large as required,
    279    these work for both signed and unsigned source types.  */
    280 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
    281 {
    282     return (n * m) >> 8;
    283 }
    284 
    285 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
    286 {
    287     return (n * m) >> 16;
    288 }
    289 
    290 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
    291 {
    292     return (n * m) >> 32;
    293 }
    294 
    295 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
    296 {
    297     uint64_t lo, hi;
    298     muls64(&lo, &hi, n, m);
    299     return hi;
    300 }
    301 
    302 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
    303 {
    304     uint64_t lo, hi;
    305     mulu64(&lo, &hi, n, m);
    306     return hi;
    307 }
    308 
    309 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
    310 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
    311 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
    312 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
    313 
    314 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
    315 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
    316 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
    317 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
    318 
    319 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
    320 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
    321 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
    322 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
    323 
    324 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
    325 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
    326 
    327 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
    328 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
    329 
    330 /* Note that all bits of the shift are significant
    331    and not modulo the element size.  */
    332 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
    333 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
    334 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
    335 
    336 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
    337 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
    338 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
    339 
    340 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
    341 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
    342 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
    343 
    344 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
    345 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
    346 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
    347 
    348 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
    349 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
    350 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
    351 
    352 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
    353 {
    354     int8_t n1 = n, n2 = n >> 8;
    355     return m + n1 + n2;
    356 }
    357 
    358 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
    359 {
    360     int16_t n1 = n, n2 = n >> 16;
    361     return m + n1 + n2;
    362 }
    363 
    364 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
    365 {
    366     int32_t n1 = n, n2 = n >> 32;
    367     return m + n1 + n2;
    368 }
    369 
    370 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
    371 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
    372 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
    373 
    374 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
    375 {
    376     uint8_t n1 = n, n2 = n >> 8;
    377     return m + n1 + n2;
    378 }
    379 
    380 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
    381 {
    382     uint16_t n1 = n, n2 = n >> 16;
    383     return m + n1 + n2;
    384 }
    385 
    386 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
    387 {
    388     uint32_t n1 = n, n2 = n >> 32;
    389     return m + n1 + n2;
    390 }
    391 
    392 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
    393 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
    394 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
    395 
    396 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
    397 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
    398 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
    399 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
    400 
    401 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
    402 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
    403 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
    404 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
    405 
    406 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
    407 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
    408 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
    409 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
    410 
    411 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
    412 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
    413 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
    414 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
    415 
    416 /*
    417  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
    418  * We pass in a pointer to a dummy saturation field to trigger
    419  * the saturating arithmetic but discard the information about
    420  * whether it has occurred.
    421  */
    422 #define do_sqshl_b(n, m) \
    423    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
    424 #define do_sqshl_h(n, m) \
    425    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
    426 #define do_sqshl_s(n, m) \
    427    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
    428 #define do_sqshl_d(n, m) \
    429    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
    430 
    431 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
    432 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
    433 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
    434 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
    435 
    436 #define do_uqshl_b(n, m) \
    437    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
    438 #define do_uqshl_h(n, m) \
    439    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
    440 #define do_uqshl_s(n, m) \
    441    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
    442 #define do_uqshl_d(n, m) \
    443    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
    444 
    445 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
    446 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
    447 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
    448 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
    449 
    450 #define do_sqrshl_b(n, m) \
    451    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
    452 #define do_sqrshl_h(n, m) \
    453    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
    454 #define do_sqrshl_s(n, m) \
    455    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
    456 #define do_sqrshl_d(n, m) \
    457    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
    458 
    459 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
    460 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
    461 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
    462 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
    463 
    464 #undef do_sqrshl_d
    465 
    466 #define do_uqrshl_b(n, m) \
    467    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
    468 #define do_uqrshl_h(n, m) \
    469    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
    470 #define do_uqrshl_s(n, m) \
    471    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
    472 #define do_uqrshl_d(n, m) \
    473    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
    474 
    475 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
    476 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
    477 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
    478 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
    479 
    480 #undef do_uqrshl_d
    481 
    482 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
    483 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
    484 
    485 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
    486 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
    487 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
    488 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
    489 
    490 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
    491 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
    492 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
    493 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
    494 
    495 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
    496 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
    497 
    498 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
    499 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
    500 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
    501 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
    502 
    503 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
    504 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
    505 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
    506 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
    507 
    508 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
    509 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
    510 
    511 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
    512 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
    513 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
    514 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
    515 
    516 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
    517 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
    518 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
    519 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
    520 
    521 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
    522 {
    523     return val >= max ? max : val <= min ? min : val;
    524 }
    525 
    526 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
    527 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
    528 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
    529 
    530 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
    531 {
    532     int64_t r = n + m;
    533     if (((r ^ n) & ~(n ^ m)) < 0) {
    534         /* Signed overflow.  */
    535         return r < 0 ? INT64_MAX : INT64_MIN;
    536     }
    537     return r;
    538 }
    539 
    540 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
    541 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
    542 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
    543 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
    544 
    545 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
    546 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
    547 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
    548 
    549 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
    550 {
    551     uint64_t r = n + m;
    552     return r < n ? UINT64_MAX : r;
    553 }
    554 
    555 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
    556 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
    557 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
    558 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
    559 
    560 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
    561 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
    562 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
    563 
    564 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
    565 {
    566     int64_t r = n - m;
    567     if (((r ^ n) & (n ^ m)) < 0) {
    568         /* Signed overflow.  */
    569         return r < 0 ? INT64_MAX : INT64_MIN;
    570     }
    571     return r;
    572 }
    573 
    574 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
    575 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
    576 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
    577 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
    578 
    579 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
    580 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
    581 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
    582 
    583 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
    584 {
    585     return n > m ? n - m : 0;
    586 }
    587 
    588 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
    589 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
    590 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
    591 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
    592 
    593 #define DO_SUQADD_B(n, m) \
    594     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
    595 #define DO_SUQADD_H(n, m) \
    596     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
    597 #define DO_SUQADD_S(n, m) \
    598     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
    599 
    600 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
    601 {
    602     uint64_t r = n + m;
    603 
    604     if (n < 0) {
    605         /* Note that m - abs(n) cannot underflow. */
    606         if (r > INT64_MAX) {
    607             /* Result is either very large positive or negative. */
    608             if (m > -n) {
    609                 /* m > abs(n), so r is a very large positive. */
    610                 return INT64_MAX;
    611             }
    612             /* Result is negative. */
    613         }
    614     } else {
    615         /* Both inputs are positive: check for overflow.  */
    616         if (r < m || r > INT64_MAX) {
    617             return INT64_MAX;
    618         }
    619     }
    620     return r;
    621 }
    622 
    623 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
    624 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
    625 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
    626 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
    627 
    628 #define DO_USQADD_B(n, m) \
    629     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
    630 #define DO_USQADD_H(n, m) \
    631     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
    632 #define DO_USQADD_S(n, m) \
    633     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
    634 
    635 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
    636 {
    637     uint64_t r = n + m;
    638 
    639     if (m < 0) {
    640         return n < -m ? 0 : r;
    641     }
    642     return r < n ? UINT64_MAX : r;
    643 }
    644 
    645 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
    646 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
    647 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
    648 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
    649 
    650 #undef DO_ZPZZ
    651 #undef DO_ZPZZ_D
    652 
    653 /*
    654  * Three operand expander, operating on element pairs.
    655  * If the slot I is even, the elements from from VN {I, I+1}.
    656  * If the slot I is odd, the elements from from VM {I-1, I}.
    657  * Load all of the input elements in each pair before overwriting output.
    658  */
    659 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
    660 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
    661 {                                                               \
    662     intptr_t i, opr_sz = simd_oprsz(desc);                      \
    663     for (i = 0; i < opr_sz; ) {                                 \
    664         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
    665         do {                                                    \
    666             TYPE n0 = *(TYPE *)(vn + H(i));                     \
    667             TYPE m0 = *(TYPE *)(vm + H(i));                     \
    668             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
    669             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
    670             if (pg & 1) {                                       \
    671                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
    672             }                                                   \
    673             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
    674             if (pg & 1) {                                       \
    675                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
    676             }                                                   \
    677             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
    678         } while (i & 15);                                       \
    679     }                                                           \
    680 }
    681 
    682 /* Similarly, specialized for 64-bit operands.  */
    683 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
    684 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
    685 {                                                               \
    686     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
    687     TYPE *d = vd, *n = vn, *m = vm;                             \
    688     uint8_t *pg = vg;                                           \
    689     for (i = 0; i < opr_sz; i += 2) {                           \
    690         TYPE n0 = n[i], n1 = n[i + 1];                          \
    691         TYPE m0 = m[i], m1 = m[i + 1];                          \
    692         if (pg[H1(i)] & 1) {                                    \
    693             d[i] = OP(n0, n1);                                  \
    694         }                                                       \
    695         if (pg[H1(i + 1)] & 1) {                                \
    696             d[i + 1] = OP(m0, m1);                              \
    697         }                                                       \
    698     }                                                           \
    699 }
    700 
    701 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
    702 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
    703 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
    704 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
    705 
    706 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
    707 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
    708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
    709 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
    710 
    711 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
    712 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
    713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
    714 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
    715 
    716 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
    717 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
    718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
    719 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
    720 
    721 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
    722 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
    723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
    724 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
    725 
    726 #undef DO_ZPZZ_PAIR
    727 #undef DO_ZPZZ_PAIR_D
    728 
    729 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
    730 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
    731                   void *status, uint32_t desc)                          \
    732 {                                                                       \
    733     intptr_t i, opr_sz = simd_oprsz(desc);                              \
    734     for (i = 0; i < opr_sz; ) {                                         \
    735         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
    736         do {                                                            \
    737             TYPE n0 = *(TYPE *)(vn + H(i));                             \
    738             TYPE m0 = *(TYPE *)(vm + H(i));                             \
    739             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
    740             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
    741             if (pg & 1) {                                               \
    742                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
    743             }                                                           \
    744             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
    745             if (pg & 1) {                                               \
    746                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
    747             }                                                           \
    748             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
    749         } while (i & 15);                                               \
    750     }                                                                   \
    751 }
    752 
    753 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
    754 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
    755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
    756 
    757 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
    758 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
    759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
    760 
    761 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
    762 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
    763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
    764 
    765 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
    766 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
    767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
    768 
    769 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
    770 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
    771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
    772 
    773 #undef DO_ZPZZ_PAIR_FP
    774 
    775 /* Three-operand expander, controlled by a predicate, in which the
    776  * third operand is "wide".  That is, for D = N op M, the same 64-bit
    777  * value of M is used with all of the narrower values of N.
    778  */
    779 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
    780 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
    781 {                                                                       \
    782     intptr_t i, opr_sz = simd_oprsz(desc);                              \
    783     for (i = 0; i < opr_sz; ) {                                         \
    784         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
    785         TYPEW mm = *(TYPEW *)(vm + i);                                  \
    786         do {                                                            \
    787             if (pg & 1) {                                               \
    788                 TYPE nn = *(TYPE *)(vn + H(i));                         \
    789                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
    790             }                                                           \
    791             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
    792         } while (i & 7);                                                \
    793     }                                                                   \
    794 }
    795 
    796 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
    797 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
    798 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
    799 
    800 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
    801 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
    802 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
    803 
    804 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
    805 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
    806 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
    807 
    808 #undef DO_ZPZW
    809 
    810 /* Fully general two-operand expander, controlled by a predicate.
    811  */
    812 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
    813 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
    814 {                                                               \
    815     intptr_t i, opr_sz = simd_oprsz(desc);                      \
    816     for (i = 0; i < opr_sz; ) {                                 \
    817         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
    818         do {                                                    \
    819             if (pg & 1) {                                       \
    820                 TYPE nn = *(TYPE *)(vn + H(i));                 \
    821                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
    822             }                                                   \
    823             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
    824         } while (i & 15);                                       \
    825     }                                                           \
    826 }
    827 
    828 /* Similarly, specialized for 64-bit operands.  */
    829 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
    830 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
    831 {                                                               \
    832     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
    833     TYPE *d = vd, *n = vn;                                      \
    834     uint8_t *pg = vg;                                           \
    835     for (i = 0; i < opr_sz; i += 1) {                           \
    836         if (pg[H1(i)] & 1) {                                    \
    837             TYPE nn = n[i];                                     \
    838             d[i] = OP(nn);                                      \
    839         }                                                       \
    840     }                                                           \
    841 }
    842 
    843 #define DO_CLS_B(N)   (clrsb32(N) - 24)
    844 #define DO_CLS_H(N)   (clrsb32(N) - 16)
    845 
    846 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
    847 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
    848 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
    849 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
    850 
    851 #define DO_CLZ_B(N)   (clz32(N) - 24)
    852 #define DO_CLZ_H(N)   (clz32(N) - 16)
    853 
    854 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
    855 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
    856 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
    857 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
    858 
    859 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
    860 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
    861 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
    862 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
    863 
    864 #define DO_CNOT(N)    (N == 0)
    865 
    866 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
    867 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
    868 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
    869 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
    870 
    871 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
    872 
    873 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
    874 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
    875 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
    876 
    877 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
    878 
    879 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
    880 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
    881 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
    882 
    883 #define DO_NOT(N)    (~N)
    884 
    885 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
    886 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
    887 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
    888 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
    889 
    890 #define DO_SXTB(N)    ((int8_t)N)
    891 #define DO_SXTH(N)    ((int16_t)N)
    892 #define DO_SXTS(N)    ((int32_t)N)
    893 #define DO_UXTB(N)    ((uint8_t)N)
    894 #define DO_UXTH(N)    ((uint16_t)N)
    895 #define DO_UXTS(N)    ((uint32_t)N)
    896 
    897 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
    898 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
    899 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
    900 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
    901 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
    902 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
    903 
    904 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
    905 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
    906 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
    907 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
    908 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
    909 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
    910 
    911 #define DO_ABS(N)    (N < 0 ? -N : N)
    912 
    913 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
    914 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
    915 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
    916 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
    917 
    918 #define DO_NEG(N)    (-N)
    919 
    920 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
    921 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
    922 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
    923 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
    924 
    925 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
    926 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
    927 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
    928 
    929 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
    930 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
    931 
    932 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
    933 
    934 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
    935 {
    936     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
    937     uint64_t *d = vd, *n = vn;
    938     uint8_t *pg = vg;
    939 
    940     for (i = 0; i < opr_sz; i += 2) {
    941         if (pg[H1(i)] & 1) {
    942             uint64_t n0 = n[i + 0];
    943             uint64_t n1 = n[i + 1];
    944             d[i + 0] = n1;
    945             d[i + 1] = n0;
    946         }
    947     }
    948 }
    949 
    950 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
    951 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
    952 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
    953 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
    954 
    955 #define DO_SQABS(X) \
    956     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
    957        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
    958 
    959 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
    960 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
    961 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
    962 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
    963 
    964 #define DO_SQNEG(X) \
    965     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
    966        x_ == min_ ? -min_ - 1 : -x_; })
    967 
    968 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
    969 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
    970 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
    971 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
    972 
    973 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
    974 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
    975 
    976 /* Three-operand expander, unpredicated, in which the third operand is "wide".
    977  */
    978 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
    979 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
    980 {                                                              \
    981     intptr_t i, opr_sz = simd_oprsz(desc);                     \
    982     for (i = 0; i < opr_sz; ) {                                \
    983         TYPEW mm = *(TYPEW *)(vm + i);                         \
    984         do {                                                   \
    985             TYPE nn = *(TYPE *)(vn + H(i));                    \
    986             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
    987             i += sizeof(TYPE);                                 \
    988         } while (i & 7);                                       \
    989     }                                                          \
    990 }
    991 
    992 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
    993 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
    994 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
    995 
    996 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
    997 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
    998 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
    999 
   1000 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
   1001 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
   1002 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
   1003 
   1004 #undef DO_ZZW
   1005 
   1006 #undef DO_CLS_B
   1007 #undef DO_CLS_H
   1008 #undef DO_CLZ_B
   1009 #undef DO_CLZ_H
   1010 #undef DO_CNOT
   1011 #undef DO_FABS
   1012 #undef DO_FNEG
   1013 #undef DO_ABS
   1014 #undef DO_NEG
   1015 #undef DO_ZPZ
   1016 #undef DO_ZPZ_D
   1017 
   1018 /*
   1019  * Three-operand expander, unpredicated, in which the two inputs are
   1020  * selected from the top or bottom half of the wide column.
   1021  */
   1022 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
   1023 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
   1024 {                                                                       \
   1025     intptr_t i, opr_sz = simd_oprsz(desc);                              \
   1026     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
   1027     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
   1028     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
   1029         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
   1030         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
   1031         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
   1032     }                                                                   \
   1033 }
   1034 
   1035 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
   1036 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
   1037 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
   1038 
   1039 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
   1040 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
   1041 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
   1042 
   1043 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
   1044 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
   1045 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
   1046 
   1047 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
   1048 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
   1049 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
   1050 
   1051 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
   1052 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
   1053 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
   1054 
   1055 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
   1056 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
   1057 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
   1058 
   1059 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
   1060 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
   1061 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
   1062 
   1063 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
   1064 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
   1065 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
   1066 
   1067 /* Note that the multiply cannot overflow, but the doubling can. */
   1068 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
   1069 {
   1070     int16_t val = n * m;
   1071     return DO_SQADD_H(val, val);
   1072 }
   1073 
   1074 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
   1075 {
   1076     int32_t val = n * m;
   1077     return DO_SQADD_S(val, val);
   1078 }
   1079 
   1080 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
   1081 {
   1082     int64_t val = n * m;
   1083     return do_sqadd_d(val, val);
   1084 }
   1085 
   1086 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
   1087 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
   1088 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
   1089 
   1090 #undef DO_ZZZ_TB
   1091 
   1092 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
   1093 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
   1094 {                                                              \
   1095     intptr_t i, opr_sz = simd_oprsz(desc);                     \
   1096     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
   1097     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
   1098         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
   1099         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
   1100         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
   1101     }                                                          \
   1102 }
   1103 
   1104 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
   1105 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
   1106 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
   1107 
   1108 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
   1109 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
   1110 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
   1111 
   1112 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
   1113 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
   1114 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
   1115 
   1116 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
   1117 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
   1118 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
   1119 
   1120 #undef DO_ZZZ_WTB
   1121 
   1122 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
   1123 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
   1124 {                                                                       \
   1125     intptr_t i, opr_sz = simd_oprsz(desc);                              \
   1126     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
   1127     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
   1128     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
   1129         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
   1130         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
   1131         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
   1132     }                                                                   \
   1133 }
   1134 
   1135 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
   1136 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
   1137 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
   1138 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
   1139 
   1140 #undef DO_ZZZ_NTB
   1141 
   1142 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
   1143 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
   1144 {                                                               \
   1145     intptr_t i, opr_sz = simd_oprsz(desc);                      \
   1146     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
   1147     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
   1148         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
   1149         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
   1150         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
   1151         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
   1152     }                                                           \
   1153 }
   1154 
   1155 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
   1156 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
   1157 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
   1158 
   1159 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
   1160 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
   1161 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
   1162 
   1163 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
   1164 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
   1165 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
   1166 
   1167 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
   1168 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
   1169 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
   1170 
   1171 #define DO_NMUL(N, M)  -(N * M)
   1172 
   1173 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
   1174 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
   1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
   1176 
   1177 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
   1178 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
   1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
   1180 
   1181 #undef DO_ZZZW_ACC
   1182 
   1183 #define DO_XTNB(NAME, TYPE, OP) \
   1184 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
   1185 {                                                            \
   1186     intptr_t i, opr_sz = simd_oprsz(desc);                   \
   1187     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
   1188         TYPE nn = *(TYPE *)(vn + i);                         \
   1189         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
   1190         *(TYPE *)(vd + i) = nn;                              \
   1191     }                                                        \
   1192 }
   1193 
   1194 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
   1195 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
   1196 {                                                                       \
   1197     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
   1198     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
   1199         TYPE nn = *(TYPE *)(vn + i);                                    \
   1200         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
   1201     }                                                                   \
   1202 }
   1203 
   1204 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
   1205 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
   1206 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
   1207 
   1208 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
   1209 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
   1210 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
   1211 
   1212 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
   1213 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
   1214 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
   1215 
   1216 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
   1217 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
   1218 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
   1219 
   1220 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
   1221 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
   1222 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
   1223 
   1224 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
   1225 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
   1226 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
   1227 
   1228 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
   1229 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
   1230 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
   1231 
   1232 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
   1233 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
   1234 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
   1235 
   1236 #undef DO_XTNB
   1237 #undef DO_XTNT
   1238 
   1239 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
   1240 {
   1241     intptr_t i, opr_sz = simd_oprsz(desc);
   1242     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
   1243     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
   1244     uint32_t *a = va, *n = vn;
   1245     uint64_t *d = vd, *m = vm;
   1246 
   1247     for (i = 0; i < opr_sz / 8; ++i) {
   1248         uint32_t e1 = a[2 * i + H4(0)];
   1249         uint32_t e2 = n[2 * i + sel] ^ inv;
   1250         uint64_t c = extract64(m[i], 32, 1);
   1251         /* Compute and store the entire 33-bit result at once. */
   1252         d[i] = c + e1 + e2;
   1253     }
   1254 }
   1255 
   1256 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
   1257 {
   1258     intptr_t i, opr_sz = simd_oprsz(desc);
   1259     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
   1260     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
   1261     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
   1262 
   1263     for (i = 0; i < opr_sz / 8; i += 2) {
   1264         Int128 e1 = int128_make64(a[i]);
   1265         Int128 e2 = int128_make64(n[i + sel] ^ inv);
   1266         Int128 c = int128_make64(m[i + 1] & 1);
   1267         Int128 r = int128_add(int128_add(e1, e2), c);
   1268         d[i + 0] = int128_getlo(r);
   1269         d[i + 1] = int128_gethi(r);
   1270     }
   1271 }
   1272 
   1273 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
   1274 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
   1275 {                                                                       \
   1276     intptr_t i, opr_sz = simd_oprsz(desc);                              \
   1277     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
   1278     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
   1279     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
   1280         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
   1281         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
   1282         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
   1283         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
   1284     }                                                                   \
   1285 }
   1286 
   1287 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
   1288            do_sqdmull_h, DO_SQADD_H)
   1289 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
   1290            do_sqdmull_s, DO_SQADD_S)
   1291 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
   1292            do_sqdmull_d, do_sqadd_d)
   1293 
   1294 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
   1295            do_sqdmull_h, DO_SQSUB_H)
   1296 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
   1297            do_sqdmull_s, DO_SQSUB_S)
   1298 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
   1299            do_sqdmull_d, do_sqsub_d)
   1300 
   1301 #undef DO_SQDMLAL
   1302 
   1303 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
   1304 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
   1305 {                                                               \
   1306     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
   1307     int rot = simd_data(desc);                                  \
   1308     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
   1309     bool sub_r = rot == 1 || rot == 2;                          \
   1310     bool sub_i = rot >= 2;                                      \
   1311     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
   1312     for (i = 0; i < opr_sz; i += 2) {                           \
   1313         TYPE elt1_a = n[H(i + sel_a)];                          \
   1314         TYPE elt2_a = m[H(i + sel_a)];                          \
   1315         TYPE elt2_b = m[H(i + sel_b)];                          \
   1316         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
   1317         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
   1318     }                                                           \
   1319 }
   1320 
   1321 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
   1322 
   1323 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
   1324 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
   1325 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
   1326 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
   1327 
   1328 #define DO_SQRDMLAH_B(N, M, A, S) \
   1329     do_sqrdmlah_b(N, M, A, S, true)
   1330 #define DO_SQRDMLAH_H(N, M, A, S) \
   1331     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
   1332 #define DO_SQRDMLAH_S(N, M, A, S) \
   1333     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
   1334 #define DO_SQRDMLAH_D(N, M, A, S) \
   1335     do_sqrdmlah_d(N, M, A, S, true)
   1336 
   1337 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
   1338 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
   1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
   1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
   1341 
   1342 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
   1343 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
   1344 {                                                                           \
   1345     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
   1346     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
   1347     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
   1348     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
   1349     bool sub_r = rot == 1 || rot == 2;                                      \
   1350     bool sub_i = rot >= 2;                                                  \
   1351     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
   1352     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
   1353         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
   1354         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
   1355         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
   1356             TYPE elt1_a = n[H(i + j + sel_a)];                              \
   1357             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
   1358             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
   1359         }                                                                   \
   1360     }                                                                       \
   1361 }
   1362 
   1363 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
   1364 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
   1365 
   1366 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
   1367 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
   1368 
   1369 #undef DO_CMLA
   1370 #undef DO_CMLA_FUNC
   1371 #undef DO_CMLA_IDX_FUNC
   1372 #undef DO_SQRDMLAH_B
   1373 #undef DO_SQRDMLAH_H
   1374 #undef DO_SQRDMLAH_S
   1375 #undef DO_SQRDMLAH_D
   1376 
   1377 /* Note N and M are 4 elements bundled into one unit. */
   1378 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
   1379                          int sel_a, int sel_b, int sub_i)
   1380 {
   1381     for (int i = 0; i <= 1; i++) {
   1382         int32_t elt1_r = (int8_t)(n >> (16 * i));
   1383         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
   1384         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
   1385         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
   1386 
   1387         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
   1388     }
   1389     return a;
   1390 }
   1391 
   1392 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
   1393                          int sel_a, int sel_b, int sub_i)
   1394 {
   1395     for (int i = 0; i <= 1; i++) {
   1396         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
   1397         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
   1398         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
   1399         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
   1400 
   1401         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
   1402     }
   1403     return a;
   1404 }
   1405 
   1406 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
   1407                               void *va, uint32_t desc)
   1408 {
   1409     int opr_sz = simd_oprsz(desc);
   1410     int rot = simd_data(desc);
   1411     int sel_a = rot & 1;
   1412     int sel_b = sel_a ^ 1;
   1413     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
   1414     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
   1415 
   1416     for (int e = 0; e < opr_sz / 4; e++) {
   1417         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
   1418     }
   1419 }
   1420 
   1421 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
   1422                               void *va, uint32_t desc)
   1423 {
   1424     int opr_sz = simd_oprsz(desc);
   1425     int rot = simd_data(desc);
   1426     int sel_a = rot & 1;
   1427     int sel_b = sel_a ^ 1;
   1428     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
   1429     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
   1430 
   1431     for (int e = 0; e < opr_sz / 8; e++) {
   1432         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
   1433     }
   1434 }
   1435 
   1436 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
   1437                              void *va, uint32_t desc)
   1438 {
   1439     int opr_sz = simd_oprsz(desc);
   1440     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
   1441     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
   1442     int sel_a = rot & 1;
   1443     int sel_b = sel_a ^ 1;
   1444     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
   1445     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
   1446 
   1447     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
   1448         uint32_t seg_m = m[seg + idx];
   1449         for (int e = 0; e < 4; e++) {
   1450             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
   1451                                    sel_a, sel_b, sub_i);
   1452         }
   1453     }
   1454 }
   1455 
   1456 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
   1457                              void *va, uint32_t desc)
   1458 {
   1459     int seg, opr_sz = simd_oprsz(desc);
   1460     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
   1461     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
   1462     int sel_a = rot & 1;
   1463     int sel_b = sel_a ^ 1;
   1464     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
   1465     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
   1466 
   1467     for (seg = 0; seg < opr_sz / 8; seg += 2) {
   1468         uint64_t seg_m = m[seg + idx];
   1469         for (int e = 0; e < 2; e++) {
   1470             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
   1471                                    sel_a, sel_b, sub_i);
   1472         }
   1473     }
   1474 }
   1475 
   1476 #define DO_ZZXZ(NAME, TYPE, H, OP) \
   1477 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
   1478 {                                                                       \
   1479     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
   1480     intptr_t i, j, idx = simd_data(desc);                               \
   1481     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
   1482     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
   1483         TYPE mm = m[i];                                                 \
   1484         for (j = 0; j < segment; j++) {                                 \
   1485             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
   1486         }                                                               \
   1487     }                                                                   \
   1488 }
   1489 
   1490 #define DO_SQRDMLAH_H(N, M, A) \
   1491     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
   1492 #define DO_SQRDMLAH_S(N, M, A) \
   1493     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
   1494 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
   1495 
   1496 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
   1497 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
   1498 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
   1499 
   1500 #define DO_SQRDMLSH_H(N, M, A) \
   1501     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
   1502 #define DO_SQRDMLSH_S(N, M, A) \
   1503     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
   1504 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
   1505 
   1506 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
   1507 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
   1508 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
   1509 
   1510 #undef DO_ZZXZ
   1511 
   1512 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
   1513 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
   1514 {                                                                         \
   1515     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
   1516     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
   1517     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
   1518     for (i = 0; i < oprsz; i += 16) {                                     \
   1519         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
   1520         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
   1521             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
   1522             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
   1523             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
   1524         }                                                                 \
   1525     }                                                                     \
   1526 }
   1527 
   1528 #define DO_MLA(N, M, A)  (A + N * M)
   1529 
   1530 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
   1531 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
   1532 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
   1533 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
   1534 
   1535 #define DO_MLS(N, M, A)  (A - N * M)
   1536 
   1537 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
   1538 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
   1539 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
   1540 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
   1541 
   1542 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
   1543 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
   1544 
   1545 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
   1546 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
   1547 
   1548 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
   1549 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
   1550 
   1551 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
   1552 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
   1553 
   1554 #undef DO_MLA
   1555 #undef DO_MLS
   1556 #undef DO_ZZXW
   1557 
   1558 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
   1559 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
   1560 {                                                                         \
   1561     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
   1562     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
   1563     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
   1564     for (i = 0; i < oprsz; i += 16) {                                     \
   1565         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
   1566         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
   1567             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
   1568             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
   1569         }                                                                 \
   1570     }                                                                     \
   1571 }
   1572 
   1573 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
   1574 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
   1575 
   1576 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
   1577 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
   1578 
   1579 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
   1580 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
   1581 
   1582 #undef DO_ZZX
   1583 
   1584 #define DO_BITPERM(NAME, TYPE, OP) \
   1585 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
   1586 {                                                              \
   1587     intptr_t i, opr_sz = simd_oprsz(desc);                     \
   1588     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
   1589         TYPE nn = *(TYPE *)(vn + i);                           \
   1590         TYPE mm = *(TYPE *)(vm + i);                           \
   1591         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
   1592     }                                                          \
   1593 }
   1594 
   1595 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
   1596 {
   1597     uint64_t res = 0;
   1598     int db, rb = 0;
   1599 
   1600     for (db = 0; db < n; ++db) {
   1601         if ((mask >> db) & 1) {
   1602             res |= ((data >> db) & 1) << rb;
   1603             ++rb;
   1604         }
   1605     }
   1606     return res;
   1607 }
   1608 
   1609 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
   1610 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
   1611 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
   1612 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
   1613 
   1614 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
   1615 {
   1616     uint64_t res = 0;
   1617     int rb, db = 0;
   1618 
   1619     for (rb = 0; rb < n; ++rb) {
   1620         if ((mask >> rb) & 1) {
   1621             res |= ((data >> db) & 1) << rb;
   1622             ++db;
   1623         }
   1624     }
   1625     return res;
   1626 }
   1627 
   1628 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
   1629 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
   1630 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
   1631 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
   1632 
   1633 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
   1634 {
   1635     uint64_t resm = 0, resu = 0;
   1636     int db, rbm = 0, rbu = 0;
   1637 
   1638     for (db = 0; db < n; ++db) {
   1639         uint64_t val = (data >> db) & 1;
   1640         if ((mask >> db) & 1) {
   1641             resm |= val << rbm++;
   1642         } else {
   1643             resu |= val << rbu++;
   1644         }
   1645     }
   1646 
   1647     return resm | (resu << rbm);
   1648 }
   1649 
   1650 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
   1651 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
   1652 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
   1653 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
   1654 
   1655 #undef DO_BITPERM
   1656 
   1657 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
   1658 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
   1659 {                                                               \
   1660     intptr_t i, opr_sz = simd_oprsz(desc);                      \
   1661     int sub_r = simd_data(desc);                                \
   1662     if (sub_r) {                                                \
   1663         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
   1664             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
   1665             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
   1666             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
   1667             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
   1668             acc_r = ADD_OP(acc_r, el2_i);                       \
   1669             acc_i = SUB_OP(acc_i, el2_r);                       \
   1670             *(TYPE *)(vd + H(i)) = acc_r;                       \
   1671             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
   1672         }                                                       \
   1673     } else {                                                    \
   1674         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
   1675             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
   1676             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
   1677             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
   1678             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
   1679             acc_r = SUB_OP(acc_r, el2_i);                       \
   1680             acc_i = ADD_OP(acc_i, el2_r);                       \
   1681             *(TYPE *)(vd + H(i)) = acc_r;                       \
   1682             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
   1683         }                                                       \
   1684     }                                                           \
   1685 }
   1686 
   1687 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
   1688 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
   1689 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
   1690 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
   1691 
   1692 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
   1693 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
   1694 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
   1695 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
   1696 
   1697 #undef DO_CADD
   1698 
   1699 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
   1700 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
   1701 {                                                              \
   1702     intptr_t i, opr_sz = simd_oprsz(desc);                     \
   1703     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
   1704     int shift = simd_data(desc) >> 1;                          \
   1705     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
   1706         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
   1707         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
   1708     }                                                          \
   1709 }
   1710 
   1711 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
   1712 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
   1713 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
   1714 
   1715 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
   1716 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
   1717 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
   1718 
   1719 #undef DO_ZZI_SHLL
   1720 
   1721 /* Two-operand reduction expander, controlled by a predicate.
   1722  * The difference between TYPERED and TYPERET has to do with
   1723  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
   1724  * but TYPERET must be unsigned so that e.g. a 32-bit value
   1725  * is not sign-extended to the ABI uint64_t return type.
   1726  */
   1727 /* ??? If we were to vectorize this by hand the reduction ordering
   1728  * would change.  For integer operands, this is perfectly fine.
   1729  */
   1730 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
   1731 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
   1732 {                                                          \
   1733     intptr_t i, opr_sz = simd_oprsz(desc);                 \
   1734     TYPERED ret = INIT;                                    \
   1735     for (i = 0; i < opr_sz; ) {                            \
   1736         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
   1737         do {                                               \
   1738             if (pg & 1) {                                  \
   1739                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
   1740                 ret = OP(ret, nn);                         \
   1741             }                                              \
   1742             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
   1743         } while (i & 15);                                  \
   1744     }                                                      \
   1745     return (TYPERET)ret;                                   \
   1746 }
   1747 
   1748 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
   1749 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
   1750 {                                                          \
   1751     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
   1752     TYPEE *n = vn;                                         \
   1753     uint8_t *pg = vg;                                      \
   1754     TYPER ret = INIT;                                      \
   1755     for (i = 0; i < opr_sz; i += 1) {                      \
   1756         if (pg[H1(i)] & 1) {                               \
   1757             TYPEE nn = n[i];                               \
   1758             ret = OP(ret, nn);                             \
   1759         }                                                  \
   1760     }                                                      \
   1761     return ret;                                            \
   1762 }
   1763 
   1764 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
   1765 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
   1766 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
   1767 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
   1768 
   1769 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
   1770 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
   1771 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
   1772 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
   1773 
   1774 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
   1775 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
   1776 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
   1777 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
   1778 
   1779 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
   1780 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
   1781 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
   1782 
   1783 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
   1784 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
   1785 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
   1786 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
   1787 
   1788 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
   1789 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
   1790 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
   1791 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
   1792 
   1793 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
   1794 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
   1795 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
   1796 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
   1797 
   1798 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
   1799 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
   1800 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
   1801 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
   1802 
   1803 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
   1804 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
   1805 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
   1806 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
   1807 
   1808 #undef DO_VPZ
   1809 #undef DO_VPZ_D
   1810 
   1811 /* Two vector operand, one scalar operand, unpredicated.  */
   1812 #define DO_ZZI(NAME, TYPE, OP)                                       \
   1813 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
   1814 {                                                                    \
   1815     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
   1816     TYPE s = s64, *d = vd, *n = vn;                                  \
   1817     for (i = 0; i < opr_sz; ++i) {                                   \
   1818         d[i] = OP(n[i], s);                                          \
   1819     }                                                                \
   1820 }
   1821 
   1822 #define DO_SUBR(X, Y)   (Y - X)
   1823 
   1824 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
   1825 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
   1826 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
   1827 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
   1828 
   1829 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
   1830 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
   1831 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
   1832 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
   1833 
   1834 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
   1835 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
   1836 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
   1837 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
   1838 
   1839 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
   1840 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
   1841 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
   1842 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
   1843 
   1844 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
   1845 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
   1846 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
   1847 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
   1848 
   1849 #undef DO_ZZI
   1850 
   1851 #undef DO_AND
   1852 #undef DO_ORR
   1853 #undef DO_EOR
   1854 #undef DO_BIC
   1855 #undef DO_ADD
   1856 #undef DO_SUB
   1857 #undef DO_MAX
   1858 #undef DO_MIN
   1859 #undef DO_ABD
   1860 #undef DO_MUL
   1861 #undef DO_DIV
   1862 #undef DO_ASR
   1863 #undef DO_LSR
   1864 #undef DO_LSL
   1865 #undef DO_SUBR
   1866 
   1867 /* Similar to the ARM LastActiveElement pseudocode function, except the
   1868    result is multiplied by the element size.  This includes the not found
   1869    indication; e.g. not found for esz=3 is -8.  */
   1870 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
   1871 {
   1872     uint64_t mask = pred_esz_masks[esz];
   1873     intptr_t i = words;
   1874 
   1875     do {
   1876         uint64_t this_g = g[--i] & mask;
   1877         if (this_g) {
   1878             return i * 64 + (63 - clz64(this_g));
   1879         }
   1880     } while (i > 0);
   1881     return (intptr_t)-1 << esz;
   1882 }
   1883 
   1884 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
   1885 {
   1886     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
   1887     uint32_t flags = PREDTEST_INIT;
   1888     uint64_t *d = vd, *g = vg;
   1889     intptr_t i = 0;
   1890 
   1891     do {
   1892         uint64_t this_d = d[i];
   1893         uint64_t this_g = g[i];
   1894 
   1895         if (this_g) {
   1896             if (!(flags & 4)) {
   1897                 /* Set in D the first bit of G.  */
   1898                 this_d |= this_g & -this_g;
   1899                 d[i] = this_d;
   1900             }
   1901             flags = iter_predtest_fwd(this_d, this_g, flags);
   1902         }
   1903     } while (++i < words);
   1904 
   1905     return flags;
   1906 }
   1907 
   1908 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
   1909 {
   1910     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
   1911     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
   1912     uint32_t flags = PREDTEST_INIT;
   1913     uint64_t *d = vd, *g = vg, esz_mask;
   1914     intptr_t i, next;
   1915 
   1916     next = last_active_element(vd, words, esz) + (1 << esz);
   1917     esz_mask = pred_esz_masks[esz];
   1918 
   1919     /* Similar to the pseudocode for pnext, but scaled by ESZ
   1920        so that we find the correct bit.  */
   1921     if (next < words * 64) {
   1922         uint64_t mask = -1;
   1923 
   1924         if (next & 63) {
   1925             mask = ~((1ull << (next & 63)) - 1);
   1926             next &= -64;
   1927         }
   1928         do {
   1929             uint64_t this_g = g[next / 64] & esz_mask & mask;
   1930             if (this_g != 0) {
   1931                 next = (next & -64) + ctz64(this_g);
   1932                 break;
   1933             }
   1934             next += 64;
   1935             mask = -1;
   1936         } while (next < words * 64);
   1937     }
   1938 
   1939     i = 0;
   1940     do {
   1941         uint64_t this_d = 0;
   1942         if (i == next / 64) {
   1943             this_d = 1ull << (next & 63);
   1944         }
   1945         d[i] = this_d;
   1946         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
   1947     } while (++i < words);
   1948 
   1949     return flags;
   1950 }
   1951 
   1952 /*
   1953  * Copy Zn into Zd, and store zero into inactive elements.
   1954  * If inv, store zeros into the active elements.
   1955  */
   1956 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
   1957 {
   1958     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   1959     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
   1960     uint64_t *d = vd, *n = vn;
   1961     uint8_t *pg = vg;
   1962 
   1963     for (i = 0; i < opr_sz; i += 1) {
   1964         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
   1965     }
   1966 }
   1967 
   1968 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
   1969 {
   1970     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   1971     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
   1972     uint64_t *d = vd, *n = vn;
   1973     uint8_t *pg = vg;
   1974 
   1975     for (i = 0; i < opr_sz; i += 1) {
   1976         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
   1977     }
   1978 }
   1979 
   1980 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
   1981 {
   1982     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   1983     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
   1984     uint64_t *d = vd, *n = vn;
   1985     uint8_t *pg = vg;
   1986 
   1987     for (i = 0; i < opr_sz; i += 1) {
   1988         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
   1989     }
   1990 }
   1991 
   1992 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
   1993 {
   1994     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   1995     uint64_t *d = vd, *n = vn;
   1996     uint8_t *pg = vg;
   1997     uint8_t inv = simd_data(desc);
   1998 
   1999     for (i = 0; i < opr_sz; i += 1) {
   2000         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
   2001     }
   2002 }
   2003 
   2004 /* Three-operand expander, immediate operand, controlled by a predicate.
   2005  */
   2006 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
   2007 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
   2008 {                                                               \
   2009     intptr_t i, opr_sz = simd_oprsz(desc);                      \
   2010     TYPE imm = simd_data(desc);                                 \
   2011     for (i = 0; i < opr_sz; ) {                                 \
   2012         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
   2013         do {                                                    \
   2014             if (pg & 1) {                                       \
   2015                 TYPE nn = *(TYPE *)(vn + H(i));                 \
   2016                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
   2017             }                                                   \
   2018             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
   2019         } while (i & 15);                                       \
   2020     }                                                           \
   2021 }
   2022 
   2023 /* Similarly, specialized for 64-bit operands.  */
   2024 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
   2025 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
   2026 {                                                               \
   2027     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
   2028     TYPE *d = vd, *n = vn;                                      \
   2029     TYPE imm = simd_data(desc);                                 \
   2030     uint8_t *pg = vg;                                           \
   2031     for (i = 0; i < opr_sz; i += 1) {                           \
   2032         if (pg[H1(i)] & 1) {                                    \
   2033             TYPE nn = n[i];                                     \
   2034             d[i] = OP(nn, imm);                                 \
   2035         }                                                       \
   2036     }                                                           \
   2037 }
   2038 
   2039 #define DO_SHR(N, M)  (N >> M)
   2040 #define DO_SHL(N, M)  (N << M)
   2041 
   2042 /* Arithmetic shift right for division.  This rounds negative numbers
   2043    toward zero as per signed division.  Therefore before shifting,
   2044    when N is negative, add 2**M-1.  */
   2045 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
   2046 
   2047 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
   2048 {
   2049     if (likely(sh < 64)) {
   2050         return (x >> sh) + ((x >> (sh - 1)) & 1);
   2051     } else if (sh == 64) {
   2052         return x >> 63;
   2053     } else {
   2054         return 0;
   2055     }
   2056 }
   2057 
   2058 static inline int64_t do_srshr(int64_t x, unsigned sh)
   2059 {
   2060     if (likely(sh < 64)) {
   2061         return (x >> sh) + ((x >> (sh - 1)) & 1);
   2062     } else {
   2063         /* Rounding the sign bit always produces 0. */
   2064         return 0;
   2065     }
   2066 }
   2067 
   2068 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
   2069 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
   2070 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
   2071 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
   2072 
   2073 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
   2074 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
   2075 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
   2076 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
   2077 
   2078 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
   2079 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
   2080 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
   2081 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
   2082 
   2083 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
   2084 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
   2085 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
   2086 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
   2087 
   2088 /* SVE2 bitwise shift by immediate */
   2089 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
   2090 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
   2091 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
   2092 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
   2093 
   2094 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
   2095 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
   2096 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
   2097 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
   2098 
   2099 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
   2100 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
   2101 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
   2102 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
   2103 
   2104 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
   2105 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
   2106 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
   2107 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
   2108 
   2109 #define do_suqrshl_b(n, m) \
   2110    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
   2111 #define do_suqrshl_h(n, m) \
   2112    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
   2113 #define do_suqrshl_s(n, m) \
   2114    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
   2115 #define do_suqrshl_d(n, m) \
   2116    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
   2117 
   2118 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
   2119 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
   2120 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
   2121 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
   2122 
   2123 #undef DO_ASRD
   2124 #undef DO_ZPZI
   2125 #undef DO_ZPZI_D
   2126 
   2127 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
   2128 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
   2129 {                                                            \
   2130     intptr_t i, opr_sz = simd_oprsz(desc);                   \
   2131     int shift = simd_data(desc);                             \
   2132     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
   2133         TYPEW nn = *(TYPEW *)(vn + i);                       \
   2134         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
   2135     }                                                        \
   2136 }
   2137 
   2138 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
   2139 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
   2140 {                                                                 \
   2141     intptr_t i, opr_sz = simd_oprsz(desc);                        \
   2142     int shift = simd_data(desc);                                  \
   2143     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
   2144         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
   2145         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
   2146     }                                                             \
   2147 }
   2148 
   2149 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
   2150 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
   2151 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
   2152 
   2153 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
   2154 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
   2155 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
   2156 
   2157 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
   2158 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
   2159 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
   2160 
   2161 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
   2162 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
   2163 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
   2164 
   2165 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
   2166 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
   2167 #define DO_SQSHRUN_D(x, sh) \
   2168     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
   2169 
   2170 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
   2171 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
   2172 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
   2173 
   2174 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
   2175 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
   2176 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
   2177 
   2178 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
   2179 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
   2180 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
   2181 
   2182 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
   2183 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
   2184 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
   2185 
   2186 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
   2187 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
   2188 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
   2189 
   2190 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
   2191 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
   2192 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
   2193 
   2194 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
   2195 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
   2196 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
   2197 
   2198 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
   2199 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
   2200 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
   2201 
   2202 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
   2203 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
   2204 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
   2205 
   2206 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
   2207 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
   2208 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
   2209 
   2210 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
   2211 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
   2212 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
   2213 
   2214 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
   2215 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
   2216 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
   2217 
   2218 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
   2219 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
   2220 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
   2221 
   2222 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
   2223 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
   2224 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
   2225 
   2226 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
   2227 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
   2228 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
   2229 
   2230 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
   2231 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
   2232 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
   2233 
   2234 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
   2235 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
   2236 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
   2237 
   2238 #undef DO_SHRNB
   2239 #undef DO_SHRNT
   2240 
   2241 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
   2242 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
   2243 {                                                                           \
   2244     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
   2245     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
   2246         TYPEW nn = *(TYPEW *)(vn + i);                                      \
   2247         TYPEW mm = *(TYPEW *)(vm + i);                                      \
   2248         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
   2249     }                                                                       \
   2250 }
   2251 
   2252 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
   2253 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
   2254 {                                                                           \
   2255     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
   2256     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
   2257         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
   2258         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
   2259         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
   2260     }                                                                       \
   2261 }
   2262 
   2263 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
   2264 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
   2265 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
   2266 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
   2267 
   2268 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
   2269 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
   2270 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
   2271 
   2272 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
   2273 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
   2274 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
   2275 
   2276 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
   2277 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
   2278 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
   2279 
   2280 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
   2281 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
   2282 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
   2283 
   2284 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
   2285 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
   2286 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
   2287 
   2288 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
   2289 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
   2290 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
   2291 
   2292 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
   2293 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
   2294 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
   2295 
   2296 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
   2297 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
   2298 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
   2299 
   2300 #undef DO_RSUBHN
   2301 #undef DO_SUBHN
   2302 #undef DO_RADDHN
   2303 #undef DO_ADDHN
   2304 
   2305 #undef DO_BINOPNB
   2306 
   2307 /* Fully general four-operand expander, controlled by a predicate.
   2308  */
   2309 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
   2310 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
   2311                   void *vg, uint32_t desc)                    \
   2312 {                                                             \
   2313     intptr_t i, opr_sz = simd_oprsz(desc);                    \
   2314     for (i = 0; i < opr_sz; ) {                               \
   2315         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
   2316         do {                                                  \
   2317             if (pg & 1) {                                     \
   2318                 TYPE nn = *(TYPE *)(vn + H(i));               \
   2319                 TYPE mm = *(TYPE *)(vm + H(i));               \
   2320                 TYPE aa = *(TYPE *)(va + H(i));               \
   2321                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
   2322             }                                                 \
   2323             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
   2324         } while (i & 15);                                     \
   2325     }                                                         \
   2326 }
   2327 
   2328 /* Similarly, specialized for 64-bit operands.  */
   2329 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
   2330 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
   2331                   void *vg, uint32_t desc)                    \
   2332 {                                                             \
   2333     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
   2334     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
   2335     uint8_t *pg = vg;                                         \
   2336     for (i = 0; i < opr_sz; i += 1) {                         \
   2337         if (pg[H1(i)] & 1) {                                  \
   2338             TYPE aa = a[i], nn = n[i], mm = m[i];             \
   2339             d[i] = OP(aa, nn, mm);                            \
   2340         }                                                     \
   2341     }                                                         \
   2342 }
   2343 
   2344 #define DO_MLA(A, N, M)  (A + N * M)
   2345 #define DO_MLS(A, N, M)  (A - N * M)
   2346 
   2347 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
   2348 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
   2349 
   2350 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
   2351 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
   2352 
   2353 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
   2354 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
   2355 
   2356 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
   2357 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
   2358 
   2359 #undef DO_MLA
   2360 #undef DO_MLS
   2361 #undef DO_ZPZZZ
   2362 #undef DO_ZPZZZ_D
   2363 
   2364 void HELPER(sve_index_b)(void *vd, uint32_t start,
   2365                          uint32_t incr, uint32_t desc)
   2366 {
   2367     intptr_t i, opr_sz = simd_oprsz(desc);
   2368     uint8_t *d = vd;
   2369     for (i = 0; i < opr_sz; i += 1) {
   2370         d[H1(i)] = start + i * incr;
   2371     }
   2372 }
   2373 
   2374 void HELPER(sve_index_h)(void *vd, uint32_t start,
   2375                          uint32_t incr, uint32_t desc)
   2376 {
   2377     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
   2378     uint16_t *d = vd;
   2379     for (i = 0; i < opr_sz; i += 1) {
   2380         d[H2(i)] = start + i * incr;
   2381     }
   2382 }
   2383 
   2384 void HELPER(sve_index_s)(void *vd, uint32_t start,
   2385                          uint32_t incr, uint32_t desc)
   2386 {
   2387     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
   2388     uint32_t *d = vd;
   2389     for (i = 0; i < opr_sz; i += 1) {
   2390         d[H4(i)] = start + i * incr;
   2391     }
   2392 }
   2393 
   2394 void HELPER(sve_index_d)(void *vd, uint64_t start,
   2395                          uint64_t incr, uint32_t desc)
   2396 {
   2397     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2398     uint64_t *d = vd;
   2399     for (i = 0; i < opr_sz; i += 1) {
   2400         d[i] = start + i * incr;
   2401     }
   2402 }
   2403 
   2404 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
   2405 {
   2406     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
   2407     uint32_t sh = simd_data(desc);
   2408     uint32_t *d = vd, *n = vn, *m = vm;
   2409     for (i = 0; i < opr_sz; i += 1) {
   2410         d[i] = n[i] + (m[i] << sh);
   2411     }
   2412 }
   2413 
   2414 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
   2415 {
   2416     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2417     uint64_t sh = simd_data(desc);
   2418     uint64_t *d = vd, *n = vn, *m = vm;
   2419     for (i = 0; i < opr_sz; i += 1) {
   2420         d[i] = n[i] + (m[i] << sh);
   2421     }
   2422 }
   2423 
   2424 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
   2425 {
   2426     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2427     uint64_t sh = simd_data(desc);
   2428     uint64_t *d = vd, *n = vn, *m = vm;
   2429     for (i = 0; i < opr_sz; i += 1) {
   2430         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
   2431     }
   2432 }
   2433 
   2434 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
   2435 {
   2436     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2437     uint64_t sh = simd_data(desc);
   2438     uint64_t *d = vd, *n = vn, *m = vm;
   2439     for (i = 0; i < opr_sz; i += 1) {
   2440         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
   2441     }
   2442 }
   2443 
   2444 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
   2445 {
   2446     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
   2447     static const uint16_t coeff[] = {
   2448         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
   2449         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
   2450         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
   2451         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
   2452     };
   2453     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
   2454     uint16_t *d = vd, *n = vn;
   2455 
   2456     for (i = 0; i < opr_sz; i++) {
   2457         uint16_t nn = n[i];
   2458         intptr_t idx = extract32(nn, 0, 5);
   2459         uint16_t exp = extract32(nn, 5, 5);
   2460         d[i] = coeff[idx] | (exp << 10);
   2461     }
   2462 }
   2463 
   2464 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
   2465 {
   2466     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
   2467     static const uint32_t coeff[] = {
   2468         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
   2469         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
   2470         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
   2471         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
   2472         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
   2473         0x1ef532, 0x20b051, 0x227043, 0x243516,
   2474         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
   2475         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
   2476         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
   2477         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
   2478         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
   2479         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
   2480         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
   2481         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
   2482         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
   2483         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
   2484     };
   2485     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
   2486     uint32_t *d = vd, *n = vn;
   2487 
   2488     for (i = 0; i < opr_sz; i++) {
   2489         uint32_t nn = n[i];
   2490         intptr_t idx = extract32(nn, 0, 6);
   2491         uint32_t exp = extract32(nn, 6, 8);
   2492         d[i] = coeff[idx] | (exp << 23);
   2493     }
   2494 }
   2495 
   2496 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
   2497 {
   2498     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
   2499     static const uint64_t coeff[] = {
   2500         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
   2501         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
   2502         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
   2503         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
   2504         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
   2505         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
   2506         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
   2507         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
   2508         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
   2509         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
   2510         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
   2511         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
   2512         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
   2513         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
   2514         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
   2515         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
   2516         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
   2517         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
   2518         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
   2519         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
   2520         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
   2521         0xFA7C1819E90D8ull,
   2522     };
   2523     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2524     uint64_t *d = vd, *n = vn;
   2525 
   2526     for (i = 0; i < opr_sz; i++) {
   2527         uint64_t nn = n[i];
   2528         intptr_t idx = extract32(nn, 0, 6);
   2529         uint64_t exp = extract32(nn, 6, 11);
   2530         d[i] = coeff[idx] | (exp << 52);
   2531     }
   2532 }
   2533 
   2534 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
   2535 {
   2536     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
   2537     uint16_t *d = vd, *n = vn, *m = vm;
   2538     for (i = 0; i < opr_sz; i += 1) {
   2539         uint16_t nn = n[i];
   2540         uint16_t mm = m[i];
   2541         if (mm & 1) {
   2542             nn = float16_one;
   2543         }
   2544         d[i] = nn ^ (mm & 2) << 14;
   2545     }
   2546 }
   2547 
   2548 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
   2549 {
   2550     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
   2551     uint32_t *d = vd, *n = vn, *m = vm;
   2552     for (i = 0; i < opr_sz; i += 1) {
   2553         uint32_t nn = n[i];
   2554         uint32_t mm = m[i];
   2555         if (mm & 1) {
   2556             nn = float32_one;
   2557         }
   2558         d[i] = nn ^ (mm & 2) << 30;
   2559     }
   2560 }
   2561 
   2562 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
   2563 {
   2564     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2565     uint64_t *d = vd, *n = vn, *m = vm;
   2566     for (i = 0; i < opr_sz; i += 1) {
   2567         uint64_t nn = n[i];
   2568         uint64_t mm = m[i];
   2569         if (mm & 1) {
   2570             nn = float64_one;
   2571         }
   2572         d[i] = nn ^ (mm & 2) << 62;
   2573     }
   2574 }
   2575 
   2576 /*
   2577  * Signed saturating addition with scalar operand.
   2578  */
   2579 
   2580 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
   2581 {
   2582     intptr_t i, oprsz = simd_oprsz(desc);
   2583 
   2584     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
   2585         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
   2586     }
   2587 }
   2588 
   2589 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
   2590 {
   2591     intptr_t i, oprsz = simd_oprsz(desc);
   2592 
   2593     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
   2594         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
   2595     }
   2596 }
   2597 
   2598 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
   2599 {
   2600     intptr_t i, oprsz = simd_oprsz(desc);
   2601 
   2602     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
   2603         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
   2604     }
   2605 }
   2606 
   2607 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
   2608 {
   2609     intptr_t i, oprsz = simd_oprsz(desc);
   2610 
   2611     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
   2612         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
   2613     }
   2614 }
   2615 
   2616 /*
   2617  * Unsigned saturating addition with scalar operand.
   2618  */
   2619 
   2620 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
   2621 {
   2622     intptr_t i, oprsz = simd_oprsz(desc);
   2623 
   2624     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
   2625         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
   2626     }
   2627 }
   2628 
   2629 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
   2630 {
   2631     intptr_t i, oprsz = simd_oprsz(desc);
   2632 
   2633     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
   2634         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
   2635     }
   2636 }
   2637 
   2638 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
   2639 {
   2640     intptr_t i, oprsz = simd_oprsz(desc);
   2641 
   2642     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
   2643         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
   2644     }
   2645 }
   2646 
   2647 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
   2648 {
   2649     intptr_t i, oprsz = simd_oprsz(desc);
   2650 
   2651     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
   2652         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
   2653     }
   2654 }
   2655 
   2656 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
   2657 {
   2658     intptr_t i, oprsz = simd_oprsz(desc);
   2659 
   2660     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
   2661         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
   2662     }
   2663 }
   2664 
   2665 /* Two operand predicated copy immediate with merge.  All valid immediates
   2666  * can fit within 17 signed bits in the simd_data field.
   2667  */
   2668 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
   2669                          uint64_t mm, uint32_t desc)
   2670 {
   2671     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2672     uint64_t *d = vd, *n = vn;
   2673     uint8_t *pg = vg;
   2674 
   2675     mm = dup_const(MO_8, mm);
   2676     for (i = 0; i < opr_sz; i += 1) {
   2677         uint64_t nn = n[i];
   2678         uint64_t pp = expand_pred_b(pg[H1(i)]);
   2679         d[i] = (mm & pp) | (nn & ~pp);
   2680     }
   2681 }
   2682 
   2683 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
   2684                          uint64_t mm, uint32_t desc)
   2685 {
   2686     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2687     uint64_t *d = vd, *n = vn;
   2688     uint8_t *pg = vg;
   2689 
   2690     mm = dup_const(MO_16, mm);
   2691     for (i = 0; i < opr_sz; i += 1) {
   2692         uint64_t nn = n[i];
   2693         uint64_t pp = expand_pred_h(pg[H1(i)]);
   2694         d[i] = (mm & pp) | (nn & ~pp);
   2695     }
   2696 }
   2697 
   2698 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
   2699                          uint64_t mm, uint32_t desc)
   2700 {
   2701     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2702     uint64_t *d = vd, *n = vn;
   2703     uint8_t *pg = vg;
   2704 
   2705     mm = dup_const(MO_32, mm);
   2706     for (i = 0; i < opr_sz; i += 1) {
   2707         uint64_t nn = n[i];
   2708         uint64_t pp = expand_pred_s(pg[H1(i)]);
   2709         d[i] = (mm & pp) | (nn & ~pp);
   2710     }
   2711 }
   2712 
   2713 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
   2714                          uint64_t mm, uint32_t desc)
   2715 {
   2716     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2717     uint64_t *d = vd, *n = vn;
   2718     uint8_t *pg = vg;
   2719 
   2720     for (i = 0; i < opr_sz; i += 1) {
   2721         uint64_t nn = n[i];
   2722         d[i] = (pg[H1(i)] & 1 ? mm : nn);
   2723     }
   2724 }
   2725 
   2726 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
   2727 {
   2728     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2729     uint64_t *d = vd;
   2730     uint8_t *pg = vg;
   2731 
   2732     val = dup_const(MO_8, val);
   2733     for (i = 0; i < opr_sz; i += 1) {
   2734         d[i] = val & expand_pred_b(pg[H1(i)]);
   2735     }
   2736 }
   2737 
   2738 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
   2739 {
   2740     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2741     uint64_t *d = vd;
   2742     uint8_t *pg = vg;
   2743 
   2744     val = dup_const(MO_16, val);
   2745     for (i = 0; i < opr_sz; i += 1) {
   2746         d[i] = val & expand_pred_h(pg[H1(i)]);
   2747     }
   2748 }
   2749 
   2750 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
   2751 {
   2752     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2753     uint64_t *d = vd;
   2754     uint8_t *pg = vg;
   2755 
   2756     val = dup_const(MO_32, val);
   2757     for (i = 0; i < opr_sz; i += 1) {
   2758         d[i] = val & expand_pred_s(pg[H1(i)]);
   2759     }
   2760 }
   2761 
   2762 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
   2763 {
   2764     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2765     uint64_t *d = vd;
   2766     uint8_t *pg = vg;
   2767 
   2768     for (i = 0; i < opr_sz; i += 1) {
   2769         d[i] = (pg[H1(i)] & 1 ? val : 0);
   2770     }
   2771 }
   2772 
   2773 /* Big-endian hosts need to frob the byte indices.  If the copy
   2774  * happens to be 8-byte aligned, then no frobbing necessary.
   2775  */
   2776 static void swap_memmove(void *vd, void *vs, size_t n)
   2777 {
   2778     uintptr_t d = (uintptr_t)vd;
   2779     uintptr_t s = (uintptr_t)vs;
   2780     uintptr_t o = (d | s | n) & 7;
   2781     size_t i;
   2782 
   2783 #if !HOST_BIG_ENDIAN
   2784     o = 0;
   2785 #endif
   2786     switch (o) {
   2787     case 0:
   2788         memmove(vd, vs, n);
   2789         break;
   2790 
   2791     case 4:
   2792         if (d < s || d >= s + n) {
   2793             for (i = 0; i < n; i += 4) {
   2794                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
   2795             }
   2796         } else {
   2797             for (i = n; i > 0; ) {
   2798                 i -= 4;
   2799                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
   2800             }
   2801         }
   2802         break;
   2803 
   2804     case 2:
   2805     case 6:
   2806         if (d < s || d >= s + n) {
   2807             for (i = 0; i < n; i += 2) {
   2808                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
   2809             }
   2810         } else {
   2811             for (i = n; i > 0; ) {
   2812                 i -= 2;
   2813                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
   2814             }
   2815         }
   2816         break;
   2817 
   2818     default:
   2819         if (d < s || d >= s + n) {
   2820             for (i = 0; i < n; i++) {
   2821                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
   2822             }
   2823         } else {
   2824             for (i = n; i > 0; ) {
   2825                 i -= 1;
   2826                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
   2827             }
   2828         }
   2829         break;
   2830     }
   2831 }
   2832 
   2833 /* Similarly for memset of 0.  */
   2834 static void swap_memzero(void *vd, size_t n)
   2835 {
   2836     uintptr_t d = (uintptr_t)vd;
   2837     uintptr_t o = (d | n) & 7;
   2838     size_t i;
   2839 
   2840     /* Usually, the first bit of a predicate is set, so N is 0.  */
   2841     if (likely(n == 0)) {
   2842         return;
   2843     }
   2844 
   2845 #if !HOST_BIG_ENDIAN
   2846     o = 0;
   2847 #endif
   2848     switch (o) {
   2849     case 0:
   2850         memset(vd, 0, n);
   2851         break;
   2852 
   2853     case 4:
   2854         for (i = 0; i < n; i += 4) {
   2855             *(uint32_t *)H1_4(d + i) = 0;
   2856         }
   2857         break;
   2858 
   2859     case 2:
   2860     case 6:
   2861         for (i = 0; i < n; i += 2) {
   2862             *(uint16_t *)H1_2(d + i) = 0;
   2863         }
   2864         break;
   2865 
   2866     default:
   2867         for (i = 0; i < n; i++) {
   2868             *(uint8_t *)H1(d + i) = 0;
   2869         }
   2870         break;
   2871     }
   2872 }
   2873 
   2874 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
   2875 {
   2876     intptr_t opr_sz = simd_oprsz(desc);
   2877     size_t n_ofs = simd_data(desc);
   2878     size_t n_siz = opr_sz - n_ofs;
   2879 
   2880     if (vd != vm) {
   2881         swap_memmove(vd, vn + n_ofs, n_siz);
   2882         swap_memmove(vd + n_siz, vm, n_ofs);
   2883     } else if (vd != vn) {
   2884         swap_memmove(vd + n_siz, vd, n_ofs);
   2885         swap_memmove(vd, vn + n_ofs, n_siz);
   2886     } else {
   2887         /* vd == vn == vm.  Need temp space.  */
   2888         ARMVectorReg tmp;
   2889         swap_memmove(&tmp, vm, n_ofs);
   2890         swap_memmove(vd, vd + n_ofs, n_siz);
   2891         memcpy(vd + n_siz, &tmp, n_ofs);
   2892     }
   2893 }
   2894 
   2895 #define DO_INSR(NAME, TYPE, H) \
   2896 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
   2897 {                                                                  \
   2898     intptr_t opr_sz = simd_oprsz(desc);                            \
   2899     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
   2900     *(TYPE *)(vd + H(0)) = val;                                    \
   2901 }
   2902 
   2903 DO_INSR(sve_insr_b, uint8_t, H1)
   2904 DO_INSR(sve_insr_h, uint16_t, H1_2)
   2905 DO_INSR(sve_insr_s, uint32_t, H1_4)
   2906 DO_INSR(sve_insr_d, uint64_t, H1_8)
   2907 
   2908 #undef DO_INSR
   2909 
   2910 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
   2911 {
   2912     intptr_t i, j, opr_sz = simd_oprsz(desc);
   2913     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
   2914         uint64_t f = *(uint64_t *)(vn + i);
   2915         uint64_t b = *(uint64_t *)(vn + j);
   2916         *(uint64_t *)(vd + i) = bswap64(b);
   2917         *(uint64_t *)(vd + j) = bswap64(f);
   2918     }
   2919 }
   2920 
   2921 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
   2922 {
   2923     intptr_t i, j, opr_sz = simd_oprsz(desc);
   2924     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
   2925         uint64_t f = *(uint64_t *)(vn + i);
   2926         uint64_t b = *(uint64_t *)(vn + j);
   2927         *(uint64_t *)(vd + i) = hswap64(b);
   2928         *(uint64_t *)(vd + j) = hswap64(f);
   2929     }
   2930 }
   2931 
   2932 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
   2933 {
   2934     intptr_t i, j, opr_sz = simd_oprsz(desc);
   2935     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
   2936         uint64_t f = *(uint64_t *)(vn + i);
   2937         uint64_t b = *(uint64_t *)(vn + j);
   2938         *(uint64_t *)(vd + i) = rol64(b, 32);
   2939         *(uint64_t *)(vd + j) = rol64(f, 32);
   2940     }
   2941 }
   2942 
   2943 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
   2944 {
   2945     intptr_t i, j, opr_sz = simd_oprsz(desc);
   2946     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
   2947         uint64_t f = *(uint64_t *)(vn + i);
   2948         uint64_t b = *(uint64_t *)(vn + j);
   2949         *(uint64_t *)(vd + i) = b;
   2950         *(uint64_t *)(vd + j) = f;
   2951     }
   2952 }
   2953 
   2954 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
   2955 
   2956 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
   2957                            bool is_tbx, tb_impl_fn *fn)
   2958 {
   2959     ARMVectorReg scratch;
   2960     uintptr_t oprsz = simd_oprsz(desc);
   2961 
   2962     if (unlikely(vd == vn)) {
   2963         vn = memcpy(&scratch, vn, oprsz);
   2964     }
   2965 
   2966     fn(vd, vn, NULL, vm, oprsz, is_tbx);
   2967 }
   2968 
   2969 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
   2970                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
   2971 {
   2972     ARMVectorReg scratch;
   2973     uintptr_t oprsz = simd_oprsz(desc);
   2974 
   2975     if (unlikely(vd == vn0)) {
   2976         vn0 = memcpy(&scratch, vn0, oprsz);
   2977         if (vd == vn1) {
   2978             vn1 = vn0;
   2979         }
   2980     } else if (unlikely(vd == vn1)) {
   2981         vn1 = memcpy(&scratch, vn1, oprsz);
   2982     }
   2983 
   2984     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
   2985 }
   2986 
   2987 #define DO_TB(SUFF, TYPE, H)                                            \
   2988 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
   2989                                 void *vm, uintptr_t oprsz, bool is_tbx) \
   2990 {                                                                       \
   2991     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
   2992     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
   2993     for (i = 0; i < nelem; ++i) {                                       \
   2994         TYPE index = indexes[H1(i)], val = 0;                           \
   2995         if (index < nelem) {                                            \
   2996             val = tbl0[H(index)];                                       \
   2997         } else {                                                        \
   2998             index -= nelem;                                             \
   2999             if (tbl1 && index < nelem) {                                \
   3000                 val = tbl1[H(index)];                                   \
   3001             } else if (is_tbx) {                                        \
   3002                 continue;                                               \
   3003             }                                                           \
   3004         }                                                               \
   3005         d[H(i)] = val;                                                  \
   3006     }                                                                   \
   3007 }                                                                       \
   3008 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
   3009 {                                                                       \
   3010     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
   3011 }                                                                       \
   3012 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
   3013                              void *vm, uint32_t desc)                   \
   3014 {                                                                       \
   3015     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
   3016 }                                                                       \
   3017 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
   3018 {                                                                       \
   3019     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
   3020 }
   3021 
   3022 DO_TB(b, uint8_t, H1)
   3023 DO_TB(h, uint16_t, H2)
   3024 DO_TB(s, uint32_t, H4)
   3025 DO_TB(d, uint64_t, H8)
   3026 
   3027 #undef DO_TB
   3028 
   3029 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
   3030 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
   3031 {                                                              \
   3032     intptr_t i, opr_sz = simd_oprsz(desc);                     \
   3033     TYPED *d = vd;                                             \
   3034     TYPES *n = vn;                                             \
   3035     ARMVectorReg tmp;                                          \
   3036     if (unlikely(vn - vd < opr_sz)) {                          \
   3037         n = memcpy(&tmp, n, opr_sz / 2);                       \
   3038     }                                                          \
   3039     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
   3040         d[HD(i)] = n[HS(i)];                                   \
   3041     }                                                          \
   3042 }
   3043 
   3044 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
   3045 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
   3046 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
   3047 
   3048 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
   3049 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
   3050 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
   3051 
   3052 #undef DO_UNPK
   3053 
   3054 /* Mask of bits included in the even numbered predicates of width esz.
   3055  * We also use this for expand_bits/compress_bits, and so extend the
   3056  * same pattern out to 16-bit units.
   3057  */
   3058 static const uint64_t even_bit_esz_masks[5] = {
   3059     0x5555555555555555ull,
   3060     0x3333333333333333ull,
   3061     0x0f0f0f0f0f0f0f0full,
   3062     0x00ff00ff00ff00ffull,
   3063     0x0000ffff0000ffffull,
   3064 };
   3065 
   3066 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
   3067  * For N==0, this corresponds to the operation that in qemu/bitops.h
   3068  * we call half_shuffle64; this algorithm is from Hacker's Delight,
   3069  * section 7-2 Shuffling Bits.
   3070  */
   3071 static uint64_t expand_bits(uint64_t x, int n)
   3072 {
   3073     int i;
   3074 
   3075     x &= 0xffffffffu;
   3076     for (i = 4; i >= n; i--) {
   3077         int sh = 1 << i;
   3078         x = ((x << sh) | x) & even_bit_esz_masks[i];
   3079     }
   3080     return x;
   3081 }
   3082 
   3083 /* Compress units of 2**(N+1) bits to units of 2**N bits.
   3084  * For N==0, this corresponds to the operation that in qemu/bitops.h
   3085  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
   3086  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
   3087  */
   3088 static uint64_t compress_bits(uint64_t x, int n)
   3089 {
   3090     int i;
   3091 
   3092     for (i = n; i <= 4; i++) {
   3093         int sh = 1 << i;
   3094         x &= even_bit_esz_masks[i];
   3095         x = (x >> sh) | x;
   3096     }
   3097     return x & 0xffffffffu;
   3098 }
   3099 
   3100 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
   3101 {
   3102     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   3103     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
   3104     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
   3105     int esize = 1 << esz;
   3106     uint64_t *d = vd;
   3107     intptr_t i;
   3108 
   3109     if (oprsz <= 8) {
   3110         uint64_t nn = *(uint64_t *)vn;
   3111         uint64_t mm = *(uint64_t *)vm;
   3112         int half = 4 * oprsz;
   3113 
   3114         nn = extract64(nn, high * half, half);
   3115         mm = extract64(mm, high * half, half);
   3116         nn = expand_bits(nn, esz);
   3117         mm = expand_bits(mm, esz);
   3118         d[0] = nn | (mm << esize);
   3119     } else {
   3120         ARMPredicateReg tmp;
   3121 
   3122         /* We produce output faster than we consume input.
   3123            Therefore we must be mindful of possible overlap.  */
   3124         if (vd == vn) {
   3125             vn = memcpy(&tmp, vn, oprsz);
   3126             if (vd == vm) {
   3127                 vm = vn;
   3128             }
   3129         } else if (vd == vm) {
   3130             vm = memcpy(&tmp, vm, oprsz);
   3131         }
   3132         if (high) {
   3133             high = oprsz >> 1;
   3134         }
   3135 
   3136         if ((oprsz & 7) == 0) {
   3137             uint32_t *n = vn, *m = vm;
   3138             high >>= 2;
   3139 
   3140             for (i = 0; i < oprsz / 8; i++) {
   3141                 uint64_t nn = n[H4(high + i)];
   3142                 uint64_t mm = m[H4(high + i)];
   3143 
   3144                 nn = expand_bits(nn, esz);
   3145                 mm = expand_bits(mm, esz);
   3146                 d[i] = nn | (mm << esize);
   3147             }
   3148         } else {
   3149             uint8_t *n = vn, *m = vm;
   3150             uint16_t *d16 = vd;
   3151 
   3152             for (i = 0; i < oprsz / 2; i++) {
   3153                 uint16_t nn = n[H1(high + i)];
   3154                 uint16_t mm = m[H1(high + i)];
   3155 
   3156                 nn = expand_bits(nn, esz);
   3157                 mm = expand_bits(mm, esz);
   3158                 d16[H2(i)] = nn | (mm << esize);
   3159             }
   3160         }
   3161     }
   3162 }
   3163 
   3164 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
   3165 {
   3166     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   3167     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
   3168     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
   3169     uint64_t *d = vd, *n = vn, *m = vm;
   3170     uint64_t l, h;
   3171     intptr_t i;
   3172 
   3173     if (oprsz <= 8) {
   3174         l = compress_bits(n[0] >> odd, esz);
   3175         h = compress_bits(m[0] >> odd, esz);
   3176         d[0] = l | (h << (4 * oprsz));
   3177     } else {
   3178         ARMPredicateReg tmp_m;
   3179         intptr_t oprsz_16 = oprsz / 16;
   3180 
   3181         if ((vm - vd) < (uintptr_t)oprsz) {
   3182             m = memcpy(&tmp_m, vm, oprsz);
   3183         }
   3184 
   3185         for (i = 0; i < oprsz_16; i++) {
   3186             l = n[2 * i + 0];
   3187             h = n[2 * i + 1];
   3188             l = compress_bits(l >> odd, esz);
   3189             h = compress_bits(h >> odd, esz);
   3190             d[i] = l | (h << 32);
   3191         }
   3192 
   3193         /*
   3194          * For VL which is not a multiple of 512, the results from M do not
   3195          * align nicely with the uint64_t for D.  Put the aligned results
   3196          * from M into TMP_M and then copy it into place afterward.
   3197          */
   3198         if (oprsz & 15) {
   3199             int final_shift = (oprsz & 15) * 2;
   3200 
   3201             l = n[2 * i + 0];
   3202             h = n[2 * i + 1];
   3203             l = compress_bits(l >> odd, esz);
   3204             h = compress_bits(h >> odd, esz);
   3205             d[i] = l | (h << final_shift);
   3206 
   3207             for (i = 0; i < oprsz_16; i++) {
   3208                 l = m[2 * i + 0];
   3209                 h = m[2 * i + 1];
   3210                 l = compress_bits(l >> odd, esz);
   3211                 h = compress_bits(h >> odd, esz);
   3212                 tmp_m.p[i] = l | (h << 32);
   3213             }
   3214             l = m[2 * i + 0];
   3215             h = m[2 * i + 1];
   3216             l = compress_bits(l >> odd, esz);
   3217             h = compress_bits(h >> odd, esz);
   3218             tmp_m.p[i] = l | (h << final_shift);
   3219 
   3220             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
   3221         } else {
   3222             for (i = 0; i < oprsz_16; i++) {
   3223                 l = m[2 * i + 0];
   3224                 h = m[2 * i + 1];
   3225                 l = compress_bits(l >> odd, esz);
   3226                 h = compress_bits(h >> odd, esz);
   3227                 d[oprsz_16 + i] = l | (h << 32);
   3228             }
   3229         }
   3230     }
   3231 }
   3232 
   3233 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
   3234 {
   3235     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   3236     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
   3237     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
   3238     uint64_t *d = vd, *n = vn, *m = vm;
   3239     uint64_t mask;
   3240     int shr, shl;
   3241     intptr_t i;
   3242 
   3243     shl = 1 << esz;
   3244     shr = 0;
   3245     mask = even_bit_esz_masks[esz];
   3246     if (odd) {
   3247         mask <<= shl;
   3248         shr = shl;
   3249         shl = 0;
   3250     }
   3251 
   3252     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
   3253         uint64_t nn = (n[i] & mask) >> shr;
   3254         uint64_t mm = (m[i] & mask) << shl;
   3255         d[i] = nn + mm;
   3256     }
   3257 }
   3258 
   3259 /* Reverse units of 2**N bits.  */
   3260 static uint64_t reverse_bits_64(uint64_t x, int n)
   3261 {
   3262     int i, sh;
   3263 
   3264     x = bswap64(x);
   3265     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
   3266         uint64_t mask = even_bit_esz_masks[i];
   3267         x = ((x & mask) << sh) | ((x >> sh) & mask);
   3268     }
   3269     return x;
   3270 }
   3271 
   3272 static uint8_t reverse_bits_8(uint8_t x, int n)
   3273 {
   3274     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
   3275     int i, sh;
   3276 
   3277     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
   3278         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
   3279     }
   3280     return x;
   3281 }
   3282 
   3283 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
   3284 {
   3285     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   3286     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
   3287     intptr_t i, oprsz_2 = oprsz / 2;
   3288 
   3289     if (oprsz <= 8) {
   3290         uint64_t l = *(uint64_t *)vn;
   3291         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
   3292         *(uint64_t *)vd = l;
   3293     } else if ((oprsz & 15) == 0) {
   3294         for (i = 0; i < oprsz_2; i += 8) {
   3295             intptr_t ih = oprsz - 8 - i;
   3296             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
   3297             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
   3298             *(uint64_t *)(vd + i) = h;
   3299             *(uint64_t *)(vd + ih) = l;
   3300         }
   3301     } else {
   3302         for (i = 0; i < oprsz_2; i += 1) {
   3303             intptr_t il = H1(i);
   3304             intptr_t ih = H1(oprsz - 1 - i);
   3305             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
   3306             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
   3307             *(uint8_t *)(vd + il) = h;
   3308             *(uint8_t *)(vd + ih) = l;
   3309         }
   3310     }
   3311 }
   3312 
   3313 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
   3314 {
   3315     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   3316     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
   3317     uint64_t *d = vd;
   3318     intptr_t i;
   3319 
   3320     if (oprsz <= 8) {
   3321         uint64_t nn = *(uint64_t *)vn;
   3322         int half = 4 * oprsz;
   3323 
   3324         nn = extract64(nn, high * half, half);
   3325         nn = expand_bits(nn, 0);
   3326         d[0] = nn;
   3327     } else {
   3328         ARMPredicateReg tmp_n;
   3329 
   3330         /* We produce output faster than we consume input.
   3331            Therefore we must be mindful of possible overlap.  */
   3332         if ((vn - vd) < (uintptr_t)oprsz) {
   3333             vn = memcpy(&tmp_n, vn, oprsz);
   3334         }
   3335         if (high) {
   3336             high = oprsz >> 1;
   3337         }
   3338 
   3339         if ((oprsz & 7) == 0) {
   3340             uint32_t *n = vn;
   3341             high >>= 2;
   3342 
   3343             for (i = 0; i < oprsz / 8; i++) {
   3344                 uint64_t nn = n[H4(high + i)];
   3345                 d[i] = expand_bits(nn, 0);
   3346             }
   3347         } else {
   3348             uint16_t *d16 = vd;
   3349             uint8_t *n = vn;
   3350 
   3351             for (i = 0; i < oprsz / 2; i++) {
   3352                 uint16_t nn = n[H1(high + i)];
   3353                 d16[H2(i)] = expand_bits(nn, 0);
   3354             }
   3355         }
   3356     }
   3357 }
   3358 
   3359 #define DO_ZIP(NAME, TYPE, H) \
   3360 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
   3361 {                                                                    \
   3362     intptr_t oprsz = simd_oprsz(desc);                               \
   3363     intptr_t odd_ofs = simd_data(desc);                              \
   3364     intptr_t i, oprsz_2 = oprsz / 2;                                 \
   3365     ARMVectorReg tmp_n, tmp_m;                                       \
   3366     /* We produce output faster than we consume input.               \
   3367        Therefore we must be mindful of possible overlap.  */         \
   3368     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
   3369         vn = memcpy(&tmp_n, vn, oprsz);                              \
   3370     }                                                                \
   3371     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
   3372         vm = memcpy(&tmp_m, vm, oprsz);                              \
   3373     }                                                                \
   3374     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
   3375         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
   3376         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
   3377             *(TYPE *)(vm + odd_ofs + H(i));                          \
   3378     }                                                                \
   3379     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
   3380         memset(vd + oprsz - 16, 0, 16);                              \
   3381     }                                                                \
   3382 }
   3383 
   3384 DO_ZIP(sve_zip_b, uint8_t, H1)
   3385 DO_ZIP(sve_zip_h, uint16_t, H1_2)
   3386 DO_ZIP(sve_zip_s, uint32_t, H1_4)
   3387 DO_ZIP(sve_zip_d, uint64_t, H1_8)
   3388 DO_ZIP(sve2_zip_q, Int128, )
   3389 
   3390 #define DO_UZP(NAME, TYPE, H) \
   3391 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
   3392 {                                                                      \
   3393     intptr_t oprsz = simd_oprsz(desc);                                 \
   3394     intptr_t odd_ofs = simd_data(desc);                                \
   3395     intptr_t i, p;                                                     \
   3396     ARMVectorReg tmp_m;                                                \
   3397     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
   3398         vm = memcpy(&tmp_m, vm, oprsz);                                \
   3399     }                                                                  \
   3400     i = 0, p = odd_ofs;                                                \
   3401     do {                                                               \
   3402         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
   3403         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
   3404     } while (p < oprsz);                                               \
   3405     p -= oprsz;                                                        \
   3406     do {                                                               \
   3407         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
   3408         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
   3409     } while (p < oprsz);                                               \
   3410     tcg_debug_assert(i == oprsz);                                      \
   3411 }
   3412 
   3413 DO_UZP(sve_uzp_b, uint8_t, H1)
   3414 DO_UZP(sve_uzp_h, uint16_t, H1_2)
   3415 DO_UZP(sve_uzp_s, uint32_t, H1_4)
   3416 DO_UZP(sve_uzp_d, uint64_t, H1_8)
   3417 DO_UZP(sve2_uzp_q, Int128, )
   3418 
   3419 #define DO_TRN(NAME, TYPE, H) \
   3420 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
   3421 {                                                                      \
   3422     intptr_t oprsz = simd_oprsz(desc);                                 \
   3423     intptr_t odd_ofs = simd_data(desc);                                \
   3424     intptr_t i;                                                        \
   3425     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
   3426         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
   3427         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
   3428         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
   3429         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
   3430     }                                                                  \
   3431     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
   3432         memset(vd + oprsz - 16, 0, 16);                                \
   3433     }                                                                  \
   3434 }
   3435 
   3436 DO_TRN(sve_trn_b, uint8_t, H1)
   3437 DO_TRN(sve_trn_h, uint16_t, H1_2)
   3438 DO_TRN(sve_trn_s, uint32_t, H1_4)
   3439 DO_TRN(sve_trn_d, uint64_t, H1_8)
   3440 DO_TRN(sve2_trn_q, Int128, )
   3441 
   3442 #undef DO_ZIP
   3443 #undef DO_UZP
   3444 #undef DO_TRN
   3445 
   3446 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
   3447 {
   3448     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
   3449     uint32_t *d = vd, *n = vn;
   3450     uint8_t *pg = vg;
   3451 
   3452     for (i = j = 0; i < opr_sz; i++) {
   3453         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
   3454             d[H4(j)] = n[H4(i)];
   3455             j++;
   3456         }
   3457     }
   3458     for (; j < opr_sz; j++) {
   3459         d[H4(j)] = 0;
   3460     }
   3461 }
   3462 
   3463 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
   3464 {
   3465     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
   3466     uint64_t *d = vd, *n = vn;
   3467     uint8_t *pg = vg;
   3468 
   3469     for (i = j = 0; i < opr_sz; i++) {
   3470         if (pg[H1(i)] & 1) {
   3471             d[j] = n[i];
   3472             j++;
   3473         }
   3474     }
   3475     for (; j < opr_sz; j++) {
   3476         d[j] = 0;
   3477     }
   3478 }
   3479 
   3480 /* Similar to the ARM LastActiveElement pseudocode function, except the
   3481  * result is multiplied by the element size.  This includes the not found
   3482  * indication; e.g. not found for esz=3 is -8.
   3483  */
   3484 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
   3485 {
   3486     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
   3487     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
   3488 
   3489     return last_active_element(vg, words, esz);
   3490 }
   3491 
   3492 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
   3493 {
   3494     intptr_t opr_sz = simd_oprsz(desc) / 8;
   3495     int esz = simd_data(desc);
   3496     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
   3497     intptr_t i, first_i, last_i;
   3498     ARMVectorReg tmp;
   3499 
   3500     first_i = last_i = 0;
   3501     first_g = last_g = 0;
   3502 
   3503     /* Find the extent of the active elements within VG.  */
   3504     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
   3505         pg = *(uint64_t *)(vg + i) & mask;
   3506         if (pg) {
   3507             if (last_g == 0) {
   3508                 last_g = pg;
   3509                 last_i = i;
   3510             }
   3511             first_g = pg;
   3512             first_i = i;
   3513         }
   3514     }
   3515 
   3516     len = 0;
   3517     if (first_g != 0) {
   3518         first_i = first_i * 8 + ctz64(first_g);
   3519         last_i = last_i * 8 + 63 - clz64(last_g);
   3520         len = last_i - first_i + (1 << esz);
   3521         if (vd == vm) {
   3522             vm = memcpy(&tmp, vm, opr_sz * 8);
   3523         }
   3524         swap_memmove(vd, vn + first_i, len);
   3525     }
   3526     swap_memmove(vd + len, vm, opr_sz * 8 - len);
   3527 }
   3528 
   3529 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
   3530                             void *vg, uint32_t desc)
   3531 {
   3532     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   3533     uint64_t *d = vd, *n = vn, *m = vm;
   3534     uint8_t *pg = vg;
   3535 
   3536     for (i = 0; i < opr_sz; i += 1) {
   3537         uint64_t nn = n[i], mm = m[i];
   3538         uint64_t pp = expand_pred_b(pg[H1(i)]);
   3539         d[i] = (nn & pp) | (mm & ~pp);
   3540     }
   3541 }
   3542 
   3543 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
   3544                             void *vg, uint32_t desc)
   3545 {
   3546     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   3547     uint64_t *d = vd, *n = vn, *m = vm;
   3548     uint8_t *pg = vg;
   3549 
   3550     for (i = 0; i < opr_sz; i += 1) {
   3551         uint64_t nn = n[i], mm = m[i];
   3552         uint64_t pp = expand_pred_h(pg[H1(i)]);
   3553         d[i] = (nn & pp) | (mm & ~pp);
   3554     }
   3555 }
   3556 
   3557 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
   3558                             void *vg, uint32_t desc)
   3559 {
   3560     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   3561     uint64_t *d = vd, *n = vn, *m = vm;
   3562     uint8_t *pg = vg;
   3563 
   3564     for (i = 0; i < opr_sz; i += 1) {
   3565         uint64_t nn = n[i], mm = m[i];
   3566         uint64_t pp = expand_pred_s(pg[H1(i)]);
   3567         d[i] = (nn & pp) | (mm & ~pp);
   3568     }
   3569 }
   3570 
   3571 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
   3572                             void *vg, uint32_t desc)
   3573 {
   3574     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   3575     uint64_t *d = vd, *n = vn, *m = vm;
   3576     uint8_t *pg = vg;
   3577 
   3578     for (i = 0; i < opr_sz; i += 1) {
   3579         uint64_t nn = n[i], mm = m[i];
   3580         d[i] = (pg[H1(i)] & 1 ? nn : mm);
   3581     }
   3582 }
   3583 
   3584 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
   3585                             void *vg, uint32_t desc)
   3586 {
   3587     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
   3588     Int128 *d = vd, *n = vn, *m = vm;
   3589     uint16_t *pg = vg;
   3590 
   3591     for (i = 0; i < opr_sz; i += 1) {
   3592         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
   3593     }
   3594 }
   3595 
   3596 /* Two operand comparison controlled by a predicate.
   3597  * ??? It is very tempting to want to be able to expand this inline
   3598  * with x86 instructions, e.g.
   3599  *
   3600  *    vcmpeqw    zm, zn, %ymm0
   3601  *    vpmovmskb  %ymm0, %eax
   3602  *    and        $0x5555, %eax
   3603  *    and        pg, %eax
   3604  *
   3605  * or even aarch64, e.g.
   3606  *
   3607  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
   3608  *    cmeq       v0.8h, zn, zm
   3609  *    and        v0.8h, v0.8h, mask
   3610  *    addv       h0, v0.8h
   3611  *    and        v0.8b, pg
   3612  *
   3613  * However, coming up with an abstraction that allows vector inputs and
   3614  * a scalar output, and also handles the byte-ordering of sub-uint64_t
   3615  * scalar outputs, is tricky.
   3616  */
   3617 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
   3618 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
   3619 {                                                                            \
   3620     intptr_t opr_sz = simd_oprsz(desc);                                      \
   3621     uint32_t flags = PREDTEST_INIT;                                          \
   3622     intptr_t i = opr_sz;                                                     \
   3623     do {                                                                     \
   3624         uint64_t out = 0, pg;                                                \
   3625         do {                                                                 \
   3626             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
   3627             TYPE nn = *(TYPE *)(vn + H(i));                                  \
   3628             TYPE mm = *(TYPE *)(vm + H(i));                                  \
   3629             out |= nn OP mm;                                                 \
   3630         } while (i & 63);                                                    \
   3631         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
   3632         out &= pg;                                                           \
   3633         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
   3634         flags = iter_predtest_bwd(out, pg, flags);                           \
   3635     } while (i > 0);                                                         \
   3636     return flags;                                                            \
   3637 }
   3638 
   3639 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
   3640     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
   3641 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
   3642     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
   3643 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
   3644     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
   3645 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
   3646     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
   3647 
   3648 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
   3649 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
   3650 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
   3651 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
   3652 
   3653 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
   3654 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
   3655 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
   3656 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
   3657 
   3658 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
   3659 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
   3660 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
   3661 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
   3662 
   3663 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
   3664 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
   3665 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
   3666 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
   3667 
   3668 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
   3669 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
   3670 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
   3671 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
   3672 
   3673 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
   3674 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
   3675 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
   3676 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
   3677 
   3678 #undef DO_CMP_PPZZ_B
   3679 #undef DO_CMP_PPZZ_H
   3680 #undef DO_CMP_PPZZ_S
   3681 #undef DO_CMP_PPZZ_D
   3682 #undef DO_CMP_PPZZ
   3683 
   3684 /* Similar, but the second source is "wide".  */
   3685 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
   3686 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
   3687 {                                                                            \
   3688     intptr_t opr_sz = simd_oprsz(desc);                                      \
   3689     uint32_t flags = PREDTEST_INIT;                                          \
   3690     intptr_t i = opr_sz;                                                     \
   3691     do {                                                                     \
   3692         uint64_t out = 0, pg;                                                \
   3693         do {                                                                 \
   3694             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
   3695             do {                                                             \
   3696                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
   3697                 TYPE nn = *(TYPE *)(vn + H(i));                              \
   3698                 out |= nn OP mm;                                             \
   3699             } while (i & 7);                                                 \
   3700         } while (i & 63);                                                    \
   3701         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
   3702         out &= pg;                                                           \
   3703         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
   3704         flags = iter_predtest_bwd(out, pg, flags);                           \
   3705     } while (i > 0);                                                         \
   3706     return flags;                                                            \
   3707 }
   3708 
   3709 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
   3710     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
   3711 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
   3712     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
   3713 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
   3714     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
   3715 
   3716 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
   3717 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
   3718 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
   3719 
   3720 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
   3721 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
   3722 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
   3723 
   3724 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
   3725 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
   3726 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
   3727 
   3728 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
   3729 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
   3730 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
   3731 
   3732 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
   3733 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
   3734 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
   3735 
   3736 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
   3737 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
   3738 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
   3739 
   3740 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
   3741 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
   3742 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
   3743 
   3744 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
   3745 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
   3746 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
   3747 
   3748 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
   3749 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
   3750 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
   3751 
   3752 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
   3753 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
   3754 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
   3755 
   3756 #undef DO_CMP_PPZW_B
   3757 #undef DO_CMP_PPZW_H
   3758 #undef DO_CMP_PPZW_S
   3759 #undef DO_CMP_PPZW
   3760 
   3761 /* Similar, but the second source is immediate.  */
   3762 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
   3763 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
   3764 {                                                                    \
   3765     intptr_t opr_sz = simd_oprsz(desc);                              \
   3766     uint32_t flags = PREDTEST_INIT;                                  \
   3767     TYPE mm = simd_data(desc);                                       \
   3768     intptr_t i = opr_sz;                                             \
   3769     do {                                                             \
   3770         uint64_t out = 0, pg;                                        \
   3771         do {                                                         \
   3772             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
   3773             TYPE nn = *(TYPE *)(vn + H(i));                          \
   3774             out |= nn OP mm;                                         \
   3775         } while (i & 63);                                            \
   3776         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
   3777         out &= pg;                                                   \
   3778         *(uint64_t *)(vd + (i >> 3)) = out;                          \
   3779         flags = iter_predtest_bwd(out, pg, flags);                   \
   3780     } while (i > 0);                                                 \
   3781     return flags;                                                    \
   3782 }
   3783 
   3784 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
   3785     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
   3786 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
   3787     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
   3788 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
   3789     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
   3790 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
   3791     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
   3792 
   3793 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
   3794 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
   3795 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
   3796 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
   3797 
   3798 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
   3799 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
   3800 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
   3801 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
   3802 
   3803 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
   3804 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
   3805 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
   3806 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
   3807 
   3808 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
   3809 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
   3810 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
   3811 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
   3812 
   3813 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
   3814 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
   3815 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
   3816 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
   3817 
   3818 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
   3819 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
   3820 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
   3821 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
   3822 
   3823 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
   3824 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
   3825 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
   3826 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
   3827 
   3828 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
   3829 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
   3830 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
   3831 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
   3832 
   3833 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
   3834 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
   3835 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
   3836 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
   3837 
   3838 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
   3839 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
   3840 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
   3841 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
   3842 
   3843 #undef DO_CMP_PPZI_B
   3844 #undef DO_CMP_PPZI_H
   3845 #undef DO_CMP_PPZI_S
   3846 #undef DO_CMP_PPZI_D
   3847 #undef DO_CMP_PPZI
   3848 
   3849 /* Similar to the ARM LastActive pseudocode function.  */
   3850 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
   3851 {
   3852     intptr_t i;
   3853 
   3854     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
   3855         uint64_t pg = *(uint64_t *)(vg + i);
   3856         if (pg) {
   3857             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
   3858         }
   3859     }
   3860     return 0;
   3861 }
   3862 
   3863 /* Compute a mask into RETB that is true for all G, up to and including
   3864  * (if after) or excluding (if !after) the first G & N.
   3865  * Return true if BRK found.
   3866  */
   3867 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
   3868                         bool brk, bool after)
   3869 {
   3870     uint64_t b;
   3871 
   3872     if (brk) {
   3873         b = 0;
   3874     } else if ((g & n) == 0) {
   3875         /* For all G, no N are set; break not found.  */
   3876         b = g;
   3877     } else {
   3878         /* Break somewhere in N.  Locate it.  */
   3879         b = g & n;            /* guard true, pred true */
   3880         b = b & -b;           /* first such */
   3881         if (after) {
   3882             b = b | (b - 1);  /* break after same */
   3883         } else {
   3884             b = b - 1;        /* break before same */
   3885         }
   3886         brk = true;
   3887     }
   3888 
   3889     *retb = b;
   3890     return brk;
   3891 }
   3892 
   3893 /* Compute a zeroing BRK.  */
   3894 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
   3895                           intptr_t oprsz, bool after)
   3896 {
   3897     bool brk = false;
   3898     intptr_t i;
   3899 
   3900     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
   3901         uint64_t this_b, this_g = g[i];
   3902 
   3903         brk = compute_brk(&this_b, n[i], this_g, brk, after);
   3904         d[i] = this_b & this_g;
   3905     }
   3906 }
   3907 
   3908 /* Likewise, but also compute flags.  */
   3909 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
   3910                                intptr_t oprsz, bool after)
   3911 {
   3912     uint32_t flags = PREDTEST_INIT;
   3913     bool brk = false;
   3914     intptr_t i;
   3915 
   3916     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
   3917         uint64_t this_b, this_d, this_g = g[i];
   3918 
   3919         brk = compute_brk(&this_b, n[i], this_g, brk, after);
   3920         d[i] = this_d = this_b & this_g;
   3921         flags = iter_predtest_fwd(this_d, this_g, flags);
   3922     }
   3923     return flags;
   3924 }
   3925 
   3926 /* Compute a merging BRK.  */
   3927 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
   3928                           intptr_t oprsz, bool after)
   3929 {
   3930     bool brk = false;
   3931     intptr_t i;
   3932 
   3933     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
   3934         uint64_t this_b, this_g = g[i];
   3935 
   3936         brk = compute_brk(&this_b, n[i], this_g, brk, after);
   3937         d[i] = (this_b & this_g) | (d[i] & ~this_g);
   3938     }
   3939 }
   3940 
   3941 /* Likewise, but also compute flags.  */
   3942 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
   3943                                intptr_t oprsz, bool after)
   3944 {
   3945     uint32_t flags = PREDTEST_INIT;
   3946     bool brk = false;
   3947     intptr_t i;
   3948 
   3949     for (i = 0; i < oprsz / 8; ++i) {
   3950         uint64_t this_b, this_d = d[i], this_g = g[i];
   3951 
   3952         brk = compute_brk(&this_b, n[i], this_g, brk, after);
   3953         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
   3954         flags = iter_predtest_fwd(this_d, this_g, flags);
   3955     }
   3956     return flags;
   3957 }
   3958 
   3959 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
   3960 {
   3961     /* It is quicker to zero the whole predicate than loop on OPRSZ.
   3962      * The compiler should turn this into 4 64-bit integer stores.
   3963      */
   3964     memset(d, 0, sizeof(ARMPredicateReg));
   3965     return PREDTEST_INIT;
   3966 }
   3967 
   3968 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
   3969                        uint32_t pred_desc)
   3970 {
   3971     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   3972     if (last_active_pred(vn, vg, oprsz)) {
   3973         compute_brk_z(vd, vm, vg, oprsz, true);
   3974     } else {
   3975         do_zero(vd, oprsz);
   3976     }
   3977 }
   3978 
   3979 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
   3980                             uint32_t pred_desc)
   3981 {
   3982     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   3983     if (last_active_pred(vn, vg, oprsz)) {
   3984         return compute_brks_z(vd, vm, vg, oprsz, true);
   3985     } else {
   3986         return do_zero(vd, oprsz);
   3987     }
   3988 }
   3989 
   3990 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
   3991                        uint32_t pred_desc)
   3992 {
   3993     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   3994     if (last_active_pred(vn, vg, oprsz)) {
   3995         compute_brk_z(vd, vm, vg, oprsz, false);
   3996     } else {
   3997         do_zero(vd, oprsz);
   3998     }
   3999 }
   4000 
   4001 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
   4002                             uint32_t pred_desc)
   4003 {
   4004     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4005     if (last_active_pred(vn, vg, oprsz)) {
   4006         return compute_brks_z(vd, vm, vg, oprsz, false);
   4007     } else {
   4008         return do_zero(vd, oprsz);
   4009     }
   4010 }
   4011 
   4012 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
   4013 {
   4014     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4015     compute_brk_z(vd, vn, vg, oprsz, true);
   4016 }
   4017 
   4018 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
   4019 {
   4020     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4021     return compute_brks_z(vd, vn, vg, oprsz, true);
   4022 }
   4023 
   4024 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
   4025 {
   4026     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4027     compute_brk_z(vd, vn, vg, oprsz, false);
   4028 }
   4029 
   4030 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
   4031 {
   4032     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4033     return compute_brks_z(vd, vn, vg, oprsz, false);
   4034 }
   4035 
   4036 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
   4037 {
   4038     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4039     compute_brk_m(vd, vn, vg, oprsz, true);
   4040 }
   4041 
   4042 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
   4043 {
   4044     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4045     return compute_brks_m(vd, vn, vg, oprsz, true);
   4046 }
   4047 
   4048 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
   4049 {
   4050     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4051     compute_brk_m(vd, vn, vg, oprsz, false);
   4052 }
   4053 
   4054 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
   4055 {
   4056     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4057     return compute_brks_m(vd, vn, vg, oprsz, false);
   4058 }
   4059 
   4060 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
   4061 {
   4062     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4063     if (!last_active_pred(vn, vg, oprsz)) {
   4064         do_zero(vd, oprsz);
   4065     }
   4066 }
   4067 
   4068 /* As if PredTest(Ones(PL), D, esz).  */
   4069 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
   4070                               uint64_t esz_mask)
   4071 {
   4072     uint32_t flags = PREDTEST_INIT;
   4073     intptr_t i;
   4074 
   4075     for (i = 0; i < oprsz / 8; i++) {
   4076         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
   4077     }
   4078     if (oprsz & 7) {
   4079         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
   4080         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
   4081     }
   4082     return flags;
   4083 }
   4084 
   4085 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
   4086 {
   4087     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4088     if (last_active_pred(vn, vg, oprsz)) {
   4089         return predtest_ones(vd, oprsz, -1);
   4090     } else {
   4091         return do_zero(vd, oprsz);
   4092     }
   4093 }
   4094 
   4095 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
   4096 {
   4097     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
   4098     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
   4099     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
   4100     intptr_t i;
   4101 
   4102     for (i = 0; i < words; ++i) {
   4103         uint64_t t = n[i] & g[i] & mask;
   4104         sum += ctpop64(t);
   4105     }
   4106     return sum;
   4107 }
   4108 
   4109 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
   4110 {
   4111     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4112     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
   4113     uint64_t esz_mask = pred_esz_masks[esz];
   4114     ARMPredicateReg *d = vd;
   4115     uint32_t flags;
   4116     intptr_t i;
   4117 
   4118     /* Begin with a zero predicate register.  */
   4119     flags = do_zero(d, oprsz);
   4120     if (count == 0) {
   4121         return flags;
   4122     }
   4123 
   4124     /* Set all of the requested bits.  */
   4125     for (i = 0; i < count / 64; ++i) {
   4126         d->p[i] = esz_mask;
   4127     }
   4128     if (count & 63) {
   4129         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
   4130     }
   4131 
   4132     return predtest_ones(d, oprsz, esz_mask);
   4133 }
   4134 
   4135 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
   4136 {
   4137     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
   4138     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
   4139     uint64_t esz_mask = pred_esz_masks[esz];
   4140     ARMPredicateReg *d = vd;
   4141     intptr_t i, invcount, oprbits;
   4142     uint64_t bits;
   4143 
   4144     if (count == 0) {
   4145         return do_zero(d, oprsz);
   4146     }
   4147 
   4148     oprbits = oprsz * 8;
   4149     tcg_debug_assert(count <= oprbits);
   4150 
   4151     bits = esz_mask;
   4152     if (oprbits & 63) {
   4153         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
   4154     }
   4155 
   4156     invcount = oprbits - count;
   4157     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
   4158         d->p[i] = bits;
   4159         bits = esz_mask;
   4160     }
   4161 
   4162     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
   4163 
   4164     while (--i >= 0) {
   4165         d->p[i] = 0;
   4166     }
   4167 
   4168     return predtest_ones(d, oprsz, esz_mask);
   4169 }
   4170 
   4171 /* Recursive reduction on a function;
   4172  * C.f. the ARM ARM function ReducePredicated.
   4173  *
   4174  * While it would be possible to write this without the DATA temporary,
   4175  * it is much simpler to process the predicate register this way.
   4176  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
   4177  * little to gain with a more complex non-recursive form.
   4178  */
   4179 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
   4180 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
   4181 {                                                                     \
   4182     if (n == 1) {                                                     \
   4183         return *data;                                                 \
   4184     } else {                                                          \
   4185         uintptr_t half = n / 2;                                       \
   4186         TYPE lo = NAME##_reduce(data, status, half);                  \
   4187         TYPE hi = NAME##_reduce(data + half, status, half);           \
   4188         return TYPE##_##FUNC(lo, hi, status);                         \
   4189     }                                                                 \
   4190 }                                                                     \
   4191 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
   4192 {                                                                     \
   4193     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
   4194     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
   4195     for (i = 0; i < oprsz; ) {                                        \
   4196         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
   4197         do {                                                          \
   4198             TYPE nn = *(TYPE *)(vn + H(i));                           \
   4199             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
   4200             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
   4201         } while (i & 15);                                             \
   4202     }                                                                 \
   4203     for (; i < maxsz; i += sizeof(TYPE)) {                            \
   4204         *(TYPE *)((void *)data + i) = IDENT;                          \
   4205     }                                                                 \
   4206     return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
   4207 }
   4208 
   4209 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
   4210 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
   4211 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
   4212 
   4213 /* Identity is floatN_default_nan, without the function call.  */
   4214 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
   4215 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
   4216 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
   4217 
   4218 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
   4219 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
   4220 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
   4221 
   4222 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
   4223 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
   4224 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
   4225 
   4226 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
   4227 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
   4228 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
   4229 
   4230 #undef DO_REDUCE
   4231 
   4232 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
   4233                              void *status, uint32_t desc)
   4234 {
   4235     intptr_t i = 0, opr_sz = simd_oprsz(desc);
   4236     float16 result = nn;
   4237 
   4238     do {
   4239         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
   4240         do {
   4241             if (pg & 1) {
   4242                 float16 mm = *(float16 *)(vm + H1_2(i));
   4243                 result = float16_add(result, mm, status);
   4244             }
   4245             i += sizeof(float16), pg >>= sizeof(float16);
   4246         } while (i & 15);
   4247     } while (i < opr_sz);
   4248 
   4249     return result;
   4250 }
   4251 
   4252 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
   4253                              void *status, uint32_t desc)
   4254 {
   4255     intptr_t i = 0, opr_sz = simd_oprsz(desc);
   4256     float32 result = nn;
   4257 
   4258     do {
   4259         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
   4260         do {
   4261             if (pg & 1) {
   4262                 float32 mm = *(float32 *)(vm + H1_2(i));
   4263                 result = float32_add(result, mm, status);
   4264             }
   4265             i += sizeof(float32), pg >>= sizeof(float32);
   4266         } while (i & 15);
   4267     } while (i < opr_sz);
   4268 
   4269     return result;
   4270 }
   4271 
   4272 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
   4273                              void *status, uint32_t desc)
   4274 {
   4275     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
   4276     uint64_t *m = vm;
   4277     uint8_t *pg = vg;
   4278 
   4279     for (i = 0; i < opr_sz; i++) {
   4280         if (pg[H1(i)] & 1) {
   4281             nn = float64_add(nn, m[i], status);
   4282         }
   4283     }
   4284 
   4285     return nn;
   4286 }
   4287 
   4288 /* Fully general three-operand expander, controlled by a predicate,
   4289  * With the extra float_status parameter.
   4290  */
   4291 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
   4292 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
   4293                   void *status, uint32_t desc)                  \
   4294 {                                                               \
   4295     intptr_t i = simd_oprsz(desc);                              \
   4296     uint64_t *g = vg;                                           \
   4297     do {                                                        \
   4298         uint64_t pg = g[(i - 1) >> 6];                          \
   4299         do {                                                    \
   4300             i -= sizeof(TYPE);                                  \
   4301             if (likely((pg >> (i & 63)) & 1)) {                 \
   4302                 TYPE nn = *(TYPE *)(vn + H(i));                 \
   4303                 TYPE mm = *(TYPE *)(vm + H(i));                 \
   4304                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
   4305             }                                                   \
   4306         } while (i & 63);                                       \
   4307     } while (i != 0);                                           \
   4308 }
   4309 
   4310 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
   4311 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
   4312 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
   4313 
   4314 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
   4315 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
   4316 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
   4317 
   4318 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
   4319 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
   4320 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
   4321 
   4322 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
   4323 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
   4324 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
   4325 
   4326 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
   4327 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
   4328 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
   4329 
   4330 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
   4331 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
   4332 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
   4333 
   4334 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
   4335 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
   4336 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
   4337 
   4338 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
   4339 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
   4340 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
   4341 
   4342 static inline float16 abd_h(float16 a, float16 b, float_status *s)
   4343 {
   4344     return float16_abs(float16_sub(a, b, s));
   4345 }
   4346 
   4347 static inline float32 abd_s(float32 a, float32 b, float_status *s)
   4348 {
   4349     return float32_abs(float32_sub(a, b, s));
   4350 }
   4351 
   4352 static inline float64 abd_d(float64 a, float64 b, float_status *s)
   4353 {
   4354     return float64_abs(float64_sub(a, b, s));
   4355 }
   4356 
   4357 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
   4358 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
   4359 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
   4360 
   4361 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
   4362 {
   4363     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
   4364     return float64_scalbn(a, b_int, s);
   4365 }
   4366 
   4367 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
   4368 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
   4369 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
   4370 
   4371 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
   4372 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
   4373 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
   4374 
   4375 #undef DO_ZPZZ_FP
   4376 
   4377 /* Three-operand expander, with one scalar operand, controlled by
   4378  * a predicate, with the extra float_status parameter.
   4379  */
   4380 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
   4381 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
   4382                   void *status, uint32_t desc)                    \
   4383 {                                                                 \
   4384     intptr_t i = simd_oprsz(desc);                                \
   4385     uint64_t *g = vg;                                             \
   4386     TYPE mm = scalar;                                             \
   4387     do {                                                          \
   4388         uint64_t pg = g[(i - 1) >> 6];                            \
   4389         do {                                                      \
   4390             i -= sizeof(TYPE);                                    \
   4391             if (likely((pg >> (i & 63)) & 1)) {                   \
   4392                 TYPE nn = *(TYPE *)(vn + H(i));                   \
   4393                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
   4394             }                                                     \
   4395         } while (i & 63);                                         \
   4396     } while (i != 0);                                             \
   4397 }
   4398 
   4399 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
   4400 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
   4401 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
   4402 
   4403 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
   4404 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
   4405 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
   4406 
   4407 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
   4408 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
   4409 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
   4410 
   4411 static inline float16 subr_h(float16 a, float16 b, float_status *s)
   4412 {
   4413     return float16_sub(b, a, s);
   4414 }
   4415 
   4416 static inline float32 subr_s(float32 a, float32 b, float_status *s)
   4417 {
   4418     return float32_sub(b, a, s);
   4419 }
   4420 
   4421 static inline float64 subr_d(float64 a, float64 b, float_status *s)
   4422 {
   4423     return float64_sub(b, a, s);
   4424 }
   4425 
   4426 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
   4427 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
   4428 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
   4429 
   4430 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
   4431 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
   4432 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
   4433 
   4434 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
   4435 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
   4436 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
   4437 
   4438 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
   4439 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
   4440 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
   4441 
   4442 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
   4443 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
   4444 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
   4445 
   4446 /* Fully general two-operand expander, controlled by a predicate,
   4447  * With the extra float_status parameter.
   4448  */
   4449 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
   4450 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
   4451 {                                                                     \
   4452     intptr_t i = simd_oprsz(desc);                                    \
   4453     uint64_t *g = vg;                                                 \
   4454     do {                                                              \
   4455         uint64_t pg = g[(i - 1) >> 6];                                \
   4456         do {                                                          \
   4457             i -= sizeof(TYPE);                                        \
   4458             if (likely((pg >> (i & 63)) & 1)) {                       \
   4459                 TYPE nn = *(TYPE *)(vn + H(i));                       \
   4460                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
   4461             }                                                         \
   4462         } while (i & 63);                                             \
   4463     } while (i != 0);                                                 \
   4464 }
   4465 
   4466 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
   4467  * FZ16.  When converting from fp16, this affects flushing input denormals;
   4468  * when converting to fp16, this affects flushing output denormals.
   4469  */
   4470 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
   4471 {
   4472     bool save = get_flush_inputs_to_zero(fpst);
   4473     float32 ret;
   4474 
   4475     set_flush_inputs_to_zero(false, fpst);
   4476     ret = float16_to_float32(f, true, fpst);
   4477     set_flush_inputs_to_zero(save, fpst);
   4478     return ret;
   4479 }
   4480 
   4481 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
   4482 {
   4483     bool save = get_flush_inputs_to_zero(fpst);
   4484     float64 ret;
   4485 
   4486     set_flush_inputs_to_zero(false, fpst);
   4487     ret = float16_to_float64(f, true, fpst);
   4488     set_flush_inputs_to_zero(save, fpst);
   4489     return ret;
   4490 }
   4491 
   4492 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
   4493 {
   4494     bool save = get_flush_to_zero(fpst);
   4495     float16 ret;
   4496 
   4497     set_flush_to_zero(false, fpst);
   4498     ret = float32_to_float16(f, true, fpst);
   4499     set_flush_to_zero(save, fpst);
   4500     return ret;
   4501 }
   4502 
   4503 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
   4504 {
   4505     bool save = get_flush_to_zero(fpst);
   4506     float16 ret;
   4507 
   4508     set_flush_to_zero(false, fpst);
   4509     ret = float64_to_float16(f, true, fpst);
   4510     set_flush_to_zero(save, fpst);
   4511     return ret;
   4512 }
   4513 
   4514 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
   4515 {
   4516     if (float16_is_any_nan(f)) {
   4517         float_raise(float_flag_invalid, s);
   4518         return 0;
   4519     }
   4520     return float16_to_int16_round_to_zero(f, s);
   4521 }
   4522 
   4523 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
   4524 {
   4525     if (float16_is_any_nan(f)) {
   4526         float_raise(float_flag_invalid, s);
   4527         return 0;
   4528     }
   4529     return float16_to_int64_round_to_zero(f, s);
   4530 }
   4531 
   4532 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
   4533 {
   4534     if (float32_is_any_nan(f)) {
   4535         float_raise(float_flag_invalid, s);
   4536         return 0;
   4537     }
   4538     return float32_to_int64_round_to_zero(f, s);
   4539 }
   4540 
   4541 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
   4542 {
   4543     if (float64_is_any_nan(f)) {
   4544         float_raise(float_flag_invalid, s);
   4545         return 0;
   4546     }
   4547     return float64_to_int64_round_to_zero(f, s);
   4548 }
   4549 
   4550 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
   4551 {
   4552     if (float16_is_any_nan(f)) {
   4553         float_raise(float_flag_invalid, s);
   4554         return 0;
   4555     }
   4556     return float16_to_uint16_round_to_zero(f, s);
   4557 }
   4558 
   4559 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
   4560 {
   4561     if (float16_is_any_nan(f)) {
   4562         float_raise(float_flag_invalid, s);
   4563         return 0;
   4564     }
   4565     return float16_to_uint64_round_to_zero(f, s);
   4566 }
   4567 
   4568 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
   4569 {
   4570     if (float32_is_any_nan(f)) {
   4571         float_raise(float_flag_invalid, s);
   4572         return 0;
   4573     }
   4574     return float32_to_uint64_round_to_zero(f, s);
   4575 }
   4576 
   4577 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
   4578 {
   4579     if (float64_is_any_nan(f)) {
   4580         float_raise(float_flag_invalid, s);
   4581         return 0;
   4582     }
   4583     return float64_to_uint64_round_to_zero(f, s);
   4584 }
   4585 
   4586 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
   4587 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
   4588 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
   4589 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
   4590 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
   4591 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
   4592 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
   4593 
   4594 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
   4595 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
   4596 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
   4597 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
   4598 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
   4599 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
   4600 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
   4601 
   4602 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
   4603 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
   4604 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
   4605 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
   4606 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
   4607 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
   4608 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
   4609 
   4610 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
   4611 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
   4612 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
   4613 
   4614 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
   4615 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
   4616 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
   4617 
   4618 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
   4619 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
   4620 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
   4621 
   4622 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
   4623 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
   4624 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
   4625 
   4626 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
   4627 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
   4628 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
   4629 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
   4630 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
   4631 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
   4632 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
   4633 
   4634 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
   4635 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
   4636 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
   4637 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
   4638 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
   4639 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
   4640 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
   4641 
   4642 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
   4643 {
   4644     /* Extract frac to the top of the uint32_t. */
   4645     uint32_t frac = (uint32_t)a << (16 + 6);
   4646     int16_t exp = extract32(a, 10, 5);
   4647 
   4648     if (unlikely(exp == 0)) {
   4649         if (frac != 0) {
   4650             if (!get_flush_inputs_to_zero(s)) {
   4651                 /* denormal: bias - fractional_zeros */
   4652                 return -15 - clz32(frac);
   4653             }
   4654             /* flush to zero */
   4655             float_raise(float_flag_input_denormal, s);
   4656         }
   4657     } else if (unlikely(exp == 0x1f)) {
   4658         if (frac == 0) {
   4659             return INT16_MAX; /* infinity */
   4660         }
   4661     } else {
   4662         /* normal: exp - bias */
   4663         return exp - 15;
   4664     }
   4665     /* nan or zero */
   4666     float_raise(float_flag_invalid, s);
   4667     return INT16_MIN;
   4668 }
   4669 
   4670 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
   4671 {
   4672     /* Extract frac to the top of the uint32_t. */
   4673     uint32_t frac = a << 9;
   4674     int32_t exp = extract32(a, 23, 8);
   4675 
   4676     if (unlikely(exp == 0)) {
   4677         if (frac != 0) {
   4678             if (!get_flush_inputs_to_zero(s)) {
   4679                 /* denormal: bias - fractional_zeros */
   4680                 return -127 - clz32(frac);
   4681             }
   4682             /* flush to zero */
   4683             float_raise(float_flag_input_denormal, s);
   4684         }
   4685     } else if (unlikely(exp == 0xff)) {
   4686         if (frac == 0) {
   4687             return INT32_MAX; /* infinity */
   4688         }
   4689     } else {
   4690         /* normal: exp - bias */
   4691         return exp - 127;
   4692     }
   4693     /* nan or zero */
   4694     float_raise(float_flag_invalid, s);
   4695     return INT32_MIN;
   4696 }
   4697 
   4698 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
   4699 {
   4700     /* Extract frac to the top of the uint64_t. */
   4701     uint64_t frac = a << 12;
   4702     int64_t exp = extract64(a, 52, 11);
   4703 
   4704     if (unlikely(exp == 0)) {
   4705         if (frac != 0) {
   4706             if (!get_flush_inputs_to_zero(s)) {
   4707                 /* denormal: bias - fractional_zeros */
   4708                 return -1023 - clz64(frac);
   4709             }
   4710             /* flush to zero */
   4711             float_raise(float_flag_input_denormal, s);
   4712         }
   4713     } else if (unlikely(exp == 0x7ff)) {
   4714         if (frac == 0) {
   4715             return INT64_MAX; /* infinity */
   4716         }
   4717     } else {
   4718         /* normal: exp - bias */
   4719         return exp - 1023;
   4720     }
   4721     /* nan or zero */
   4722     float_raise(float_flag_invalid, s);
   4723     return INT64_MIN;
   4724 }
   4725 
   4726 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
   4727 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
   4728 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
   4729 
   4730 #undef DO_ZPZ_FP
   4731 
   4732 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
   4733                             float_status *status, uint32_t desc,
   4734                             uint16_t neg1, uint16_t neg3)
   4735 {
   4736     intptr_t i = simd_oprsz(desc);
   4737     uint64_t *g = vg;
   4738 
   4739     do {
   4740         uint64_t pg = g[(i - 1) >> 6];
   4741         do {
   4742             i -= 2;
   4743             if (likely((pg >> (i & 63)) & 1)) {
   4744                 float16 e1, e2, e3, r;
   4745 
   4746                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
   4747                 e2 = *(uint16_t *)(vm + H1_2(i));
   4748                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
   4749                 r = float16_muladd(e1, e2, e3, 0, status);
   4750                 *(uint16_t *)(vd + H1_2(i)) = r;
   4751             }
   4752         } while (i & 63);
   4753     } while (i != 0);
   4754 }
   4755 
   4756 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
   4757                               void *vg, void *status, uint32_t desc)
   4758 {
   4759     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
   4760 }
   4761 
   4762 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
   4763                               void *vg, void *status, uint32_t desc)
   4764 {
   4765     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
   4766 }
   4767 
   4768 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
   4769                                void *vg, void *status, uint32_t desc)
   4770 {
   4771     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
   4772 }
   4773 
   4774 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
   4775                                void *vg, void *status, uint32_t desc)
   4776 {
   4777     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
   4778 }
   4779 
   4780 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
   4781                             float_status *status, uint32_t desc,
   4782                             uint32_t neg1, uint32_t neg3)
   4783 {
   4784     intptr_t i = simd_oprsz(desc);
   4785     uint64_t *g = vg;
   4786 
   4787     do {
   4788         uint64_t pg = g[(i - 1) >> 6];
   4789         do {
   4790             i -= 4;
   4791             if (likely((pg >> (i & 63)) & 1)) {
   4792                 float32 e1, e2, e3, r;
   4793 
   4794                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
   4795                 e2 = *(uint32_t *)(vm + H1_4(i));
   4796                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
   4797                 r = float32_muladd(e1, e2, e3, 0, status);
   4798                 *(uint32_t *)(vd + H1_4(i)) = r;
   4799             }
   4800         } while (i & 63);
   4801     } while (i != 0);
   4802 }
   4803 
   4804 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
   4805                               void *vg, void *status, uint32_t desc)
   4806 {
   4807     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
   4808 }
   4809 
   4810 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
   4811                               void *vg, void *status, uint32_t desc)
   4812 {
   4813     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
   4814 }
   4815 
   4816 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
   4817                                void *vg, void *status, uint32_t desc)
   4818 {
   4819     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
   4820 }
   4821 
   4822 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
   4823                                void *vg, void *status, uint32_t desc)
   4824 {
   4825     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
   4826 }
   4827 
   4828 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
   4829                             float_status *status, uint32_t desc,
   4830                             uint64_t neg1, uint64_t neg3)
   4831 {
   4832     intptr_t i = simd_oprsz(desc);
   4833     uint64_t *g = vg;
   4834 
   4835     do {
   4836         uint64_t pg = g[(i - 1) >> 6];
   4837         do {
   4838             i -= 8;
   4839             if (likely((pg >> (i & 63)) & 1)) {
   4840                 float64 e1, e2, e3, r;
   4841 
   4842                 e1 = *(uint64_t *)(vn + i) ^ neg1;
   4843                 e2 = *(uint64_t *)(vm + i);
   4844                 e3 = *(uint64_t *)(va + i) ^ neg3;
   4845                 r = float64_muladd(e1, e2, e3, 0, status);
   4846                 *(uint64_t *)(vd + i) = r;
   4847             }
   4848         } while (i & 63);
   4849     } while (i != 0);
   4850 }
   4851 
   4852 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
   4853                               void *vg, void *status, uint32_t desc)
   4854 {
   4855     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
   4856 }
   4857 
   4858 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
   4859                               void *vg, void *status, uint32_t desc)
   4860 {
   4861     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
   4862 }
   4863 
   4864 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
   4865                                void *vg, void *status, uint32_t desc)
   4866 {
   4867     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
   4868 }
   4869 
   4870 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
   4871                                void *vg, void *status, uint32_t desc)
   4872 {
   4873     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
   4874 }
   4875 
   4876 /* Two operand floating-point comparison controlled by a predicate.
   4877  * Unlike the integer version, we are not allowed to optimistically
   4878  * compare operands, since the comparison may have side effects wrt
   4879  * the FPSR.
   4880  */
   4881 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
   4882 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
   4883                   void *status, uint32_t desc)                          \
   4884 {                                                                       \
   4885     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
   4886     uint64_t *d = vd, *g = vg;                                          \
   4887     do {                                                                \
   4888         uint64_t out = 0, pg = g[j];                                    \
   4889         do {                                                            \
   4890             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
   4891             if (likely((pg >> (i & 63)) & 1)) {                         \
   4892                 TYPE nn = *(TYPE *)(vn + H(i));                         \
   4893                 TYPE mm = *(TYPE *)(vm + H(i));                         \
   4894                 out |= OP(TYPE, nn, mm, status);                        \
   4895             }                                                           \
   4896         } while (i & 63);                                               \
   4897         d[j--] = out;                                                   \
   4898     } while (i > 0);                                                    \
   4899 }
   4900 
   4901 #define DO_FPCMP_PPZZ_H(NAME, OP) \
   4902     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
   4903 #define DO_FPCMP_PPZZ_S(NAME, OP) \
   4904     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
   4905 #define DO_FPCMP_PPZZ_D(NAME, OP) \
   4906     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
   4907 
   4908 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
   4909     DO_FPCMP_PPZZ_H(NAME, OP)   \
   4910     DO_FPCMP_PPZZ_S(NAME, OP)   \
   4911     DO_FPCMP_PPZZ_D(NAME, OP)
   4912 
   4913 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
   4914 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
   4915 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
   4916 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
   4917 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
   4918 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
   4919 #define DO_FCMUO(TYPE, X, Y, ST)  \
   4920     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
   4921 #define DO_FACGE(TYPE, X, Y, ST)  \
   4922     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
   4923 #define DO_FACGT(TYPE, X, Y, ST)  \
   4924     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
   4925 
   4926 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
   4927 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
   4928 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
   4929 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
   4930 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
   4931 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
   4932 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
   4933 
   4934 #undef DO_FPCMP_PPZZ_ALL
   4935 #undef DO_FPCMP_PPZZ_D
   4936 #undef DO_FPCMP_PPZZ_S
   4937 #undef DO_FPCMP_PPZZ_H
   4938 #undef DO_FPCMP_PPZZ
   4939 
   4940 /* One operand floating-point comparison against zero, controlled
   4941  * by a predicate.
   4942  */
   4943 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
   4944 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
   4945                   void *status, uint32_t desc)             \
   4946 {                                                          \
   4947     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
   4948     uint64_t *d = vd, *g = vg;                             \
   4949     do {                                                   \
   4950         uint64_t out = 0, pg = g[j];                       \
   4951         do {                                               \
   4952             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
   4953             if ((pg >> (i & 63)) & 1) {                    \
   4954                 TYPE nn = *(TYPE *)(vn + H(i));            \
   4955                 out |= OP(TYPE, nn, 0, status);            \
   4956             }                                              \
   4957         } while (i & 63);                                  \
   4958         d[j--] = out;                                      \
   4959     } while (i > 0);                                       \
   4960 }
   4961 
   4962 #define DO_FPCMP_PPZ0_H(NAME, OP) \
   4963     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
   4964 #define DO_FPCMP_PPZ0_S(NAME, OP) \
   4965     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
   4966 #define DO_FPCMP_PPZ0_D(NAME, OP) \
   4967     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
   4968 
   4969 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
   4970     DO_FPCMP_PPZ0_H(NAME, OP)   \
   4971     DO_FPCMP_PPZ0_S(NAME, OP)   \
   4972     DO_FPCMP_PPZ0_D(NAME, OP)
   4973 
   4974 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
   4975 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
   4976 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
   4977 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
   4978 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
   4979 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
   4980 
   4981 /* FP Trig Multiply-Add. */
   4982 
   4983 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
   4984 {
   4985     static const float16 coeff[16] = {
   4986         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
   4987         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
   4988     };
   4989     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
   4990     intptr_t x = simd_data(desc);
   4991     float16 *d = vd, *n = vn, *m = vm;
   4992     for (i = 0; i < opr_sz; i++) {
   4993         float16 mm = m[i];
   4994         intptr_t xx = x;
   4995         if (float16_is_neg(mm)) {
   4996             mm = float16_abs(mm);
   4997             xx += 8;
   4998         }
   4999         d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
   5000     }
   5001 }
   5002 
   5003 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
   5004 {
   5005     static const float32 coeff[16] = {
   5006         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
   5007         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
   5008         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
   5009         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
   5010     };
   5011     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
   5012     intptr_t x = simd_data(desc);
   5013     float32 *d = vd, *n = vn, *m = vm;
   5014     for (i = 0; i < opr_sz; i++) {
   5015         float32 mm = m[i];
   5016         intptr_t xx = x;
   5017         if (float32_is_neg(mm)) {
   5018             mm = float32_abs(mm);
   5019             xx += 8;
   5020         }
   5021         d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
   5022     }
   5023 }
   5024 
   5025 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
   5026 {
   5027     static const float64 coeff[16] = {
   5028         0x3ff0000000000000ull, 0xbfc5555555555543ull,
   5029         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
   5030         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
   5031         0x3de5d8408868552full, 0x0000000000000000ull,
   5032         0x3ff0000000000000ull, 0xbfe0000000000000ull,
   5033         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
   5034         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
   5035         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
   5036     };
   5037     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
   5038     intptr_t x = simd_data(desc);
   5039     float64 *d = vd, *n = vn, *m = vm;
   5040     for (i = 0; i < opr_sz; i++) {
   5041         float64 mm = m[i];
   5042         intptr_t xx = x;
   5043         if (float64_is_neg(mm)) {
   5044             mm = float64_abs(mm);
   5045             xx += 8;
   5046         }
   5047         d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
   5048     }
   5049 }
   5050 
   5051 /*
   5052  * FP Complex Add
   5053  */
   5054 
   5055 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
   5056                          void *vs, uint32_t desc)
   5057 {
   5058     intptr_t j, i = simd_oprsz(desc);
   5059     uint64_t *g = vg;
   5060     float16 neg_imag = float16_set_sign(0, simd_data(desc));
   5061     float16 neg_real = float16_chs(neg_imag);
   5062 
   5063     do {
   5064         uint64_t pg = g[(i - 1) >> 6];
   5065         do {
   5066             float16 e0, e1, e2, e3;
   5067 
   5068             /* I holds the real index; J holds the imag index.  */
   5069             j = i - sizeof(float16);
   5070             i -= 2 * sizeof(float16);
   5071 
   5072             e0 = *(float16 *)(vn + H1_2(i));
   5073             e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
   5074             e2 = *(float16 *)(vn + H1_2(j));
   5075             e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
   5076 
   5077             if (likely((pg >> (i & 63)) & 1)) {
   5078                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
   5079             }
   5080             if (likely((pg >> (j & 63)) & 1)) {
   5081                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
   5082             }
   5083         } while (i & 63);
   5084     } while (i != 0);
   5085 }
   5086 
   5087 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
   5088                          void *vs, uint32_t desc)
   5089 {
   5090     intptr_t j, i = simd_oprsz(desc);
   5091     uint64_t *g = vg;
   5092     float32 neg_imag = float32_set_sign(0, simd_data(desc));
   5093     float32 neg_real = float32_chs(neg_imag);
   5094 
   5095     do {
   5096         uint64_t pg = g[(i - 1) >> 6];
   5097         do {
   5098             float32 e0, e1, e2, e3;
   5099 
   5100             /* I holds the real index; J holds the imag index.  */
   5101             j = i - sizeof(float32);
   5102             i -= 2 * sizeof(float32);
   5103 
   5104             e0 = *(float32 *)(vn + H1_2(i));
   5105             e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
   5106             e2 = *(float32 *)(vn + H1_2(j));
   5107             e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
   5108 
   5109             if (likely((pg >> (i & 63)) & 1)) {
   5110                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
   5111             }
   5112             if (likely((pg >> (j & 63)) & 1)) {
   5113                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
   5114             }
   5115         } while (i & 63);
   5116     } while (i != 0);
   5117 }
   5118 
   5119 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
   5120                          void *vs, uint32_t desc)
   5121 {
   5122     intptr_t j, i = simd_oprsz(desc);
   5123     uint64_t *g = vg;
   5124     float64 neg_imag = float64_set_sign(0, simd_data(desc));
   5125     float64 neg_real = float64_chs(neg_imag);
   5126 
   5127     do {
   5128         uint64_t pg = g[(i - 1) >> 6];
   5129         do {
   5130             float64 e0, e1, e2, e3;
   5131 
   5132             /* I holds the real index; J holds the imag index.  */
   5133             j = i - sizeof(float64);
   5134             i -= 2 * sizeof(float64);
   5135 
   5136             e0 = *(float64 *)(vn + H1_2(i));
   5137             e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
   5138             e2 = *(float64 *)(vn + H1_2(j));
   5139             e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
   5140 
   5141             if (likely((pg >> (i & 63)) & 1)) {
   5142                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
   5143             }
   5144             if (likely((pg >> (j & 63)) & 1)) {
   5145                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
   5146             }
   5147         } while (i & 63);
   5148     } while (i != 0);
   5149 }
   5150 
   5151 /*
   5152  * FP Complex Multiply
   5153  */
   5154 
   5155 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
   5156                                void *vg, void *status, uint32_t desc)
   5157 {
   5158     intptr_t j, i = simd_oprsz(desc);
   5159     unsigned rot = simd_data(desc);
   5160     bool flip = rot & 1;
   5161     float16 neg_imag, neg_real;
   5162     uint64_t *g = vg;
   5163 
   5164     neg_imag = float16_set_sign(0, (rot & 2) != 0);
   5165     neg_real = float16_set_sign(0, rot == 1 || rot == 2);
   5166 
   5167     do {
   5168         uint64_t pg = g[(i - 1) >> 6];
   5169         do {
   5170             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
   5171 
   5172             /* I holds the real index; J holds the imag index.  */
   5173             j = i - sizeof(float16);
   5174             i -= 2 * sizeof(float16);
   5175 
   5176             nr = *(float16 *)(vn + H1_2(i));
   5177             ni = *(float16 *)(vn + H1_2(j));
   5178             mr = *(float16 *)(vm + H1_2(i));
   5179             mi = *(float16 *)(vm + H1_2(j));
   5180 
   5181             e2 = (flip ? ni : nr);
   5182             e1 = (flip ? mi : mr) ^ neg_real;
   5183             e4 = e2;
   5184             e3 = (flip ? mr : mi) ^ neg_imag;
   5185 
   5186             if (likely((pg >> (i & 63)) & 1)) {
   5187                 d = *(float16 *)(va + H1_2(i));
   5188                 d = float16_muladd(e2, e1, d, 0, status);
   5189                 *(float16 *)(vd + H1_2(i)) = d;
   5190             }
   5191             if (likely((pg >> (j & 63)) & 1)) {
   5192                 d = *(float16 *)(va + H1_2(j));
   5193                 d = float16_muladd(e4, e3, d, 0, status);
   5194                 *(float16 *)(vd + H1_2(j)) = d;
   5195             }
   5196         } while (i & 63);
   5197     } while (i != 0);
   5198 }
   5199 
   5200 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
   5201                                void *vg, void *status, uint32_t desc)
   5202 {
   5203     intptr_t j, i = simd_oprsz(desc);
   5204     unsigned rot = simd_data(desc);
   5205     bool flip = rot & 1;
   5206     float32 neg_imag, neg_real;
   5207     uint64_t *g = vg;
   5208 
   5209     neg_imag = float32_set_sign(0, (rot & 2) != 0);
   5210     neg_real = float32_set_sign(0, rot == 1 || rot == 2);
   5211 
   5212     do {
   5213         uint64_t pg = g[(i - 1) >> 6];
   5214         do {
   5215             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
   5216 
   5217             /* I holds the real index; J holds the imag index.  */
   5218             j = i - sizeof(float32);
   5219             i -= 2 * sizeof(float32);
   5220 
   5221             nr = *(float32 *)(vn + H1_2(i));
   5222             ni = *(float32 *)(vn + H1_2(j));
   5223             mr = *(float32 *)(vm + H1_2(i));
   5224             mi = *(float32 *)(vm + H1_2(j));
   5225 
   5226             e2 = (flip ? ni : nr);
   5227             e1 = (flip ? mi : mr) ^ neg_real;
   5228             e4 = e2;
   5229             e3 = (flip ? mr : mi) ^ neg_imag;
   5230 
   5231             if (likely((pg >> (i & 63)) & 1)) {
   5232                 d = *(float32 *)(va + H1_2(i));
   5233                 d = float32_muladd(e2, e1, d, 0, status);
   5234                 *(float32 *)(vd + H1_2(i)) = d;
   5235             }
   5236             if (likely((pg >> (j & 63)) & 1)) {
   5237                 d = *(float32 *)(va + H1_2(j));
   5238                 d = float32_muladd(e4, e3, d, 0, status);
   5239                 *(float32 *)(vd + H1_2(j)) = d;
   5240             }
   5241         } while (i & 63);
   5242     } while (i != 0);
   5243 }
   5244 
   5245 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
   5246                                void *vg, void *status, uint32_t desc)
   5247 {
   5248     intptr_t j, i = simd_oprsz(desc);
   5249     unsigned rot = simd_data(desc);
   5250     bool flip = rot & 1;
   5251     float64 neg_imag, neg_real;
   5252     uint64_t *g = vg;
   5253 
   5254     neg_imag = float64_set_sign(0, (rot & 2) != 0);
   5255     neg_real = float64_set_sign(0, rot == 1 || rot == 2);
   5256 
   5257     do {
   5258         uint64_t pg = g[(i - 1) >> 6];
   5259         do {
   5260             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
   5261 
   5262             /* I holds the real index; J holds the imag index.  */
   5263             j = i - sizeof(float64);
   5264             i -= 2 * sizeof(float64);
   5265 
   5266             nr = *(float64 *)(vn + H1_2(i));
   5267             ni = *(float64 *)(vn + H1_2(j));
   5268             mr = *(float64 *)(vm + H1_2(i));
   5269             mi = *(float64 *)(vm + H1_2(j));
   5270 
   5271             e2 = (flip ? ni : nr);
   5272             e1 = (flip ? mi : mr) ^ neg_real;
   5273             e4 = e2;
   5274             e3 = (flip ? mr : mi) ^ neg_imag;
   5275 
   5276             if (likely((pg >> (i & 63)) & 1)) {
   5277                 d = *(float64 *)(va + H1_2(i));
   5278                 d = float64_muladd(e2, e1, d, 0, status);
   5279                 *(float64 *)(vd + H1_2(i)) = d;
   5280             }
   5281             if (likely((pg >> (j & 63)) & 1)) {
   5282                 d = *(float64 *)(va + H1_2(j));
   5283                 d = float64_muladd(e4, e3, d, 0, status);
   5284                 *(float64 *)(vd + H1_2(j)) = d;
   5285             }
   5286         } while (i & 63);
   5287     } while (i != 0);
   5288 }
   5289 
   5290 /*
   5291  * Load contiguous data, protected by a governing predicate.
   5292  */
   5293 
   5294 /*
   5295  * Skip through a sequence of inactive elements in the guarding predicate @vg,
   5296  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
   5297  * element >= @reg_off, or @reg_max if there were no active elements at all.
   5298  */
   5299 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
   5300                                  intptr_t reg_max, int esz)
   5301 {
   5302     uint64_t pg_mask = pred_esz_masks[esz];
   5303     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
   5304 
   5305     /* In normal usage, the first element is active.  */
   5306     if (likely(pg & 1)) {
   5307         return reg_off;
   5308     }
   5309 
   5310     if (pg == 0) {
   5311         reg_off &= -64;
   5312         do {
   5313             reg_off += 64;
   5314             if (unlikely(reg_off >= reg_max)) {
   5315                 /* The entire predicate was false.  */
   5316                 return reg_max;
   5317             }
   5318             pg = vg[reg_off >> 6] & pg_mask;
   5319         } while (pg == 0);
   5320     }
   5321     reg_off += ctz64(pg);
   5322 
   5323     /* We should never see an out of range predicate bit set.  */
   5324     tcg_debug_assert(reg_off < reg_max);
   5325     return reg_off;
   5326 }
   5327 
   5328 /*
   5329  * Resolve the guest virtual address to info->host and info->flags.
   5330  * If @nofault, return false if the page is invalid, otherwise
   5331  * exit via page fault exception.
   5332  */
   5333 
   5334 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
   5335                     target_ulong addr, int mem_off, MMUAccessType access_type,
   5336                     int mmu_idx, uintptr_t retaddr)
   5337 {
   5338     int flags;
   5339 
   5340     addr += mem_off;
   5341 
   5342     /*
   5343      * User-only currently always issues with TBI.  See the comment
   5344      * above useronly_clean_ptr.  Usually we clean this top byte away
   5345      * during translation, but we can't do that for e.g. vector + imm
   5346      * addressing modes.
   5347      *
   5348      * We currently always enable TBI for user-only, and do not provide
   5349      * a way to turn it off.  So clean the pointer unconditionally here,
   5350      * rather than look it up here, or pass it down from above.
   5351      */
   5352     addr = useronly_clean_ptr(addr);
   5353 
   5354 #ifdef CONFIG_USER_ONLY
   5355     flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
   5356                                &info->host, retaddr);
   5357     memset(&info->attrs, 0, sizeof(info->attrs));
   5358     /* Require both ANON and MTE; see allocation_tag_mem(). */
   5359     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
   5360 #else
   5361     CPUTLBEntryFull *full;
   5362     flags = probe_access_full(env, addr, access_type, mmu_idx, nofault,
   5363                               &info->host, &full, retaddr);
   5364     info->attrs = full->attrs;
   5365     info->tagged = full->pte_attrs == 0xf0;
   5366 #endif
   5367     info->flags = flags;
   5368 
   5369     if (flags & TLB_INVALID_MASK) {
   5370         g_assert(nofault);
   5371         return false;
   5372     }
   5373 
   5374     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
   5375     info->host -= mem_off;
   5376     return true;
   5377 }
   5378 
   5379 /*
   5380  * Find first active element on each page, and a loose bound for the
   5381  * final element on each page.  Identify any single element that spans
   5382  * the page boundary.  Return true if there are any active elements.
   5383  */
   5384 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
   5385                             intptr_t reg_max, int esz, int msize)
   5386 {
   5387     const int esize = 1 << esz;
   5388     const uint64_t pg_mask = pred_esz_masks[esz];
   5389     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
   5390     intptr_t mem_off_last, mem_off_split;
   5391     intptr_t page_split, elt_split;
   5392     intptr_t i;
   5393 
   5394     /* Set all of the element indices to -1, and the TLB data to 0. */
   5395     memset(info, -1, offsetof(SVEContLdSt, page));
   5396     memset(info->page, 0, sizeof(info->page));
   5397 
   5398     /* Gross scan over the entire predicate to find bounds. */
   5399     i = 0;
   5400     do {
   5401         uint64_t pg = vg[i] & pg_mask;
   5402         if (pg) {
   5403             reg_off_last = i * 64 + 63 - clz64(pg);
   5404             if (reg_off_first < 0) {
   5405                 reg_off_first = i * 64 + ctz64(pg);
   5406             }
   5407         }
   5408     } while (++i * 64 < reg_max);
   5409 
   5410     if (unlikely(reg_off_first < 0)) {
   5411         /* No active elements, no pages touched. */
   5412         return false;
   5413     }
   5414     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
   5415 
   5416     info->reg_off_first[0] = reg_off_first;
   5417     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
   5418     mem_off_last = (reg_off_last >> esz) * msize;
   5419 
   5420     page_split = -(addr | TARGET_PAGE_MASK);
   5421     if (likely(mem_off_last + msize <= page_split)) {
   5422         /* The entire operation fits within a single page. */
   5423         info->reg_off_last[0] = reg_off_last;
   5424         return true;
   5425     }
   5426 
   5427     info->page_split = page_split;
   5428     elt_split = page_split / msize;
   5429     reg_off_split = elt_split << esz;
   5430     mem_off_split = elt_split * msize;
   5431 
   5432     /*
   5433      * This is the last full element on the first page, but it is not
   5434      * necessarily active.  If there is no full element, i.e. the first
   5435      * active element is the one that's split, this value remains -1.
   5436      * It is useful as iteration bounds.
   5437      */
   5438     if (elt_split != 0) {
   5439         info->reg_off_last[0] = reg_off_split - esize;
   5440     }
   5441 
   5442     /* Determine if an unaligned element spans the pages.  */
   5443     if (page_split % msize != 0) {
   5444         /* It is helpful to know if the split element is active. */
   5445         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
   5446             info->reg_off_split = reg_off_split;
   5447             info->mem_off_split = mem_off_split;
   5448 
   5449             if (reg_off_split == reg_off_last) {
   5450                 /* The page crossing element is last. */
   5451                 return true;
   5452             }
   5453         }
   5454         reg_off_split += esize;
   5455         mem_off_split += msize;
   5456     }
   5457 
   5458     /*
   5459      * We do want the first active element on the second page, because
   5460      * this may affect the address reported in an exception.
   5461      */
   5462     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
   5463     tcg_debug_assert(reg_off_split <= reg_off_last);
   5464     info->reg_off_first[1] = reg_off_split;
   5465     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
   5466     info->reg_off_last[1] = reg_off_last;
   5467     return true;
   5468 }
   5469 
   5470 /*
   5471  * Resolve the guest virtual addresses to info->page[].
   5472  * Control the generation of page faults with @fault.  Return false if
   5473  * there is no work to do, which can only happen with @fault == FAULT_NO.
   5474  */
   5475 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
   5476                          CPUARMState *env, target_ulong addr,
   5477                          MMUAccessType access_type, uintptr_t retaddr)
   5478 {
   5479     int mmu_idx = cpu_mmu_index(env, false);
   5480     int mem_off = info->mem_off_first[0];
   5481     bool nofault = fault == FAULT_NO;
   5482     bool have_work = true;
   5483 
   5484     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
   5485                         access_type, mmu_idx, retaddr)) {
   5486         /* No work to be done. */
   5487         return false;
   5488     }
   5489 
   5490     if (likely(info->page_split < 0)) {
   5491         /* The entire operation was on the one page. */
   5492         return true;
   5493     }
   5494 
   5495     /*
   5496      * If the second page is invalid, then we want the fault address to be
   5497      * the first byte on that page which is accessed.
   5498      */
   5499     if (info->mem_off_split >= 0) {
   5500         /*
   5501          * There is an element split across the pages.  The fault address
   5502          * should be the first byte of the second page.
   5503          */
   5504         mem_off = info->page_split;
   5505         /*
   5506          * If the split element is also the first active element
   5507          * of the vector, then:  For first-fault we should continue
   5508          * to generate faults for the second page.  For no-fault,
   5509          * we have work only if the second page is valid.
   5510          */
   5511         if (info->mem_off_first[0] < info->mem_off_split) {
   5512             nofault = FAULT_FIRST;
   5513             have_work = false;
   5514         }
   5515     } else {
   5516         /*
   5517          * There is no element split across the pages.  The fault address
   5518          * should be the first active element on the second page.
   5519          */
   5520         mem_off = info->mem_off_first[1];
   5521         /*
   5522          * There must have been one active element on the first page,
   5523          * so we're out of first-fault territory.
   5524          */
   5525         nofault = fault != FAULT_ALL;
   5526     }
   5527 
   5528     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
   5529                                 access_type, mmu_idx, retaddr);
   5530     return have_work;
   5531 }
   5532 
   5533 #ifndef CONFIG_USER_ONLY
   5534 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
   5535                                uint64_t *vg, target_ulong addr,
   5536                                int esize, int msize, int wp_access,
   5537                                uintptr_t retaddr)
   5538 {
   5539     intptr_t mem_off, reg_off, reg_last;
   5540     int flags0 = info->page[0].flags;
   5541     int flags1 = info->page[1].flags;
   5542 
   5543     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
   5544         return;
   5545     }
   5546 
   5547     /* Indicate that watchpoints are handled. */
   5548     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
   5549     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
   5550 
   5551     if (flags0 & TLB_WATCHPOINT) {
   5552         mem_off = info->mem_off_first[0];
   5553         reg_off = info->reg_off_first[0];
   5554         reg_last = info->reg_off_last[0];
   5555 
   5556         while (reg_off <= reg_last) {
   5557             uint64_t pg = vg[reg_off >> 6];
   5558             do {
   5559                 if ((pg >> (reg_off & 63)) & 1) {
   5560                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
   5561                                          msize, info->page[0].attrs,
   5562                                          wp_access, retaddr);
   5563                 }
   5564                 reg_off += esize;
   5565                 mem_off += msize;
   5566             } while (reg_off <= reg_last && (reg_off & 63));
   5567         }
   5568     }
   5569 
   5570     mem_off = info->mem_off_split;
   5571     if (mem_off >= 0) {
   5572         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
   5573                              info->page[0].attrs, wp_access, retaddr);
   5574     }
   5575 
   5576     mem_off = info->mem_off_first[1];
   5577     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
   5578         reg_off = info->reg_off_first[1];
   5579         reg_last = info->reg_off_last[1];
   5580 
   5581         do {
   5582             uint64_t pg = vg[reg_off >> 6];
   5583             do {
   5584                 if ((pg >> (reg_off & 63)) & 1) {
   5585                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
   5586                                          msize, info->page[1].attrs,
   5587                                          wp_access, retaddr);
   5588                 }
   5589                 reg_off += esize;
   5590                 mem_off += msize;
   5591             } while (reg_off & 63);
   5592         } while (reg_off <= reg_last);
   5593     }
   5594 }
   5595 #endif
   5596 
   5597 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
   5598                              uint64_t *vg, target_ulong addr, int esize,
   5599                              int msize, uint32_t mtedesc, uintptr_t ra)
   5600 {
   5601     intptr_t mem_off, reg_off, reg_last;
   5602 
   5603     /* Process the page only if MemAttr == Tagged. */
   5604     if (info->page[0].tagged) {
   5605         mem_off = info->mem_off_first[0];
   5606         reg_off = info->reg_off_first[0];
   5607         reg_last = info->reg_off_split;
   5608         if (reg_last < 0) {
   5609             reg_last = info->reg_off_last[0];
   5610         }
   5611 
   5612         do {
   5613             uint64_t pg = vg[reg_off >> 6];
   5614             do {
   5615                 if ((pg >> (reg_off & 63)) & 1) {
   5616                     mte_check(env, mtedesc, addr, ra);
   5617                 }
   5618                 reg_off += esize;
   5619                 mem_off += msize;
   5620             } while (reg_off <= reg_last && (reg_off & 63));
   5621         } while (reg_off <= reg_last);
   5622     }
   5623 
   5624     mem_off = info->mem_off_first[1];
   5625     if (mem_off >= 0 && info->page[1].tagged) {
   5626         reg_off = info->reg_off_first[1];
   5627         reg_last = info->reg_off_last[1];
   5628 
   5629         do {
   5630             uint64_t pg = vg[reg_off >> 6];
   5631             do {
   5632                 if ((pg >> (reg_off & 63)) & 1) {
   5633                     mte_check(env, mtedesc, addr, ra);
   5634                 }
   5635                 reg_off += esize;
   5636                 mem_off += msize;
   5637             } while (reg_off & 63);
   5638         } while (reg_off <= reg_last);
   5639     }
   5640 }
   5641 
   5642 /*
   5643  * Common helper for all contiguous 1,2,3,4-register predicated stores.
   5644  */
   5645 static inline QEMU_ALWAYS_INLINE
   5646 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
   5647                uint32_t desc, const uintptr_t retaddr,
   5648                const int esz, const int msz, const int N, uint32_t mtedesc,
   5649                sve_ldst1_host_fn *host_fn,
   5650                sve_ldst1_tlb_fn *tlb_fn)
   5651 {
   5652     const unsigned rd = simd_data(desc);
   5653     const intptr_t reg_max = simd_oprsz(desc);
   5654     intptr_t reg_off, reg_last, mem_off;
   5655     SVEContLdSt info;
   5656     void *host;
   5657     int flags, i;
   5658 
   5659     /* Find the active elements.  */
   5660     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
   5661         /* The entire predicate was false; no load occurs.  */
   5662         for (i = 0; i < N; ++i) {
   5663             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
   5664         }
   5665         return;
   5666     }
   5667 
   5668     /* Probe the page(s).  Exit with exception for any invalid page. */
   5669     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
   5670 
   5671     /* Handle watchpoints for all active elements. */
   5672     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
   5673                               BP_MEM_READ, retaddr);
   5674 
   5675     /*
   5676      * Handle mte checks for all active elements.
   5677      * Since TBI must be set for MTE, !mtedesc => !mte_active.
   5678      */
   5679     if (mtedesc) {
   5680         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
   5681                                 mtedesc, retaddr);
   5682     }
   5683 
   5684     flags = info.page[0].flags | info.page[1].flags;
   5685     if (unlikely(flags != 0)) {
   5686 #ifdef CONFIG_USER_ONLY
   5687         g_assert_not_reached();
   5688 #else
   5689         /*
   5690          * At least one page includes MMIO.
   5691          * Any bus operation can fail with cpu_transaction_failed,
   5692          * which for ARM will raise SyncExternal.  Perform the load
   5693          * into scratch memory to preserve register state until the end.
   5694          */
   5695         ARMVectorReg scratch[4] = { };
   5696 
   5697         mem_off = info.mem_off_first[0];
   5698         reg_off = info.reg_off_first[0];
   5699         reg_last = info.reg_off_last[1];
   5700         if (reg_last < 0) {
   5701             reg_last = info.reg_off_split;
   5702             if (reg_last < 0) {
   5703                 reg_last = info.reg_off_last[0];
   5704             }
   5705         }
   5706 
   5707         do {
   5708             uint64_t pg = vg[reg_off >> 6];
   5709             do {
   5710                 if ((pg >> (reg_off & 63)) & 1) {
   5711                     for (i = 0; i < N; ++i) {
   5712                         tlb_fn(env, &scratch[i], reg_off,
   5713                                addr + mem_off + (i << msz), retaddr);
   5714                     }
   5715                 }
   5716                 reg_off += 1 << esz;
   5717                 mem_off += N << msz;
   5718             } while (reg_off & 63);
   5719         } while (reg_off <= reg_last);
   5720 
   5721         for (i = 0; i < N; ++i) {
   5722             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
   5723         }
   5724         return;
   5725 #endif
   5726     }
   5727 
   5728     /* The entire operation is in RAM, on valid pages. */
   5729 
   5730     for (i = 0; i < N; ++i) {
   5731         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
   5732     }
   5733 
   5734     mem_off = info.mem_off_first[0];
   5735     reg_off = info.reg_off_first[0];
   5736     reg_last = info.reg_off_last[0];
   5737     host = info.page[0].host;
   5738 
   5739     while (reg_off <= reg_last) {
   5740         uint64_t pg = vg[reg_off >> 6];
   5741         do {
   5742             if ((pg >> (reg_off & 63)) & 1) {
   5743                 for (i = 0; i < N; ++i) {
   5744                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
   5745                             host + mem_off + (i << msz));
   5746                 }
   5747             }
   5748             reg_off += 1 << esz;
   5749             mem_off += N << msz;
   5750         } while (reg_off <= reg_last && (reg_off & 63));
   5751     }
   5752 
   5753     /*
   5754      * Use the slow path to manage the cross-page misalignment.
   5755      * But we know this is RAM and cannot trap.
   5756      */
   5757     mem_off = info.mem_off_split;
   5758     if (unlikely(mem_off >= 0)) {
   5759         reg_off = info.reg_off_split;
   5760         for (i = 0; i < N; ++i) {
   5761             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
   5762                    addr + mem_off + (i << msz), retaddr);
   5763         }
   5764     }
   5765 
   5766     mem_off = info.mem_off_first[1];
   5767     if (unlikely(mem_off >= 0)) {
   5768         reg_off = info.reg_off_first[1];
   5769         reg_last = info.reg_off_last[1];
   5770         host = info.page[1].host;
   5771 
   5772         do {
   5773             uint64_t pg = vg[reg_off >> 6];
   5774             do {
   5775                 if ((pg >> (reg_off & 63)) & 1) {
   5776                     for (i = 0; i < N; ++i) {
   5777                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
   5778                                 host + mem_off + (i << msz));
   5779                     }
   5780                 }
   5781                 reg_off += 1 << esz;
   5782                 mem_off += N << msz;
   5783             } while (reg_off & 63);
   5784         } while (reg_off <= reg_last);
   5785     }
   5786 }
   5787 
   5788 static inline QEMU_ALWAYS_INLINE
   5789 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
   5790                    uint32_t desc, const uintptr_t ra,
   5791                    const int esz, const int msz, const int N,
   5792                    sve_ldst1_host_fn *host_fn,
   5793                    sve_ldst1_tlb_fn *tlb_fn)
   5794 {
   5795     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   5796     int bit55 = extract64(addr, 55, 1);
   5797 
   5798     /* Remove mtedesc from the normal sve descriptor. */
   5799     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   5800 
   5801     /* Perform gross MTE suppression early. */
   5802     if (!tbi_check(desc, bit55) ||
   5803         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
   5804         mtedesc = 0;
   5805     }
   5806 
   5807     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
   5808 }
   5809 
   5810 #define DO_LD1_1(NAME, ESZ)                                             \
   5811 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
   5812                             target_ulong addr, uint32_t desc)           \
   5813 {                                                                       \
   5814     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
   5815               sve_##NAME##_host, sve_##NAME##_tlb);                     \
   5816 }                                                                       \
   5817 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
   5818                                 target_ulong addr, uint32_t desc)       \
   5819 {                                                                       \
   5820     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
   5821                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
   5822 }
   5823 
   5824 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
   5825 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
   5826                                target_ulong addr, uint32_t desc)        \
   5827 {                                                                       \
   5828     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
   5829               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
   5830 }                                                                       \
   5831 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
   5832                                target_ulong addr, uint32_t desc)        \
   5833 {                                                                       \
   5834     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
   5835               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
   5836 }                                                                       \
   5837 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
   5838                                    target_ulong addr, uint32_t desc)    \
   5839 {                                                                       \
   5840     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
   5841                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
   5842 }                                                                       \
   5843 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
   5844                                    target_ulong addr, uint32_t desc)    \
   5845 {                                                                       \
   5846     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
   5847                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
   5848 }
   5849 
   5850 DO_LD1_1(ld1bb,  MO_8)
   5851 DO_LD1_1(ld1bhu, MO_16)
   5852 DO_LD1_1(ld1bhs, MO_16)
   5853 DO_LD1_1(ld1bsu, MO_32)
   5854 DO_LD1_1(ld1bss, MO_32)
   5855 DO_LD1_1(ld1bdu, MO_64)
   5856 DO_LD1_1(ld1bds, MO_64)
   5857 
   5858 DO_LD1_2(ld1hh,  MO_16, MO_16)
   5859 DO_LD1_2(ld1hsu, MO_32, MO_16)
   5860 DO_LD1_2(ld1hss, MO_32, MO_16)
   5861 DO_LD1_2(ld1hdu, MO_64, MO_16)
   5862 DO_LD1_2(ld1hds, MO_64, MO_16)
   5863 
   5864 DO_LD1_2(ld1ss,  MO_32, MO_32)
   5865 DO_LD1_2(ld1sdu, MO_64, MO_32)
   5866 DO_LD1_2(ld1sds, MO_64, MO_32)
   5867 
   5868 DO_LD1_2(ld1dd,  MO_64, MO_64)
   5869 
   5870 #undef DO_LD1_1
   5871 #undef DO_LD1_2
   5872 
   5873 #define DO_LDN_1(N)                                                     \
   5874 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
   5875                              target_ulong addr, uint32_t desc)          \
   5876 {                                                                       \
   5877     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
   5878               sve_ld1bb_host, sve_ld1bb_tlb);                           \
   5879 }                                                                       \
   5880 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
   5881                                  target_ulong addr, uint32_t desc)      \
   5882 {                                                                       \
   5883     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
   5884                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
   5885 }
   5886 
   5887 #define DO_LDN_2(N, SUFF, ESZ)                                          \
   5888 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
   5889                                     target_ulong addr, uint32_t desc)   \
   5890 {                                                                       \
   5891     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
   5892               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
   5893 }                                                                       \
   5894 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
   5895                                     target_ulong addr, uint32_t desc)   \
   5896 {                                                                       \
   5897     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
   5898               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
   5899 }                                                                       \
   5900 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
   5901                                         target_ulong addr, uint32_t desc) \
   5902 {                                                                       \
   5903     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
   5904                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
   5905 }                                                                       \
   5906 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
   5907                                         target_ulong addr, uint32_t desc) \
   5908 {                                                                       \
   5909     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
   5910                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
   5911 }
   5912 
   5913 DO_LDN_1(2)
   5914 DO_LDN_1(3)
   5915 DO_LDN_1(4)
   5916 
   5917 DO_LDN_2(2, hh, MO_16)
   5918 DO_LDN_2(3, hh, MO_16)
   5919 DO_LDN_2(4, hh, MO_16)
   5920 
   5921 DO_LDN_2(2, ss, MO_32)
   5922 DO_LDN_2(3, ss, MO_32)
   5923 DO_LDN_2(4, ss, MO_32)
   5924 
   5925 DO_LDN_2(2, dd, MO_64)
   5926 DO_LDN_2(3, dd, MO_64)
   5927 DO_LDN_2(4, dd, MO_64)
   5928 
   5929 #undef DO_LDN_1
   5930 #undef DO_LDN_2
   5931 
   5932 /*
   5933  * Load contiguous data, first-fault and no-fault.
   5934  *
   5935  * For user-only, one could argue that we should hold the mmap_lock during
   5936  * the operation so that there is no race between page_check_range and the
   5937  * load operation.  However, unmapping pages out from under a running thread
   5938  * is extraordinarily unlikely.  This theoretical race condition also affects
   5939  * linux-user/ in its get_user/put_user macros.
   5940  *
   5941  * TODO: Construct some helpers, written in assembly, that interact with
   5942  * host_signal_handler to produce memory ops which can properly report errors
   5943  * without racing.
   5944  */
   5945 
   5946 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
   5947  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
   5948  * option, which leaves subsequent data unchanged.
   5949  */
   5950 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
   5951 {
   5952     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
   5953 
   5954     if (i & 63) {
   5955         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
   5956         i = ROUND_UP(i, 64);
   5957     }
   5958     for (; i < oprsz; i += 64) {
   5959         ffr[i / 64] = 0;
   5960     }
   5961 }
   5962 
   5963 /*
   5964  * Common helper for all contiguous no-fault and first-fault loads.
   5965  */
   5966 static inline QEMU_ALWAYS_INLINE
   5967 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
   5968                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
   5969                    const int esz, const int msz, const SVEContFault fault,
   5970                    sve_ldst1_host_fn *host_fn,
   5971                    sve_ldst1_tlb_fn *tlb_fn)
   5972 {
   5973     const unsigned rd = simd_data(desc);
   5974     void *vd = &env->vfp.zregs[rd];
   5975     const intptr_t reg_max = simd_oprsz(desc);
   5976     intptr_t reg_off, mem_off, reg_last;
   5977     SVEContLdSt info;
   5978     int flags;
   5979     void *host;
   5980 
   5981     /* Find the active elements.  */
   5982     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
   5983         /* The entire predicate was false; no load occurs.  */
   5984         memset(vd, 0, reg_max);
   5985         return;
   5986     }
   5987     reg_off = info.reg_off_first[0];
   5988 
   5989     /* Probe the page(s). */
   5990     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
   5991         /* Fault on first element. */
   5992         tcg_debug_assert(fault == FAULT_NO);
   5993         memset(vd, 0, reg_max);
   5994         goto do_fault;
   5995     }
   5996 
   5997     mem_off = info.mem_off_first[0];
   5998     flags = info.page[0].flags;
   5999 
   6000     /*
   6001      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
   6002      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
   6003      */
   6004     if (!info.page[0].tagged) {
   6005         mtedesc = 0;
   6006     }
   6007 
   6008     if (fault == FAULT_FIRST) {
   6009         /* Trapping mte check for the first-fault element.  */
   6010         if (mtedesc) {
   6011             mte_check(env, mtedesc, addr + mem_off, retaddr);
   6012         }
   6013 
   6014         /*
   6015          * Special handling of the first active element,
   6016          * if it crosses a page boundary or is MMIO.
   6017          */
   6018         bool is_split = mem_off == info.mem_off_split;
   6019         if (unlikely(flags != 0) || unlikely(is_split)) {
   6020             /*
   6021              * Use the slow path for cross-page handling.
   6022              * Might trap for MMIO or watchpoints.
   6023              */
   6024             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
   6025 
   6026             /* After any fault, zero the other elements. */
   6027             swap_memzero(vd, reg_off);
   6028             reg_off += 1 << esz;
   6029             mem_off += 1 << msz;
   6030             swap_memzero(vd + reg_off, reg_max - reg_off);
   6031 
   6032             if (is_split) {
   6033                 goto second_page;
   6034             }
   6035         } else {
   6036             memset(vd, 0, reg_max);
   6037         }
   6038     } else {
   6039         memset(vd, 0, reg_max);
   6040         if (unlikely(mem_off == info.mem_off_split)) {
   6041             /* The first active element crosses a page boundary. */
   6042             flags |= info.page[1].flags;
   6043             if (unlikely(flags & TLB_MMIO)) {
   6044                 /* Some page is MMIO, see below. */
   6045                 goto do_fault;
   6046             }
   6047             if (unlikely(flags & TLB_WATCHPOINT) &&
   6048                 (cpu_watchpoint_address_matches
   6049                  (env_cpu(env), addr + mem_off, 1 << msz)
   6050                  & BP_MEM_READ)) {
   6051                 /* Watchpoint hit, see below. */
   6052                 goto do_fault;
   6053             }
   6054             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
   6055                 goto do_fault;
   6056             }
   6057             /*
   6058              * Use the slow path for cross-page handling.
   6059              * This is RAM, without a watchpoint, and will not trap.
   6060              */
   6061             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
   6062             goto second_page;
   6063         }
   6064     }
   6065 
   6066     /*
   6067      * From this point on, all memory operations are MemSingleNF.
   6068      *
   6069      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
   6070      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
   6071      *
   6072      * Unfortuately we do not have access to the memory attributes from the
   6073      * PTE to tell Device memory from Normal memory.  So we make a mostly
   6074      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
   6075      * This gives the right answer for the common cases of "Normal memory,
   6076      * backed by host RAM" and "Device memory, backed by MMIO".
   6077      * The architecture allows us to suppress an NF load and return
   6078      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
   6079      * case of "Normal memory, backed by MMIO" is permitted.  The case we
   6080      * get wrong is "Device memory, backed by host RAM", for which we
   6081      * should return (UNKNOWN, FAULT) for but do not.
   6082      *
   6083      * Similarly, CPU_BP breakpoints would raise exceptions, and so
   6084      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
   6085      * architectural breakpoints the same.
   6086      */
   6087     if (unlikely(flags & TLB_MMIO)) {
   6088         goto do_fault;
   6089     }
   6090 
   6091     reg_last = info.reg_off_last[0];
   6092     host = info.page[0].host;
   6093 
   6094     do {
   6095         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
   6096         do {
   6097             if ((pg >> (reg_off & 63)) & 1) {
   6098                 if (unlikely(flags & TLB_WATCHPOINT) &&
   6099                     (cpu_watchpoint_address_matches
   6100                      (env_cpu(env), addr + mem_off, 1 << msz)
   6101                      & BP_MEM_READ)) {
   6102                     goto do_fault;
   6103                 }
   6104                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
   6105                     goto do_fault;
   6106                 }
   6107                 host_fn(vd, reg_off, host + mem_off);
   6108             }
   6109             reg_off += 1 << esz;
   6110             mem_off += 1 << msz;
   6111         } while (reg_off <= reg_last && (reg_off & 63));
   6112     } while (reg_off <= reg_last);
   6113 
   6114     /*
   6115      * MemSingleNF is allowed to fail for any reason.  We have special
   6116      * code above to handle the first element crossing a page boundary.
   6117      * As an implementation choice, decline to handle a cross-page element
   6118      * in any other position.
   6119      */
   6120     reg_off = info.reg_off_split;
   6121     if (reg_off >= 0) {
   6122         goto do_fault;
   6123     }
   6124 
   6125  second_page:
   6126     reg_off = info.reg_off_first[1];
   6127     if (likely(reg_off < 0)) {
   6128         /* No active elements on the second page.  All done. */
   6129         return;
   6130     }
   6131 
   6132     /*
   6133      * MemSingleNF is allowed to fail for any reason.  As an implementation
   6134      * choice, decline to handle elements on the second page.  This should
   6135      * be low frequency as the guest walks through memory -- the next
   6136      * iteration of the guest's loop should be aligned on the page boundary,
   6137      * and then all following iterations will stay aligned.
   6138      */
   6139 
   6140  do_fault:
   6141     record_fault(env, reg_off, reg_max);
   6142 }
   6143 
   6144 static inline QEMU_ALWAYS_INLINE
   6145 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
   6146                        uint32_t desc, const uintptr_t retaddr,
   6147                        const int esz, const int msz, const SVEContFault fault,
   6148                        sve_ldst1_host_fn *host_fn,
   6149                        sve_ldst1_tlb_fn *tlb_fn)
   6150 {
   6151     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   6152     int bit55 = extract64(addr, 55, 1);
   6153 
   6154     /* Remove mtedesc from the normal sve descriptor. */
   6155     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   6156 
   6157     /* Perform gross MTE suppression early. */
   6158     if (!tbi_check(desc, bit55) ||
   6159         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
   6160         mtedesc = 0;
   6161     }
   6162 
   6163     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
   6164                   esz, msz, fault, host_fn, tlb_fn);
   6165 }
   6166 
   6167 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
   6168 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
   6169                                  target_ulong addr, uint32_t desc)      \
   6170 {                                                                       \
   6171     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
   6172                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
   6173 }                                                                       \
   6174 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
   6175                                  target_ulong addr, uint32_t desc)      \
   6176 {                                                                       \
   6177     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
   6178                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
   6179 }                                                                       \
   6180 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
   6181                                      target_ulong addr, uint32_t desc)  \
   6182 {                                                                       \
   6183     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
   6184                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
   6185 }                                                                       \
   6186 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
   6187                                      target_ulong addr, uint32_t desc)  \
   6188 {                                                                       \
   6189     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
   6190                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
   6191 }
   6192 
   6193 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
   6194 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
   6195                                     target_ulong addr, uint32_t desc)   \
   6196 {                                                                       \
   6197     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
   6198                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
   6199 }                                                                       \
   6200 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
   6201                                     target_ulong addr, uint32_t desc)   \
   6202 {                                                                       \
   6203     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
   6204                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
   6205 }                                                                       \
   6206 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
   6207                                     target_ulong addr, uint32_t desc)   \
   6208 {                                                                       \
   6209     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
   6210                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
   6211 }                                                                       \
   6212 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
   6213                                     target_ulong addr, uint32_t desc)   \
   6214 {                                                                       \
   6215     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
   6216                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
   6217 }                                                                       \
   6218 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
   6219                                         target_ulong addr, uint32_t desc) \
   6220 {                                                                       \
   6221     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
   6222                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
   6223 }                                                                       \
   6224 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
   6225                                         target_ulong addr, uint32_t desc) \
   6226 {                                                                       \
   6227     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
   6228                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
   6229 }                                                                       \
   6230 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
   6231                                         target_ulong addr, uint32_t desc) \
   6232 {                                                                       \
   6233     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
   6234                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
   6235 }                                                                       \
   6236 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
   6237                                         target_ulong addr, uint32_t desc) \
   6238 {                                                                       \
   6239     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
   6240                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
   6241 }
   6242 
   6243 DO_LDFF1_LDNF1_1(bb,  MO_8)
   6244 DO_LDFF1_LDNF1_1(bhu, MO_16)
   6245 DO_LDFF1_LDNF1_1(bhs, MO_16)
   6246 DO_LDFF1_LDNF1_1(bsu, MO_32)
   6247 DO_LDFF1_LDNF1_1(bss, MO_32)
   6248 DO_LDFF1_LDNF1_1(bdu, MO_64)
   6249 DO_LDFF1_LDNF1_1(bds, MO_64)
   6250 
   6251 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
   6252 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
   6253 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
   6254 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
   6255 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
   6256 
   6257 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
   6258 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
   6259 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
   6260 
   6261 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
   6262 
   6263 #undef DO_LDFF1_LDNF1_1
   6264 #undef DO_LDFF1_LDNF1_2
   6265 
   6266 /*
   6267  * Common helper for all contiguous 1,2,3,4-register predicated stores.
   6268  */
   6269 
   6270 static inline QEMU_ALWAYS_INLINE
   6271 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
   6272                uint32_t desc, const uintptr_t retaddr,
   6273                const int esz, const int msz, const int N, uint32_t mtedesc,
   6274                sve_ldst1_host_fn *host_fn,
   6275                sve_ldst1_tlb_fn *tlb_fn)
   6276 {
   6277     const unsigned rd = simd_data(desc);
   6278     const intptr_t reg_max = simd_oprsz(desc);
   6279     intptr_t reg_off, reg_last, mem_off;
   6280     SVEContLdSt info;
   6281     void *host;
   6282     int i, flags;
   6283 
   6284     /* Find the active elements.  */
   6285     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
   6286         /* The entire predicate was false; no store occurs.  */
   6287         return;
   6288     }
   6289 
   6290     /* Probe the page(s).  Exit with exception for any invalid page. */
   6291     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
   6292 
   6293     /* Handle watchpoints for all active elements. */
   6294     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
   6295                               BP_MEM_WRITE, retaddr);
   6296 
   6297     /*
   6298      * Handle mte checks for all active elements.
   6299      * Since TBI must be set for MTE, !mtedesc => !mte_active.
   6300      */
   6301     if (mtedesc) {
   6302         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
   6303                                 mtedesc, retaddr);
   6304     }
   6305 
   6306     flags = info.page[0].flags | info.page[1].flags;
   6307     if (unlikely(flags != 0)) {
   6308 #ifdef CONFIG_USER_ONLY
   6309         g_assert_not_reached();
   6310 #else
   6311         /*
   6312          * At least one page includes MMIO.
   6313          * Any bus operation can fail with cpu_transaction_failed,
   6314          * which for ARM will raise SyncExternal.  We cannot avoid
   6315          * this fault and will leave with the store incomplete.
   6316          */
   6317         mem_off = info.mem_off_first[0];
   6318         reg_off = info.reg_off_first[0];
   6319         reg_last = info.reg_off_last[1];
   6320         if (reg_last < 0) {
   6321             reg_last = info.reg_off_split;
   6322             if (reg_last < 0) {
   6323                 reg_last = info.reg_off_last[0];
   6324             }
   6325         }
   6326 
   6327         do {
   6328             uint64_t pg = vg[reg_off >> 6];
   6329             do {
   6330                 if ((pg >> (reg_off & 63)) & 1) {
   6331                     for (i = 0; i < N; ++i) {
   6332                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
   6333                                addr + mem_off + (i << msz), retaddr);
   6334                     }
   6335                 }
   6336                 reg_off += 1 << esz;
   6337                 mem_off += N << msz;
   6338             } while (reg_off & 63);
   6339         } while (reg_off <= reg_last);
   6340         return;
   6341 #endif
   6342     }
   6343 
   6344     mem_off = info.mem_off_first[0];
   6345     reg_off = info.reg_off_first[0];
   6346     reg_last = info.reg_off_last[0];
   6347     host = info.page[0].host;
   6348 
   6349     while (reg_off <= reg_last) {
   6350         uint64_t pg = vg[reg_off >> 6];
   6351         do {
   6352             if ((pg >> (reg_off & 63)) & 1) {
   6353                 for (i = 0; i < N; ++i) {
   6354                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
   6355                             host + mem_off + (i << msz));
   6356                 }
   6357             }
   6358             reg_off += 1 << esz;
   6359             mem_off += N << msz;
   6360         } while (reg_off <= reg_last && (reg_off & 63));
   6361     }
   6362 
   6363     /*
   6364      * Use the slow path to manage the cross-page misalignment.
   6365      * But we know this is RAM and cannot trap.
   6366      */
   6367     mem_off = info.mem_off_split;
   6368     if (unlikely(mem_off >= 0)) {
   6369         reg_off = info.reg_off_split;
   6370         for (i = 0; i < N; ++i) {
   6371             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
   6372                    addr + mem_off + (i << msz), retaddr);
   6373         }
   6374     }
   6375 
   6376     mem_off = info.mem_off_first[1];
   6377     if (unlikely(mem_off >= 0)) {
   6378         reg_off = info.reg_off_first[1];
   6379         reg_last = info.reg_off_last[1];
   6380         host = info.page[1].host;
   6381 
   6382         do {
   6383             uint64_t pg = vg[reg_off >> 6];
   6384             do {
   6385                 if ((pg >> (reg_off & 63)) & 1) {
   6386                     for (i = 0; i < N; ++i) {
   6387                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
   6388                                 host + mem_off + (i << msz));
   6389                     }
   6390                 }
   6391                 reg_off += 1 << esz;
   6392                 mem_off += N << msz;
   6393             } while (reg_off & 63);
   6394         } while (reg_off <= reg_last);
   6395     }
   6396 }
   6397 
   6398 static inline QEMU_ALWAYS_INLINE
   6399 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
   6400                    uint32_t desc, const uintptr_t ra,
   6401                    const int esz, const int msz, const int N,
   6402                    sve_ldst1_host_fn *host_fn,
   6403                    sve_ldst1_tlb_fn *tlb_fn)
   6404 {
   6405     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   6406     int bit55 = extract64(addr, 55, 1);
   6407 
   6408     /* Remove mtedesc from the normal sve descriptor. */
   6409     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   6410 
   6411     /* Perform gross MTE suppression early. */
   6412     if (!tbi_check(desc, bit55) ||
   6413         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
   6414         mtedesc = 0;
   6415     }
   6416 
   6417     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
   6418 }
   6419 
   6420 #define DO_STN_1(N, NAME, ESZ)                                          \
   6421 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
   6422                                  target_ulong addr, uint32_t desc)      \
   6423 {                                                                       \
   6424     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
   6425               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
   6426 }                                                                       \
   6427 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
   6428                                      target_ulong addr, uint32_t desc)  \
   6429 {                                                                       \
   6430     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
   6431                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
   6432 }
   6433 
   6434 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
   6435 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
   6436                                     target_ulong addr, uint32_t desc)   \
   6437 {                                                                       \
   6438     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
   6439               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
   6440 }                                                                       \
   6441 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
   6442                                     target_ulong addr, uint32_t desc)   \
   6443 {                                                                       \
   6444     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
   6445               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
   6446 }                                                                       \
   6447 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
   6448                                         target_ulong addr, uint32_t desc) \
   6449 {                                                                       \
   6450     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
   6451                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
   6452 }                                                                       \
   6453 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
   6454                                         target_ulong addr, uint32_t desc) \
   6455 {                                                                       \
   6456     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
   6457                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
   6458 }
   6459 
   6460 DO_STN_1(1, bb, MO_8)
   6461 DO_STN_1(1, bh, MO_16)
   6462 DO_STN_1(1, bs, MO_32)
   6463 DO_STN_1(1, bd, MO_64)
   6464 DO_STN_1(2, bb, MO_8)
   6465 DO_STN_1(3, bb, MO_8)
   6466 DO_STN_1(4, bb, MO_8)
   6467 
   6468 DO_STN_2(1, hh, MO_16, MO_16)
   6469 DO_STN_2(1, hs, MO_32, MO_16)
   6470 DO_STN_2(1, hd, MO_64, MO_16)
   6471 DO_STN_2(2, hh, MO_16, MO_16)
   6472 DO_STN_2(3, hh, MO_16, MO_16)
   6473 DO_STN_2(4, hh, MO_16, MO_16)
   6474 
   6475 DO_STN_2(1, ss, MO_32, MO_32)
   6476 DO_STN_2(1, sd, MO_64, MO_32)
   6477 DO_STN_2(2, ss, MO_32, MO_32)
   6478 DO_STN_2(3, ss, MO_32, MO_32)
   6479 DO_STN_2(4, ss, MO_32, MO_32)
   6480 
   6481 DO_STN_2(1, dd, MO_64, MO_64)
   6482 DO_STN_2(2, dd, MO_64, MO_64)
   6483 DO_STN_2(3, dd, MO_64, MO_64)
   6484 DO_STN_2(4, dd, MO_64, MO_64)
   6485 
   6486 #undef DO_STN_1
   6487 #undef DO_STN_2
   6488 
   6489 /*
   6490  * Loads with a vector index.
   6491  */
   6492 
   6493 /*
   6494  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
   6495  */
   6496 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
   6497 
   6498 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
   6499 {
   6500     return *(uint32_t *)(reg + H1_4(reg_ofs));
   6501 }
   6502 
   6503 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
   6504 {
   6505     return *(int32_t *)(reg + H1_4(reg_ofs));
   6506 }
   6507 
   6508 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
   6509 {
   6510     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
   6511 }
   6512 
   6513 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
   6514 {
   6515     return (int32_t)*(uint64_t *)(reg + reg_ofs);
   6516 }
   6517 
   6518 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
   6519 {
   6520     return *(uint64_t *)(reg + reg_ofs);
   6521 }
   6522 
   6523 static inline QEMU_ALWAYS_INLINE
   6524 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
   6525                target_ulong base, uint32_t desc, uintptr_t retaddr,
   6526                uint32_t mtedesc, int esize, int msize,
   6527                zreg_off_fn *off_fn,
   6528                sve_ldst1_host_fn *host_fn,
   6529                sve_ldst1_tlb_fn *tlb_fn)
   6530 {
   6531     const int mmu_idx = cpu_mmu_index(env, false);
   6532     const intptr_t reg_max = simd_oprsz(desc);
   6533     const int scale = simd_data(desc);
   6534     ARMVectorReg scratch;
   6535     intptr_t reg_off;
   6536     SVEHostPage info, info2;
   6537 
   6538     memset(&scratch, 0, reg_max);
   6539     reg_off = 0;
   6540     do {
   6541         uint64_t pg = vg[reg_off >> 6];
   6542         do {
   6543             if (likely(pg & 1)) {
   6544                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
   6545                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
   6546 
   6547                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
   6548                                mmu_idx, retaddr);
   6549 
   6550                 if (likely(in_page >= msize)) {
   6551                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
   6552                         cpu_check_watchpoint(env_cpu(env), addr, msize,
   6553                                              info.attrs, BP_MEM_READ, retaddr);
   6554                     }
   6555                     if (mtedesc && info.tagged) {
   6556                         mte_check(env, mtedesc, addr, retaddr);
   6557                     }
   6558                     if (unlikely(info.flags & TLB_MMIO)) {
   6559                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
   6560                     } else {
   6561                         host_fn(&scratch, reg_off, info.host);
   6562                     }
   6563                 } else {
   6564                     /* Element crosses the page boundary. */
   6565                     sve_probe_page(&info2, false, env, addr + in_page, 0,
   6566                                    MMU_DATA_LOAD, mmu_idx, retaddr);
   6567                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
   6568                         cpu_check_watchpoint(env_cpu(env), addr,
   6569                                              msize, info.attrs,
   6570                                              BP_MEM_READ, retaddr);
   6571                     }
   6572                     if (mtedesc && info.tagged) {
   6573                         mte_check(env, mtedesc, addr, retaddr);
   6574                     }
   6575                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
   6576                 }
   6577             }
   6578             reg_off += esize;
   6579             pg >>= esize;
   6580         } while (reg_off & 63);
   6581     } while (reg_off < reg_max);
   6582 
   6583     /* Wait until all exceptions have been raised to write back.  */
   6584     memcpy(vd, &scratch, reg_max);
   6585 }
   6586 
   6587 static inline QEMU_ALWAYS_INLINE
   6588 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
   6589                    target_ulong base, uint32_t desc, uintptr_t retaddr,
   6590                    int esize, int msize, zreg_off_fn *off_fn,
   6591                    sve_ldst1_host_fn *host_fn,
   6592                    sve_ldst1_tlb_fn *tlb_fn)
   6593 {
   6594     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   6595     /* Remove mtedesc from the normal sve descriptor. */
   6596     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   6597 
   6598     /*
   6599      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
   6600      * offset base entirely over the address space hole to change the
   6601      * pointer tag, or change the bit55 selector.  So we could here
   6602      * examine TBI + TCMA like we do for sve_ldN_r_mte().
   6603      */
   6604     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
   6605               esize, msize, off_fn, host_fn, tlb_fn);
   6606 }
   6607 
   6608 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
   6609 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
   6610                                  void *vm, target_ulong base, uint32_t desc) \
   6611 {                                                                            \
   6612     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
   6613               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
   6614 }                                                                            \
   6615 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
   6616      void *vm, target_ulong base, uint32_t desc)                             \
   6617 {                                                                            \
   6618     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
   6619                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
   6620 }
   6621 
   6622 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
   6623 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
   6624                                  void *vm, target_ulong base, uint32_t desc) \
   6625 {                                                                            \
   6626     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
   6627               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
   6628 }                                                                            \
   6629 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
   6630     void *vm, target_ulong base, uint32_t desc)                              \
   6631 {                                                                            \
   6632     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
   6633                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
   6634 }
   6635 
   6636 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
   6637 DO_LD1_ZPZ_S(bsu, zss, MO_8)
   6638 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
   6639 DO_LD1_ZPZ_D(bdu, zss, MO_8)
   6640 DO_LD1_ZPZ_D(bdu, zd, MO_8)
   6641 
   6642 DO_LD1_ZPZ_S(bss, zsu, MO_8)
   6643 DO_LD1_ZPZ_S(bss, zss, MO_8)
   6644 DO_LD1_ZPZ_D(bds, zsu, MO_8)
   6645 DO_LD1_ZPZ_D(bds, zss, MO_8)
   6646 DO_LD1_ZPZ_D(bds, zd, MO_8)
   6647 
   6648 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
   6649 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
   6650 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
   6651 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
   6652 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
   6653 
   6654 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
   6655 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
   6656 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
   6657 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
   6658 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
   6659 
   6660 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
   6661 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
   6662 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
   6663 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
   6664 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
   6665 
   6666 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
   6667 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
   6668 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
   6669 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
   6670 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
   6671 
   6672 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
   6673 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
   6674 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
   6675 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
   6676 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
   6677 
   6678 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
   6679 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
   6680 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
   6681 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
   6682 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
   6683 
   6684 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
   6685 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
   6686 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
   6687 
   6688 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
   6689 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
   6690 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
   6691 
   6692 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
   6693 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
   6694 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
   6695 
   6696 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
   6697 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
   6698 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
   6699 
   6700 #undef DO_LD1_ZPZ_S
   6701 #undef DO_LD1_ZPZ_D
   6702 
   6703 /* First fault loads with a vector index.  */
   6704 
   6705 /*
   6706  * Common helpers for all gather first-faulting loads.
   6707  */
   6708 
   6709 static inline QEMU_ALWAYS_INLINE
   6710 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
   6711                  target_ulong base, uint32_t desc, uintptr_t retaddr,
   6712                  uint32_t mtedesc, const int esz, const int msz,
   6713                  zreg_off_fn *off_fn,
   6714                  sve_ldst1_host_fn *host_fn,
   6715                  sve_ldst1_tlb_fn *tlb_fn)
   6716 {
   6717     const int mmu_idx = cpu_mmu_index(env, false);
   6718     const intptr_t reg_max = simd_oprsz(desc);
   6719     const int scale = simd_data(desc);
   6720     const int esize = 1 << esz;
   6721     const int msize = 1 << msz;
   6722     intptr_t reg_off;
   6723     SVEHostPage info;
   6724     target_ulong addr, in_page;
   6725 
   6726     /* Skip to the first true predicate.  */
   6727     reg_off = find_next_active(vg, 0, reg_max, esz);
   6728     if (unlikely(reg_off >= reg_max)) {
   6729         /* The entire predicate was false; no load occurs.  */
   6730         memset(vd, 0, reg_max);
   6731         return;
   6732     }
   6733 
   6734     /*
   6735      * Probe the first element, allowing faults.
   6736      */
   6737     addr = base + (off_fn(vm, reg_off) << scale);
   6738     if (mtedesc) {
   6739         mte_check(env, mtedesc, addr, retaddr);
   6740     }
   6741     tlb_fn(env, vd, reg_off, addr, retaddr);
   6742 
   6743     /* After any fault, zero the other elements. */
   6744     swap_memzero(vd, reg_off);
   6745     reg_off += esize;
   6746     swap_memzero(vd + reg_off, reg_max - reg_off);
   6747 
   6748     /*
   6749      * Probe the remaining elements, not allowing faults.
   6750      */
   6751     while (reg_off < reg_max) {
   6752         uint64_t pg = vg[reg_off >> 6];
   6753         do {
   6754             if (likely((pg >> (reg_off & 63)) & 1)) {
   6755                 addr = base + (off_fn(vm, reg_off) << scale);
   6756                 in_page = -(addr | TARGET_PAGE_MASK);
   6757 
   6758                 if (unlikely(in_page < msize)) {
   6759                     /* Stop if the element crosses a page boundary. */
   6760                     goto fault;
   6761                 }
   6762 
   6763                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
   6764                                mmu_idx, retaddr);
   6765                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
   6766                     goto fault;
   6767                 }
   6768                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
   6769                     (cpu_watchpoint_address_matches
   6770                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
   6771                     goto fault;
   6772                 }
   6773                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
   6774                     goto fault;
   6775                 }
   6776 
   6777                 host_fn(vd, reg_off, info.host);
   6778             }
   6779             reg_off += esize;
   6780         } while (reg_off & 63);
   6781     }
   6782     return;
   6783 
   6784  fault:
   6785     record_fault(env, reg_off, reg_max);
   6786 }
   6787 
   6788 static inline QEMU_ALWAYS_INLINE
   6789 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
   6790                      target_ulong base, uint32_t desc, uintptr_t retaddr,
   6791                      const int esz, const int msz,
   6792                      zreg_off_fn *off_fn,
   6793                      sve_ldst1_host_fn *host_fn,
   6794                      sve_ldst1_tlb_fn *tlb_fn)
   6795 {
   6796     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   6797     /* Remove mtedesc from the normal sve descriptor. */
   6798     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   6799 
   6800     /*
   6801      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
   6802      * offset base entirely over the address space hole to change the
   6803      * pointer tag, or change the bit55 selector.  So we could here
   6804      * examine TBI + TCMA like we do for sve_ldN_r_mte().
   6805      */
   6806     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
   6807                 esz, msz, off_fn, host_fn, tlb_fn);
   6808 }
   6809 
   6810 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
   6811 void HELPER(sve_ldff##MEM##_##OFS)                                      \
   6812     (CPUARMState *env, void *vd, void *vg,                              \
   6813      void *vm, target_ulong base, uint32_t desc)                        \
   6814 {                                                                       \
   6815     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
   6816                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
   6817 }                                                                       \
   6818 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
   6819     (CPUARMState *env, void *vd, void *vg,                              \
   6820      void *vm, target_ulong base, uint32_t desc)                        \
   6821 {                                                                       \
   6822     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
   6823                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
   6824 }
   6825 
   6826 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
   6827 void HELPER(sve_ldff##MEM##_##OFS)                                      \
   6828     (CPUARMState *env, void *vd, void *vg,                              \
   6829      void *vm, target_ulong base, uint32_t desc)                        \
   6830 {                                                                       \
   6831     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
   6832                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
   6833 }                                                                       \
   6834 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
   6835     (CPUARMState *env, void *vd, void *vg,                              \
   6836      void *vm, target_ulong base, uint32_t desc)                        \
   6837 {                                                                       \
   6838     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
   6839                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
   6840 }
   6841 
   6842 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
   6843 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
   6844 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
   6845 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
   6846 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
   6847 
   6848 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
   6849 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
   6850 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
   6851 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
   6852 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
   6853 
   6854 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
   6855 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
   6856 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
   6857 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
   6858 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
   6859 
   6860 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
   6861 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
   6862 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
   6863 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
   6864 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
   6865 
   6866 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
   6867 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
   6868 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
   6869 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
   6870 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
   6871 
   6872 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
   6873 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
   6874 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
   6875 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
   6876 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
   6877 
   6878 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
   6879 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
   6880 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
   6881 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
   6882 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
   6883 
   6884 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
   6885 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
   6886 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
   6887 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
   6888 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
   6889 
   6890 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
   6891 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
   6892 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
   6893 
   6894 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
   6895 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
   6896 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
   6897 
   6898 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
   6899 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
   6900 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
   6901 
   6902 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
   6903 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
   6904 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
   6905 
   6906 /* Stores with a vector index.  */
   6907 
   6908 static inline QEMU_ALWAYS_INLINE
   6909 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
   6910                target_ulong base, uint32_t desc, uintptr_t retaddr,
   6911                uint32_t mtedesc, int esize, int msize,
   6912                zreg_off_fn *off_fn,
   6913                sve_ldst1_host_fn *host_fn,
   6914                sve_ldst1_tlb_fn *tlb_fn)
   6915 {
   6916     const int mmu_idx = cpu_mmu_index(env, false);
   6917     const intptr_t reg_max = simd_oprsz(desc);
   6918     const int scale = simd_data(desc);
   6919     void *host[ARM_MAX_VQ * 4];
   6920     intptr_t reg_off, i;
   6921     SVEHostPage info, info2;
   6922 
   6923     /*
   6924      * Probe all of the elements for host addresses and flags.
   6925      */
   6926     i = reg_off = 0;
   6927     do {
   6928         uint64_t pg = vg[reg_off >> 6];
   6929         do {
   6930             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
   6931             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
   6932 
   6933             host[i] = NULL;
   6934             if (likely((pg >> (reg_off & 63)) & 1)) {
   6935                 if (likely(in_page >= msize)) {
   6936                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
   6937                                    mmu_idx, retaddr);
   6938                     if (!(info.flags & TLB_MMIO)) {
   6939                         host[i] = info.host;
   6940                     }
   6941                 } else {
   6942                     /*
   6943                      * Element crosses the page boundary.
   6944                      * Probe both pages, but do not record the host address,
   6945                      * so that we use the slow path.
   6946                      */
   6947                     sve_probe_page(&info, false, env, addr, 0,
   6948                                    MMU_DATA_STORE, mmu_idx, retaddr);
   6949                     sve_probe_page(&info2, false, env, addr + in_page, 0,
   6950                                    MMU_DATA_STORE, mmu_idx, retaddr);
   6951                     info.flags |= info2.flags;
   6952                 }
   6953 
   6954                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
   6955                     cpu_check_watchpoint(env_cpu(env), addr, msize,
   6956                                          info.attrs, BP_MEM_WRITE, retaddr);
   6957                 }
   6958 
   6959                 if (mtedesc && info.tagged) {
   6960                     mte_check(env, mtedesc, addr, retaddr);
   6961                 }
   6962             }
   6963             i += 1;
   6964             reg_off += esize;
   6965         } while (reg_off & 63);
   6966     } while (reg_off < reg_max);
   6967 
   6968     /*
   6969      * Now that we have recognized all exceptions except SyncExternal
   6970      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
   6971      *
   6972      * Note for the common case of an element in RAM, not crossing a page
   6973      * boundary, we have stored the host address in host[].  This doubles
   6974      * as a first-level check against the predicate, since only enabled
   6975      * elements have non-null host addresses.
   6976      */
   6977     i = reg_off = 0;
   6978     do {
   6979         void *h = host[i];
   6980         if (likely(h != NULL)) {
   6981             host_fn(vd, reg_off, h);
   6982         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
   6983             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
   6984             tlb_fn(env, vd, reg_off, addr, retaddr);
   6985         }
   6986         i += 1;
   6987         reg_off += esize;
   6988     } while (reg_off < reg_max);
   6989 }
   6990 
   6991 static inline QEMU_ALWAYS_INLINE
   6992 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
   6993                    target_ulong base, uint32_t desc, uintptr_t retaddr,
   6994                    int esize, int msize, zreg_off_fn *off_fn,
   6995                    sve_ldst1_host_fn *host_fn,
   6996                    sve_ldst1_tlb_fn *tlb_fn)
   6997 {
   6998     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   6999     /* Remove mtedesc from the normal sve descriptor. */
   7000     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
   7001 
   7002     /*
   7003      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
   7004      * offset base entirely over the address space hole to change the
   7005      * pointer tag, or change the bit55 selector.  So we could here
   7006      * examine TBI + TCMA like we do for sve_ldN_r_mte().
   7007      */
   7008     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
   7009               esize, msize, off_fn, host_fn, tlb_fn);
   7010 }
   7011 
   7012 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
   7013 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
   7014                                  void *vm, target_ulong base, uint32_t desc) \
   7015 {                                                                       \
   7016     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
   7017               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
   7018 }                                                                       \
   7019 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
   7020     void *vm, target_ulong base, uint32_t desc)                         \
   7021 {                                                                       \
   7022     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
   7023                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
   7024 }
   7025 
   7026 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
   7027 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
   7028                                  void *vm, target_ulong base, uint32_t desc) \
   7029 {                                                                       \
   7030     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
   7031               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
   7032 }                                                                       \
   7033 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
   7034     void *vm, target_ulong base, uint32_t desc)                         \
   7035 {                                                                       \
   7036     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
   7037                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
   7038 }
   7039 
   7040 DO_ST1_ZPZ_S(bs, zsu, MO_8)
   7041 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
   7042 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
   7043 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
   7044 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
   7045 
   7046 DO_ST1_ZPZ_S(bs, zss, MO_8)
   7047 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
   7048 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
   7049 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
   7050 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
   7051 
   7052 DO_ST1_ZPZ_D(bd, zsu, MO_8)
   7053 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
   7054 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
   7055 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
   7056 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
   7057 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
   7058 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
   7059 
   7060 DO_ST1_ZPZ_D(bd, zss, MO_8)
   7061 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
   7062 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
   7063 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
   7064 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
   7065 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
   7066 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
   7067 
   7068 DO_ST1_ZPZ_D(bd, zd, MO_8)
   7069 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
   7070 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
   7071 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
   7072 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
   7073 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
   7074 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
   7075 
   7076 #undef DO_ST1_ZPZ_S
   7077 #undef DO_ST1_ZPZ_D
   7078 
   7079 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
   7080 {
   7081     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   7082     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
   7083 
   7084     for (i = 0; i < opr_sz; ++i) {
   7085         d[i] = n[i] ^ m[i] ^ k[i];
   7086     }
   7087 }
   7088 
   7089 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
   7090 {
   7091     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   7092     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
   7093 
   7094     for (i = 0; i < opr_sz; ++i) {
   7095         d[i] = n[i] ^ (m[i] & ~k[i]);
   7096     }
   7097 }
   7098 
   7099 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
   7100 {
   7101     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   7102     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
   7103 
   7104     for (i = 0; i < opr_sz; ++i) {
   7105         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
   7106     }
   7107 }
   7108 
   7109 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
   7110 {
   7111     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   7112     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
   7113 
   7114     for (i = 0; i < opr_sz; ++i) {
   7115         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
   7116     }
   7117 }
   7118 
   7119 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
   7120 {
   7121     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   7122     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
   7123 
   7124     for (i = 0; i < opr_sz; ++i) {
   7125         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
   7126     }
   7127 }
   7128 
   7129 /*
   7130  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
   7131  * See hasless(v,1) from
   7132  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
   7133  */
   7134 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
   7135 {
   7136     int bits = 8 << esz;
   7137     uint64_t ones = dup_const(esz, 1);
   7138     uint64_t signs = ones << (bits - 1);
   7139     uint64_t cmp0, cmp1;
   7140 
   7141     cmp1 = dup_const(esz, n);
   7142     cmp0 = cmp1 ^ m0;
   7143     cmp1 = cmp1 ^ m1;
   7144     cmp0 = (cmp0 - ones) & ~cmp0;
   7145     cmp1 = (cmp1 - ones) & ~cmp1;
   7146     return (cmp0 | cmp1) & signs;
   7147 }
   7148 
   7149 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
   7150                                 uint32_t desc, int esz, bool nmatch)
   7151 {
   7152     uint16_t esz_mask = pred_esz_masks[esz];
   7153     intptr_t opr_sz = simd_oprsz(desc);
   7154     uint32_t flags = PREDTEST_INIT;
   7155     intptr_t i, j, k;
   7156 
   7157     for (i = 0; i < opr_sz; i += 16) {
   7158         uint64_t m0 = *(uint64_t *)(vm + i);
   7159         uint64_t m1 = *(uint64_t *)(vm + i + 8);
   7160         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
   7161         uint16_t out = 0;
   7162 
   7163         for (j = 0; j < 16; j += 8) {
   7164             uint64_t n = *(uint64_t *)(vn + i + j);
   7165 
   7166             for (k = 0; k < 8; k += 1 << esz) {
   7167                 if (pg & (1 << (j + k))) {
   7168                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
   7169                     out |= (o ^ nmatch) << (j + k);
   7170                 }
   7171             }
   7172         }
   7173         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
   7174         flags = iter_predtest_fwd(out, pg, flags);
   7175     }
   7176     return flags;
   7177 }
   7178 
   7179 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
   7180 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
   7181 {                                                                             \
   7182     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
   7183 }
   7184 
   7185 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
   7186 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
   7187 
   7188 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
   7189 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
   7190 
   7191 #undef DO_PPZZ_MATCH
   7192 
   7193 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
   7194                             uint32_t desc)
   7195 {
   7196     ARMVectorReg scratch;
   7197     intptr_t i, j;
   7198     intptr_t opr_sz = simd_oprsz(desc);
   7199     uint32_t *d = vd, *n = vn, *m = vm;
   7200     uint8_t *pg = vg;
   7201 
   7202     if (d == n) {
   7203         n = memcpy(&scratch, n, opr_sz);
   7204         if (d == m) {
   7205             m = n;
   7206         }
   7207     } else if (d == m) {
   7208         m = memcpy(&scratch, m, opr_sz);
   7209     }
   7210 
   7211     for (i = 0; i < opr_sz; i += 4) {
   7212         uint64_t count = 0;
   7213         uint8_t pred;
   7214 
   7215         pred = pg[H1(i >> 3)] >> (i & 7);
   7216         if (pred & 1) {
   7217             uint32_t nn = n[H4(i >> 2)];
   7218 
   7219             for (j = 0; j <= i; j += 4) {
   7220                 pred = pg[H1(j >> 3)] >> (j & 7);
   7221                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
   7222                     ++count;
   7223                 }
   7224             }
   7225         }
   7226         d[H4(i >> 2)] = count;
   7227     }
   7228 }
   7229 
   7230 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
   7231                             uint32_t desc)
   7232 {
   7233     ARMVectorReg scratch;
   7234     intptr_t i, j;
   7235     intptr_t opr_sz = simd_oprsz(desc);
   7236     uint64_t *d = vd, *n = vn, *m = vm;
   7237     uint8_t *pg = vg;
   7238 
   7239     if (d == n) {
   7240         n = memcpy(&scratch, n, opr_sz);
   7241         if (d == m) {
   7242             m = n;
   7243         }
   7244     } else if (d == m) {
   7245         m = memcpy(&scratch, m, opr_sz);
   7246     }
   7247 
   7248     for (i = 0; i < opr_sz / 8; ++i) {
   7249         uint64_t count = 0;
   7250         if (pg[H1(i)] & 1) {
   7251             uint64_t nn = n[i];
   7252             for (j = 0; j <= i; ++j) {
   7253                 if ((pg[H1(j)] & 1) && nn == m[j]) {
   7254                     ++count;
   7255                 }
   7256             }
   7257         }
   7258         d[i] = count;
   7259     }
   7260 }
   7261 
   7262 /*
   7263  * Returns the number of bytes in m0 and m1 that match n.
   7264  * Unlike do_match2 we don't just need true/false, we need an exact count.
   7265  * This requires two extra logical operations.
   7266  */
   7267 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
   7268 {
   7269     const uint64_t mask = dup_const(MO_8, 0x7f);
   7270     uint64_t cmp0, cmp1;
   7271 
   7272     cmp1 = dup_const(MO_8, n);
   7273     cmp0 = cmp1 ^ m0;
   7274     cmp1 = cmp1 ^ m1;
   7275 
   7276     /*
   7277      * 1: clear msb of each byte to avoid carry to next byte (& mask)
   7278      * 2: carry in to msb if byte != 0 (+ mask)
   7279      * 3: set msb if cmp has msb set (| cmp)
   7280      * 4: set ~msb to ignore them (| mask)
   7281      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
   7282      * 5: invert, resulting in 0x80 if and only if byte == 0.
   7283      */
   7284     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
   7285     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
   7286 
   7287     /*
   7288      * Combine the two compares in a way that the bits do
   7289      * not overlap, and so preserves the count of set bits.
   7290      * If the host has an efficient instruction for ctpop,
   7291      * then ctpop(x) + ctpop(y) has the same number of
   7292      * operations as ctpop(x | (y >> 1)).  If the host does
   7293      * not have an efficient ctpop, then we only want to
   7294      * use it once.
   7295      */
   7296     return ctpop64(cmp0 | (cmp1 >> 1));
   7297 }
   7298 
   7299 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
   7300 {
   7301     intptr_t i, j;
   7302     intptr_t opr_sz = simd_oprsz(desc);
   7303 
   7304     for (i = 0; i < opr_sz; i += 16) {
   7305         uint64_t n0 = *(uint64_t *)(vn + i);
   7306         uint64_t m0 = *(uint64_t *)(vm + i);
   7307         uint64_t n1 = *(uint64_t *)(vn + i + 8);
   7308         uint64_t m1 = *(uint64_t *)(vm + i + 8);
   7309         uint64_t out0 = 0;
   7310         uint64_t out1 = 0;
   7311 
   7312         for (j = 0; j < 64; j += 8) {
   7313             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
   7314             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
   7315             out0 |= cnt0 << j;
   7316             out1 |= cnt1 << j;
   7317         }
   7318 
   7319         *(uint64_t *)(vd + i) = out0;
   7320         *(uint64_t *)(vd + i + 8) = out1;
   7321     }
   7322 }
   7323 
   7324 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
   7325 {
   7326     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   7327     int shr = simd_data(desc);
   7328     int shl = 8 - shr;
   7329     uint64_t mask = dup_const(MO_8, 0xff >> shr);
   7330     uint64_t *d = vd, *n = vn, *m = vm;
   7331 
   7332     for (i = 0; i < opr_sz; ++i) {
   7333         uint64_t t = n[i] ^ m[i];
   7334         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
   7335     }
   7336 }
   7337 
   7338 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
   7339 {
   7340     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   7341     int shr = simd_data(desc);
   7342     int shl = 16 - shr;
   7343     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
   7344     uint64_t *d = vd, *n = vn, *m = vm;
   7345 
   7346     for (i = 0; i < opr_sz; ++i) {
   7347         uint64_t t = n[i] ^ m[i];
   7348         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
   7349     }
   7350 }
   7351 
   7352 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
   7353 {
   7354     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
   7355     int shr = simd_data(desc);
   7356     uint32_t *d = vd, *n = vn, *m = vm;
   7357 
   7358     for (i = 0; i < opr_sz; ++i) {
   7359         d[i] = ror32(n[i] ^ m[i], shr);
   7360     }
   7361 }
   7362 
   7363 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
   7364                      void *status, uint32_t desc)
   7365 {
   7366     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
   7367 
   7368     for (s = 0; s < opr_sz; ++s) {
   7369         float32 *n = vn + s * sizeof(float32) * 4;
   7370         float32 *m = vm + s * sizeof(float32) * 4;
   7371         float32 *a = va + s * sizeof(float32) * 4;
   7372         float32 *d = vd + s * sizeof(float32) * 4;
   7373         float32 n00 = n[H4(0)], n01 = n[H4(1)];
   7374         float32 n10 = n[H4(2)], n11 = n[H4(3)];
   7375         float32 m00 = m[H4(0)], m01 = m[H4(1)];
   7376         float32 m10 = m[H4(2)], m11 = m[H4(3)];
   7377         float32 p0, p1;
   7378 
   7379         /* i = 0, j = 0 */
   7380         p0 = float32_mul(n00, m00, status);
   7381         p1 = float32_mul(n01, m01, status);
   7382         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
   7383 
   7384         /* i = 0, j = 1 */
   7385         p0 = float32_mul(n00, m10, status);
   7386         p1 = float32_mul(n01, m11, status);
   7387         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
   7388 
   7389         /* i = 1, j = 0 */
   7390         p0 = float32_mul(n10, m00, status);
   7391         p1 = float32_mul(n11, m01, status);
   7392         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
   7393 
   7394         /* i = 1, j = 1 */
   7395         p0 = float32_mul(n10, m10, status);
   7396         p1 = float32_mul(n11, m11, status);
   7397         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
   7398     }
   7399 }
   7400 
   7401 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
   7402                      void *status, uint32_t desc)
   7403 {
   7404     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
   7405 
   7406     for (s = 0; s < opr_sz; ++s) {
   7407         float64 *n = vn + s * sizeof(float64) * 4;
   7408         float64 *m = vm + s * sizeof(float64) * 4;
   7409         float64 *a = va + s * sizeof(float64) * 4;
   7410         float64 *d = vd + s * sizeof(float64) * 4;
   7411         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
   7412         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
   7413         float64 p0, p1;
   7414 
   7415         /* i = 0, j = 0 */
   7416         p0 = float64_mul(n00, m00, status);
   7417         p1 = float64_mul(n01, m01, status);
   7418         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
   7419 
   7420         /* i = 0, j = 1 */
   7421         p0 = float64_mul(n00, m10, status);
   7422         p1 = float64_mul(n01, m11, status);
   7423         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
   7424 
   7425         /* i = 1, j = 0 */
   7426         p0 = float64_mul(n10, m00, status);
   7427         p1 = float64_mul(n11, m01, status);
   7428         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
   7429 
   7430         /* i = 1, j = 1 */
   7431         p0 = float64_mul(n10, m10, status);
   7432         p1 = float64_mul(n11, m11, status);
   7433         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
   7434     }
   7435 }
   7436 
   7437 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
   7438 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
   7439 {                                                                             \
   7440     intptr_t i = simd_oprsz(desc);                                            \
   7441     uint64_t *g = vg;                                                         \
   7442     do {                                                                      \
   7443         uint64_t pg = g[(i - 1) >> 6];                                        \
   7444         do {                                                                  \
   7445             i -= sizeof(TYPEW);                                               \
   7446             if (likely((pg >> (i & 63)) & 1)) {                               \
   7447                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
   7448                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
   7449             }                                                                 \
   7450         } while (i & 63);                                                     \
   7451     } while (i != 0);                                                         \
   7452 }
   7453 
   7454 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
   7455 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
   7456 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
   7457 
   7458 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
   7459 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
   7460 {                                                                             \
   7461     intptr_t i = simd_oprsz(desc);                                            \
   7462     uint64_t *g = vg;                                                         \
   7463     do {                                                                      \
   7464         uint64_t pg = g[(i - 1) >> 6];                                        \
   7465         do {                                                                  \
   7466             i -= sizeof(TYPEW);                                               \
   7467             if (likely((pg >> (i & 63)) & 1)) {                               \
   7468                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
   7469                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
   7470             }                                                                 \
   7471         } while (i & 63);                                                     \
   7472     } while (i != 0);                                                         \
   7473 }
   7474 
   7475 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
   7476 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
   7477 
   7478 #undef DO_FCVTLT
   7479 #undef DO_FCVTNT