qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

sme_helper.c (43633B)


      1 /*
      2  * ARM SME Operations
      3  *
      4  * Copyright (c) 2022 Linaro, Ltd.
      5  *
      6  * This library is free software; you can redistribute it and/or
      7  * modify it under the terms of the GNU Lesser General Public
      8  * License as published by the Free Software Foundation; either
      9  * version 2.1 of the License, or (at your option) any later version.
     10  *
     11  * This library is distributed in the hope that it will be useful,
     12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14  * Lesser General Public License for more details.
     15  *
     16  * You should have received a copy of the GNU Lesser General Public
     17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
     18  */
     19 
     20 #include "qemu/osdep.h"
     21 #include "cpu.h"
     22 #include "internals.h"
     23 #include "tcg/tcg-gvec-desc.h"
     24 #include "exec/helper-proto.h"
     25 #include "exec/cpu_ldst.h"
     26 #include "exec/exec-all.h"
     27 #include "qemu/int128.h"
     28 #include "fpu/softfloat.h"
     29 #include "vec_internal.h"
     30 #include "sve_ldst_internal.h"
     31 
     32 /* ResetSVEState */
     33 void arm_reset_sve_state(CPUARMState *env)
     34 {
     35     memset(env->vfp.zregs, 0, sizeof(env->vfp.zregs));
     36     /* Recall that FFR is stored as pregs[16]. */
     37     memset(env->vfp.pregs, 0, sizeof(env->vfp.pregs));
     38     vfp_set_fpcr(env, 0x0800009f);
     39 }
     40 
     41 void helper_set_pstate_sm(CPUARMState *env, uint32_t i)
     42 {
     43     if (i == FIELD_EX64(env->svcr, SVCR, SM)) {
     44         return;
     45     }
     46     env->svcr ^= R_SVCR_SM_MASK;
     47     arm_reset_sve_state(env);
     48 }
     49 
     50 void helper_set_pstate_za(CPUARMState *env, uint32_t i)
     51 {
     52     if (i == FIELD_EX64(env->svcr, SVCR, ZA)) {
     53         return;
     54     }
     55     env->svcr ^= R_SVCR_ZA_MASK;
     56 
     57     /*
     58      * ResetSMEState.
     59      *
     60      * SetPSTATE_ZA zeros on enable and disable.  We can zero this only
     61      * on enable: while disabled, the storage is inaccessible and the
     62      * value does not matter.  We're not saving the storage in vmstate
     63      * when disabled either.
     64      */
     65     if (i) {
     66         memset(env->zarray, 0, sizeof(env->zarray));
     67     }
     68 }
     69 
     70 void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
     71 {
     72     uint32_t i;
     73 
     74     /*
     75      * Special case clearing the entire ZA space.
     76      * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
     77      * parts of the ZA storage outside of SVL.
     78      */
     79     if (imm == 0xff) {
     80         memset(env->zarray, 0, sizeof(env->zarray));
     81         return;
     82     }
     83 
     84     /*
     85      * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
     86      * so each row is discontiguous within ZA[].
     87      */
     88     for (i = 0; i < svl; i++) {
     89         if (imm & (1 << (i % 8))) {
     90             memset(&env->zarray[i], 0, svl);
     91         }
     92     }
     93 }
     94 
     95 
     96 /*
     97  * When considering the ZA storage as an array of elements of
     98  * type T, the index within that array of the Nth element of
     99  * a vertical slice of a tile can be calculated like this,
    100  * regardless of the size of type T. This is because the tiles
    101  * are interleaved, so if type T is size N bytes then row 1 of
    102  * the tile is N rows away from row 0. The division by N to
    103  * convert a byte offset into an array index and the multiplication
    104  * by N to convert from vslice-index-within-the-tile to
    105  * the index within the ZA storage cancel out.
    106  */
    107 #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
    108 
    109 /*
    110  * When doing byte arithmetic on the ZA storage, the element
    111  * byteoff bytes away in a tile vertical slice is always this
    112  * many bytes away in the ZA storage, regardless of the
    113  * size of the tile element, assuming that byteoff is a multiple
    114  * of the element size. Again this is because of the interleaving
    115  * of the tiles. For instance if we have 1 byte per element then
    116  * each row of the ZA storage has one byte of the vslice data,
    117  * and (counting from 0) byte 8 goes in row 8 of the storage
    118  * at offset (8 * row-size-in-bytes).
    119  * If we have 8 bytes per element then each row of the ZA storage
    120  * has 8 bytes of the data, but there are 8 interleaved tiles and
    121  * so byte 8 of the data goes into row 1 of the tile,
    122  * which is again row 8 of the storage, so the offset is still
    123  * (8 * row-size-in-bytes). Similarly for other element sizes.
    124  */
    125 #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
    126 
    127 
    128 /*
    129  * Move Zreg vector to ZArray column.
    130  */
    131 #define DO_MOVA_C(NAME, TYPE, H)                                        \
    132 void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
    133 {                                                                       \
    134     int i, oprsz = simd_oprsz(desc);                                    \
    135     for (i = 0; i < oprsz; ) {                                          \
    136         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
    137         do {                                                            \
    138             if (pg & 1) {                                               \
    139                 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
    140             }                                                           \
    141             i += sizeof(TYPE);                                          \
    142             pg >>= sizeof(TYPE);                                        \
    143         } while (i & 15);                                               \
    144     }                                                                   \
    145 }
    146 
    147 DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
    148 DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
    149 DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
    150 
    151 void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
    152 {
    153     int i, oprsz = simd_oprsz(desc) / 8;
    154     uint8_t *pg = vg;
    155     uint64_t *n = vn;
    156     uint64_t *a = za;
    157 
    158     for (i = 0; i < oprsz; i++) {
    159         if (pg[H1(i)] & 1) {
    160             a[tile_vslice_index(i)] = n[i];
    161         }
    162     }
    163 }
    164 
    165 void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
    166 {
    167     int i, oprsz = simd_oprsz(desc) / 16;
    168     uint16_t *pg = vg;
    169     Int128 *n = vn;
    170     Int128 *a = za;
    171 
    172     /*
    173      * Int128 is used here simply to copy 16 bytes, and to simplify
    174      * the address arithmetic.
    175      */
    176     for (i = 0; i < oprsz; i++) {
    177         if (pg[H2(i)] & 1) {
    178             a[tile_vslice_index(i)] = n[i];
    179         }
    180     }
    181 }
    182 
    183 #undef DO_MOVA_C
    184 
    185 /*
    186  * Move ZArray column to Zreg vector.
    187  */
    188 #define DO_MOVA_Z(NAME, TYPE, H)                                        \
    189 void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
    190 {                                                                       \
    191     int i, oprsz = simd_oprsz(desc);                                    \
    192     for (i = 0; i < oprsz; ) {                                          \
    193         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
    194         do {                                                            \
    195             if (pg & 1) {                                               \
    196                 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
    197             }                                                           \
    198             i += sizeof(TYPE);                                          \
    199             pg >>= sizeof(TYPE);                                        \
    200         } while (i & 15);                                               \
    201     }                                                                   \
    202 }
    203 
    204 DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
    205 DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
    206 DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
    207 
    208 void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
    209 {
    210     int i, oprsz = simd_oprsz(desc) / 8;
    211     uint8_t *pg = vg;
    212     uint64_t *d = vd;
    213     uint64_t *a = za;
    214 
    215     for (i = 0; i < oprsz; i++) {
    216         if (pg[H1(i)] & 1) {
    217             d[i] = a[tile_vslice_index(i)];
    218         }
    219     }
    220 }
    221 
    222 void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
    223 {
    224     int i, oprsz = simd_oprsz(desc) / 16;
    225     uint16_t *pg = vg;
    226     Int128 *d = vd;
    227     Int128 *a = za;
    228 
    229     /*
    230      * Int128 is used here simply to copy 16 bytes, and to simplify
    231      * the address arithmetic.
    232      */
    233     for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
    234         if (pg[H2(i)] & 1) {
    235             d[i] = a[tile_vslice_index(i)];
    236         }
    237     }
    238 }
    239 
    240 #undef DO_MOVA_Z
    241 
    242 /*
    243  * Clear elements in a tile slice comprising len bytes.
    244  */
    245 
    246 typedef void ClearFn(void *ptr, size_t off, size_t len);
    247 
    248 static void clear_horizontal(void *ptr, size_t off, size_t len)
    249 {
    250     memset(ptr + off, 0, len);
    251 }
    252 
    253 static void clear_vertical_b(void *vptr, size_t off, size_t len)
    254 {
    255     for (size_t i = 0; i < len; ++i) {
    256         *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
    257     }
    258 }
    259 
    260 static void clear_vertical_h(void *vptr, size_t off, size_t len)
    261 {
    262     for (size_t i = 0; i < len; i += 2) {
    263         *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
    264     }
    265 }
    266 
    267 static void clear_vertical_s(void *vptr, size_t off, size_t len)
    268 {
    269     for (size_t i = 0; i < len; i += 4) {
    270         *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
    271     }
    272 }
    273 
    274 static void clear_vertical_d(void *vptr, size_t off, size_t len)
    275 {
    276     for (size_t i = 0; i < len; i += 8) {
    277         *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
    278     }
    279 }
    280 
    281 static void clear_vertical_q(void *vptr, size_t off, size_t len)
    282 {
    283     for (size_t i = 0; i < len; i += 16) {
    284         memset(vptr + tile_vslice_offset(i + off), 0, 16);
    285     }
    286 }
    287 
    288 /*
    289  * Copy elements from an array into a tile slice comprising len bytes.
    290  */
    291 
    292 typedef void CopyFn(void *dst, const void *src, size_t len);
    293 
    294 static void copy_horizontal(void *dst, const void *src, size_t len)
    295 {
    296     memcpy(dst, src, len);
    297 }
    298 
    299 static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
    300 {
    301     const uint8_t *src = vsrc;
    302     uint8_t *dst = vdst;
    303     size_t i;
    304 
    305     for (i = 0; i < len; ++i) {
    306         dst[tile_vslice_index(i)] = src[i];
    307     }
    308 }
    309 
    310 static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
    311 {
    312     const uint16_t *src = vsrc;
    313     uint16_t *dst = vdst;
    314     size_t i;
    315 
    316     for (i = 0; i < len / 2; ++i) {
    317         dst[tile_vslice_index(i)] = src[i];
    318     }
    319 }
    320 
    321 static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
    322 {
    323     const uint32_t *src = vsrc;
    324     uint32_t *dst = vdst;
    325     size_t i;
    326 
    327     for (i = 0; i < len / 4; ++i) {
    328         dst[tile_vslice_index(i)] = src[i];
    329     }
    330 }
    331 
    332 static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
    333 {
    334     const uint64_t *src = vsrc;
    335     uint64_t *dst = vdst;
    336     size_t i;
    337 
    338     for (i = 0; i < len / 8; ++i) {
    339         dst[tile_vslice_index(i)] = src[i];
    340     }
    341 }
    342 
    343 static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
    344 {
    345     for (size_t i = 0; i < len; i += 16) {
    346         memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
    347     }
    348 }
    349 
    350 /*
    351  * Host and TLB primitives for vertical tile slice addressing.
    352  */
    353 
    354 #define DO_LD(NAME, TYPE, HOST, TLB)                                        \
    355 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
    356 {                                                                           \
    357     TYPE val = HOST(host);                                                  \
    358     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
    359 }                                                                           \
    360 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
    361                         intptr_t off, target_ulong addr, uintptr_t ra)      \
    362 {                                                                           \
    363     TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
    364     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
    365 }
    366 
    367 #define DO_ST(NAME, TYPE, HOST, TLB)                                        \
    368 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
    369 {                                                                           \
    370     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
    371     HOST(host, val);                                                        \
    372 }                                                                           \
    373 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
    374                         intptr_t off, target_ulong addr, uintptr_t ra)      \
    375 {                                                                           \
    376     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
    377     TLB(env, useronly_clean_ptr(addr), val, ra);                            \
    378 }
    379 
    380 /*
    381  * The ARMVectorReg elements are stored in host-endian 64-bit units.
    382  * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
    383  * corresponds to storing the two 64-bit pieces in little-endian order.
    384  */
    385 #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
    386 static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
    387 {                                                                           \
    388     uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
    389     uint64_t *ptr = za + off;                                               \
    390     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
    391 }                                                                           \
    392 static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
    393 {                                                                           \
    394     HNAME##_host(za, tile_vslice_offset(off), host);                        \
    395 }                                                                           \
    396 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
    397                                target_ulong addr, uintptr_t ra)             \
    398 {                                                                           \
    399     uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
    400     uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
    401     uint64_t *ptr = za + off;                                               \
    402     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
    403 }                                                                           \
    404 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
    405                                target_ulong addr, uintptr_t ra)             \
    406 {                                                                           \
    407     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
    408 }
    409 
    410 #define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
    411 static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
    412 {                                                                           \
    413     uint64_t *ptr = za + off;                                               \
    414     HOST(host, ptr[BE]);                                                    \
    415     HOST(host + 1, ptr[!BE]);                                               \
    416 }                                                                           \
    417 static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
    418 {                                                                           \
    419     HNAME##_host(za, tile_vslice_offset(off), host);                        \
    420 }                                                                           \
    421 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
    422                                target_ulong addr, uintptr_t ra)             \
    423 {                                                                           \
    424     uint64_t *ptr = za + off;                                               \
    425     TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
    426     TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
    427 }                                                                           \
    428 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
    429                                target_ulong addr, uintptr_t ra)             \
    430 {                                                                           \
    431     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
    432 }
    433 
    434 DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
    435 DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
    436 DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
    437 DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
    438 DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
    439 DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
    440 DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
    441 
    442 DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
    443 DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
    444 
    445 DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
    446 DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
    447 DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
    448 DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
    449 DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
    450 DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
    451 DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
    452 
    453 DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
    454 DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
    455 
    456 #undef DO_LD
    457 #undef DO_ST
    458 #undef DO_LDQ
    459 #undef DO_STQ
    460 
    461 /*
    462  * Common helper for all contiguous predicated loads.
    463  */
    464 
    465 static inline QEMU_ALWAYS_INLINE
    466 void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
    467              const target_ulong addr, uint32_t desc, const uintptr_t ra,
    468              const int esz, uint32_t mtedesc, bool vertical,
    469              sve_ldst1_host_fn *host_fn,
    470              sve_ldst1_tlb_fn *tlb_fn,
    471              ClearFn *clr_fn,
    472              CopyFn *cpy_fn)
    473 {
    474     const intptr_t reg_max = simd_oprsz(desc);
    475     const intptr_t esize = 1 << esz;
    476     intptr_t reg_off, reg_last;
    477     SVEContLdSt info;
    478     void *host;
    479     int flags;
    480 
    481     /* Find the active elements.  */
    482     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
    483         /* The entire predicate was false; no load occurs.  */
    484         clr_fn(za, 0, reg_max);
    485         return;
    486     }
    487 
    488     /* Probe the page(s).  Exit with exception for any invalid page. */
    489     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
    490 
    491     /* Handle watchpoints for all active elements. */
    492     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
    493                               BP_MEM_READ, ra);
    494 
    495     /*
    496      * Handle mte checks for all active elements.
    497      * Since TBI must be set for MTE, !mtedesc => !mte_active.
    498      */
    499     if (mtedesc) {
    500         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
    501                                 mtedesc, ra);
    502     }
    503 
    504     flags = info.page[0].flags | info.page[1].flags;
    505     if (unlikely(flags != 0)) {
    506 #ifdef CONFIG_USER_ONLY
    507         g_assert_not_reached();
    508 #else
    509         /*
    510          * At least one page includes MMIO.
    511          * Any bus operation can fail with cpu_transaction_failed,
    512          * which for ARM will raise SyncExternal.  Perform the load
    513          * into scratch memory to preserve register state until the end.
    514          */
    515         ARMVectorReg scratch = { };
    516 
    517         reg_off = info.reg_off_first[0];
    518         reg_last = info.reg_off_last[1];
    519         if (reg_last < 0) {
    520             reg_last = info.reg_off_split;
    521             if (reg_last < 0) {
    522                 reg_last = info.reg_off_last[0];
    523             }
    524         }
    525 
    526         do {
    527             uint64_t pg = vg[reg_off >> 6];
    528             do {
    529                 if ((pg >> (reg_off & 63)) & 1) {
    530                     tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
    531                 }
    532                 reg_off += esize;
    533             } while (reg_off & 63);
    534         } while (reg_off <= reg_last);
    535 
    536         cpy_fn(za, &scratch, reg_max);
    537         return;
    538 #endif
    539     }
    540 
    541     /* The entire operation is in RAM, on valid pages. */
    542 
    543     reg_off = info.reg_off_first[0];
    544     reg_last = info.reg_off_last[0];
    545     host = info.page[0].host;
    546 
    547     if (!vertical) {
    548         memset(za, 0, reg_max);
    549     } else if (reg_off) {
    550         clr_fn(za, 0, reg_off);
    551     }
    552 
    553     while (reg_off <= reg_last) {
    554         uint64_t pg = vg[reg_off >> 6];
    555         do {
    556             if ((pg >> (reg_off & 63)) & 1) {
    557                 host_fn(za, reg_off, host + reg_off);
    558             } else if (vertical) {
    559                 clr_fn(za, reg_off, esize);
    560             }
    561             reg_off += esize;
    562         } while (reg_off <= reg_last && (reg_off & 63));
    563     }
    564 
    565     /*
    566      * Use the slow path to manage the cross-page misalignment.
    567      * But we know this is RAM and cannot trap.
    568      */
    569     reg_off = info.reg_off_split;
    570     if (unlikely(reg_off >= 0)) {
    571         tlb_fn(env, za, reg_off, addr + reg_off, ra);
    572     }
    573 
    574     reg_off = info.reg_off_first[1];
    575     if (unlikely(reg_off >= 0)) {
    576         reg_last = info.reg_off_last[1];
    577         host = info.page[1].host;
    578 
    579         do {
    580             uint64_t pg = vg[reg_off >> 6];
    581             do {
    582                 if ((pg >> (reg_off & 63)) & 1) {
    583                     host_fn(za, reg_off, host + reg_off);
    584                 } else if (vertical) {
    585                     clr_fn(za, reg_off, esize);
    586                 }
    587                 reg_off += esize;
    588             } while (reg_off & 63);
    589         } while (reg_off <= reg_last);
    590     }
    591 }
    592 
    593 static inline QEMU_ALWAYS_INLINE
    594 void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
    595                  target_ulong addr, uint32_t desc, uintptr_t ra,
    596                  const int esz, bool vertical,
    597                  sve_ldst1_host_fn *host_fn,
    598                  sve_ldst1_tlb_fn *tlb_fn,
    599                  ClearFn *clr_fn,
    600                  CopyFn *cpy_fn)
    601 {
    602     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
    603     int bit55 = extract64(addr, 55, 1);
    604 
    605     /* Remove mtedesc from the normal sve descriptor. */
    606     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
    607 
    608     /* Perform gross MTE suppression early. */
    609     if (!tbi_check(desc, bit55) ||
    610         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
    611         mtedesc = 0;
    612     }
    613 
    614     sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
    615             host_fn, tlb_fn, clr_fn, cpy_fn);
    616 }
    617 
    618 #define DO_LD(L, END, ESZ)                                                 \
    619 void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
    620                                  target_ulong addr, uint32_t desc)         \
    621 {                                                                          \
    622     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
    623             sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,           \
    624             clear_horizontal, copy_horizontal);                            \
    625 }                                                                          \
    626 void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
    627                                  target_ulong addr, uint32_t desc)         \
    628 {                                                                          \
    629     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
    630             sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,             \
    631             clear_vertical_##L, copy_vertical_##L);                        \
    632 }                                                                          \
    633 void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
    634                                      target_ulong addr, uint32_t desc)     \
    635 {                                                                          \
    636     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
    637                 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,       \
    638                 clear_horizontal, copy_horizontal);                        \
    639 }                                                                          \
    640 void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
    641                                      target_ulong addr, uint32_t desc)     \
    642 {                                                                          \
    643     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
    644                 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,         \
    645                 clear_vertical_##L, copy_vertical_##L);                    \
    646 }
    647 
    648 DO_LD(b, , MO_8)
    649 DO_LD(h, _be, MO_16)
    650 DO_LD(h, _le, MO_16)
    651 DO_LD(s, _be, MO_32)
    652 DO_LD(s, _le, MO_32)
    653 DO_LD(d, _be, MO_64)
    654 DO_LD(d, _le, MO_64)
    655 DO_LD(q, _be, MO_128)
    656 DO_LD(q, _le, MO_128)
    657 
    658 #undef DO_LD
    659 
    660 /*
    661  * Common helper for all contiguous predicated stores.
    662  */
    663 
    664 static inline QEMU_ALWAYS_INLINE
    665 void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
    666              const target_ulong addr, uint32_t desc, const uintptr_t ra,
    667              const int esz, uint32_t mtedesc, bool vertical,
    668              sve_ldst1_host_fn *host_fn,
    669              sve_ldst1_tlb_fn *tlb_fn)
    670 {
    671     const intptr_t reg_max = simd_oprsz(desc);
    672     const intptr_t esize = 1 << esz;
    673     intptr_t reg_off, reg_last;
    674     SVEContLdSt info;
    675     void *host;
    676     int flags;
    677 
    678     /* Find the active elements.  */
    679     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
    680         /* The entire predicate was false; no store occurs.  */
    681         return;
    682     }
    683 
    684     /* Probe the page(s).  Exit with exception for any invalid page. */
    685     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
    686 
    687     /* Handle watchpoints for all active elements. */
    688     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
    689                               BP_MEM_WRITE, ra);
    690 
    691     /*
    692      * Handle mte checks for all active elements.
    693      * Since TBI must be set for MTE, !mtedesc => !mte_active.
    694      */
    695     if (mtedesc) {
    696         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
    697                                 mtedesc, ra);
    698     }
    699 
    700     flags = info.page[0].flags | info.page[1].flags;
    701     if (unlikely(flags != 0)) {
    702 #ifdef CONFIG_USER_ONLY
    703         g_assert_not_reached();
    704 #else
    705         /*
    706          * At least one page includes MMIO.
    707          * Any bus operation can fail with cpu_transaction_failed,
    708          * which for ARM will raise SyncExternal.  We cannot avoid
    709          * this fault and will leave with the store incomplete.
    710          */
    711         reg_off = info.reg_off_first[0];
    712         reg_last = info.reg_off_last[1];
    713         if (reg_last < 0) {
    714             reg_last = info.reg_off_split;
    715             if (reg_last < 0) {
    716                 reg_last = info.reg_off_last[0];
    717             }
    718         }
    719 
    720         do {
    721             uint64_t pg = vg[reg_off >> 6];
    722             do {
    723                 if ((pg >> (reg_off & 63)) & 1) {
    724                     tlb_fn(env, za, reg_off, addr + reg_off, ra);
    725                 }
    726                 reg_off += esize;
    727             } while (reg_off & 63);
    728         } while (reg_off <= reg_last);
    729         return;
    730 #endif
    731     }
    732 
    733     reg_off = info.reg_off_first[0];
    734     reg_last = info.reg_off_last[0];
    735     host = info.page[0].host;
    736 
    737     while (reg_off <= reg_last) {
    738         uint64_t pg = vg[reg_off >> 6];
    739         do {
    740             if ((pg >> (reg_off & 63)) & 1) {
    741                 host_fn(za, reg_off, host + reg_off);
    742             }
    743             reg_off += 1 << esz;
    744         } while (reg_off <= reg_last && (reg_off & 63));
    745     }
    746 
    747     /*
    748      * Use the slow path to manage the cross-page misalignment.
    749      * But we know this is RAM and cannot trap.
    750      */
    751     reg_off = info.reg_off_split;
    752     if (unlikely(reg_off >= 0)) {
    753         tlb_fn(env, za, reg_off, addr + reg_off, ra);
    754     }
    755 
    756     reg_off = info.reg_off_first[1];
    757     if (unlikely(reg_off >= 0)) {
    758         reg_last = info.reg_off_last[1];
    759         host = info.page[1].host;
    760 
    761         do {
    762             uint64_t pg = vg[reg_off >> 6];
    763             do {
    764                 if ((pg >> (reg_off & 63)) & 1) {
    765                     host_fn(za, reg_off, host + reg_off);
    766                 }
    767                 reg_off += 1 << esz;
    768             } while (reg_off & 63);
    769         } while (reg_off <= reg_last);
    770     }
    771 }
    772 
    773 static inline QEMU_ALWAYS_INLINE
    774 void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
    775                  uint32_t desc, uintptr_t ra, int esz, bool vertical,
    776                  sve_ldst1_host_fn *host_fn,
    777                  sve_ldst1_tlb_fn *tlb_fn)
    778 {
    779     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
    780     int bit55 = extract64(addr, 55, 1);
    781 
    782     /* Remove mtedesc from the normal sve descriptor. */
    783     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
    784 
    785     /* Perform gross MTE suppression early. */
    786     if (!tbi_check(desc, bit55) ||
    787         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
    788         mtedesc = 0;
    789     }
    790 
    791     sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
    792             vertical, host_fn, tlb_fn);
    793 }
    794 
    795 #define DO_ST(L, END, ESZ)                                                 \
    796 void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
    797                                  target_ulong addr, uint32_t desc)         \
    798 {                                                                          \
    799     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
    800             sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);          \
    801 }                                                                          \
    802 void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
    803                                  target_ulong addr, uint32_t desc)         \
    804 {                                                                          \
    805     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
    806             sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);            \
    807 }                                                                          \
    808 void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
    809                                      target_ulong addr, uint32_t desc)     \
    810 {                                                                          \
    811     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
    812                 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);      \
    813 }                                                                          \
    814 void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
    815                                      target_ulong addr, uint32_t desc)     \
    816 {                                                                          \
    817     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
    818                 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);        \
    819 }
    820 
    821 DO_ST(b, , MO_8)
    822 DO_ST(h, _be, MO_16)
    823 DO_ST(h, _le, MO_16)
    824 DO_ST(s, _be, MO_32)
    825 DO_ST(s, _le, MO_32)
    826 DO_ST(d, _be, MO_64)
    827 DO_ST(d, _le, MO_64)
    828 DO_ST(q, _be, MO_128)
    829 DO_ST(q, _le, MO_128)
    830 
    831 #undef DO_ST
    832 
    833 void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
    834                          void *vpm, uint32_t desc)
    835 {
    836     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
    837     uint64_t *pn = vpn, *pm = vpm;
    838     uint32_t *zda = vzda, *zn = vzn;
    839 
    840     for (row = 0; row < oprsz; ) {
    841         uint64_t pa = pn[row >> 4];
    842         do {
    843             if (pa & 1) {
    844                 for (col = 0; col < oprsz; ) {
    845                     uint64_t pb = pm[col >> 4];
    846                     do {
    847                         if (pb & 1) {
    848                             zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
    849                         }
    850                         pb >>= 4;
    851                     } while (++col & 15);
    852                 }
    853             }
    854             pa >>= 4;
    855         } while (++row & 15);
    856     }
    857 }
    858 
    859 void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
    860                          void *vpm, uint32_t desc)
    861 {
    862     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
    863     uint8_t *pn = vpn, *pm = vpm;
    864     uint64_t *zda = vzda, *zn = vzn;
    865 
    866     for (row = 0; row < oprsz; ++row) {
    867         if (pn[H1(row)] & 1) {
    868             for (col = 0; col < oprsz; ++col) {
    869                 if (pm[H1(col)] & 1) {
    870                     zda[tile_vslice_index(row) + col] += zn[col];
    871                 }
    872             }
    873         }
    874     }
    875 }
    876 
    877 void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
    878                          void *vpm, uint32_t desc)
    879 {
    880     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
    881     uint64_t *pn = vpn, *pm = vpm;
    882     uint32_t *zda = vzda, *zn = vzn;
    883 
    884     for (row = 0; row < oprsz; ) {
    885         uint64_t pa = pn[row >> 4];
    886         do {
    887             if (pa & 1) {
    888                 uint32_t zn_row = zn[H4(row)];
    889                 for (col = 0; col < oprsz; ) {
    890                     uint64_t pb = pm[col >> 4];
    891                     do {
    892                         if (pb & 1) {
    893                             zda[tile_vslice_index(row) + H4(col)] += zn_row;
    894                         }
    895                         pb >>= 4;
    896                     } while (++col & 15);
    897                 }
    898             }
    899             pa >>= 4;
    900         } while (++row & 15);
    901     }
    902 }
    903 
    904 void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
    905                          void *vpm, uint32_t desc)
    906 {
    907     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
    908     uint8_t *pn = vpn, *pm = vpm;
    909     uint64_t *zda = vzda, *zn = vzn;
    910 
    911     for (row = 0; row < oprsz; ++row) {
    912         if (pn[H1(row)] & 1) {
    913             uint64_t zn_row = zn[row];
    914             for (col = 0; col < oprsz; ++col) {
    915                 if (pm[H1(col)] & 1) {
    916                     zda[tile_vslice_index(row) + col] += zn_row;
    917                 }
    918             }
    919         }
    920     }
    921 }
    922 
    923 void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
    924                          void *vpm, void *vst, uint32_t desc)
    925 {
    926     intptr_t row, col, oprsz = simd_maxsz(desc);
    927     uint32_t neg = simd_data(desc) << 31;
    928     uint16_t *pn = vpn, *pm = vpm;
    929     float_status fpst;
    930 
    931     /*
    932      * Make a copy of float_status because this operation does not
    933      * update the cumulative fp exception status.  It also produces
    934      * default nans.
    935      */
    936     fpst = *(float_status *)vst;
    937     set_default_nan_mode(true, &fpst);
    938 
    939     for (row = 0; row < oprsz; ) {
    940         uint16_t pa = pn[H2(row >> 4)];
    941         do {
    942             if (pa & 1) {
    943                 void *vza_row = vza + tile_vslice_offset(row);
    944                 uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
    945 
    946                 for (col = 0; col < oprsz; ) {
    947                     uint16_t pb = pm[H2(col >> 4)];
    948                     do {
    949                         if (pb & 1) {
    950                             uint32_t *a = vza_row + H1_4(col);
    951                             uint32_t *m = vzm + H1_4(col);
    952                             *a = float32_muladd(n, *m, *a, 0, vst);
    953                         }
    954                         col += 4;
    955                         pb >>= 4;
    956                     } while (col & 15);
    957                 }
    958             }
    959             row += 4;
    960             pa >>= 4;
    961         } while (row & 15);
    962     }
    963 }
    964 
    965 void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
    966                          void *vpm, void *vst, uint32_t desc)
    967 {
    968     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
    969     uint64_t neg = (uint64_t)simd_data(desc) << 63;
    970     uint64_t *za = vza, *zn = vzn, *zm = vzm;
    971     uint8_t *pn = vpn, *pm = vpm;
    972     float_status fpst = *(float_status *)vst;
    973 
    974     set_default_nan_mode(true, &fpst);
    975 
    976     for (row = 0; row < oprsz; ++row) {
    977         if (pn[H1(row)] & 1) {
    978             uint64_t *za_row = &za[tile_vslice_index(row)];
    979             uint64_t n = zn[row] ^ neg;
    980 
    981             for (col = 0; col < oprsz; ++col) {
    982                 if (pm[H1(col)] & 1) {
    983                     uint64_t *a = &za_row[col];
    984                     *a = float64_muladd(n, zm[col], *a, 0, &fpst);
    985                 }
    986             }
    987         }
    988     }
    989 }
    990 
    991 /*
    992  * Alter PAIR as needed for controlling predicates being false,
    993  * and for NEG on an enabled row element.
    994  */
    995 static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
    996 {
    997     /*
    998      * The pseudocode uses a conditional negate after the conditional zero.
    999      * It is simpler here to unconditionally negate before conditional zero.
   1000      */
   1001     pair ^= neg;
   1002     if (!(pg & 1)) {
   1003         pair &= 0xffff0000u;
   1004     }
   1005     if (!(pg & 4)) {
   1006         pair &= 0x0000ffffu;
   1007     }
   1008     return pair;
   1009 }
   1010 
   1011 static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
   1012                           float_status *s_std, float_status *s_odd)
   1013 {
   1014     float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
   1015     float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
   1016     float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
   1017     float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
   1018     float64 t64;
   1019     float32 t32;
   1020 
   1021     /*
   1022      * The ARM pseudocode function FPDot performs both multiplies
   1023      * and the add with a single rounding operation.  Emulate this
   1024      * by performing the first multiply in round-to-odd, then doing
   1025      * the second multiply as fused multiply-add, and rounding to
   1026      * float32 all in one step.
   1027      */
   1028     t64 = float64_mul(e1r, e2r, s_odd);
   1029     t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
   1030 
   1031     /* This conversion is exact, because we've already rounded. */
   1032     t32 = float64_to_float32(t64, s_std);
   1033 
   1034     /* The final accumulation step is not fused. */
   1035     return float32_add(sum, t32, s_std);
   1036 }
   1037 
   1038 void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
   1039                          void *vpm, void *vst, uint32_t desc)
   1040 {
   1041     intptr_t row, col, oprsz = simd_maxsz(desc);
   1042     uint32_t neg = simd_data(desc) * 0x80008000u;
   1043     uint16_t *pn = vpn, *pm = vpm;
   1044     float_status fpst_odd, fpst_std;
   1045 
   1046     /*
   1047      * Make a copy of float_status because this operation does not
   1048      * update the cumulative fp exception status.  It also produces
   1049      * default nans.  Make a second copy with round-to-odd -- see above.
   1050      */
   1051     fpst_std = *(float_status *)vst;
   1052     set_default_nan_mode(true, &fpst_std);
   1053     fpst_odd = fpst_std;
   1054     set_float_rounding_mode(float_round_to_odd, &fpst_odd);
   1055 
   1056     for (row = 0; row < oprsz; ) {
   1057         uint16_t prow = pn[H2(row >> 4)];
   1058         do {
   1059             void *vza_row = vza + tile_vslice_offset(row);
   1060             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
   1061 
   1062             n = f16mop_adj_pair(n, prow, neg);
   1063 
   1064             for (col = 0; col < oprsz; ) {
   1065                 uint16_t pcol = pm[H2(col >> 4)];
   1066                 do {
   1067                     if (prow & pcol & 0b0101) {
   1068                         uint32_t *a = vza_row + H1_4(col);
   1069                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
   1070 
   1071                         m = f16mop_adj_pair(m, pcol, 0);
   1072                         *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
   1073 
   1074                         col += 4;
   1075                         pcol >>= 4;
   1076                     }
   1077                 } while (col & 15);
   1078             }
   1079             row += 4;
   1080             prow >>= 4;
   1081         } while (row & 15);
   1082     }
   1083 }
   1084 
   1085 void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
   1086                         void *vpm, uint32_t desc)
   1087 {
   1088     intptr_t row, col, oprsz = simd_maxsz(desc);
   1089     uint32_t neg = simd_data(desc) * 0x80008000u;
   1090     uint16_t *pn = vpn, *pm = vpm;
   1091 
   1092     for (row = 0; row < oprsz; ) {
   1093         uint16_t prow = pn[H2(row >> 4)];
   1094         do {
   1095             void *vza_row = vza + tile_vslice_offset(row);
   1096             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
   1097 
   1098             n = f16mop_adj_pair(n, prow, neg);
   1099 
   1100             for (col = 0; col < oprsz; ) {
   1101                 uint16_t pcol = pm[H2(col >> 4)];
   1102                 do {
   1103                     if (prow & pcol & 0b0101) {
   1104                         uint32_t *a = vza_row + H1_4(col);
   1105                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
   1106 
   1107                         m = f16mop_adj_pair(m, pcol, 0);
   1108                         *a = bfdotadd(*a, n, m);
   1109 
   1110                         col += 4;
   1111                         pcol >>= 4;
   1112                     }
   1113                 } while (col & 15);
   1114             }
   1115             row += 4;
   1116             prow >>= 4;
   1117         } while (row & 15);
   1118     }
   1119 }
   1120 
   1121 typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
   1122 
   1123 static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
   1124                             uint8_t *pn, uint8_t *pm,
   1125                             uint32_t desc, IMOPFn *fn)
   1126 {
   1127     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
   1128     bool neg = simd_data(desc);
   1129 
   1130     for (row = 0; row < oprsz; ++row) {
   1131         uint8_t pa = pn[H1(row)];
   1132         uint64_t *za_row = &za[tile_vslice_index(row)];
   1133         uint64_t n = zn[row];
   1134 
   1135         for (col = 0; col < oprsz; ++col) {
   1136             uint8_t pb = pm[H1(col)];
   1137             uint64_t *a = &za_row[col];
   1138 
   1139             *a = fn(n, zm[col], *a, pa & pb, neg);
   1140         }
   1141     }
   1142 }
   1143 
   1144 #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
   1145 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
   1146 {                                                                           \
   1147     uint32_t sum0 = 0, sum1 = 0;                                            \
   1148     /* Apply P to N as a mask, making the inactive elements 0. */           \
   1149     n &= expand_pred_b(p);                                                  \
   1150     sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                              \
   1151     sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                              \
   1152     sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                            \
   1153     sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                            \
   1154     sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                            \
   1155     sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40);                            \
   1156     sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                            \
   1157     sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56);                            \
   1158     if (neg) {                                                              \
   1159         sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1;       \
   1160     } else {                                                                \
   1161         sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1;       \
   1162     }                                                                       \
   1163     return ((uint64_t)sum1 << 32) | sum0;                                   \
   1164 }
   1165 
   1166 #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
   1167 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
   1168 {                                                                           \
   1169     uint64_t sum = 0;                                                       \
   1170     /* Apply P to N as a mask, making the inactive elements 0. */           \
   1171     n &= expand_pred_h(p);                                                  \
   1172     sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
   1173     sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
   1174     sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                             \
   1175     sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                             \
   1176     return neg ? a - sum : a + sum;                                         \
   1177 }
   1178 
   1179 DEF_IMOP_32(smopa_s, int8_t, int8_t)
   1180 DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
   1181 DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
   1182 DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
   1183 
   1184 DEF_IMOP_64(smopa_d, int16_t, int16_t)
   1185 DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
   1186 DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
   1187 DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
   1188 
   1189 #define DEF_IMOPH(NAME) \
   1190     void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn,      \
   1191                             void *vpm, uint32_t desc)                        \
   1192     { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
   1193 
   1194 DEF_IMOPH(smopa_s)
   1195 DEF_IMOPH(umopa_s)
   1196 DEF_IMOPH(sumopa_s)
   1197 DEF_IMOPH(usmopa_s)
   1198 DEF_IMOPH(smopa_d)
   1199 DEF_IMOPH(umopa_d)
   1200 DEF_IMOPH(sumopa_d)
   1201 DEF_IMOPH(usmopa_d)