qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

neon_helper.c (45000B)


      1 /*
      2  * ARM NEON vector operations.
      3  *
      4  * Copyright (c) 2007, 2008 CodeSourcery.
      5  * Written by Paul Brook
      6  *
      7  * This code is licensed under the GNU GPL v2.
      8  */
      9 #include "qemu/osdep.h"
     10 
     11 #include "cpu.h"
     12 #include "exec/helper-proto.h"
     13 #include "fpu/softfloat.h"
     14 #include "vec_internal.h"
     15 
     16 #define SIGNBIT (uint32_t)0x80000000
     17 #define SIGNBIT64 ((uint64_t)1 << 63)
     18 
     19 #define SET_QC() env->vfp.qc[0] = 1
     20 
     21 #define NEON_TYPE1(name, type) \
     22 typedef struct \
     23 { \
     24     type v1; \
     25 } neon_##name;
     26 #if HOST_BIG_ENDIAN
     27 #define NEON_TYPE2(name, type) \
     28 typedef struct \
     29 { \
     30     type v2; \
     31     type v1; \
     32 } neon_##name;
     33 #define NEON_TYPE4(name, type) \
     34 typedef struct \
     35 { \
     36     type v4; \
     37     type v3; \
     38     type v2; \
     39     type v1; \
     40 } neon_##name;
     41 #else
     42 #define NEON_TYPE2(name, type) \
     43 typedef struct \
     44 { \
     45     type v1; \
     46     type v2; \
     47 } neon_##name;
     48 #define NEON_TYPE4(name, type) \
     49 typedef struct \
     50 { \
     51     type v1; \
     52     type v2; \
     53     type v3; \
     54     type v4; \
     55 } neon_##name;
     56 #endif
     57 
     58 NEON_TYPE4(s8, int8_t)
     59 NEON_TYPE4(u8, uint8_t)
     60 NEON_TYPE2(s16, int16_t)
     61 NEON_TYPE2(u16, uint16_t)
     62 NEON_TYPE1(s32, int32_t)
     63 NEON_TYPE1(u32, uint32_t)
     64 #undef NEON_TYPE4
     65 #undef NEON_TYPE2
     66 #undef NEON_TYPE1
     67 
     68 /* Copy from a uint32_t to a vector structure type.  */
     69 #define NEON_UNPACK(vtype, dest, val) do { \
     70     union { \
     71         vtype v; \
     72         uint32_t i; \
     73     } conv_u; \
     74     conv_u.i = (val); \
     75     dest = conv_u.v; \
     76     } while(0)
     77 
     78 /* Copy from a vector structure type to a uint32_t.  */
     79 #define NEON_PACK(vtype, dest, val) do { \
     80     union { \
     81         vtype v; \
     82         uint32_t i; \
     83     } conv_u; \
     84     conv_u.v = (val); \
     85     dest = conv_u.i; \
     86     } while(0)
     87 
     88 #define NEON_DO1 \
     89     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
     90 #define NEON_DO2 \
     91     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
     92     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
     93 #define NEON_DO4 \
     94     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
     95     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
     96     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
     97     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
     98 
     99 #define NEON_VOP_BODY(vtype, n) \
    100 { \
    101     uint32_t res; \
    102     vtype vsrc1; \
    103     vtype vsrc2; \
    104     vtype vdest; \
    105     NEON_UNPACK(vtype, vsrc1, arg1); \
    106     NEON_UNPACK(vtype, vsrc2, arg2); \
    107     NEON_DO##n; \
    108     NEON_PACK(vtype, res, vdest); \
    109     return res; \
    110 }
    111 
    112 #define NEON_VOP(name, vtype, n) \
    113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
    114 NEON_VOP_BODY(vtype, n)
    115 
    116 #define NEON_VOP_ENV(name, vtype, n) \
    117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
    118 NEON_VOP_BODY(vtype, n)
    119 
    120 /* Pairwise operations.  */
    121 /* For 32-bit elements each segment only contains a single element, so
    122    the elementwise and pairwise operations are the same.  */
    123 #define NEON_PDO2 \
    124     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
    125     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
    126 #define NEON_PDO4 \
    127     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
    128     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
    129     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
    130     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
    131 
    132 #define NEON_POP(name, vtype, n) \
    133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
    134 { \
    135     uint32_t res; \
    136     vtype vsrc1; \
    137     vtype vsrc2; \
    138     vtype vdest; \
    139     NEON_UNPACK(vtype, vsrc1, arg1); \
    140     NEON_UNPACK(vtype, vsrc2, arg2); \
    141     NEON_PDO##n; \
    142     NEON_PACK(vtype, res, vdest); \
    143     return res; \
    144 }
    145 
    146 /* Unary operators.  */
    147 #define NEON_VOP1(name, vtype, n) \
    148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
    149 { \
    150     vtype vsrc1; \
    151     vtype vdest; \
    152     NEON_UNPACK(vtype, vsrc1, arg); \
    153     NEON_DO##n; \
    154     NEON_PACK(vtype, arg, vdest); \
    155     return arg; \
    156 }
    157 
    158 
    159 #define NEON_USAT(dest, src1, src2, type) do { \
    160     uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
    161     if (tmp != (type)tmp) { \
    162         SET_QC(); \
    163         dest = ~0; \
    164     } else { \
    165         dest = tmp; \
    166     }} while(0)
    167 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
    168 NEON_VOP_ENV(qadd_u8, neon_u8, 4)
    169 #undef NEON_FN
    170 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
    171 NEON_VOP_ENV(qadd_u16, neon_u16, 2)
    172 #undef NEON_FN
    173 #undef NEON_USAT
    174 
    175 uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
    176 {
    177     uint32_t res = a + b;
    178     if (res < a) {
    179         SET_QC();
    180         res = ~0;
    181     }
    182     return res;
    183 }
    184 
    185 uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    186 {
    187     uint64_t res;
    188 
    189     res = src1 + src2;
    190     if (res < src1) {
    191         SET_QC();
    192         res = ~(uint64_t)0;
    193     }
    194     return res;
    195 }
    196 
    197 #define NEON_SSAT(dest, src1, src2, type) do { \
    198     int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
    199     if (tmp != (type)tmp) { \
    200         SET_QC(); \
    201         if (src2 > 0) { \
    202             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
    203         } else { \
    204             tmp = 1 << (sizeof(type) * 8 - 1); \
    205         } \
    206     } \
    207     dest = tmp; \
    208     } while(0)
    209 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
    210 NEON_VOP_ENV(qadd_s8, neon_s8, 4)
    211 #undef NEON_FN
    212 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
    213 NEON_VOP_ENV(qadd_s16, neon_s16, 2)
    214 #undef NEON_FN
    215 #undef NEON_SSAT
    216 
    217 uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
    218 {
    219     uint32_t res = a + b;
    220     if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
    221         SET_QC();
    222         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
    223     }
    224     return res;
    225 }
    226 
    227 uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    228 {
    229     uint64_t res;
    230 
    231     res = src1 + src2;
    232     if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
    233         SET_QC();
    234         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
    235     }
    236     return res;
    237 }
    238 
    239 /* Unsigned saturating accumulate of signed value
    240  *
    241  * Op1/Rn is treated as signed
    242  * Op2/Rd is treated as unsigned
    243  *
    244  * Explicit casting is used to ensure the correct sign extension of
    245  * inputs. The result is treated as a unsigned value and saturated as such.
    246  *
    247  * We use a macro for the 8/16 bit cases which expects signed integers of va,
    248  * vb, and vr for interim calculation and an unsigned 32 bit result value r.
    249  */
    250 
    251 #define USATACC(bits, shift) \
    252     do { \
    253         va = sextract32(a, shift, bits);                                \
    254         vb = extract32(b, shift, bits);                                 \
    255         vr = va + vb;                                                   \
    256         if (vr > UINT##bits##_MAX) {                                    \
    257             SET_QC();                                                   \
    258             vr = UINT##bits##_MAX;                                      \
    259         } else if (vr < 0) {                                            \
    260             SET_QC();                                                   \
    261             vr = 0;                                                     \
    262         }                                                               \
    263         r = deposit32(r, shift, bits, vr);                              \
    264    } while (0)
    265 
    266 uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
    267 {
    268     int16_t va, vb, vr;
    269     uint32_t r = 0;
    270 
    271     USATACC(8, 0);
    272     USATACC(8, 8);
    273     USATACC(8, 16);
    274     USATACC(8, 24);
    275     return r;
    276 }
    277 
    278 uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
    279 {
    280     int32_t va, vb, vr;
    281     uint64_t r = 0;
    282 
    283     USATACC(16, 0);
    284     USATACC(16, 16);
    285     return r;
    286 }
    287 
    288 #undef USATACC
    289 
    290 uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
    291 {
    292     int64_t va = (int32_t)a;
    293     int64_t vb = (uint32_t)b;
    294     int64_t vr = va + vb;
    295     if (vr > UINT32_MAX) {
    296         SET_QC();
    297         vr = UINT32_MAX;
    298     } else if (vr < 0) {
    299         SET_QC();
    300         vr = 0;
    301     }
    302     return vr;
    303 }
    304 
    305 uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
    306 {
    307     uint64_t res;
    308     res = a + b;
    309     /* We only need to look at the pattern of SIGN bits to detect
    310      * +ve/-ve saturation
    311      */
    312     if (~a & b & ~res & SIGNBIT64) {
    313         SET_QC();
    314         res = UINT64_MAX;
    315     } else if (a & ~b & res & SIGNBIT64) {
    316         SET_QC();
    317         res = 0;
    318     }
    319     return res;
    320 }
    321 
    322 /* Signed saturating accumulate of unsigned value
    323  *
    324  * Op1/Rn is treated as unsigned
    325  * Op2/Rd is treated as signed
    326  *
    327  * The result is treated as a signed value and saturated as such
    328  *
    329  * We use a macro for the 8/16 bit cases which expects signed integers of va,
    330  * vb, and vr for interim calculation and an unsigned 32 bit result value r.
    331  */
    332 
    333 #define SSATACC(bits, shift) \
    334     do { \
    335         va = extract32(a, shift, bits);                                 \
    336         vb = sextract32(b, shift, bits);                                \
    337         vr = va + vb;                                                   \
    338         if (vr > INT##bits##_MAX) {                                     \
    339             SET_QC();                                                   \
    340             vr = INT##bits##_MAX;                                       \
    341         } else if (vr < INT##bits##_MIN) {                              \
    342             SET_QC();                                                   \
    343             vr = INT##bits##_MIN;                                       \
    344         }                                                               \
    345         r = deposit32(r, shift, bits, vr);                              \
    346     } while (0)
    347 
    348 uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
    349 {
    350     int16_t va, vb, vr;
    351     uint32_t r = 0;
    352 
    353     SSATACC(8, 0);
    354     SSATACC(8, 8);
    355     SSATACC(8, 16);
    356     SSATACC(8, 24);
    357     return r;
    358 }
    359 
    360 uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
    361 {
    362     int32_t va, vb, vr;
    363     uint32_t r = 0;
    364 
    365     SSATACC(16, 0);
    366     SSATACC(16, 16);
    367 
    368     return r;
    369 }
    370 
    371 #undef SSATACC
    372 
    373 uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
    374 {
    375     int64_t res;
    376     int64_t op1 = (uint32_t)a;
    377     int64_t op2 = (int32_t)b;
    378     res = op1 + op2;
    379     if (res > INT32_MAX) {
    380         SET_QC();
    381         res = INT32_MAX;
    382     } else if (res < INT32_MIN) {
    383         SET_QC();
    384         res = INT32_MIN;
    385     }
    386     return res;
    387 }
    388 
    389 uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
    390 {
    391     uint64_t res;
    392     res = a + b;
    393     /* We only need to look at the pattern of SIGN bits to detect an overflow */
    394     if (((a & res)
    395          | (~b & res)
    396          | (a & ~b)) & SIGNBIT64) {
    397         SET_QC();
    398         res = INT64_MAX;
    399     }
    400     return res;
    401 }
    402 
    403 
    404 #define NEON_USAT(dest, src1, src2, type) do { \
    405     uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
    406     if (tmp != (type)tmp) { \
    407         SET_QC(); \
    408         dest = 0; \
    409     } else { \
    410         dest = tmp; \
    411     }} while(0)
    412 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
    413 NEON_VOP_ENV(qsub_u8, neon_u8, 4)
    414 #undef NEON_FN
    415 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
    416 NEON_VOP_ENV(qsub_u16, neon_u16, 2)
    417 #undef NEON_FN
    418 #undef NEON_USAT
    419 
    420 uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
    421 {
    422     uint32_t res = a - b;
    423     if (res > a) {
    424         SET_QC();
    425         res = 0;
    426     }
    427     return res;
    428 }
    429 
    430 uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    431 {
    432     uint64_t res;
    433 
    434     if (src1 < src2) {
    435         SET_QC();
    436         res = 0;
    437     } else {
    438         res = src1 - src2;
    439     }
    440     return res;
    441 }
    442 
    443 #define NEON_SSAT(dest, src1, src2, type) do { \
    444     int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
    445     if (tmp != (type)tmp) { \
    446         SET_QC(); \
    447         if (src2 < 0) { \
    448             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
    449         } else { \
    450             tmp = 1 << (sizeof(type) * 8 - 1); \
    451         } \
    452     } \
    453     dest = tmp; \
    454     } while(0)
    455 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
    456 NEON_VOP_ENV(qsub_s8, neon_s8, 4)
    457 #undef NEON_FN
    458 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
    459 NEON_VOP_ENV(qsub_s16, neon_s16, 2)
    460 #undef NEON_FN
    461 #undef NEON_SSAT
    462 
    463 uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
    464 {
    465     uint32_t res = a - b;
    466     if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
    467         SET_QC();
    468         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
    469     }
    470     return res;
    471 }
    472 
    473 uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    474 {
    475     uint64_t res;
    476 
    477     res = src1 - src2;
    478     if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
    479         SET_QC();
    480         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
    481     }
    482     return res;
    483 }
    484 
    485 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
    486 NEON_VOP(hadd_s8, neon_s8, 4)
    487 NEON_VOP(hadd_u8, neon_u8, 4)
    488 NEON_VOP(hadd_s16, neon_s16, 2)
    489 NEON_VOP(hadd_u16, neon_u16, 2)
    490 #undef NEON_FN
    491 
    492 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
    493 {
    494     int32_t dest;
    495 
    496     dest = (src1 >> 1) + (src2 >> 1);
    497     if (src1 & src2 & 1)
    498         dest++;
    499     return dest;
    500 }
    501 
    502 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
    503 {
    504     uint32_t dest;
    505 
    506     dest = (src1 >> 1) + (src2 >> 1);
    507     if (src1 & src2 & 1)
    508         dest++;
    509     return dest;
    510 }
    511 
    512 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
    513 NEON_VOP(rhadd_s8, neon_s8, 4)
    514 NEON_VOP(rhadd_u8, neon_u8, 4)
    515 NEON_VOP(rhadd_s16, neon_s16, 2)
    516 NEON_VOP(rhadd_u16, neon_u16, 2)
    517 #undef NEON_FN
    518 
    519 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
    520 {
    521     int32_t dest;
    522 
    523     dest = (src1 >> 1) + (src2 >> 1);
    524     if ((src1 | src2) & 1)
    525         dest++;
    526     return dest;
    527 }
    528 
    529 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
    530 {
    531     uint32_t dest;
    532 
    533     dest = (src1 >> 1) + (src2 >> 1);
    534     if ((src1 | src2) & 1)
    535         dest++;
    536     return dest;
    537 }
    538 
    539 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
    540 NEON_VOP(hsub_s8, neon_s8, 4)
    541 NEON_VOP(hsub_u8, neon_u8, 4)
    542 NEON_VOP(hsub_s16, neon_s16, 2)
    543 NEON_VOP(hsub_u16, neon_u16, 2)
    544 #undef NEON_FN
    545 
    546 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
    547 {
    548     int32_t dest;
    549 
    550     dest = (src1 >> 1) - (src2 >> 1);
    551     if ((~src1) & src2 & 1)
    552         dest--;
    553     return dest;
    554 }
    555 
    556 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
    557 {
    558     uint32_t dest;
    559 
    560     dest = (src1 >> 1) - (src2 >> 1);
    561     if ((~src1) & src2 & 1)
    562         dest--;
    563     return dest;
    564 }
    565 
    566 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
    567 NEON_POP(pmin_s8, neon_s8, 4)
    568 NEON_POP(pmin_u8, neon_u8, 4)
    569 NEON_POP(pmin_s16, neon_s16, 2)
    570 NEON_POP(pmin_u16, neon_u16, 2)
    571 #undef NEON_FN
    572 
    573 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
    574 NEON_POP(pmax_s8, neon_s8, 4)
    575 NEON_POP(pmax_u8, neon_u8, 4)
    576 NEON_POP(pmax_s16, neon_s16, 2)
    577 NEON_POP(pmax_u16, neon_u16, 2)
    578 #undef NEON_FN
    579 
    580 #define NEON_FN(dest, src1, src2) \
    581     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
    582 NEON_VOP(shl_u16, neon_u16, 2)
    583 #undef NEON_FN
    584 
    585 #define NEON_FN(dest, src1, src2) \
    586     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
    587 NEON_VOP(shl_s16, neon_s16, 2)
    588 #undef NEON_FN
    589 
    590 #define NEON_FN(dest, src1, src2) \
    591     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
    592 NEON_VOP(rshl_s8, neon_s8, 4)
    593 #undef NEON_FN
    594 
    595 #define NEON_FN(dest, src1, src2) \
    596     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
    597 NEON_VOP(rshl_s16, neon_s16, 2)
    598 #undef NEON_FN
    599 
    600 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
    601 {
    602     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
    603 }
    604 
    605 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
    606 {
    607     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
    608 }
    609 
    610 #define NEON_FN(dest, src1, src2) \
    611     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
    612 NEON_VOP(rshl_u8, neon_u8, 4)
    613 #undef NEON_FN
    614 
    615 #define NEON_FN(dest, src1, src2) \
    616     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
    617 NEON_VOP(rshl_u16, neon_u16, 2)
    618 #undef NEON_FN
    619 
    620 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
    621 {
    622     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
    623 }
    624 
    625 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
    626 {
    627     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
    628 }
    629 
    630 #define NEON_FN(dest, src1, src2) \
    631     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
    632 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
    633 #undef NEON_FN
    634 
    635 #define NEON_FN(dest, src1, src2) \
    636     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
    637 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
    638 #undef NEON_FN
    639 
    640 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
    641 {
    642     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
    643 }
    644 
    645 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
    646 {
    647     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
    648 }
    649 
    650 #define NEON_FN(dest, src1, src2) \
    651     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
    652 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
    653 #undef NEON_FN
    654 
    655 #define NEON_FN(dest, src1, src2) \
    656     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
    657 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
    658 #undef NEON_FN
    659 
    660 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
    661 {
    662     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
    663 }
    664 
    665 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
    666 {
    667     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
    668 }
    669 
    670 #define NEON_FN(dest, src1, src2) \
    671     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
    672 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
    673 #undef NEON_FN
    674 
    675 #define NEON_FN(dest, src1, src2) \
    676     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
    677 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
    678 #undef NEON_FN
    679 
    680 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
    681 {
    682     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
    683 }
    684 
    685 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
    686 {
    687     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
    688 }
    689 
    690 #define NEON_FN(dest, src1, src2) \
    691     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
    692 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
    693 #undef NEON_FN
    694 
    695 #define NEON_FN(dest, src1, src2) \
    696     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
    697 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
    698 #undef NEON_FN
    699 
    700 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
    701 {
    702     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
    703 }
    704 
    705 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
    706 {
    707     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
    708 }
    709 
    710 #define NEON_FN(dest, src1, src2) \
    711     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
    712 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
    713 #undef NEON_FN
    714 
    715 #define NEON_FN(dest, src1, src2) \
    716     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
    717 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
    718 #undef NEON_FN
    719 
    720 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
    721 {
    722     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
    723 }
    724 
    725 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
    726 {
    727     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
    728 }
    729 
    730 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
    731 {
    732     uint32_t mask;
    733     mask = (a ^ b) & 0x80808080u;
    734     a &= ~0x80808080u;
    735     b &= ~0x80808080u;
    736     return (a + b) ^ mask;
    737 }
    738 
    739 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
    740 {
    741     uint32_t mask;
    742     mask = (a ^ b) & 0x80008000u;
    743     a &= ~0x80008000u;
    744     b &= ~0x80008000u;
    745     return (a + b) ^ mask;
    746 }
    747 
    748 #define NEON_FN(dest, src1, src2) dest = src1 + src2
    749 NEON_POP(padd_u8, neon_u8, 4)
    750 NEON_POP(padd_u16, neon_u16, 2)
    751 #undef NEON_FN
    752 
    753 #define NEON_FN(dest, src1, src2) dest = src1 - src2
    754 NEON_VOP(sub_u8, neon_u8, 4)
    755 NEON_VOP(sub_u16, neon_u16, 2)
    756 #undef NEON_FN
    757 
    758 #define NEON_FN(dest, src1, src2) dest = src1 * src2
    759 NEON_VOP(mul_u8, neon_u8, 4)
    760 NEON_VOP(mul_u16, neon_u16, 2)
    761 #undef NEON_FN
    762 
    763 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
    764 NEON_VOP(tst_u8, neon_u8, 4)
    765 NEON_VOP(tst_u16, neon_u16, 2)
    766 NEON_VOP(tst_u32, neon_u32, 1)
    767 #undef NEON_FN
    768 
    769 /* Count Leading Sign/Zero Bits.  */
    770 static inline int do_clz8(uint8_t x)
    771 {
    772     int n;
    773     for (n = 8; x; n--)
    774         x >>= 1;
    775     return n;
    776 }
    777 
    778 static inline int do_clz16(uint16_t x)
    779 {
    780     int n;
    781     for (n = 16; x; n--)
    782         x >>= 1;
    783     return n;
    784 }
    785 
    786 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
    787 NEON_VOP1(clz_u8, neon_u8, 4)
    788 #undef NEON_FN
    789 
    790 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
    791 NEON_VOP1(clz_u16, neon_u16, 2)
    792 #undef NEON_FN
    793 
    794 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
    795 NEON_VOP1(cls_s8, neon_s8, 4)
    796 #undef NEON_FN
    797 
    798 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
    799 NEON_VOP1(cls_s16, neon_s16, 2)
    800 #undef NEON_FN
    801 
    802 uint32_t HELPER(neon_cls_s32)(uint32_t x)
    803 {
    804     int count;
    805     if ((int32_t)x < 0)
    806         x = ~x;
    807     for (count = 32; x; count--)
    808         x = x >> 1;
    809     return count - 1;
    810 }
    811 
    812 /* Bit count.  */
    813 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
    814 {
    815     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
    816     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
    817     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
    818     return x;
    819 }
    820 
    821 /* Reverse bits in each 8 bit word */
    822 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
    823 {
    824     x =  ((x & 0xf0f0f0f0) >> 4)
    825        | ((x & 0x0f0f0f0f) << 4);
    826     x =  ((x & 0x88888888) >> 3)
    827        | ((x & 0x44444444) >> 1)
    828        | ((x & 0x22222222) << 1)
    829        | ((x & 0x11111111) << 3);
    830     return x;
    831 }
    832 
    833 #define NEON_QDMULH16(dest, src1, src2, round) do { \
    834     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
    835     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
    836         SET_QC(); \
    837         tmp = (tmp >> 31) ^ ~SIGNBIT; \
    838     } else { \
    839         tmp <<= 1; \
    840     } \
    841     if (round) { \
    842         int32_t old = tmp; \
    843         tmp += 1 << 15; \
    844         if ((int32_t)tmp < old) { \
    845             SET_QC(); \
    846             tmp = SIGNBIT - 1; \
    847         } \
    848     } \
    849     dest = tmp >> 16; \
    850     } while(0)
    851 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
    852 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
    853 #undef NEON_FN
    854 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
    855 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
    856 #undef NEON_FN
    857 #undef NEON_QDMULH16
    858 
    859 #define NEON_QDMULH32(dest, src1, src2, round) do { \
    860     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
    861     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
    862         SET_QC(); \
    863         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
    864     } else { \
    865         tmp <<= 1; \
    866     } \
    867     if (round) { \
    868         int64_t old = tmp; \
    869         tmp += (int64_t)1 << 31; \
    870         if ((int64_t)tmp < old) { \
    871             SET_QC(); \
    872             tmp = SIGNBIT64 - 1; \
    873         } \
    874     } \
    875     dest = tmp >> 32; \
    876     } while(0)
    877 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
    878 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
    879 #undef NEON_FN
    880 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
    881 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
    882 #undef NEON_FN
    883 #undef NEON_QDMULH32
    884 
    885 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
    886 {
    887     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
    888            | ((x >> 24) & 0xff000000u);
    889 }
    890 
    891 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
    892 {
    893     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
    894 }
    895 
    896 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
    897 {
    898     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
    899             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
    900 }
    901 
    902 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
    903 {
    904     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
    905 }
    906 
    907 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
    908 {
    909     x &= 0xff80ff80ff80ff80ull;
    910     x += 0x0080008000800080ull;
    911     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
    912             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
    913 }
    914 
    915 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
    916 {
    917     x &= 0xffff8000ffff8000ull;
    918     x += 0x0000800000008000ull;
    919     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
    920 }
    921 
    922 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
    923 {
    924     uint16_t s;
    925     uint8_t d;
    926     uint32_t res = 0;
    927 #define SAT8(n) \
    928     s = x >> n; \
    929     if (s & 0x8000) { \
    930         SET_QC(); \
    931     } else { \
    932         if (s > 0xff) { \
    933             d = 0xff; \
    934             SET_QC(); \
    935         } else  { \
    936             d = s; \
    937         } \
    938         res |= (uint32_t)d << (n / 2); \
    939     }
    940 
    941     SAT8(0);
    942     SAT8(16);
    943     SAT8(32);
    944     SAT8(48);
    945 #undef SAT8
    946     return res;
    947 }
    948 
    949 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
    950 {
    951     uint16_t s;
    952     uint8_t d;
    953     uint32_t res = 0;
    954 #define SAT8(n) \
    955     s = x >> n; \
    956     if (s > 0xff) { \
    957         d = 0xff; \
    958         SET_QC(); \
    959     } else  { \
    960         d = s; \
    961     } \
    962     res |= (uint32_t)d << (n / 2);
    963 
    964     SAT8(0);
    965     SAT8(16);
    966     SAT8(32);
    967     SAT8(48);
    968 #undef SAT8
    969     return res;
    970 }
    971 
    972 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
    973 {
    974     int16_t s;
    975     uint8_t d;
    976     uint32_t res = 0;
    977 #define SAT8(n) \
    978     s = x >> n; \
    979     if (s != (int8_t)s) { \
    980         d = (s >> 15) ^ 0x7f; \
    981         SET_QC(); \
    982     } else  { \
    983         d = s; \
    984     } \
    985     res |= (uint32_t)d << (n / 2);
    986 
    987     SAT8(0);
    988     SAT8(16);
    989     SAT8(32);
    990     SAT8(48);
    991 #undef SAT8
    992     return res;
    993 }
    994 
    995 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
    996 {
    997     uint32_t high;
    998     uint32_t low;
    999     low = x;
   1000     if (low & 0x80000000) {
   1001         low = 0;
   1002         SET_QC();
   1003     } else if (low > 0xffff) {
   1004         low = 0xffff;
   1005         SET_QC();
   1006     }
   1007     high = x >> 32;
   1008     if (high & 0x80000000) {
   1009         high = 0;
   1010         SET_QC();
   1011     } else if (high > 0xffff) {
   1012         high = 0xffff;
   1013         SET_QC();
   1014     }
   1015     return low | (high << 16);
   1016 }
   1017 
   1018 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
   1019 {
   1020     uint32_t high;
   1021     uint32_t low;
   1022     low = x;
   1023     if (low > 0xffff) {
   1024         low = 0xffff;
   1025         SET_QC();
   1026     }
   1027     high = x >> 32;
   1028     if (high > 0xffff) {
   1029         high = 0xffff;
   1030         SET_QC();
   1031     }
   1032     return low | (high << 16);
   1033 }
   1034 
   1035 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
   1036 {
   1037     int32_t low;
   1038     int32_t high;
   1039     low = x;
   1040     if (low != (int16_t)low) {
   1041         low = (low >> 31) ^ 0x7fff;
   1042         SET_QC();
   1043     }
   1044     high = x >> 32;
   1045     if (high != (int16_t)high) {
   1046         high = (high >> 31) ^ 0x7fff;
   1047         SET_QC();
   1048     }
   1049     return (uint16_t)low | (high << 16);
   1050 }
   1051 
   1052 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
   1053 {
   1054     if (x & 0x8000000000000000ull) {
   1055         SET_QC();
   1056         return 0;
   1057     }
   1058     if (x > 0xffffffffu) {
   1059         SET_QC();
   1060         return 0xffffffffu;
   1061     }
   1062     return x;
   1063 }
   1064 
   1065 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
   1066 {
   1067     if (x > 0xffffffffu) {
   1068         SET_QC();
   1069         return 0xffffffffu;
   1070     }
   1071     return x;
   1072 }
   1073 
   1074 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
   1075 {
   1076     if ((int64_t)x != (int32_t)x) {
   1077         SET_QC();
   1078         return ((int64_t)x >> 63) ^ 0x7fffffff;
   1079     }
   1080     return x;
   1081 }
   1082 
   1083 uint64_t HELPER(neon_widen_u8)(uint32_t x)
   1084 {
   1085     uint64_t tmp;
   1086     uint64_t ret;
   1087     ret = (uint8_t)x;
   1088     tmp = (uint8_t)(x >> 8);
   1089     ret |= tmp << 16;
   1090     tmp = (uint8_t)(x >> 16);
   1091     ret |= tmp << 32;
   1092     tmp = (uint8_t)(x >> 24);
   1093     ret |= tmp << 48;
   1094     return ret;
   1095 }
   1096 
   1097 uint64_t HELPER(neon_widen_s8)(uint32_t x)
   1098 {
   1099     uint64_t tmp;
   1100     uint64_t ret;
   1101     ret = (uint16_t)(int8_t)x;
   1102     tmp = (uint16_t)(int8_t)(x >> 8);
   1103     ret |= tmp << 16;
   1104     tmp = (uint16_t)(int8_t)(x >> 16);
   1105     ret |= tmp << 32;
   1106     tmp = (uint16_t)(int8_t)(x >> 24);
   1107     ret |= tmp << 48;
   1108     return ret;
   1109 }
   1110 
   1111 uint64_t HELPER(neon_widen_u16)(uint32_t x)
   1112 {
   1113     uint64_t high = (uint16_t)(x >> 16);
   1114     return ((uint16_t)x) | (high << 32);
   1115 }
   1116 
   1117 uint64_t HELPER(neon_widen_s16)(uint32_t x)
   1118 {
   1119     uint64_t high = (int16_t)(x >> 16);
   1120     return ((uint32_t)(int16_t)x) | (high << 32);
   1121 }
   1122 
   1123 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
   1124 {
   1125     uint64_t mask;
   1126     mask = (a ^ b) & 0x8000800080008000ull;
   1127     a &= ~0x8000800080008000ull;
   1128     b &= ~0x8000800080008000ull;
   1129     return (a + b) ^ mask;
   1130 }
   1131 
   1132 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
   1133 {
   1134     uint64_t mask;
   1135     mask = (a ^ b) & 0x8000000080000000ull;
   1136     a &= ~0x8000000080000000ull;
   1137     b &= ~0x8000000080000000ull;
   1138     return (a + b) ^ mask;
   1139 }
   1140 
   1141 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
   1142 {
   1143     uint64_t tmp;
   1144     uint64_t tmp2;
   1145 
   1146     tmp = a & 0x0000ffff0000ffffull;
   1147     tmp += (a >> 16) & 0x0000ffff0000ffffull;
   1148     tmp2 = b & 0xffff0000ffff0000ull;
   1149     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
   1150     return    ( tmp         & 0xffff)
   1151             | ((tmp  >> 16) & 0xffff0000ull)
   1152             | ((tmp2 << 16) & 0xffff00000000ull)
   1153             | ( tmp2        & 0xffff000000000000ull);
   1154 }
   1155 
   1156 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
   1157 {
   1158     uint32_t low = a + (a >> 32);
   1159     uint32_t high = b + (b >> 32);
   1160     return low + ((uint64_t)high << 32);
   1161 }
   1162 
   1163 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
   1164 {
   1165     uint64_t mask;
   1166     mask = (a ^ ~b) & 0x8000800080008000ull;
   1167     a |= 0x8000800080008000ull;
   1168     b &= ~0x8000800080008000ull;
   1169     return (a - b) ^ mask;
   1170 }
   1171 
   1172 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
   1173 {
   1174     uint64_t mask;
   1175     mask = (a ^ ~b) & 0x8000000080000000ull;
   1176     a |= 0x8000000080000000ull;
   1177     b &= ~0x8000000080000000ull;
   1178     return (a - b) ^ mask;
   1179 }
   1180 
   1181 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
   1182 {
   1183     uint32_t x, y;
   1184     uint32_t low, high;
   1185 
   1186     x = a;
   1187     y = b;
   1188     low = x + y;
   1189     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
   1190         SET_QC();
   1191         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
   1192     }
   1193     x = a >> 32;
   1194     y = b >> 32;
   1195     high = x + y;
   1196     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
   1197         SET_QC();
   1198         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
   1199     }
   1200     return low | ((uint64_t)high << 32);
   1201 }
   1202 
   1203 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
   1204 {
   1205     uint64_t result;
   1206 
   1207     result = a + b;
   1208     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
   1209         SET_QC();
   1210         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
   1211     }
   1212     return result;
   1213 }
   1214 
   1215 /* We have to do the arithmetic in a larger type than
   1216  * the input type, because for example with a signed 32 bit
   1217  * op the absolute difference can overflow a signed 32 bit value.
   1218  */
   1219 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
   1220     arithtype tmp_x = (intype)(x);                            \
   1221     arithtype tmp_y = (intype)(y);                            \
   1222     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
   1223     } while(0)
   1224 
   1225 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
   1226 {
   1227     uint64_t tmp;
   1228     uint64_t result;
   1229     DO_ABD(result, a, b, uint8_t, uint32_t);
   1230     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
   1231     result |= tmp << 16;
   1232     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
   1233     result |= tmp << 32;
   1234     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
   1235     result |= tmp << 48;
   1236     return result;
   1237 }
   1238 
   1239 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
   1240 {
   1241     uint64_t tmp;
   1242     uint64_t result;
   1243     DO_ABD(result, a, b, int8_t, int32_t);
   1244     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
   1245     result |= tmp << 16;
   1246     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
   1247     result |= tmp << 32;
   1248     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
   1249     result |= tmp << 48;
   1250     return result;
   1251 }
   1252 
   1253 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
   1254 {
   1255     uint64_t tmp;
   1256     uint64_t result;
   1257     DO_ABD(result, a, b, uint16_t, uint32_t);
   1258     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
   1259     return result | (tmp << 32);
   1260 }
   1261 
   1262 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
   1263 {
   1264     uint64_t tmp;
   1265     uint64_t result;
   1266     DO_ABD(result, a, b, int16_t, int32_t);
   1267     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
   1268     return result | (tmp << 32);
   1269 }
   1270 
   1271 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
   1272 {
   1273     uint64_t result;
   1274     DO_ABD(result, a, b, uint32_t, uint64_t);
   1275     return result;
   1276 }
   1277 
   1278 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
   1279 {
   1280     uint64_t result;
   1281     DO_ABD(result, a, b, int32_t, int64_t);
   1282     return result;
   1283 }
   1284 #undef DO_ABD
   1285 
   1286 /* Widening multiply. Named type is the source type.  */
   1287 #define DO_MULL(dest, x, y, type1, type2) do { \
   1288     type1 tmp_x = x; \
   1289     type1 tmp_y = y; \
   1290     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
   1291     } while(0)
   1292 
   1293 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
   1294 {
   1295     uint64_t tmp;
   1296     uint64_t result;
   1297 
   1298     DO_MULL(result, a, b, uint8_t, uint16_t);
   1299     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
   1300     result |= tmp << 16;
   1301     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
   1302     result |= tmp << 32;
   1303     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
   1304     result |= tmp << 48;
   1305     return result;
   1306 }
   1307 
   1308 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
   1309 {
   1310     uint64_t tmp;
   1311     uint64_t result;
   1312 
   1313     DO_MULL(result, a, b, int8_t, uint16_t);
   1314     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
   1315     result |= tmp << 16;
   1316     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
   1317     result |= tmp << 32;
   1318     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
   1319     result |= tmp << 48;
   1320     return result;
   1321 }
   1322 
   1323 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
   1324 {
   1325     uint64_t tmp;
   1326     uint64_t result;
   1327 
   1328     DO_MULL(result, a, b, uint16_t, uint32_t);
   1329     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
   1330     return result | (tmp << 32);
   1331 }
   1332 
   1333 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
   1334 {
   1335     uint64_t tmp;
   1336     uint64_t result;
   1337 
   1338     DO_MULL(result, a, b, int16_t, uint32_t);
   1339     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
   1340     return result | (tmp << 32);
   1341 }
   1342 
   1343 uint64_t HELPER(neon_negl_u16)(uint64_t x)
   1344 {
   1345     uint16_t tmp;
   1346     uint64_t result;
   1347     result = (uint16_t)-x;
   1348     tmp = -(x >> 16);
   1349     result |= (uint64_t)tmp << 16;
   1350     tmp = -(x >> 32);
   1351     result |= (uint64_t)tmp << 32;
   1352     tmp = -(x >> 48);
   1353     result |= (uint64_t)tmp << 48;
   1354     return result;
   1355 }
   1356 
   1357 uint64_t HELPER(neon_negl_u32)(uint64_t x)
   1358 {
   1359     uint32_t low = -x;
   1360     uint32_t high = -(x >> 32);
   1361     return low | ((uint64_t)high << 32);
   1362 }
   1363 
   1364 /* Saturating sign manipulation.  */
   1365 /* ??? Make these use NEON_VOP1 */
   1366 #define DO_QABS8(x) do { \
   1367     if (x == (int8_t)0x80) { \
   1368         x = 0x7f; \
   1369         SET_QC(); \
   1370     } else if (x < 0) { \
   1371         x = -x; \
   1372     }} while (0)
   1373 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
   1374 {
   1375     neon_s8 vec;
   1376     NEON_UNPACK(neon_s8, vec, x);
   1377     DO_QABS8(vec.v1);
   1378     DO_QABS8(vec.v2);
   1379     DO_QABS8(vec.v3);
   1380     DO_QABS8(vec.v4);
   1381     NEON_PACK(neon_s8, x, vec);
   1382     return x;
   1383 }
   1384 #undef DO_QABS8
   1385 
   1386 #define DO_QNEG8(x) do { \
   1387     if (x == (int8_t)0x80) { \
   1388         x = 0x7f; \
   1389         SET_QC(); \
   1390     } else { \
   1391         x = -x; \
   1392     }} while (0)
   1393 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
   1394 {
   1395     neon_s8 vec;
   1396     NEON_UNPACK(neon_s8, vec, x);
   1397     DO_QNEG8(vec.v1);
   1398     DO_QNEG8(vec.v2);
   1399     DO_QNEG8(vec.v3);
   1400     DO_QNEG8(vec.v4);
   1401     NEON_PACK(neon_s8, x, vec);
   1402     return x;
   1403 }
   1404 #undef DO_QNEG8
   1405 
   1406 #define DO_QABS16(x) do { \
   1407     if (x == (int16_t)0x8000) { \
   1408         x = 0x7fff; \
   1409         SET_QC(); \
   1410     } else if (x < 0) { \
   1411         x = -x; \
   1412     }} while (0)
   1413 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
   1414 {
   1415     neon_s16 vec;
   1416     NEON_UNPACK(neon_s16, vec, x);
   1417     DO_QABS16(vec.v1);
   1418     DO_QABS16(vec.v2);
   1419     NEON_PACK(neon_s16, x, vec);
   1420     return x;
   1421 }
   1422 #undef DO_QABS16
   1423 
   1424 #define DO_QNEG16(x) do { \
   1425     if (x == (int16_t)0x8000) { \
   1426         x = 0x7fff; \
   1427         SET_QC(); \
   1428     } else { \
   1429         x = -x; \
   1430     }} while (0)
   1431 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
   1432 {
   1433     neon_s16 vec;
   1434     NEON_UNPACK(neon_s16, vec, x);
   1435     DO_QNEG16(vec.v1);
   1436     DO_QNEG16(vec.v2);
   1437     NEON_PACK(neon_s16, x, vec);
   1438     return x;
   1439 }
   1440 #undef DO_QNEG16
   1441 
   1442 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
   1443 {
   1444     if (x == SIGNBIT) {
   1445         SET_QC();
   1446         x = ~SIGNBIT;
   1447     } else if ((int32_t)x < 0) {
   1448         x = -x;
   1449     }
   1450     return x;
   1451 }
   1452 
   1453 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
   1454 {
   1455     if (x == SIGNBIT) {
   1456         SET_QC();
   1457         x = ~SIGNBIT;
   1458     } else {
   1459         x = -x;
   1460     }
   1461     return x;
   1462 }
   1463 
   1464 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
   1465 {
   1466     if (x == SIGNBIT64) {
   1467         SET_QC();
   1468         x = ~SIGNBIT64;
   1469     } else if ((int64_t)x < 0) {
   1470         x = -x;
   1471     }
   1472     return x;
   1473 }
   1474 
   1475 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
   1476 {
   1477     if (x == SIGNBIT64) {
   1478         SET_QC();
   1479         x = ~SIGNBIT64;
   1480     } else {
   1481         x = -x;
   1482     }
   1483     return x;
   1484 }
   1485 
   1486 /* NEON Float helpers.  */
   1487 
   1488 /* Floating point comparisons produce an integer result.
   1489  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
   1490  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
   1491  */
   1492 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
   1493 {
   1494     float_status *fpst = fpstp;
   1495     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
   1496 }
   1497 
   1498 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
   1499 {
   1500     float_status *fpst = fpstp;
   1501     return -float32_le(make_float32(b), make_float32(a), fpst);
   1502 }
   1503 
   1504 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
   1505 {
   1506     float_status *fpst = fpstp;
   1507     return -float32_lt(make_float32(b), make_float32(a), fpst);
   1508 }
   1509 
   1510 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
   1511 {
   1512     float_status *fpst = fpstp;
   1513     float32 f0 = float32_abs(make_float32(a));
   1514     float32 f1 = float32_abs(make_float32(b));
   1515     return -float32_le(f1, f0, fpst);
   1516 }
   1517 
   1518 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
   1519 {
   1520     float_status *fpst = fpstp;
   1521     float32 f0 = float32_abs(make_float32(a));
   1522     float32 f1 = float32_abs(make_float32(b));
   1523     return -float32_lt(f1, f0, fpst);
   1524 }
   1525 
   1526 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
   1527 {
   1528     float_status *fpst = fpstp;
   1529     float64 f0 = float64_abs(make_float64(a));
   1530     float64 f1 = float64_abs(make_float64(b));
   1531     return -float64_le(f1, f0, fpst);
   1532 }
   1533 
   1534 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
   1535 {
   1536     float_status *fpst = fpstp;
   1537     float64 f0 = float64_abs(make_float64(a));
   1538     float64 f1 = float64_abs(make_float64(b));
   1539     return -float64_lt(f1, f0, fpst);
   1540 }
   1541 
   1542 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
   1543 
   1544 void HELPER(neon_qunzip8)(void *vd, void *vm)
   1545 {
   1546     uint64_t *rd = vd, *rm = vm;
   1547     uint64_t zd0 = rd[0], zd1 = rd[1];
   1548     uint64_t zm0 = rm[0], zm1 = rm[1];
   1549 
   1550     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
   1551         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
   1552         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
   1553         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
   1554     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
   1555         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
   1556         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
   1557         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
   1558     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
   1559         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
   1560         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
   1561         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
   1562     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
   1563         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
   1564         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
   1565         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
   1566 
   1567     rm[0] = m0;
   1568     rm[1] = m1;
   1569     rd[0] = d0;
   1570     rd[1] = d1;
   1571 }
   1572 
   1573 void HELPER(neon_qunzip16)(void *vd, void *vm)
   1574 {
   1575     uint64_t *rd = vd, *rm = vm;
   1576     uint64_t zd0 = rd[0], zd1 = rd[1];
   1577     uint64_t zm0 = rm[0], zm1 = rm[1];
   1578 
   1579     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
   1580         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
   1581     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
   1582         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
   1583     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
   1584         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
   1585     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
   1586         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
   1587 
   1588     rm[0] = m0;
   1589     rm[1] = m1;
   1590     rd[0] = d0;
   1591     rd[1] = d1;
   1592 }
   1593 
   1594 void HELPER(neon_qunzip32)(void *vd, void *vm)
   1595 {
   1596     uint64_t *rd = vd, *rm = vm;
   1597     uint64_t zd0 = rd[0], zd1 = rd[1];
   1598     uint64_t zm0 = rm[0], zm1 = rm[1];
   1599 
   1600     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
   1601     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
   1602     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
   1603     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
   1604 
   1605     rm[0] = m0;
   1606     rm[1] = m1;
   1607     rd[0] = d0;
   1608     rd[1] = d1;
   1609 }
   1610 
   1611 void HELPER(neon_unzip8)(void *vd, void *vm)
   1612 {
   1613     uint64_t *rd = vd, *rm = vm;
   1614     uint64_t zd = rd[0], zm = rm[0];
   1615 
   1616     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
   1617         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
   1618         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
   1619         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
   1620     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
   1621         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
   1622         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
   1623         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
   1624 
   1625     rm[0] = m0;
   1626     rd[0] = d0;
   1627 }
   1628 
   1629 void HELPER(neon_unzip16)(void *vd, void *vm)
   1630 {
   1631     uint64_t *rd = vd, *rm = vm;
   1632     uint64_t zd = rd[0], zm = rm[0];
   1633 
   1634     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
   1635         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
   1636     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
   1637         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
   1638 
   1639     rm[0] = m0;
   1640     rd[0] = d0;
   1641 }
   1642 
   1643 void HELPER(neon_qzip8)(void *vd, void *vm)
   1644 {
   1645     uint64_t *rd = vd, *rm = vm;
   1646     uint64_t zd0 = rd[0], zd1 = rd[1];
   1647     uint64_t zm0 = rm[0], zm1 = rm[1];
   1648 
   1649     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
   1650         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
   1651         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
   1652         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
   1653     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
   1654         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
   1655         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
   1656         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
   1657     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
   1658         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
   1659         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
   1660         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
   1661     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
   1662         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
   1663         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
   1664         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
   1665 
   1666     rm[0] = m0;
   1667     rm[1] = m1;
   1668     rd[0] = d0;
   1669     rd[1] = d1;
   1670 }
   1671 
   1672 void HELPER(neon_qzip16)(void *vd, void *vm)
   1673 {
   1674     uint64_t *rd = vd, *rm = vm;
   1675     uint64_t zd0 = rd[0], zd1 = rd[1];
   1676     uint64_t zm0 = rm[0], zm1 = rm[1];
   1677 
   1678     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
   1679         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
   1680     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
   1681         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
   1682     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
   1683         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
   1684     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
   1685         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
   1686 
   1687     rm[0] = m0;
   1688     rm[1] = m1;
   1689     rd[0] = d0;
   1690     rd[1] = d1;
   1691 }
   1692 
   1693 void HELPER(neon_qzip32)(void *vd, void *vm)
   1694 {
   1695     uint64_t *rd = vd, *rm = vm;
   1696     uint64_t zd0 = rd[0], zd1 = rd[1];
   1697     uint64_t zm0 = rm[0], zm1 = rm[1];
   1698 
   1699     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
   1700     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
   1701     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
   1702     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
   1703 
   1704     rm[0] = m0;
   1705     rm[1] = m1;
   1706     rd[0] = d0;
   1707     rd[1] = d1;
   1708 }
   1709 
   1710 void HELPER(neon_zip8)(void *vd, void *vm)
   1711 {
   1712     uint64_t *rd = vd, *rm = vm;
   1713     uint64_t zd = rd[0], zm = rm[0];
   1714 
   1715     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
   1716         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
   1717         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
   1718         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
   1719     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
   1720         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
   1721         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
   1722         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
   1723 
   1724     rm[0] = m0;
   1725     rd[0] = d0;
   1726 }
   1727 
   1728 void HELPER(neon_zip16)(void *vd, void *vm)
   1729 {
   1730     uint64_t *rd = vd, *rm = vm;
   1731     uint64_t zd = rd[0], zm = rm[0];
   1732 
   1733     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
   1734         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
   1735     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
   1736         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
   1737 
   1738     rm[0] = m0;
   1739     rd[0] = d0;
   1740 }