qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

ops_sse.h (80969B)


      1 /*
      2  *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
      3  *
      4  *  Copyright (c) 2005 Fabrice Bellard
      5  *  Copyright (c) 2008 Intel Corporation  <andrew.zaborowski@intel.com>
      6  *
      7  * This library is free software; you can redistribute it and/or
      8  * modify it under the terms of the GNU Lesser General Public
      9  * License as published by the Free Software Foundation; either
     10  * version 2.1 of the License, or (at your option) any later version.
     11  *
     12  * This library is distributed in the hope that it will be useful,
     13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15  * Lesser General Public License for more details.
     16  *
     17  * You should have received a copy of the GNU Lesser General Public
     18  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
     19  */
     20 
     21 #include "crypto/aes.h"
     22 
     23 #if SHIFT == 0
     24 #define Reg MMXReg
     25 #define XMM_ONLY(...)
     26 #define B(n) MMX_B(n)
     27 #define W(n) MMX_W(n)
     28 #define L(n) MMX_L(n)
     29 #define Q(n) MMX_Q(n)
     30 #define SUFFIX _mmx
     31 #else
     32 #define Reg ZMMReg
     33 #define XMM_ONLY(...) __VA_ARGS__
     34 #define B(n) ZMM_B(n)
     35 #define W(n) ZMM_W(n)
     36 #define L(n) ZMM_L(n)
     37 #define Q(n) ZMM_Q(n)
     38 #if SHIFT == 1
     39 #define SUFFIX _xmm
     40 #else
     41 #define SUFFIX _ymm
     42 #endif
     43 #endif
     44 
     45 #define LANE_WIDTH (SHIFT ? 16 : 8)
     46 #define PACK_WIDTH (LANE_WIDTH / 2)
     47 
     48 #if SHIFT == 0
     49 #define FPSRL(x, c) ((x) >> shift)
     50 #define FPSRAW(x, c) ((int16_t)(x) >> shift)
     51 #define FPSRAL(x, c) ((int32_t)(x) >> shift)
     52 #define FPSLL(x, c) ((x) << shift)
     53 #endif
     54 
     55 void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
     56 {
     57     int shift;
     58     if (c->Q(0) > 15) {
     59         for (int i = 0; i < 1 << SHIFT; i++) {
     60             d->Q(i) = 0;
     61         }
     62     } else {
     63         shift = c->B(0);
     64         for (int i = 0; i < 4 << SHIFT; i++) {
     65             d->W(i) = FPSRL(s->W(i), shift);
     66         }
     67     }
     68 }
     69 
     70 void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
     71 {
     72     int shift;
     73     if (c->Q(0) > 15) {
     74         for (int i = 0; i < 1 << SHIFT; i++) {
     75             d->Q(i) = 0;
     76         }
     77     } else {
     78         shift = c->B(0);
     79         for (int i = 0; i < 4 << SHIFT; i++) {
     80             d->W(i) = FPSLL(s->W(i), shift);
     81         }
     82     }
     83 }
     84 
     85 void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
     86 {
     87     int shift;
     88     if (c->Q(0) > 15) {
     89         shift = 15;
     90     } else {
     91         shift = c->B(0);
     92     }
     93     for (int i = 0; i < 4 << SHIFT; i++) {
     94         d->W(i) = FPSRAW(s->W(i), shift);
     95     }
     96 }
     97 
     98 void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
     99 {
    100     int shift;
    101     if (c->Q(0) > 31) {
    102         for (int i = 0; i < 1 << SHIFT; i++) {
    103             d->Q(i) = 0;
    104         }
    105     } else {
    106         shift = c->B(0);
    107         for (int i = 0; i < 2 << SHIFT; i++) {
    108             d->L(i) = FPSRL(s->L(i), shift);
    109         }
    110     }
    111 }
    112 
    113 void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
    114 {
    115     int shift;
    116     if (c->Q(0) > 31) {
    117         for (int i = 0; i < 1 << SHIFT; i++) {
    118             d->Q(i) = 0;
    119         }
    120     } else {
    121         shift = c->B(0);
    122         for (int i = 0; i < 2 << SHIFT; i++) {
    123             d->L(i) = FPSLL(s->L(i), shift);
    124         }
    125     }
    126 }
    127 
    128 void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
    129 {
    130     int shift;
    131     if (c->Q(0) > 31) {
    132         shift = 31;
    133     } else {
    134         shift = c->B(0);
    135     }
    136     for (int i = 0; i < 2 << SHIFT; i++) {
    137         d->L(i) = FPSRAL(s->L(i), shift);
    138     }
    139 }
    140 
    141 void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
    142 {
    143     int shift;
    144     if (c->Q(0) > 63) {
    145         for (int i = 0; i < 1 << SHIFT; i++) {
    146             d->Q(i) = 0;
    147         }
    148     } else {
    149         shift = c->B(0);
    150         for (int i = 0; i < 1 << SHIFT; i++) {
    151             d->Q(i) = FPSRL(s->Q(i), shift);
    152         }
    153     }
    154 }
    155 
    156 void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
    157 {
    158     int shift;
    159     if (c->Q(0) > 63) {
    160         for (int i = 0; i < 1 << SHIFT; i++) {
    161             d->Q(i) = 0;
    162         }
    163     } else {
    164         shift = c->B(0);
    165         for (int i = 0; i < 1 << SHIFT; i++) {
    166             d->Q(i) = FPSLL(s->Q(i), shift);
    167         }
    168     }
    169 }
    170 
    171 #if SHIFT >= 1
    172 void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
    173 {
    174     int shift, i, j;
    175 
    176     shift = c->L(0);
    177     if (shift > 16) {
    178         shift = 16;
    179     }
    180     for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
    181         for (i = 0; i < 16 - shift; i++) {
    182             d->B(j + i) = s->B(j + i + shift);
    183         }
    184         for (i = 16 - shift; i < 16; i++) {
    185             d->B(j + i) = 0;
    186         }
    187     }
    188 }
    189 
    190 void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
    191 {
    192     int shift, i, j;
    193 
    194     shift = c->L(0);
    195     if (shift > 16) {
    196         shift = 16;
    197     }
    198     for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
    199         for (i = 15; i >= shift; i--) {
    200             d->B(j + i) = s->B(j + i - shift);
    201         }
    202         for (i = 0; i < shift; i++) {
    203             d->B(j + i) = 0;
    204         }
    205     }
    206 }
    207 #endif
    208 
    209 #define SSE_HELPER_1(name, elem, num, F)                        \
    210     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
    211     {                                                           \
    212         int n = num;                                            \
    213         for (int i = 0; i < n; i++) {                           \
    214             d->elem(i) = F(s->elem(i));                         \
    215         }                                                       \
    216     }
    217 
    218 #define SSE_HELPER_2(name, elem, num, F)                        \
    219     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)   \
    220     {                                                           \
    221         int n = num;                                            \
    222         for (int i = 0; i < n; i++) {                           \
    223             d->elem(i) = F(v->elem(i), s->elem(i));             \
    224         }                                                       \
    225     }
    226 
    227 #define SSE_HELPER_B(name, F)                                   \
    228     SSE_HELPER_2(name, B, 8 << SHIFT, F)
    229 
    230 #define SSE_HELPER_W(name, F)                                   \
    231     SSE_HELPER_2(name, W, 4 << SHIFT, F)
    232 
    233 #define SSE_HELPER_L(name, F)                                   \
    234     SSE_HELPER_2(name, L, 2 << SHIFT, F)
    235 
    236 #define SSE_HELPER_Q(name, F)                                   \
    237     SSE_HELPER_2(name, Q, 1 << SHIFT, F)
    238 
    239 #if SHIFT == 0
    240 static inline int satub(int x)
    241 {
    242     if (x < 0) {
    243         return 0;
    244     } else if (x > 255) {
    245         return 255;
    246     } else {
    247         return x;
    248     }
    249 }
    250 
    251 static inline int satuw(int x)
    252 {
    253     if (x < 0) {
    254         return 0;
    255     } else if (x > 65535) {
    256         return 65535;
    257     } else {
    258         return x;
    259     }
    260 }
    261 
    262 static inline int satsb(int x)
    263 {
    264     if (x < -128) {
    265         return -128;
    266     } else if (x > 127) {
    267         return 127;
    268     } else {
    269         return x;
    270     }
    271 }
    272 
    273 static inline int satsw(int x)
    274 {
    275     if (x < -32768) {
    276         return -32768;
    277     } else if (x > 32767) {
    278         return 32767;
    279     } else {
    280         return x;
    281     }
    282 }
    283 
    284 #define FADD(a, b) ((a) + (b))
    285 #define FADDUB(a, b) satub((a) + (b))
    286 #define FADDUW(a, b) satuw((a) + (b))
    287 #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
    288 #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
    289 
    290 #define FSUB(a, b) ((a) - (b))
    291 #define FSUBUB(a, b) satub((a) - (b))
    292 #define FSUBUW(a, b) satuw((a) - (b))
    293 #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
    294 #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
    295 #define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
    296 #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
    297 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
    298 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
    299 
    300 #define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16)
    301 #define FMULHUW(a, b) ((a) * (b) >> 16)
    302 #define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16)
    303 
    304 #define FAVG(a, b) (((a) + (b) + 1) >> 1)
    305 #endif
    306 
    307 SSE_HELPER_W(helper_pmulhuw, FMULHUW)
    308 SSE_HELPER_W(helper_pmulhw, FMULHW)
    309 
    310 #if SHIFT == 0
    311 void glue(helper_pmulhrw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    312 {
    313     d->W(0) = FMULHRW(d->W(0), s->W(0));
    314     d->W(1) = FMULHRW(d->W(1), s->W(1));
    315     d->W(2) = FMULHRW(d->W(2), s->W(2));
    316     d->W(3) = FMULHRW(d->W(3), s->W(3));
    317 }
    318 #endif
    319 
    320 SSE_HELPER_B(helper_pavgb, FAVG)
    321 SSE_HELPER_W(helper_pavgw, FAVG)
    322 
    323 void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
    324 {
    325     int i;
    326 
    327     for (i = 0; i < (1 << SHIFT); i++) {
    328         d->Q(i) = (uint64_t)s->L(i * 2) * (uint64_t)v->L(i * 2);
    329     }
    330 }
    331 
    332 void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
    333 {
    334     int i;
    335 
    336     for (i = 0; i < (2 << SHIFT); i++) {
    337         d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) +
    338             (int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1);
    339     }
    340 }
    341 
    342 #if SHIFT == 0
    343 static inline int abs1(int a)
    344 {
    345     if (a < 0) {
    346         return -a;
    347     } else {
    348         return a;
    349     }
    350 }
    351 #endif
    352 void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
    353 {
    354     int i;
    355 
    356     for (i = 0; i < (1 << SHIFT); i++) {
    357         unsigned int val = 0;
    358         val += abs1(v->B(8 * i + 0) - s->B(8 * i + 0));
    359         val += abs1(v->B(8 * i + 1) - s->B(8 * i + 1));
    360         val += abs1(v->B(8 * i + 2) - s->B(8 * i + 2));
    361         val += abs1(v->B(8 * i + 3) - s->B(8 * i + 3));
    362         val += abs1(v->B(8 * i + 4) - s->B(8 * i + 4));
    363         val += abs1(v->B(8 * i + 5) - s->B(8 * i + 5));
    364         val += abs1(v->B(8 * i + 6) - s->B(8 * i + 6));
    365         val += abs1(v->B(8 * i + 7) - s->B(8 * i + 7));
    366         d->Q(i) = val;
    367     }
    368 }
    369 
    370 #if SHIFT < 2
    371 void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
    372                                   target_ulong a0)
    373 {
    374     int i;
    375 
    376     for (i = 0; i < (8 << SHIFT); i++) {
    377         if (s->B(i) & 0x80) {
    378             cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC());
    379         }
    380     }
    381 }
    382 #endif
    383 
    384 #define SHUFFLE4(F, a, b, offset) do {      \
    385     r0 = a->F((order & 3) + offset);        \
    386     r1 = a->F(((order >> 2) & 3) + offset); \
    387     r2 = b->F(((order >> 4) & 3) + offset); \
    388     r3 = b->F(((order >> 6) & 3) + offset); \
    389     d->F(offset) = r0;                      \
    390     d->F(offset + 1) = r1;                  \
    391     d->F(offset + 2) = r2;                  \
    392     d->F(offset + 3) = r3;                  \
    393     } while (0)
    394 
    395 #if SHIFT == 0
    396 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
    397 {
    398     uint16_t r0, r1, r2, r3;
    399 
    400     SHUFFLE4(W, s, s, 0);
    401 }
    402 #else
    403 void glue(helper_shufps, SUFFIX)(Reg *d, Reg *v, Reg *s, int order)
    404 {
    405     uint32_t r0, r1, r2, r3;
    406     int i;
    407 
    408     for (i = 0; i < 2 << SHIFT; i += 4) {
    409         SHUFFLE4(L, v, s, i);
    410     }
    411 }
    412 
    413 void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *v, Reg *s, int order)
    414 {
    415     uint64_t r0, r1;
    416     int i;
    417 
    418     for (i = 0; i < 1 << SHIFT; i += 2) {
    419         r0 = v->Q(((order & 1) & 1) + i);
    420         r1 = s->Q(((order >> 1) & 1) + i);
    421         d->Q(i) = r0;
    422         d->Q(i + 1) = r1;
    423         order >>= 2;
    424     }
    425 }
    426 
    427 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
    428 {
    429     uint32_t r0, r1, r2, r3;
    430     int i;
    431 
    432     for (i = 0; i < 2 << SHIFT; i += 4) {
    433         SHUFFLE4(L, s, s, i);
    434     }
    435 }
    436 
    437 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
    438 {
    439     uint16_t r0, r1, r2, r3;
    440     int i, j;
    441 
    442     for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) {
    443         SHUFFLE4(W, s, s, i);
    444         d->Q(j) = s->Q(j);
    445     }
    446 }
    447 
    448 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
    449 {
    450     uint16_t r0, r1, r2, r3;
    451     int i, j;
    452 
    453     for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) {
    454         d->Q(j) = s->Q(j);
    455         SHUFFLE4(W, s, s, i);
    456     }
    457 }
    458 #endif
    459 
    460 #if SHIFT >= 1
    461 /* FPU ops */
    462 /* XXX: not accurate */
    463 
    464 #define SSE_HELPER_P(name, F)                                           \
    465     void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env,          \
    466             Reg *d, Reg *v, Reg *s)                                     \
    467     {                                                                   \
    468         int i;                                                          \
    469         for (i = 0; i < 2 << SHIFT; i++) {                              \
    470             d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i));              \
    471         }                                                               \
    472     }                                                                   \
    473                                                                         \
    474     void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env,          \
    475             Reg *d, Reg *v, Reg *s)                                     \
    476     {                                                                   \
    477         int i;                                                          \
    478         for (i = 0; i < 1 << SHIFT; i++) {                              \
    479             d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i));              \
    480         }                                                               \
    481     }
    482 
    483 #if SHIFT == 1
    484 
    485 #define SSE_HELPER_S(name, F)                                           \
    486     SSE_HELPER_P(name, F)                                               \
    487                                                                         \
    488     void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)\
    489     {                                                                   \
    490         int i;                                                          \
    491         d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0));                  \
    492         for (i = 1; i < 2 << SHIFT; i++) {                              \
    493             d->ZMM_L(i) = v->ZMM_L(i);                                  \
    494         }                                                               \
    495     }                                                                   \
    496                                                                         \
    497     void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)\
    498     {                                                                   \
    499         int i;                                                          \
    500         d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0));                  \
    501         for (i = 1; i < 1 << SHIFT; i++) {                              \
    502             d->ZMM_Q(i) = v->ZMM_Q(i);                                  \
    503         }                                                               \
    504     }
    505 
    506 #else
    507 
    508 #define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F)
    509 
    510 #endif
    511 
    512 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
    513 #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
    514 #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
    515 #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
    516 
    517 /* Note that the choice of comparison op here is important to get the
    518  * special cases right: for min and max Intel specifies that (-0,0),
    519  * (NaN, anything) and (anything, NaN) return the second argument.
    520  */
    521 #define FPU_MIN(size, a, b)                                     \
    522     (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b))
    523 #define FPU_MAX(size, a, b)                                     \
    524     (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b))
    525 
    526 SSE_HELPER_S(add, FPU_ADD)
    527 SSE_HELPER_S(sub, FPU_SUB)
    528 SSE_HELPER_S(mul, FPU_MUL)
    529 SSE_HELPER_S(div, FPU_DIV)
    530 SSE_HELPER_S(min, FPU_MIN)
    531 SSE_HELPER_S(max, FPU_MAX)
    532 
    533 void glue(helper_sqrtps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    534 {
    535     int i;
    536     for (i = 0; i < 2 << SHIFT; i++) {
    537         d->ZMM_S(i) = float32_sqrt(s->ZMM_S(i), &env->sse_status);
    538     }
    539 }
    540 
    541 void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    542 {
    543     int i;
    544     for (i = 0; i < 1 << SHIFT; i++) {
    545         d->ZMM_D(i) = float64_sqrt(s->ZMM_D(i), &env->sse_status);
    546     }
    547 }
    548 
    549 #if SHIFT == 1
    550 void helper_sqrtss(CPUX86State *env, Reg *d, Reg *v, Reg *s)
    551 {
    552     int i;
    553     d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status);
    554     for (i = 1; i < 2 << SHIFT; i++) {
    555         d->ZMM_L(i) = v->ZMM_L(i);
    556     }
    557 }
    558 
    559 void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *v, Reg *s)
    560 {
    561     int i;
    562     d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status);
    563     for (i = 1; i < 1 << SHIFT; i++) {
    564         d->ZMM_Q(i) = v->ZMM_Q(i);
    565     }
    566 }
    567 #endif
    568 
    569 /* float to float conversions */
    570 void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    571 {
    572     int i;
    573     for (i = 1 << SHIFT; --i >= 0; ) {
    574         d->ZMM_D(i) = float32_to_float64(s->ZMM_S(i), &env->sse_status);
    575     }
    576 }
    577 
    578 void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    579 {
    580     int i;
    581     for (i = 0; i < 1 << SHIFT; i++) {
    582          d->ZMM_S(i) = float64_to_float32(s->ZMM_D(i), &env->sse_status);
    583     }
    584     for (i >>= 1; i < 1 << SHIFT; i++) {
    585          d->Q(i) = 0;
    586     }
    587 }
    588 
    589 #if SHIFT >= 1
    590 void glue(helper_cvtph2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    591 {
    592     int i;
    593 
    594     for (i = 2 << SHIFT; --i >= 0; ) {
    595          d->ZMM_S(i) = float16_to_float32(s->ZMM_H(i), true, &env->sse_status);
    596     }
    597 }
    598 
    599 void glue(helper_cvtps2ph, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, int mode)
    600 {
    601     int i;
    602     FloatRoundMode prev_rounding_mode = env->sse_status.float_rounding_mode;
    603     if (!(mode & (1 << 2))) {
    604         set_x86_rounding_mode(mode & 3, &env->sse_status);
    605     }
    606 
    607     for (i = 0; i < 2 << SHIFT; i++) {
    608         d->ZMM_H(i) = float32_to_float16(s->ZMM_S(i), true, &env->sse_status);
    609     }
    610     for (i >>= 2; i < 1 << SHIFT; i++) {
    611         d->Q(i) = 0;
    612     }
    613 
    614     env->sse_status.float_rounding_mode = prev_rounding_mode;
    615 }
    616 #endif
    617 
    618 #if SHIFT == 1
    619 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)
    620 {
    621     int i;
    622     d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status);
    623     for (i = 1; i < 1 << SHIFT; i++) {
    624         d->ZMM_Q(i) = v->ZMM_Q(i);
    625     }
    626 }
    627 
    628 void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)
    629 {
    630     int i;
    631     d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
    632     for (i = 1; i < 2 << SHIFT; i++) {
    633         d->ZMM_L(i) = v->ZMM_L(i);
    634     }
    635 }
    636 #endif
    637 
    638 /* integer to float */
    639 void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    640 {
    641     int i;
    642     for (i = 0; i < 2 << SHIFT; i++) {
    643         d->ZMM_S(i) = int32_to_float32(s->ZMM_L(i), &env->sse_status);
    644     }
    645 }
    646 
    647 void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    648 {
    649     int i;
    650     for (i = 1 << SHIFT; --i >= 0; ) {
    651         int32_t l = s->ZMM_L(i);
    652         d->ZMM_D(i) = int32_to_float64(l, &env->sse_status);
    653     }
    654 }
    655 
    656 #if SHIFT == 1
    657 void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s)
    658 {
    659     d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
    660     d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
    661 }
    662 
    663 void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s)
    664 {
    665     d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
    666     d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
    667 }
    668 
    669 void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val)
    670 {
    671     d->ZMM_S(0) = int32_to_float32(val, &env->sse_status);
    672 }
    673 
    674 void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val)
    675 {
    676     d->ZMM_D(0) = int32_to_float64(val, &env->sse_status);
    677 }
    678 
    679 #ifdef TARGET_X86_64
    680 void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val)
    681 {
    682     d->ZMM_S(0) = int64_to_float32(val, &env->sse_status);
    683 }
    684 
    685 void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val)
    686 {
    687     d->ZMM_D(0) = int64_to_float64(val, &env->sse_status);
    688 }
    689 #endif
    690 
    691 #endif
    692 
    693 /* float to integer */
    694 
    695 #if SHIFT == 1
    696 /*
    697  * x86 mandates that we return the indefinite integer value for the result
    698  * of any float-to-integer conversion that raises the 'invalid' exception.
    699  * Wrap the softfloat functions to get this behaviour.
    700  */
    701 #define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE)              \
    702     static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s)        \
    703     {                                                                   \
    704         int oldflags, newflags;                                         \
    705         RETTYPE r;                                                      \
    706                                                                         \
    707         oldflags = get_float_exception_flags(s);                        \
    708         set_float_exception_flags(0, s);                                \
    709         r = FN(a, s);                                                   \
    710         newflags = get_float_exception_flags(s);                        \
    711         if (newflags & float_flag_invalid) {                            \
    712             r = INDEFVALUE;                                             \
    713         }                                                               \
    714         set_float_exception_flags(newflags | oldflags, s);              \
    715         return r;                                                       \
    716     }
    717 
    718 WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN)
    719 WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN)
    720 WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN)
    721 WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN)
    722 WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN)
    723 WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN)
    724 WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN)
    725 WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN)
    726 #endif
    727 
    728 void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    729 {
    730     int i;
    731     for (i = 0; i < 2 << SHIFT; i++) {
    732         d->ZMM_L(i) = x86_float32_to_int32(s->ZMM_S(i), &env->sse_status);
    733     }
    734 }
    735 
    736 void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    737 {
    738     int i;
    739     for (i = 0; i < 1 << SHIFT; i++) {
    740         d->ZMM_L(i) = x86_float64_to_int32(s->ZMM_D(i), &env->sse_status);
    741     }
    742     for (i >>= 1; i < 1 << SHIFT; i++) {
    743          d->Q(i) = 0;
    744     }
    745 }
    746 
    747 #if SHIFT == 1
    748 void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
    749 {
    750     d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
    751     d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
    752 }
    753 
    754 void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
    755 {
    756     d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
    757     d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
    758 }
    759 
    760 int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s)
    761 {
    762     return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
    763 }
    764 
    765 int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s)
    766 {
    767     return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
    768 }
    769 
    770 #ifdef TARGET_X86_64
    771 int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s)
    772 {
    773     return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status);
    774 }
    775 
    776 int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s)
    777 {
    778     return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status);
    779 }
    780 #endif
    781 #endif
    782 
    783 /* float to integer truncated */
    784 void glue(helper_cvttps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    785 {
    786     int i;
    787     for (i = 0; i < 2 << SHIFT; i++) {
    788         d->ZMM_L(i) = x86_float32_to_int32_round_to_zero(s->ZMM_S(i),
    789                                                          &env->sse_status);
    790     }
    791 }
    792 
    793 void glue(helper_cvttpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    794 {
    795     int i;
    796     for (i = 0; i < 1 << SHIFT; i++) {
    797         d->ZMM_L(i) = x86_float64_to_int32_round_to_zero(s->ZMM_D(i),
    798                                                          &env->sse_status);
    799     }
    800     for (i >>= 1; i < 1 << SHIFT; i++) {
    801          d->Q(i) = 0;
    802     }
    803 }
    804 
    805 #if SHIFT == 1
    806 void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
    807 {
    808     d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
    809     d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
    810 }
    811 
    812 void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
    813 {
    814     d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
    815     d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
    816 }
    817 
    818 int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s)
    819 {
    820     return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
    821 }
    822 
    823 int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s)
    824 {
    825     return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
    826 }
    827 
    828 #ifdef TARGET_X86_64
    829 int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s)
    830 {
    831     return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status);
    832 }
    833 
    834 int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s)
    835 {
    836     return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status);
    837 }
    838 #endif
    839 #endif
    840 
    841 void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    842 {
    843     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
    844     int i;
    845     for (i = 0; i < 2 << SHIFT; i++) {
    846         d->ZMM_S(i) = float32_div(float32_one,
    847                                   float32_sqrt(s->ZMM_S(i), &env->sse_status),
    848                                   &env->sse_status);
    849     }
    850     set_float_exception_flags(old_flags, &env->sse_status);
    851 }
    852 
    853 #if SHIFT == 1
    854 void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s)
    855 {
    856     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
    857     int i;
    858     d->ZMM_S(0) = float32_div(float32_one,
    859                               float32_sqrt(s->ZMM_S(0), &env->sse_status),
    860                               &env->sse_status);
    861     set_float_exception_flags(old_flags, &env->sse_status);
    862     for (i = 1; i < 2 << SHIFT; i++) {
    863         d->ZMM_L(i) = v->ZMM_L(i);
    864     }
    865 }
    866 #endif
    867 
    868 void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    869 {
    870     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
    871     int i;
    872     for (i = 0; i < 2 << SHIFT; i++) {
    873         d->ZMM_S(i) = float32_div(float32_one, s->ZMM_S(i), &env->sse_status);
    874     }
    875     set_float_exception_flags(old_flags, &env->sse_status);
    876 }
    877 
    878 #if SHIFT == 1
    879 void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s)
    880 {
    881     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
    882     int i;
    883     d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
    884     for (i = 1; i < 2 << SHIFT; i++) {
    885         d->ZMM_L(i) = v->ZMM_L(i);
    886     }
    887     set_float_exception_flags(old_flags, &env->sse_status);
    888 }
    889 #endif
    890 
    891 #if SHIFT == 1
    892 static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
    893 {
    894     uint64_t mask;
    895 
    896     if (len == 0) {
    897         mask = ~0LL;
    898     } else {
    899         mask = (1ULL << len) - 1;
    900     }
    901     return (src >> shift) & mask;
    902 }
    903 
    904 void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    905 {
    906     d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1) & 63, s->ZMM_B(0) & 63);
    907 }
    908 
    909 void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length)
    910 {
    911     d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length);
    912 }
    913 
    914 static inline uint64_t helper_insertq(uint64_t dest, uint64_t src, int shift, int len)
    915 {
    916     uint64_t mask;
    917 
    918     if (len == 0) {
    919         mask = ~0ULL;
    920     } else {
    921         mask = (1ULL << len) - 1;
    922     }
    923     return (dest & ~(mask << shift)) | ((src & mask) << shift);
    924 }
    925 
    926 void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    927 {
    928     d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), s->ZMM_B(9) & 63, s->ZMM_B(8) & 63);
    929 }
    930 
    931 void helper_insertq_i(CPUX86State *env, ZMMReg *d, ZMMReg *s, int index, int length)
    932 {
    933     d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), index, length);
    934 }
    935 #endif
    936 
    937 #define SSE_HELPER_HPS(name, F)  \
    938 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
    939 {                                                                 \
    940     float32 r[2 << SHIFT];                                        \
    941     int i, j, k;                                                  \
    942     for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) {            \
    943         for (i = j = 0; j < 4; i++, j += 2) {                     \
    944             r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \
    945         }                                                         \
    946         for (j = 0; j < 4; i++, j += 2) {                         \
    947             r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \
    948         }                                                         \
    949     }                                                             \
    950     for (i = 0; i < 2 << SHIFT; i++) {                            \
    951         d->ZMM_S(i) = r[i];                                       \
    952     }                                                             \
    953 }
    954 
    955 SSE_HELPER_HPS(haddps, float32_add)
    956 SSE_HELPER_HPS(hsubps, float32_sub)
    957 
    958 #define SSE_HELPER_HPD(name, F)  \
    959 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
    960 {                                                                 \
    961     float64 r[1 << SHIFT];                                        \
    962     int i, j, k;                                                  \
    963     for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) {            \
    964         for (i = j = 0; j < 2; i++, j += 2) {                     \
    965             r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \
    966         }                                                         \
    967         for (j = 0; j < 2; i++, j += 2) {                         \
    968             r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \
    969         }                                                         \
    970     }                                                             \
    971     for (i = 0; i < 1 << SHIFT; i++) {                            \
    972         d->ZMM_D(i) = r[i];                                       \
    973     }                                                             \
    974 }
    975 
    976 SSE_HELPER_HPD(haddpd, float64_add)
    977 SSE_HELPER_HPD(hsubpd, float64_sub)
    978 
    979 void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
    980 {
    981     int i;
    982     for (i = 0; i < 2 << SHIFT; i += 2) {
    983         d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
    984         d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
    985     }
    986 }
    987 
    988 void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
    989 {
    990     int i;
    991     for (i = 0; i < 1 << SHIFT; i += 2) {
    992         d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status);
    993         d->ZMM_D(i+1) = float64_add(v->ZMM_D(i+1), s->ZMM_D(i+1), &env->sse_status);
    994     }
    995 }
    996 
    997 #define SSE_HELPER_CMP_P(name, F, C)                                    \
    998     void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env,          \
    999                                              Reg *d, Reg *v, Reg *s)    \
   1000     {                                                                   \
   1001         int i;                                                          \
   1002         for (i = 0; i < 2 << SHIFT; i++) {                              \
   1003             d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0;  \
   1004         }                                                               \
   1005     }                                                                   \
   1006                                                                         \
   1007     void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env,          \
   1008                                              Reg *d, Reg *v, Reg *s)    \
   1009     {                                                                   \
   1010         int i;                                                          \
   1011         for (i = 0; i < 1 << SHIFT; i++) {                              \
   1012             d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0;  \
   1013         }                                                               \
   1014     }
   1015 
   1016 #if SHIFT == 1
   1017 #define SSE_HELPER_CMP(name, F, C)                                          \
   1018     SSE_HELPER_CMP_P(name, F, C)                                            \
   1019     void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)    \
   1020     {                                                                       \
   1021         int i;                                                              \
   1022         d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0;          \
   1023         for (i = 1; i < 2 << SHIFT; i++) {                                  \
   1024             d->ZMM_L(i) = v->ZMM_L(i);                                      \
   1025         }                                                                   \
   1026     }                                                                       \
   1027                                                                             \
   1028     void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)    \
   1029     {                                                                       \
   1030         int i;                                                              \
   1031         d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0;          \
   1032         for (i = 1; i < 1 << SHIFT; i++) {                                  \
   1033             d->ZMM_Q(i) = v->ZMM_Q(i);                                      \
   1034         }                                                                   \
   1035     }
   1036 
   1037 static inline bool FPU_EQU(FloatRelation x)
   1038 {
   1039     return (x == float_relation_equal || x == float_relation_unordered);
   1040 }
   1041 static inline bool FPU_GE(FloatRelation x)
   1042 {
   1043     return (x == float_relation_equal || x == float_relation_greater);
   1044 }
   1045 #define FPU_EQ(x) (x == float_relation_equal)
   1046 #define FPU_LT(x) (x == float_relation_less)
   1047 #define FPU_LE(x) (x <= float_relation_equal)
   1048 #define FPU_GT(x) (x == float_relation_greater)
   1049 #define FPU_UNORD(x) (x == float_relation_unordered)
   1050 /* We must make sure we evaluate the argument in case it is a signalling NAN */
   1051 #define FPU_FALSE(x) (x == float_relation_equal && 0)
   1052 
   1053 #define FPU_CMPQ(size, a, b) \
   1054     float ## size ## _compare_quiet(a, b, &env->sse_status)
   1055 #define FPU_CMPS(size, a, b) \
   1056     float ## size ## _compare(a, b, &env->sse_status)
   1057 
   1058 #else
   1059 #define SSE_HELPER_CMP(name, F, C) SSE_HELPER_CMP_P(name, F, C)
   1060 #endif
   1061 
   1062 SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ)
   1063 SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT)
   1064 SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE)
   1065 SSE_HELPER_CMP(cmpunord, FPU_CMPQ,  FPU_UNORD)
   1066 SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ)
   1067 SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT)
   1068 SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE)
   1069 SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD)
   1070 
   1071 SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU)
   1072 SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE)
   1073 SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT)
   1074 SSE_HELPER_CMP(cmpfalse, FPU_CMPQ,  FPU_FALSE)
   1075 SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, !FPU_EQU)
   1076 SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE)
   1077 SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT)
   1078 SSE_HELPER_CMP(cmptrue, FPU_CMPQ,  !FPU_FALSE)
   1079 
   1080 SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ)
   1081 SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT)
   1082 SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE)
   1083 SSE_HELPER_CMP(cmpunords, FPU_CMPS,  FPU_UNORD)
   1084 SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ)
   1085 SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT)
   1086 SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE)
   1087 SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD)
   1088 
   1089 SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU)
   1090 SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE)
   1091 SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT)
   1092 SSE_HELPER_CMP(cmpfalses, FPU_CMPS,  FPU_FALSE)
   1093 SSE_HELPER_CMP(cmpnequs, FPU_CMPS, !FPU_EQU)
   1094 SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE)
   1095 SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT)
   1096 SSE_HELPER_CMP(cmptrues, FPU_CMPS,  !FPU_FALSE)
   1097 
   1098 #undef SSE_HELPER_CMP
   1099 
   1100 #if SHIFT == 1
   1101 static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
   1102 
   1103 void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s)
   1104 {
   1105     FloatRelation ret;
   1106     float32 s0, s1;
   1107 
   1108     s0 = d->ZMM_S(0);
   1109     s1 = s->ZMM_S(0);
   1110     ret = float32_compare_quiet(s0, s1, &env->sse_status);
   1111     CC_SRC = comis_eflags[ret + 1];
   1112 }
   1113 
   1114 void helper_comiss(CPUX86State *env, Reg *d, Reg *s)
   1115 {
   1116     FloatRelation ret;
   1117     float32 s0, s1;
   1118 
   1119     s0 = d->ZMM_S(0);
   1120     s1 = s->ZMM_S(0);
   1121     ret = float32_compare(s0, s1, &env->sse_status);
   1122     CC_SRC = comis_eflags[ret + 1];
   1123 }
   1124 
   1125 void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s)
   1126 {
   1127     FloatRelation ret;
   1128     float64 d0, d1;
   1129 
   1130     d0 = d->ZMM_D(0);
   1131     d1 = s->ZMM_D(0);
   1132     ret = float64_compare_quiet(d0, d1, &env->sse_status);
   1133     CC_SRC = comis_eflags[ret + 1];
   1134 }
   1135 
   1136 void helper_comisd(CPUX86State *env, Reg *d, Reg *s)
   1137 {
   1138     FloatRelation ret;
   1139     float64 d0, d1;
   1140 
   1141     d0 = d->ZMM_D(0);
   1142     d1 = s->ZMM_D(0);
   1143     ret = float64_compare(d0, d1, &env->sse_status);
   1144     CC_SRC = comis_eflags[ret + 1];
   1145 }
   1146 #endif
   1147 
   1148 uint32_t glue(helper_movmskps, SUFFIX)(CPUX86State *env, Reg *s)
   1149 {
   1150     uint32_t mask;
   1151     int i;
   1152 
   1153     mask = 0;
   1154     for (i = 0; i < 2 << SHIFT; i++) {
   1155         mask |= (s->ZMM_L(i) >> (31 - i)) & (1 << i);
   1156     }
   1157     return mask;
   1158 }
   1159 
   1160 uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s)
   1161 {
   1162     uint32_t mask;
   1163     int i;
   1164 
   1165     mask = 0;
   1166     for (i = 0; i < 1 << SHIFT; i++) {
   1167         mask |= (s->ZMM_Q(i) >> (63 - i)) & (1 << i);
   1168     }
   1169     return mask;
   1170 }
   1171 
   1172 #endif
   1173 
   1174 #define PACK_HELPER_B(name, F) \
   1175 void glue(helper_pack ## name, SUFFIX)(CPUX86State *env,      \
   1176         Reg *d, Reg *v, Reg *s)                               \
   1177 {                                                             \
   1178     uint8_t r[PACK_WIDTH * 2];                                \
   1179     int j, k;                                                 \
   1180     for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) {            \
   1181         for (k = 0; k < PACK_WIDTH; k++) {                    \
   1182             r[k] = F((int16_t)v->W(j + k));                   \
   1183         }                                                     \
   1184         for (k = 0; k < PACK_WIDTH; k++) {                    \
   1185             r[PACK_WIDTH + k] = F((int16_t)s->W(j + k));      \
   1186         }                                                     \
   1187         for (k = 0; k < PACK_WIDTH * 2; k++) {                \
   1188             d->B(2 * j + k) = r[k];                           \
   1189         }                                                     \
   1190     }                                                         \
   1191 }
   1192 
   1193 PACK_HELPER_B(sswb, satsb)
   1194 PACK_HELPER_B(uswb, satub)
   1195 
   1196 void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   1197 {
   1198     uint16_t r[PACK_WIDTH];
   1199     int j, k;
   1200 
   1201     for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) {
   1202         for (k = 0; k < PACK_WIDTH / 2; k++) {
   1203             r[k] = satsw(v->L(j + k));
   1204         }
   1205         for (k = 0; k < PACK_WIDTH / 2; k++) {
   1206             r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k));
   1207         }
   1208         for (k = 0; k < PACK_WIDTH; k++) {
   1209             d->W(2 * j + k) = r[k];
   1210         }
   1211     }
   1212 }
   1213 
   1214 #define UNPCK_OP(base_name, base)                                       \
   1215                                                                         \
   1216     void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
   1217                                                 Reg *d, Reg *v, Reg *s) \
   1218     {                                                                   \
   1219         uint8_t r[PACK_WIDTH * 2];                                      \
   1220         int j, i;                                                       \
   1221                                                                         \
   1222         for (j = 0; j < 8 << SHIFT; ) {                                 \
   1223             int k = j + base * PACK_WIDTH;                              \
   1224             for (i = 0; i < PACK_WIDTH; i++) {                          \
   1225                 r[2 * i] = v->B(k + i);                                 \
   1226                 r[2 * i + 1] = s->B(k + i);                             \
   1227             }                                                           \
   1228             for (i = 0; i < PACK_WIDTH * 2; i++, j++) {                 \
   1229                 d->B(j) = r[i];                                         \
   1230             }                                                           \
   1231         }                                                               \
   1232     }                                                                   \
   1233                                                                         \
   1234     void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
   1235                                                 Reg *d, Reg *v, Reg *s) \
   1236     {                                                                   \
   1237         uint16_t r[PACK_WIDTH];                                         \
   1238         int j, i;                                                       \
   1239                                                                         \
   1240         for (j = 0; j < 4 << SHIFT; ) {                                 \
   1241             int k = j + base * PACK_WIDTH / 2;                          \
   1242             for (i = 0; i < PACK_WIDTH / 2; i++) {                      \
   1243                 r[2 * i] = v->W(k + i);                                 \
   1244                 r[2 * i + 1] = s->W(k + i);                             \
   1245             }                                                           \
   1246             for (i = 0; i < PACK_WIDTH; i++, j++) {                     \
   1247                 d->W(j) = r[i];                                         \
   1248             }                                                           \
   1249         }                                                               \
   1250     }                                                                   \
   1251                                                                         \
   1252     void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
   1253                                                 Reg *d, Reg *v, Reg *s) \
   1254     {                                                                   \
   1255         uint32_t r[PACK_WIDTH / 2];                                     \
   1256         int j, i;                                                       \
   1257                                                                         \
   1258         for (j = 0; j < 2 << SHIFT; ) {                                 \
   1259             int k = j + base * PACK_WIDTH / 4;                          \
   1260             for (i = 0; i < PACK_WIDTH / 4; i++) {                      \
   1261                 r[2 * i] = v->L(k + i);                                 \
   1262                 r[2 * i + 1] = s->L(k + i);                             \
   1263             }                                                           \
   1264             for (i = 0; i < PACK_WIDTH / 2; i++, j++) {                 \
   1265                 d->L(j) = r[i];                                         \
   1266             }                                                           \
   1267         }                                                               \
   1268     }                                                                   \
   1269                                                                         \
   1270     XMM_ONLY(                                                           \
   1271              void glue(helper_punpck ## base_name ## qdq, SUFFIX)(      \
   1272                         CPUX86State *env, Reg *d, Reg *v, Reg *s)       \
   1273              {                                                          \
   1274                  uint64_t r[2];                                         \
   1275                  int i;                                                 \
   1276                                                                         \
   1277                  for (i = 0; i < 1 << SHIFT; i += 2) {                  \
   1278                      r[0] = v->Q(base + i);                             \
   1279                      r[1] = s->Q(base + i);                             \
   1280                      d->Q(i) = r[0];                                    \
   1281                      d->Q(i + 1) = r[1];                                \
   1282                  }                                                      \
   1283              }                                                          \
   1284                                                                         )
   1285 
   1286 UNPCK_OP(l, 0)
   1287 UNPCK_OP(h, 1)
   1288 
   1289 #undef PACK_WIDTH
   1290 #undef PACK_HELPER_B
   1291 #undef UNPCK_OP
   1292 
   1293 
   1294 /* 3DNow! float ops */
   1295 #if SHIFT == 0
   1296 void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
   1297 {
   1298     d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
   1299     d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
   1300 }
   1301 
   1302 void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s)
   1303 {
   1304     d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
   1305     d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
   1306 }
   1307 
   1308 void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s)
   1309 {
   1310     d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
   1311     d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
   1312 }
   1313 
   1314 void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s)
   1315 {
   1316     d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0),
   1317                                                        &env->mmx_status));
   1318     d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1),
   1319                                                        &env->mmx_status));
   1320 }
   1321 
   1322 void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s)
   1323 {
   1324     float32 r;
   1325 
   1326     r = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
   1327     d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
   1328     d->MMX_S(0) = r;
   1329 }
   1330 
   1331 void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s)
   1332 {
   1333     d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
   1334     d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
   1335 }
   1336 
   1337 void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s)
   1338 {
   1339     d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0),
   1340                                    &env->mmx_status) ? -1 : 0;
   1341     d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1),
   1342                                    &env->mmx_status) ? -1 : 0;
   1343 }
   1344 
   1345 void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s)
   1346 {
   1347     d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0),
   1348                              &env->mmx_status) ? -1 : 0;
   1349     d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1),
   1350                              &env->mmx_status) ? -1 : 0;
   1351 }
   1352 
   1353 void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s)
   1354 {
   1355     d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0),
   1356                              &env->mmx_status) ? -1 : 0;
   1357     d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1),
   1358                              &env->mmx_status) ? -1 : 0;
   1359 }
   1360 
   1361 void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s)
   1362 {
   1363     if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) {
   1364         d->MMX_S(0) = s->MMX_S(0);
   1365     }
   1366     if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) {
   1367         d->MMX_S(1) = s->MMX_S(1);
   1368     }
   1369 }
   1370 
   1371 void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s)
   1372 {
   1373     if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) {
   1374         d->MMX_S(0) = s->MMX_S(0);
   1375     }
   1376     if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) {
   1377         d->MMX_S(1) = s->MMX_S(1);
   1378     }
   1379 }
   1380 
   1381 void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s)
   1382 {
   1383     d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
   1384     d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
   1385 }
   1386 
   1387 void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
   1388 {
   1389     float32 r;
   1390 
   1391     r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
   1392     d->MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
   1393     d->MMX_S(0) = r;
   1394 }
   1395 
   1396 void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
   1397 {
   1398     float32 r;
   1399 
   1400     r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
   1401     d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
   1402     d->MMX_S(0) = r;
   1403 }
   1404 
   1405 void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s)
   1406 {
   1407     d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status);
   1408     d->MMX_S(1) = d->MMX_S(0);
   1409 }
   1410 
   1411 void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s)
   1412 {
   1413     d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
   1414     d->MMX_S(1) = float32_div(float32_one,
   1415                               float32_sqrt(d->MMX_S(1), &env->mmx_status),
   1416                               &env->mmx_status);
   1417     d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
   1418     d->MMX_L(0) = d->MMX_L(1);
   1419 }
   1420 
   1421 void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s)
   1422 {
   1423     d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
   1424     d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
   1425 }
   1426 
   1427 void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s)
   1428 {
   1429     d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
   1430     d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
   1431 }
   1432 
   1433 void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
   1434 {
   1435     uint32_t r;
   1436 
   1437     r = s->MMX_L(0);
   1438     d->MMX_L(0) = s->MMX_L(1);
   1439     d->MMX_L(1) = r;
   1440 }
   1441 #endif
   1442 
   1443 /* SSSE3 op helpers */
   1444 void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   1445 {
   1446     int i;
   1447 #if SHIFT == 0
   1448     uint8_t r[8];
   1449 
   1450     for (i = 0; i < 8; i++) {
   1451         r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7));
   1452     }
   1453     for (i = 0; i < 8; i++) {
   1454         d->B(i) = r[i];
   1455     }
   1456 #else
   1457     uint8_t r[8 << SHIFT];
   1458 
   1459     for (i = 0; i < 8 << SHIFT; i++) {
   1460         int j = i & ~0xf;
   1461         r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf));
   1462     }
   1463     for (i = 0; i < 8 << SHIFT; i++) {
   1464         d->B(i) = r[i];
   1465     }
   1466 #endif
   1467 }
   1468 
   1469 #define SSE_HELPER_HW(name, F)  \
   1470 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
   1471 {                                                          \
   1472     uint16_t r[4 << SHIFT];                                \
   1473     int i, j, k;                                           \
   1474     for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) {     \
   1475         for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
   1476             r[i + k] = F(v->W(j + k), v->W(j + k + 1));    \
   1477         }                                                  \
   1478         for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) {     \
   1479             r[i + k] = F(s->W(j + k), s->W(j + k + 1));    \
   1480         }                                                  \
   1481     }                                                      \
   1482     for (i = 0; i < 4 << SHIFT; i++) {                     \
   1483         d->W(i) = r[i];                                    \
   1484     }                                                      \
   1485 }
   1486 
   1487 #define SSE_HELPER_HL(name, F)  \
   1488 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
   1489 {                                                          \
   1490     uint32_t r[2 << SHIFT];                                \
   1491     int i, j, k;                                           \
   1492     for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) {     \
   1493         for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
   1494             r[i + k] = F(v->L(j + k), v->L(j + k + 1));    \
   1495         }                                                  \
   1496         for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) {     \
   1497             r[i + k] = F(s->L(j + k), s->L(j + k + 1));    \
   1498         }                                                  \
   1499     }                                                      \
   1500     for (i = 0; i < 2 << SHIFT; i++) {                     \
   1501         d->L(i) = r[i];                                    \
   1502     }                                                      \
   1503 }
   1504 
   1505 SSE_HELPER_HW(phaddw, FADD)
   1506 SSE_HELPER_HW(phsubw, FSUB)
   1507 SSE_HELPER_HW(phaddsw, FADDSW)
   1508 SSE_HELPER_HW(phsubsw, FSUBSW)
   1509 SSE_HELPER_HL(phaddd, FADD)
   1510 SSE_HELPER_HL(phsubd, FSUB)
   1511 
   1512 #undef SSE_HELPER_HW
   1513 #undef SSE_HELPER_HL
   1514 
   1515 void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   1516 {
   1517     int i;
   1518     for (i = 0; i < 4 << SHIFT; i++) {
   1519         d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) +
   1520                         (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1));
   1521     }
   1522 }
   1523 
   1524 #define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15)
   1525 SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
   1526 
   1527 #define FSIGNB(d, s) (s <= INT8_MAX  ? s ? d : 0 : -(int8_t)d)
   1528 #define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d)
   1529 #define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d)
   1530 SSE_HELPER_B(helper_psignb, FSIGNB)
   1531 SSE_HELPER_W(helper_psignw, FSIGNW)
   1532 SSE_HELPER_L(helper_psignd, FSIGNL)
   1533 
   1534 void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
   1535                                   uint32_t imm)
   1536 {
   1537     int i;
   1538 
   1539     /* XXX could be checked during translation */
   1540     if (imm >= (SHIFT ? 32 : 16)) {
   1541         for (i = 0; i < (1 << SHIFT); i++) {
   1542             d->Q(i) = 0;
   1543         }
   1544     } else {
   1545         int shift = imm * 8;
   1546 #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
   1547 #if SHIFT == 0
   1548         d->Q(0) = SHR(s->Q(0), shift - 0) |
   1549             SHR(v->Q(0), shift -  64);
   1550 #else
   1551         for (i = 0; i < (1 << SHIFT); i += 2) {
   1552             uint64_t r0, r1;
   1553 
   1554             r0 = SHR(s->Q(i), shift - 0) |
   1555                  SHR(s->Q(i + 1), shift -  64) |
   1556                  SHR(v->Q(i), shift - 128) |
   1557                  SHR(v->Q(i + 1), shift - 192);
   1558             r1 = SHR(s->Q(i), shift + 64) |
   1559                  SHR(s->Q(i + 1), shift -   0) |
   1560                  SHR(v->Q(i), shift -  64) |
   1561                  SHR(v->Q(i + 1), shift - 128);
   1562             d->Q(i) = r0;
   1563             d->Q(i + 1) = r1;
   1564         }
   1565 #endif
   1566 #undef SHR
   1567     }
   1568 }
   1569 
   1570 #if SHIFT >= 1
   1571 
   1572 #define SSE_HELPER_V(name, elem, num, F)                                \
   1573     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,   \
   1574                             Reg *m)                                     \
   1575     {                                                                   \
   1576         int i;                                                          \
   1577         for (i = 0; i < num; i++) {                                     \
   1578             d->elem(i) = F(v->elem(i), s->elem(i), m->elem(i));         \
   1579         }                                                               \
   1580     }
   1581 
   1582 #define SSE_HELPER_I(name, elem, num, F)                                \
   1583     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,   \
   1584                             uint32_t imm)                               \
   1585     {                                                                   \
   1586         int i;                                                          \
   1587         for (i = 0; i < num; i++) {                                     \
   1588             int j = i & 7;                                              \
   1589             d->elem(i) = F(v->elem(i), s->elem(i), (imm >> j) & 1);     \
   1590         }                                                               \
   1591     }
   1592 
   1593 /* SSE4.1 op helpers */
   1594 #define FBLENDVB(v, s, m) ((m & 0x80) ? s : v)
   1595 #define FBLENDVPS(v, s, m) ((m & 0x80000000) ? s : v)
   1596 #define FBLENDVPD(v, s, m) ((m & 0x8000000000000000LL) ? s : v)
   1597 SSE_HELPER_V(helper_pblendvb, B, 8 << SHIFT, FBLENDVB)
   1598 SSE_HELPER_V(helper_blendvps, L, 2 << SHIFT, FBLENDVPS)
   1599 SSE_HELPER_V(helper_blendvpd, Q, 1 << SHIFT, FBLENDVPD)
   1600 
   1601 void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1602 {
   1603     uint64_t zf = 0, cf = 0;
   1604     int i;
   1605 
   1606     for (i = 0; i < 1 << SHIFT; i++) {
   1607         zf |= (s->Q(i) &  d->Q(i));
   1608         cf |= (s->Q(i) & ~d->Q(i));
   1609     }
   1610     CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
   1611 }
   1612 
   1613 #define FMOVSLDUP(i) s->L((i) & ~1)
   1614 #define FMOVSHDUP(i) s->L((i) | 1)
   1615 #define FMOVDLDUP(i) s->Q((i) & ~1)
   1616 
   1617 #define SSE_HELPER_F(name, elem, num, F)                        \
   1618     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
   1619     {                                                           \
   1620         int n = num;                                            \
   1621         for (int i = n; --i >= 0; ) {                           \
   1622             d->elem(i) = F(i);                                  \
   1623         }                                                       \
   1624     }
   1625 
   1626 #if SHIFT > 0
   1627 SSE_HELPER_F(helper_pmovsxbw, W, 4 << SHIFT, (int8_t) s->B)
   1628 SSE_HELPER_F(helper_pmovsxbd, L, 2 << SHIFT, (int8_t) s->B)
   1629 SSE_HELPER_F(helper_pmovsxbq, Q, 1 << SHIFT, (int8_t) s->B)
   1630 SSE_HELPER_F(helper_pmovsxwd, L, 2 << SHIFT, (int16_t) s->W)
   1631 SSE_HELPER_F(helper_pmovsxwq, Q, 1 << SHIFT, (int16_t) s->W)
   1632 SSE_HELPER_F(helper_pmovsxdq, Q, 1 << SHIFT, (int32_t) s->L)
   1633 SSE_HELPER_F(helper_pmovzxbw, W, 4 << SHIFT, s->B)
   1634 SSE_HELPER_F(helper_pmovzxbd, L, 2 << SHIFT, s->B)
   1635 SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B)
   1636 SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W)
   1637 SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W)
   1638 SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L)
   1639 SSE_HELPER_F(helper_pmovsldup, L, 2 << SHIFT, FMOVSLDUP)
   1640 SSE_HELPER_F(helper_pmovshdup, L, 2 << SHIFT, FMOVSHDUP)
   1641 SSE_HELPER_F(helper_pmovdldup, Q, 1 << SHIFT, FMOVDLDUP)
   1642 #endif
   1643 
   1644 void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   1645 {
   1646     int i;
   1647 
   1648     for (i = 0; i < 1 << SHIFT; i++) {
   1649         d->Q(i) = (int64_t)(int32_t) v->L(2 * i) * (int32_t) s->L(2 * i);
   1650     }
   1651 }
   1652 
   1653 void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   1654 {
   1655     uint16_t r[8];
   1656     int i, j, k;
   1657 
   1658     for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) {
   1659         r[0] = satuw(v->L(j));
   1660         r[1] = satuw(v->L(j + 1));
   1661         r[2] = satuw(v->L(j + 2));
   1662         r[3] = satuw(v->L(j + 3));
   1663         r[4] = satuw(s->L(j));
   1664         r[5] = satuw(s->L(j + 1));
   1665         r[6] = satuw(s->L(j + 2));
   1666         r[7] = satuw(s->L(j + 3));
   1667         for (k = 0; k < 8; k++) {
   1668             d->W(i + k) = r[k];
   1669         }
   1670     }
   1671 }
   1672 
   1673 #if SHIFT == 1
   1674 void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1675 {
   1676     int idx = 0;
   1677 
   1678     if (s->W(1) < s->W(idx)) {
   1679         idx = 1;
   1680     }
   1681     if (s->W(2) < s->W(idx)) {
   1682         idx = 2;
   1683     }
   1684     if (s->W(3) < s->W(idx)) {
   1685         idx = 3;
   1686     }
   1687     if (s->W(4) < s->W(idx)) {
   1688         idx = 4;
   1689     }
   1690     if (s->W(5) < s->W(idx)) {
   1691         idx = 5;
   1692     }
   1693     if (s->W(6) < s->W(idx)) {
   1694         idx = 6;
   1695     }
   1696     if (s->W(7) < s->W(idx)) {
   1697         idx = 7;
   1698     }
   1699 
   1700     d->W(0) = s->W(idx);
   1701     d->W(1) = idx;
   1702     d->L(1) = 0;
   1703     d->Q(1) = 0;
   1704 }
   1705 #endif
   1706 
   1707 void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   1708                                   uint32_t mode)
   1709 {
   1710     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
   1711     signed char prev_rounding_mode;
   1712     int i;
   1713 
   1714     prev_rounding_mode = env->sse_status.float_rounding_mode;
   1715     if (!(mode & (1 << 2))) {
   1716         set_x86_rounding_mode(mode & 3, &env->sse_status);
   1717     }
   1718 
   1719     for (i = 0; i < 2 << SHIFT; i++) {
   1720         d->ZMM_S(i) = float32_round_to_int(s->ZMM_S(i), &env->sse_status);
   1721     }
   1722 
   1723     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
   1724         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
   1725                                   ~float_flag_inexact,
   1726                                   &env->sse_status);
   1727     }
   1728     env->sse_status.float_rounding_mode = prev_rounding_mode;
   1729 }
   1730 
   1731 void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   1732                                   uint32_t mode)
   1733 {
   1734     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
   1735     signed char prev_rounding_mode;
   1736     int i;
   1737 
   1738     prev_rounding_mode = env->sse_status.float_rounding_mode;
   1739     if (!(mode & (1 << 2))) {
   1740         set_x86_rounding_mode(mode & 3, &env->sse_status);
   1741     }
   1742 
   1743     for (i = 0; i < 1 << SHIFT; i++) {
   1744         d->ZMM_D(i) = float64_round_to_int(s->ZMM_D(i), &env->sse_status);
   1745     }
   1746 
   1747     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
   1748         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
   1749                                   ~float_flag_inexact,
   1750                                   &env->sse_status);
   1751     }
   1752     env->sse_status.float_rounding_mode = prev_rounding_mode;
   1753 }
   1754 
   1755 #if SHIFT == 1
   1756 void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
   1757                                   uint32_t mode)
   1758 {
   1759     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
   1760     signed char prev_rounding_mode;
   1761     int i;
   1762 
   1763     prev_rounding_mode = env->sse_status.float_rounding_mode;
   1764     if (!(mode & (1 << 2))) {
   1765         set_x86_rounding_mode(mode & 3, &env->sse_status);
   1766     }
   1767 
   1768     d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
   1769     for (i = 1; i < 2 << SHIFT; i++) {
   1770         d->ZMM_L(i) = v->ZMM_L(i);
   1771     }
   1772 
   1773     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
   1774         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
   1775                                   ~float_flag_inexact,
   1776                                   &env->sse_status);
   1777     }
   1778     env->sse_status.float_rounding_mode = prev_rounding_mode;
   1779 }
   1780 
   1781 void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
   1782                                   uint32_t mode)
   1783 {
   1784     uint8_t old_flags = get_float_exception_flags(&env->sse_status);
   1785     signed char prev_rounding_mode;
   1786     int i;
   1787 
   1788     prev_rounding_mode = env->sse_status.float_rounding_mode;
   1789     if (!(mode & (1 << 2))) {
   1790         set_x86_rounding_mode(mode & 3, &env->sse_status);
   1791     }
   1792 
   1793     d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
   1794     for (i = 1; i < 1 << SHIFT; i++) {
   1795         d->ZMM_Q(i) = v->ZMM_Q(i);
   1796     }
   1797 
   1798     if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
   1799         set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
   1800                                   ~float_flag_inexact,
   1801                                   &env->sse_status);
   1802     }
   1803     env->sse_status.float_rounding_mode = prev_rounding_mode;
   1804 }
   1805 #endif
   1806 
   1807 #define FBLENDP(v, s, m) (m ? s : v)
   1808 SSE_HELPER_I(helper_blendps, L, 2 << SHIFT, FBLENDP)
   1809 SSE_HELPER_I(helper_blendpd, Q, 1 << SHIFT, FBLENDP)
   1810 SSE_HELPER_I(helper_pblendw, W, 4 << SHIFT, FBLENDP)
   1811 
   1812 void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
   1813                                uint32_t mask)
   1814 {
   1815     float32 prod1, prod2, temp2, temp3, temp4;
   1816     int i;
   1817 
   1818     for (i = 0; i < 2 << SHIFT; i += 4) {
   1819         /*
   1820          * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D
   1821          * to correctly round the intermediate results
   1822          */
   1823         if (mask & (1 << 4)) {
   1824             prod1 = float32_mul(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
   1825         } else {
   1826             prod1 = float32_zero;
   1827         }
   1828         if (mask & (1 << 5)) {
   1829             prod2 = float32_mul(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
   1830         } else {
   1831             prod2 = float32_zero;
   1832         }
   1833         temp2 = float32_add(prod1, prod2, &env->sse_status);
   1834         if (mask & (1 << 6)) {
   1835             prod1 = float32_mul(v->ZMM_S(i+2), s->ZMM_S(i+2), &env->sse_status);
   1836         } else {
   1837             prod1 = float32_zero;
   1838         }
   1839         if (mask & (1 << 7)) {
   1840             prod2 = float32_mul(v->ZMM_S(i+3), s->ZMM_S(i+3), &env->sse_status);
   1841         } else {
   1842             prod2 = float32_zero;
   1843         }
   1844         temp3 = float32_add(prod1, prod2, &env->sse_status);
   1845         temp4 = float32_add(temp2, temp3, &env->sse_status);
   1846 
   1847         d->ZMM_S(i) = (mask & (1 << 0)) ? temp4 : float32_zero;
   1848         d->ZMM_S(i+1) = (mask & (1 << 1)) ? temp4 : float32_zero;
   1849         d->ZMM_S(i+2) = (mask & (1 << 2)) ? temp4 : float32_zero;
   1850         d->ZMM_S(i+3) = (mask & (1 << 3)) ? temp4 : float32_zero;
   1851     }
   1852 }
   1853 
   1854 #if SHIFT == 1
   1855 /* Oddly, there is no ymm version of dppd */
   1856 void glue(helper_dppd, SUFFIX)(CPUX86State *env,
   1857                                Reg *d, Reg *v, Reg *s, uint32_t mask)
   1858 {
   1859     float64 prod1, prod2, temp2;
   1860 
   1861     if (mask & (1 << 4)) {
   1862         prod1 = float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status);
   1863     } else {
   1864         prod1 = float64_zero;
   1865     }
   1866     if (mask & (1 << 5)) {
   1867         prod2 = float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status);
   1868     } else {
   1869         prod2 = float64_zero;
   1870     }
   1871     temp2 = float64_add(prod1, prod2, &env->sse_status);
   1872     d->ZMM_D(0) = (mask & (1 << 0)) ? temp2 : float64_zero;
   1873     d->ZMM_D(1) = (mask & (1 << 1)) ? temp2 : float64_zero;
   1874 }
   1875 #endif
   1876 
   1877 void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
   1878                                   uint32_t offset)
   1879 {
   1880     int i, j;
   1881     uint16_t r[8];
   1882 
   1883     for (j = 0; j < 4 << SHIFT; ) {
   1884         int s0 = (j * 2) + ((offset & 3) << 2);
   1885         int d0 = (j * 2) + ((offset & 4) << 0);
   1886         for (i = 0; i < LANE_WIDTH / 2; i++, d0++) {
   1887             r[i] = 0;
   1888             r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0));
   1889             r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1));
   1890             r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2));
   1891             r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3));
   1892         }
   1893         for (i = 0; i < LANE_WIDTH / 2; i++, j++) {
   1894             d->W(j) = r[i];
   1895         }
   1896         offset >>= 3;
   1897     }
   1898 }
   1899 
   1900 /* SSE4.2 op helpers */
   1901 #if SHIFT == 1
   1902 static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
   1903 {
   1904     target_long val, limit;
   1905 
   1906     /* Presence of REX.W is indicated by a bit higher than 7 set */
   1907     if (ctrl >> 8) {
   1908         val = (target_long)env->regs[reg];
   1909     } else {
   1910         val = (int32_t)env->regs[reg];
   1911     }
   1912     if (ctrl & 1) {
   1913         limit = 8;
   1914     } else {
   1915         limit = 16;
   1916     }
   1917     if ((val > limit) || (val < -limit)) {
   1918         return limit;
   1919     }
   1920     return abs1(val);
   1921 }
   1922 
   1923 static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
   1924 {
   1925     int val = 0;
   1926 
   1927     if (ctrl & 1) {
   1928         while (val < 8 && r->W(val)) {
   1929             val++;
   1930         }
   1931     } else {
   1932         while (val < 16 && r->B(val)) {
   1933             val++;
   1934         }
   1935     }
   1936 
   1937     return val;
   1938 }
   1939 
   1940 static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
   1941 {
   1942     switch ((ctrl >> 0) & 3) {
   1943     case 0:
   1944         return r->B(i);
   1945     case 1:
   1946         return r->W(i);
   1947     case 2:
   1948         return (int8_t)r->B(i);
   1949     case 3:
   1950     default:
   1951         return (int16_t)r->W(i);
   1952     }
   1953 }
   1954 
   1955 static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
   1956                                  uint8_t ctrl, int valids, int validd)
   1957 {
   1958     unsigned int res = 0;
   1959     int v;
   1960     int j, i;
   1961     int upper = (ctrl & 1) ? 7 : 15;
   1962 
   1963     valids--;
   1964     validd--;
   1965 
   1966     CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
   1967 
   1968     switch ((ctrl >> 2) & 3) {
   1969     case 0:
   1970         for (j = valids; j >= 0; j--) {
   1971             res <<= 1;
   1972             v = pcmp_val(s, ctrl, j);
   1973             for (i = validd; i >= 0; i--) {
   1974                 res |= (v == pcmp_val(d, ctrl, i));
   1975             }
   1976         }
   1977         break;
   1978     case 1:
   1979         for (j = valids; j >= 0; j--) {
   1980             res <<= 1;
   1981             v = pcmp_val(s, ctrl, j);
   1982             for (i = ((validd - 1) | 1); i >= 0; i -= 2) {
   1983                 res |= (pcmp_val(d, ctrl, i - 0) >= v &&
   1984                         pcmp_val(d, ctrl, i - 1) <= v);
   1985             }
   1986         }
   1987         break;
   1988     case 2:
   1989         res = (1 << (upper - MAX(valids, validd))) - 1;
   1990         res <<= MAX(valids, validd) - MIN(valids, validd);
   1991         for (i = MIN(valids, validd); i >= 0; i--) {
   1992             res <<= 1;
   1993             v = pcmp_val(s, ctrl, i);
   1994             res |= (v == pcmp_val(d, ctrl, i));
   1995         }
   1996         break;
   1997     case 3:
   1998         if (validd == -1) {
   1999             res = (2 << upper) - 1;
   2000             break;
   2001         }
   2002         for (j = valids == upper ? valids : valids - validd; j >= 0; j--) {
   2003             res <<= 1;
   2004             v = 1;
   2005             for (i = MIN(valids - j, validd); i >= 0; i--) {
   2006                 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
   2007             }
   2008             res |= v;
   2009         }
   2010         break;
   2011     }
   2012 
   2013     switch ((ctrl >> 4) & 3) {
   2014     case 1:
   2015         res ^= (2 << upper) - 1;
   2016         break;
   2017     case 3:
   2018         res ^= (1 << (valids + 1)) - 1;
   2019         break;
   2020     }
   2021 
   2022     if (res) {
   2023         CC_SRC |= CC_C;
   2024     }
   2025     if (res & 1) {
   2026         CC_SRC |= CC_O;
   2027     }
   2028 
   2029     return res;
   2030 }
   2031 
   2032 void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   2033                                     uint32_t ctrl)
   2034 {
   2035     unsigned int res = pcmpxstrx(env, d, s, ctrl,
   2036                                  pcmp_elen(env, R_EDX, ctrl),
   2037                                  pcmp_elen(env, R_EAX, ctrl));
   2038 
   2039     if (res) {
   2040         env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
   2041     } else {
   2042         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
   2043     }
   2044 }
   2045 
   2046 void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   2047                                     uint32_t ctrl)
   2048 {
   2049     int i;
   2050     unsigned int res = pcmpxstrx(env, d, s, ctrl,
   2051                                  pcmp_elen(env, R_EDX, ctrl),
   2052                                  pcmp_elen(env, R_EAX, ctrl));
   2053 
   2054     if ((ctrl >> 6) & 1) {
   2055         if (ctrl & 1) {
   2056             for (i = 0; i < 8; i++, res >>= 1) {
   2057                 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
   2058             }
   2059         } else {
   2060             for (i = 0; i < 16; i++, res >>= 1) {
   2061                 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
   2062             }
   2063         }
   2064     } else {
   2065         env->xmm_regs[0].Q(1) = 0;
   2066         env->xmm_regs[0].Q(0) = res;
   2067     }
   2068 }
   2069 
   2070 void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   2071                                     uint32_t ctrl)
   2072 {
   2073     unsigned int res = pcmpxstrx(env, d, s, ctrl,
   2074                                  pcmp_ilen(s, ctrl),
   2075                                  pcmp_ilen(d, ctrl));
   2076 
   2077     if (res) {
   2078         env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
   2079     } else {
   2080         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
   2081     }
   2082 }
   2083 
   2084 void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   2085                                     uint32_t ctrl)
   2086 {
   2087     int i;
   2088     unsigned int res = pcmpxstrx(env, d, s, ctrl,
   2089                                  pcmp_ilen(s, ctrl),
   2090                                  pcmp_ilen(d, ctrl));
   2091 
   2092     if ((ctrl >> 6) & 1) {
   2093         if (ctrl & 1) {
   2094             for (i = 0; i < 8; i++, res >>= 1) {
   2095                 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
   2096             }
   2097         } else {
   2098             for (i = 0; i < 16; i++, res >>= 1) {
   2099                 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
   2100             }
   2101         }
   2102     } else {
   2103         env->xmm_regs[0].Q(1) = 0;
   2104         env->xmm_regs[0].Q(0) = res;
   2105     }
   2106 }
   2107 
   2108 #define CRCPOLY        0x1edc6f41
   2109 #define CRCPOLY_BITREV 0x82f63b78
   2110 target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
   2111 {
   2112     target_ulong crc = (msg & ((target_ulong) -1 >>
   2113                                (TARGET_LONG_BITS - len))) ^ crc1;
   2114 
   2115     while (len--) {
   2116         crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
   2117     }
   2118 
   2119     return crc;
   2120 }
   2121 
   2122 #endif
   2123 
   2124 #if SHIFT == 1
   2125 static void clmulq(uint64_t *dest_l, uint64_t *dest_h,
   2126                           uint64_t a, uint64_t b)
   2127 {
   2128     uint64_t al, ah, resh, resl;
   2129 
   2130     ah = 0;
   2131     al = a;
   2132     resh = resl = 0;
   2133 
   2134     while (b) {
   2135         if (b & 1) {
   2136             resl ^= al;
   2137             resh ^= ah;
   2138         }
   2139         ah = (ah << 1) | (al >> 63);
   2140         al <<= 1;
   2141         b >>= 1;
   2142     }
   2143 
   2144     *dest_l = resl;
   2145     *dest_h = resh;
   2146 }
   2147 #endif
   2148 
   2149 void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
   2150                                     uint32_t ctrl)
   2151 {
   2152     uint64_t a, b;
   2153     int i;
   2154 
   2155     for (i = 0; i < 1 << SHIFT; i += 2) {
   2156         a = v->Q(((ctrl & 1) != 0) + i);
   2157         b = s->Q(((ctrl & 16) != 0) + i);
   2158         clmulq(&d->Q(i), &d->Q(i + 1), a, b);
   2159     }
   2160 }
   2161 
   2162 void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   2163 {
   2164     int i;
   2165     Reg st = *v;
   2166     Reg rk = *s;
   2167 
   2168     for (i = 0 ; i < 2 << SHIFT ; i++) {
   2169         int j = i & 3;
   2170         d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^
   2171                                     AES_Td1[st.B(AES_ishifts[4 * j + 1])] ^
   2172                                     AES_Td2[st.B(AES_ishifts[4 * j + 2])] ^
   2173                                     AES_Td3[st.B(AES_ishifts[4 * j + 3])]);
   2174     }
   2175 }
   2176 
   2177 void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   2178 {
   2179     int i;
   2180     Reg st = *v;
   2181     Reg rk = *s;
   2182 
   2183     for (i = 0; i < 8 << SHIFT; i++) {
   2184         d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]);
   2185     }
   2186 }
   2187 
   2188 void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   2189 {
   2190     int i;
   2191     Reg st = *v;
   2192     Reg rk = *s;
   2193 
   2194     for (i = 0 ; i < 2 << SHIFT ; i++) {
   2195         int j = i & 3;
   2196         d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^
   2197                                     AES_Te1[st.B(AES_shifts[4 * j + 1])] ^
   2198                                     AES_Te2[st.B(AES_shifts[4 * j + 2])] ^
   2199                                     AES_Te3[st.B(AES_shifts[4 * j + 3])]);
   2200     }
   2201 }
   2202 
   2203 void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   2204 {
   2205     int i;
   2206     Reg st = *v;
   2207     Reg rk = *s;
   2208 
   2209     for (i = 0; i < 8 << SHIFT; i++) {
   2210         d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]);
   2211     }
   2212 }
   2213 
   2214 #if SHIFT == 1
   2215 void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   2216 {
   2217     int i;
   2218     Reg tmp = *s;
   2219 
   2220     for (i = 0 ; i < 4 ; i++) {
   2221         d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^
   2222                           AES_imc[tmp.B(4 * i + 1)][1] ^
   2223                           AES_imc[tmp.B(4 * i + 2)][2] ^
   2224                           AES_imc[tmp.B(4 * i + 3)][3]);
   2225     }
   2226 }
   2227 
   2228 void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   2229                                           uint32_t ctrl)
   2230 {
   2231     int i;
   2232     Reg tmp = *s;
   2233 
   2234     for (i = 0 ; i < 4 ; i++) {
   2235         d->B(i) = AES_sbox[tmp.B(i + 4)];
   2236         d->B(i + 8) = AES_sbox[tmp.B(i + 12)];
   2237     }
   2238     d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl;
   2239     d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl;
   2240 }
   2241 #endif
   2242 #endif
   2243 
   2244 #if SHIFT >= 1
   2245 void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   2246 {
   2247     uint64_t r0, r1;
   2248     int i;
   2249 
   2250     for (i = 0; i < 1 << SHIFT; i += 2) {
   2251         r0 = v->Q(i + ((s->Q(i) >> 1) & 1));
   2252         r1 = v->Q(i + ((s->Q(i+1) >> 1) & 1));
   2253         d->Q(i) = r0;
   2254         d->Q(i+1) = r1;
   2255     }
   2256 }
   2257 
   2258 void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   2259 {
   2260     uint32_t r0, r1, r2, r3;
   2261     int i;
   2262 
   2263     for (i = 0; i < 2 << SHIFT; i += 4) {
   2264         r0 = v->L(i + (s->L(i) & 3));
   2265         r1 = v->L(i + (s->L(i+1) & 3));
   2266         r2 = v->L(i + (s->L(i+2) & 3));
   2267         r3 = v->L(i + (s->L(i+3) & 3));
   2268         d->L(i) = r0;
   2269         d->L(i+1) = r1;
   2270         d->L(i+2) = r2;
   2271         d->L(i+3) = r3;
   2272     }
   2273 }
   2274 
   2275 void glue(helper_vpermilpd_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order)
   2276 {
   2277     uint64_t r0, r1;
   2278     int i;
   2279 
   2280     for (i = 0; i < 1 << SHIFT; i += 2) {
   2281         r0 = s->Q(i + ((order >> 0) & 1));
   2282         r1 = s->Q(i + ((order >> 1) & 1));
   2283         d->Q(i) = r0;
   2284         d->Q(i+1) = r1;
   2285 
   2286         order >>= 2;
   2287     }
   2288 }
   2289 
   2290 void glue(helper_vpermilps_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order)
   2291 {
   2292     uint32_t r0, r1, r2, r3;
   2293     int i;
   2294 
   2295     for (i = 0; i < 2 << SHIFT; i += 4) {
   2296         r0 = s->L(i + ((order >> 0) & 3));
   2297         r1 = s->L(i + ((order >> 2) & 3));
   2298         r2 = s->L(i + ((order >> 4) & 3));
   2299         r3 = s->L(i + ((order >> 6) & 3));
   2300         d->L(i) = r0;
   2301         d->L(i+1) = r1;
   2302         d->L(i+2) = r2;
   2303         d->L(i+3) = r3;
   2304     }
   2305 }
   2306 
   2307 #if SHIFT == 1
   2308 #define FPSRLVD(x, c) (c < 32 ? ((x) >> c) : 0)
   2309 #define FPSRLVQ(x, c) (c < 64 ? ((x) >> c) : 0)
   2310 #define FPSRAVD(x, c) ((int32_t)(x) >> (c < 32 ? c : 31))
   2311 #define FPSRAVQ(x, c) ((int64_t)(x) >> (c < 64 ? c : 63))
   2312 #define FPSLLVD(x, c) (c < 32 ? ((x) << c) : 0)
   2313 #define FPSLLVQ(x, c) (c < 64 ? ((x) << c) : 0)
   2314 #endif
   2315 
   2316 SSE_HELPER_L(helper_vpsrlvd, FPSRLVD)
   2317 SSE_HELPER_L(helper_vpsravd, FPSRAVD)
   2318 SSE_HELPER_L(helper_vpsllvd, FPSLLVD)
   2319 
   2320 SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ)
   2321 SSE_HELPER_Q(helper_vpsravq, FPSRAVQ)
   2322 SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ)
   2323 
   2324 void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   2325 {
   2326     uint32_t zf = 0, cf = 0;
   2327     int i;
   2328 
   2329     for (i = 0; i < 2 << SHIFT; i++) {
   2330         zf |= (s->L(i) &  d->L(i));
   2331         cf |= (s->L(i) & ~d->L(i));
   2332     }
   2333     CC_SRC = ((zf >> 31) ? 0 : CC_Z) | ((cf >> 31) ? 0 : CC_C);
   2334 }
   2335 
   2336 void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   2337 {
   2338     uint64_t zf = 0, cf = 0;
   2339     int i;
   2340 
   2341     for (i = 0; i < 1 << SHIFT; i++) {
   2342         zf |= (s->Q(i) &  d->Q(i));
   2343         cf |= (s->Q(i) & ~d->Q(i));
   2344     }
   2345     CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C);
   2346 }
   2347 
   2348 void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env,
   2349                                         Reg *v, Reg *s, target_ulong a0)
   2350 {
   2351     int i;
   2352 
   2353     for (i = 0; i < (2 << SHIFT); i++) {
   2354         if (v->L(i) >> 31) {
   2355             cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC());
   2356         }
   2357     }
   2358 }
   2359 
   2360 void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env,
   2361                                         Reg *v, Reg *s, target_ulong a0)
   2362 {
   2363     int i;
   2364 
   2365     for (i = 0; i < (1 << SHIFT); i++) {
   2366         if (v->Q(i) >> 63) {
   2367             cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC());
   2368         }
   2369     }
   2370 }
   2371 
   2372 void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   2373 {
   2374     int i;
   2375 
   2376     for (i = 0; i < (2 << SHIFT); i++) {
   2377         d->L(i) = (v->L(i) >> 31) ? s->L(i) : 0;
   2378     }
   2379 }
   2380 
   2381 void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
   2382 {
   2383     int i;
   2384 
   2385     for (i = 0; i < (1 << SHIFT); i++) {
   2386         d->Q(i) = (v->Q(i) >> 63) ? s->Q(i) : 0;
   2387     }
   2388 }
   2389 
   2390 void glue(helper_vpgatherdd, SUFFIX)(CPUX86State *env,
   2391         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
   2392 {
   2393     int i;
   2394     for (i = 0; i < (2 << SHIFT); i++) {
   2395         if (v->L(i) >> 31) {
   2396             target_ulong addr = a0
   2397                 + ((target_ulong)(int32_t)s->L(i) << scale);
   2398             d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
   2399         }
   2400         v->L(i) = 0;
   2401     }
   2402 }
   2403 
   2404 void glue(helper_vpgatherdq, SUFFIX)(CPUX86State *env,
   2405         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
   2406 {
   2407     int i;
   2408     for (i = 0; i < (1 << SHIFT); i++) {
   2409         if (v->Q(i) >> 63) {
   2410             target_ulong addr = a0
   2411                 + ((target_ulong)(int32_t)s->L(i) << scale);
   2412             d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());
   2413         }
   2414         v->Q(i) = 0;
   2415     }
   2416 }
   2417 
   2418 void glue(helper_vpgatherqd, SUFFIX)(CPUX86State *env,
   2419         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
   2420 {
   2421     int i;
   2422     for (i = 0; i < (1 << SHIFT); i++) {
   2423         if (v->L(i) >> 31) {
   2424             target_ulong addr = a0
   2425                 + ((target_ulong)(int64_t)s->Q(i) << scale);
   2426             d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());
   2427         }
   2428         v->L(i) = 0;
   2429     }
   2430     for (i /= 2; i < 1 << SHIFT; i++) {
   2431         d->Q(i) = 0;
   2432         v->Q(i) = 0;
   2433     }
   2434 }
   2435 
   2436 void glue(helper_vpgatherqq, SUFFIX)(CPUX86State *env,
   2437         Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale)
   2438 {
   2439     int i;
   2440     for (i = 0; i < (1 << SHIFT); i++) {
   2441         if (v->Q(i) >> 63) {
   2442             target_ulong addr = a0
   2443                 + ((target_ulong)(int64_t)s->Q(i) << scale);
   2444             d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());
   2445         }
   2446         v->Q(i) = 0;
   2447     }
   2448 }
   2449 #endif
   2450 
   2451 #if SHIFT >= 2
   2452 void helper_vpermdq_ymm(Reg *d, Reg *v, Reg *s, uint32_t order)
   2453 {
   2454     uint64_t r0, r1, r2, r3;
   2455 
   2456     switch (order & 3) {
   2457     case 0:
   2458         r0 = v->Q(0);
   2459         r1 = v->Q(1);
   2460         break;
   2461     case 1:
   2462         r0 = v->Q(2);
   2463         r1 = v->Q(3);
   2464         break;
   2465     case 2:
   2466         r0 = s->Q(0);
   2467         r1 = s->Q(1);
   2468         break;
   2469     case 3:
   2470         r0 = s->Q(2);
   2471         r1 = s->Q(3);
   2472         break;
   2473     }
   2474     switch ((order >> 4) & 3) {
   2475     case 0:
   2476         r2 = v->Q(0);
   2477         r3 = v->Q(1);
   2478         break;
   2479     case 1:
   2480         r2 = v->Q(2);
   2481         r3 = v->Q(3);
   2482         break;
   2483     case 2:
   2484         r2 = s->Q(0);
   2485         r3 = s->Q(1);
   2486         break;
   2487     case 3:
   2488         r2 = s->Q(2);
   2489         r3 = s->Q(3);
   2490         break;
   2491     }
   2492     d->Q(0) = r0;
   2493     d->Q(1) = r1;
   2494     d->Q(2) = r2;
   2495     d->Q(3) = r3;
   2496 }
   2497 
   2498 void helper_vpermq_ymm(Reg *d, Reg *s, uint32_t order)
   2499 {
   2500     uint64_t r0, r1, r2, r3;
   2501     r0 = s->Q(order & 3);
   2502     r1 = s->Q((order >> 2) & 3);
   2503     r2 = s->Q((order >> 4) & 3);
   2504     r3 = s->Q((order >> 6) & 3);
   2505     d->Q(0) = r0;
   2506     d->Q(1) = r1;
   2507     d->Q(2) = r2;
   2508     d->Q(3) = r3;
   2509 }
   2510 
   2511 void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
   2512 {
   2513     uint32_t r[8];
   2514     int i;
   2515 
   2516     for (i = 0; i < 8; i++) {
   2517         r[i] = s->L(v->L(i) & 7);
   2518     }
   2519     for (i = 0; i < 8; i++) {
   2520         d->L(i) = r[i];
   2521     }
   2522 }
   2523 #endif
   2524 
   2525 /* FMA3 op helpers */
   2526 #if SHIFT == 1
   2527 #define SSE_HELPER_FMAS(name, elem, F)                                         \
   2528     void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c, int flags)     \
   2529     {                                                                          \
   2530         d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0), flags, &env->sse_status); \
   2531     }
   2532 #define SSE_HELPER_FMAP(name, elem, num, F)                                    \
   2533     void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c,  \
   2534                             int flags, int flip)                               \
   2535     {                                                                          \
   2536         int i;                                                                 \
   2537         for (i = 0; i < num; i++) {                                            \
   2538             d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i), flags, &env->sse_status); \
   2539             flags ^= flip;                                                     \
   2540         }                                                                      \
   2541     }
   2542 
   2543 SSE_HELPER_FMAS(helper_fma4ss,  ZMM_S, float32_muladd)
   2544 SSE_HELPER_FMAS(helper_fma4sd,  ZMM_D, float64_muladd)
   2545 #endif
   2546 
   2547 #if SHIFT >= 1
   2548 SSE_HELPER_FMAP(helper_fma4ps,  ZMM_S, 2 << SHIFT, float32_muladd)
   2549 SSE_HELPER_FMAP(helper_fma4pd,  ZMM_D, 1 << SHIFT, float64_muladd)
   2550 #endif
   2551 
   2552 #undef SSE_HELPER_S
   2553 
   2554 #undef LANE_WIDTH
   2555 #undef SHIFT
   2556 #undef XMM_ONLY
   2557 #undef Reg
   2558 #undef B
   2559 #undef W
   2560 #undef L
   2561 #undef Q
   2562 #undef SUFFIX