qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

tcg-op-gvec.c (121876B)


      1 /*
      2  * Generic vector operation expansion
      3  *
      4  * Copyright (c) 2018 Linaro
      5  *
      6  * This library is free software; you can redistribute it and/or
      7  * modify it under the terms of the GNU Lesser General Public
      8  * License as published by the Free Software Foundation; either
      9  * version 2.1 of the License, or (at your option) any later version.
     10  *
     11  * This library is distributed in the hope that it will be useful,
     12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14  * Lesser General Public License for more details.
     15  *
     16  * You should have received a copy of the GNU Lesser General Public
     17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
     18  */
     19 
     20 #include "qemu/osdep.h"
     21 #include "tcg/tcg.h"
     22 #include "tcg/tcg-op.h"
     23 #include "tcg/tcg-op-gvec.h"
     24 #include "qemu/main-loop.h"
     25 #include "tcg/tcg-gvec-desc.h"
     26 
     27 #define MAX_UNROLL  4
     28 
     29 #ifdef CONFIG_DEBUG_TCG
     30 static const TCGOpcode vecop_list_empty[1] = { 0 };
     31 #else
     32 #define vecop_list_empty NULL
     33 #endif
     34 
     35 
     36 /* Verify vector size and alignment rules.  OFS should be the OR of all
     37    of the operand offsets so that we can check them all at once.  */
     38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
     39 {
     40     uint32_t max_align;
     41 
     42     switch (oprsz) {
     43     case 8:
     44     case 16:
     45     case 32:
     46         tcg_debug_assert(oprsz <= maxsz);
     47         break;
     48     default:
     49         tcg_debug_assert(oprsz == maxsz);
     50         break;
     51     }
     52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
     53 
     54     max_align = maxsz >= 16 ? 15 : 7;
     55     tcg_debug_assert((maxsz & max_align) == 0);
     56     tcg_debug_assert((ofs & max_align) == 0);
     57 }
     58 
     59 /* Verify vector overlap rules for two operands.  */
     60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
     61 {
     62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
     63 }
     64 
     65 /* Verify vector overlap rules for three operands.  */
     66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
     67 {
     68     check_overlap_2(d, a, s);
     69     check_overlap_2(d, b, s);
     70     check_overlap_2(a, b, s);
     71 }
     72 
     73 /* Verify vector overlap rules for four operands.  */
     74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
     75                             uint32_t c, uint32_t s)
     76 {
     77     check_overlap_2(d, a, s);
     78     check_overlap_2(d, b, s);
     79     check_overlap_2(d, c, s);
     80     check_overlap_2(a, b, s);
     81     check_overlap_2(a, c, s);
     82     check_overlap_2(b, c, s);
     83 }
     84 
     85 /* Create a descriptor from components.  */
     86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
     87 {
     88     uint32_t desc = 0;
     89 
     90     check_size_align(oprsz, maxsz, 0);
     91     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
     92 
     93     oprsz = (oprsz / 8) - 1;
     94     maxsz = (maxsz / 8) - 1;
     95 
     96     /*
     97      * We have just asserted in check_size_align that either
     98      * oprsz is {8,16,32} or matches maxsz.  Encode the final
     99      * case with '2', as that would otherwise map to 24.
    100      */
    101     if (oprsz == maxsz) {
    102         oprsz = 2;
    103     }
    104 
    105     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
    106     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
    107     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
    108 
    109     return desc;
    110 }
    111 
    112 /* Generate a call to a gvec-style helper with two vector operands.  */
    113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
    114                         uint32_t oprsz, uint32_t maxsz, int32_t data,
    115                         gen_helper_gvec_2 *fn)
    116 {
    117     TCGv_ptr a0, a1;
    118     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
    119 
    120     a0 = tcg_temp_new_ptr();
    121     a1 = tcg_temp_new_ptr();
    122 
    123     tcg_gen_addi_ptr(a0, cpu_env, dofs);
    124     tcg_gen_addi_ptr(a1, cpu_env, aofs);
    125 
    126     fn(a0, a1, desc);
    127 
    128     tcg_temp_free_ptr(a0);
    129     tcg_temp_free_ptr(a1);
    130 }
    131 
    132 /* Generate a call to a gvec-style helper with two vector operands
    133    and one scalar operand.  */
    134 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
    135                          uint32_t oprsz, uint32_t maxsz, int32_t data,
    136                          gen_helper_gvec_2i *fn)
    137 {
    138     TCGv_ptr a0, a1;
    139     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
    140 
    141     a0 = tcg_temp_new_ptr();
    142     a1 = tcg_temp_new_ptr();
    143 
    144     tcg_gen_addi_ptr(a0, cpu_env, dofs);
    145     tcg_gen_addi_ptr(a1, cpu_env, aofs);
    146 
    147     fn(a0, a1, c, desc);
    148 
    149     tcg_temp_free_ptr(a0);
    150     tcg_temp_free_ptr(a1);
    151 }
    152 
    153 /* Generate a call to a gvec-style helper with three vector operands.  */
    154 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    155                         uint32_t oprsz, uint32_t maxsz, int32_t data,
    156                         gen_helper_gvec_3 *fn)
    157 {
    158     TCGv_ptr a0, a1, a2;
    159     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
    160 
    161     a0 = tcg_temp_new_ptr();
    162     a1 = tcg_temp_new_ptr();
    163     a2 = tcg_temp_new_ptr();
    164 
    165     tcg_gen_addi_ptr(a0, cpu_env, dofs);
    166     tcg_gen_addi_ptr(a1, cpu_env, aofs);
    167     tcg_gen_addi_ptr(a2, cpu_env, bofs);
    168 
    169     fn(a0, a1, a2, desc);
    170 
    171     tcg_temp_free_ptr(a0);
    172     tcg_temp_free_ptr(a1);
    173     tcg_temp_free_ptr(a2);
    174 }
    175 
    176 /* Generate a call to a gvec-style helper with four vector operands.  */
    177 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    178                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
    179                         int32_t data, gen_helper_gvec_4 *fn)
    180 {
    181     TCGv_ptr a0, a1, a2, a3;
    182     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
    183 
    184     a0 = tcg_temp_new_ptr();
    185     a1 = tcg_temp_new_ptr();
    186     a2 = tcg_temp_new_ptr();
    187     a3 = tcg_temp_new_ptr();
    188 
    189     tcg_gen_addi_ptr(a0, cpu_env, dofs);
    190     tcg_gen_addi_ptr(a1, cpu_env, aofs);
    191     tcg_gen_addi_ptr(a2, cpu_env, bofs);
    192     tcg_gen_addi_ptr(a3, cpu_env, cofs);
    193 
    194     fn(a0, a1, a2, a3, desc);
    195 
    196     tcg_temp_free_ptr(a0);
    197     tcg_temp_free_ptr(a1);
    198     tcg_temp_free_ptr(a2);
    199     tcg_temp_free_ptr(a3);
    200 }
    201 
    202 /* Generate a call to a gvec-style helper with five vector operands.  */
    203 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    204                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
    205                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
    206 {
    207     TCGv_ptr a0, a1, a2, a3, a4;
    208     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
    209 
    210     a0 = tcg_temp_new_ptr();
    211     a1 = tcg_temp_new_ptr();
    212     a2 = tcg_temp_new_ptr();
    213     a3 = tcg_temp_new_ptr();
    214     a4 = tcg_temp_new_ptr();
    215 
    216     tcg_gen_addi_ptr(a0, cpu_env, dofs);
    217     tcg_gen_addi_ptr(a1, cpu_env, aofs);
    218     tcg_gen_addi_ptr(a2, cpu_env, bofs);
    219     tcg_gen_addi_ptr(a3, cpu_env, cofs);
    220     tcg_gen_addi_ptr(a4, cpu_env, xofs);
    221 
    222     fn(a0, a1, a2, a3, a4, desc);
    223 
    224     tcg_temp_free_ptr(a0);
    225     tcg_temp_free_ptr(a1);
    226     tcg_temp_free_ptr(a2);
    227     tcg_temp_free_ptr(a3);
    228     tcg_temp_free_ptr(a4);
    229 }
    230 
    231 /* Generate a call to a gvec-style helper with three vector operands
    232    and an extra pointer operand.  */
    233 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
    234                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
    235                         int32_t data, gen_helper_gvec_2_ptr *fn)
    236 {
    237     TCGv_ptr a0, a1;
    238     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
    239 
    240     a0 = tcg_temp_new_ptr();
    241     a1 = tcg_temp_new_ptr();
    242 
    243     tcg_gen_addi_ptr(a0, cpu_env, dofs);
    244     tcg_gen_addi_ptr(a1, cpu_env, aofs);
    245 
    246     fn(a0, a1, ptr, desc);
    247 
    248     tcg_temp_free_ptr(a0);
    249     tcg_temp_free_ptr(a1);
    250 }
    251 
    252 /* Generate a call to a gvec-style helper with three vector operands
    253    and an extra pointer operand.  */
    254 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    255                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
    256                         int32_t data, gen_helper_gvec_3_ptr *fn)
    257 {
    258     TCGv_ptr a0, a1, a2;
    259     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
    260 
    261     a0 = tcg_temp_new_ptr();
    262     a1 = tcg_temp_new_ptr();
    263     a2 = tcg_temp_new_ptr();
    264 
    265     tcg_gen_addi_ptr(a0, cpu_env, dofs);
    266     tcg_gen_addi_ptr(a1, cpu_env, aofs);
    267     tcg_gen_addi_ptr(a2, cpu_env, bofs);
    268 
    269     fn(a0, a1, a2, ptr, desc);
    270 
    271     tcg_temp_free_ptr(a0);
    272     tcg_temp_free_ptr(a1);
    273     tcg_temp_free_ptr(a2);
    274 }
    275 
    276 /* Generate a call to a gvec-style helper with four vector operands
    277    and an extra pointer operand.  */
    278 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    279                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
    280                         uint32_t maxsz, int32_t data,
    281                         gen_helper_gvec_4_ptr *fn)
    282 {
    283     TCGv_ptr a0, a1, a2, a3;
    284     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
    285 
    286     a0 = tcg_temp_new_ptr();
    287     a1 = tcg_temp_new_ptr();
    288     a2 = tcg_temp_new_ptr();
    289     a3 = tcg_temp_new_ptr();
    290 
    291     tcg_gen_addi_ptr(a0, cpu_env, dofs);
    292     tcg_gen_addi_ptr(a1, cpu_env, aofs);
    293     tcg_gen_addi_ptr(a2, cpu_env, bofs);
    294     tcg_gen_addi_ptr(a3, cpu_env, cofs);
    295 
    296     fn(a0, a1, a2, a3, ptr, desc);
    297 
    298     tcg_temp_free_ptr(a0);
    299     tcg_temp_free_ptr(a1);
    300     tcg_temp_free_ptr(a2);
    301     tcg_temp_free_ptr(a3);
    302 }
    303 
    304 /* Generate a call to a gvec-style helper with five vector operands
    305    and an extra pointer operand.  */
    306 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    307                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
    308                         uint32_t oprsz, uint32_t maxsz, int32_t data,
    309                         gen_helper_gvec_5_ptr *fn)
    310 {
    311     TCGv_ptr a0, a1, a2, a3, a4;
    312     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
    313 
    314     a0 = tcg_temp_new_ptr();
    315     a1 = tcg_temp_new_ptr();
    316     a2 = tcg_temp_new_ptr();
    317     a3 = tcg_temp_new_ptr();
    318     a4 = tcg_temp_new_ptr();
    319 
    320     tcg_gen_addi_ptr(a0, cpu_env, dofs);
    321     tcg_gen_addi_ptr(a1, cpu_env, aofs);
    322     tcg_gen_addi_ptr(a2, cpu_env, bofs);
    323     tcg_gen_addi_ptr(a3, cpu_env, cofs);
    324     tcg_gen_addi_ptr(a4, cpu_env, eofs);
    325 
    326     fn(a0, a1, a2, a3, a4, ptr, desc);
    327 
    328     tcg_temp_free_ptr(a0);
    329     tcg_temp_free_ptr(a1);
    330     tcg_temp_free_ptr(a2);
    331     tcg_temp_free_ptr(a3);
    332     tcg_temp_free_ptr(a4);
    333 }
    334 
    335 /* Return true if we want to implement something of OPRSZ bytes
    336    in units of LNSZ.  This limits the expansion of inline code.  */
    337 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
    338 {
    339     uint32_t q, r;
    340 
    341     if (oprsz < lnsz) {
    342         return false;
    343     }
    344 
    345     q = oprsz / lnsz;
    346     r = oprsz % lnsz;
    347     tcg_debug_assert((r & 7) == 0);
    348 
    349     if (lnsz < 16) {
    350         /* For sizes below 16, accept no remainder. */
    351         if (r != 0) {
    352             return false;
    353         }
    354     } else {
    355         /*
    356          * Recall that ARM SVE allows vector sizes that are not a
    357          * power of 2, but always a multiple of 16.  The intent is
    358          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
    359          * In addition, expand_clr needs to handle a multiple of 8.
    360          * Thus we can handle the tail with one more operation per
    361          * diminishing power of 2.
    362          */
    363         q += ctpop32(r);
    364     }
    365 
    366     return q <= MAX_UNROLL;
    367 }
    368 
    369 static void expand_clr(uint32_t dofs, uint32_t maxsz);
    370 
    371 /* Duplicate C as per VECE.  */
    372 uint64_t (dup_const)(unsigned vece, uint64_t c)
    373 {
    374     switch (vece) {
    375     case MO_8:
    376         return 0x0101010101010101ull * (uint8_t)c;
    377     case MO_16:
    378         return 0x0001000100010001ull * (uint16_t)c;
    379     case MO_32:
    380         return 0x0000000100000001ull * (uint32_t)c;
    381     case MO_64:
    382         return c;
    383     default:
    384         g_assert_not_reached();
    385     }
    386 }
    387 
    388 /* Duplicate IN into OUT as per VECE.  */
    389 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
    390 {
    391     switch (vece) {
    392     case MO_8:
    393         tcg_gen_ext8u_i32(out, in);
    394         tcg_gen_muli_i32(out, out, 0x01010101);
    395         break;
    396     case MO_16:
    397         tcg_gen_deposit_i32(out, in, in, 16, 16);
    398         break;
    399     case MO_32:
    400         tcg_gen_mov_i32(out, in);
    401         break;
    402     default:
    403         g_assert_not_reached();
    404     }
    405 }
    406 
    407 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
    408 {
    409     switch (vece) {
    410     case MO_8:
    411         tcg_gen_ext8u_i64(out, in);
    412         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
    413         break;
    414     case MO_16:
    415         tcg_gen_ext16u_i64(out, in);
    416         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
    417         break;
    418     case MO_32:
    419         tcg_gen_deposit_i64(out, in, in, 32, 32);
    420         break;
    421     case MO_64:
    422         tcg_gen_mov_i64(out, in);
    423         break;
    424     default:
    425         g_assert_not_reached();
    426     }
    427 }
    428 
    429 /* Select a supported vector type for implementing an operation on SIZE
    430  * bytes.  If OP is 0, assume that the real operation to be performed is
    431  * required by all backends.  Otherwise, make sure than OP can be performed
    432  * on elements of size VECE in the selected type.  Do not select V64 if
    433  * PREFER_I64 is true.  Return 0 if no vector type is selected.
    434  */
    435 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
    436                                   uint32_t size, bool prefer_i64)
    437 {
    438     /*
    439      * Recall that ARM SVE allows vector sizes that are not a
    440      * power of 2, but always a multiple of 16.  The intent is
    441      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
    442      * It is hard to imagine a case in which v256 is supported
    443      * but v128 is not, but check anyway.
    444      * In addition, expand_clr needs to handle a multiple of 8.
    445      */
    446     if (TCG_TARGET_HAS_v256 &&
    447         check_size_impl(size, 32) &&
    448         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
    449         (!(size & 16) ||
    450          (TCG_TARGET_HAS_v128 &&
    451           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
    452         (!(size & 8) ||
    453          (TCG_TARGET_HAS_v64 &&
    454           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
    455         return TCG_TYPE_V256;
    456     }
    457     if (TCG_TARGET_HAS_v128 &&
    458         check_size_impl(size, 16) &&
    459         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
    460         (!(size & 8) ||
    461          (TCG_TARGET_HAS_v64 &&
    462           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
    463         return TCG_TYPE_V128;
    464     }
    465     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
    466         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
    467         return TCG_TYPE_V64;
    468     }
    469     return 0;
    470 }
    471 
    472 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
    473                          uint32_t maxsz, TCGv_vec t_vec)
    474 {
    475     uint32_t i = 0;
    476 
    477     tcg_debug_assert(oprsz >= 8);
    478 
    479     /*
    480      * This may be expand_clr for the tail of an operation, e.g.
    481      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
    482      * are misaligned wrt the maximum vector size, so do that first.
    483      */
    484     if (dofs & 8) {
    485         tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
    486         i += 8;
    487     }
    488 
    489     switch (type) {
    490     case TCG_TYPE_V256:
    491         /*
    492          * Recall that ARM SVE allows vector sizes that are not a
    493          * power of 2, but always a multiple of 16.  The intent is
    494          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
    495          */
    496         for (; i + 32 <= oprsz; i += 32) {
    497             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
    498         }
    499         /* fallthru */
    500     case TCG_TYPE_V128:
    501         for (; i + 16 <= oprsz; i += 16) {
    502             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
    503         }
    504         break;
    505     case TCG_TYPE_V64:
    506         for (; i < oprsz; i += 8) {
    507             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
    508         }
    509         break;
    510     default:
    511         g_assert_not_reached();
    512     }
    513 
    514     if (oprsz < maxsz) {
    515         expand_clr(dofs + oprsz, maxsz - oprsz);
    516     }
    517 }
    518 
    519 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
    520  * Only one of IN_32 or IN_64 may be set;
    521  * IN_C is used if IN_32 and IN_64 are unset.
    522  */
    523 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
    524                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
    525                    uint64_t in_c)
    526 {
    527     TCGType type;
    528     TCGv_i64 t_64;
    529     TCGv_i32 t_32, t_desc;
    530     TCGv_ptr t_ptr;
    531     uint32_t i;
    532 
    533     assert(vece <= (in_32 ? MO_32 : MO_64));
    534     assert(in_32 == NULL || in_64 == NULL);
    535 
    536     /* If we're storing 0, expand oprsz to maxsz.  */
    537     if (in_32 == NULL && in_64 == NULL) {
    538         in_c = dup_const(vece, in_c);
    539         if (in_c == 0) {
    540             oprsz = maxsz;
    541             vece = MO_8;
    542         } else if (in_c == dup_const(MO_8, in_c)) {
    543             vece = MO_8;
    544         }
    545     }
    546 
    547     /* Implement inline with a vector type, if possible.
    548      * Prefer integer when 64-bit host and no variable dup.
    549      */
    550     type = choose_vector_type(NULL, vece, oprsz,
    551                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
    552                                && (in_64 == NULL || vece == MO_64)));
    553     if (type != 0) {
    554         TCGv_vec t_vec = tcg_temp_new_vec(type);
    555 
    556         if (in_32) {
    557             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
    558         } else if (in_64) {
    559             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
    560         } else {
    561             tcg_gen_dupi_vec(vece, t_vec, in_c);
    562         }
    563         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
    564         tcg_temp_free_vec(t_vec);
    565         return;
    566     }
    567 
    568     /* Otherwise, inline with an integer type, unless "large".  */
    569     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
    570         t_64 = NULL;
    571         t_32 = NULL;
    572 
    573         if (in_32) {
    574             /* We are given a 32-bit variable input.  For a 64-bit host,
    575                use a 64-bit operation unless the 32-bit operation would
    576                be simple enough.  */
    577             if (TCG_TARGET_REG_BITS == 64
    578                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
    579                 t_64 = tcg_temp_new_i64();
    580                 tcg_gen_extu_i32_i64(t_64, in_32);
    581                 tcg_gen_dup_i64(vece, t_64, t_64);
    582             } else {
    583                 t_32 = tcg_temp_new_i32();
    584                 tcg_gen_dup_i32(vece, t_32, in_32);
    585             }
    586         } else if (in_64) {
    587             /* We are given a 64-bit variable input.  */
    588             t_64 = tcg_temp_new_i64();
    589             tcg_gen_dup_i64(vece, t_64, in_64);
    590         } else {
    591             /* We are given a constant input.  */
    592             /* For 64-bit hosts, use 64-bit constants for "simple" constants
    593                or when we'd need too many 32-bit stores, or when a 64-bit
    594                constant is really required.  */
    595             if (vece == MO_64
    596                 || (TCG_TARGET_REG_BITS == 64
    597                     && (in_c == 0 || in_c == -1
    598                         || !check_size_impl(oprsz, 4)))) {
    599                 t_64 = tcg_constant_i64(in_c);
    600             } else {
    601                 t_32 = tcg_constant_i32(in_c);
    602             }
    603         }
    604 
    605         /* Implement inline if we picked an implementation size above.  */
    606         if (t_32) {
    607             for (i = 0; i < oprsz; i += 4) {
    608                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
    609             }
    610             tcg_temp_free_i32(t_32);
    611             goto done;
    612         }
    613         if (t_64) {
    614             for (i = 0; i < oprsz; i += 8) {
    615                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
    616             }
    617             tcg_temp_free_i64(t_64);
    618             goto done;
    619         }
    620     }
    621 
    622     /* Otherwise implement out of line.  */
    623     t_ptr = tcg_temp_new_ptr();
    624     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
    625 
    626     /*
    627      * This may be expand_clr for the tail of an operation, e.g.
    628      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
    629      * wrt simd_desc and will assert.  Simply pass all replicated byte
    630      * stores through to memset.
    631      */
    632     if (oprsz == maxsz && vece == MO_8) {
    633         TCGv_ptr t_size = tcg_const_ptr(oprsz);
    634         TCGv_i32 t_val;
    635 
    636         if (in_32) {
    637             t_val = in_32;
    638         } else if (in_64) {
    639             t_val = tcg_temp_new_i32();
    640             tcg_gen_extrl_i64_i32(t_val, in_64);
    641         } else {
    642             t_val = tcg_constant_i32(in_c);
    643         }
    644         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
    645 
    646         if (in_64) {
    647             tcg_temp_free_i32(t_val);
    648         }
    649         tcg_temp_free_ptr(t_size);
    650         tcg_temp_free_ptr(t_ptr);
    651         return;
    652     }
    653 
    654     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
    655 
    656     if (vece == MO_64) {
    657         if (in_64) {
    658             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
    659         } else {
    660             t_64 = tcg_constant_i64(in_c);
    661             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
    662         }
    663     } else {
    664         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
    665         static dup_fn * const fns[3] = {
    666             gen_helper_gvec_dup8,
    667             gen_helper_gvec_dup16,
    668             gen_helper_gvec_dup32
    669         };
    670 
    671         if (in_32) {
    672             fns[vece](t_ptr, t_desc, in_32);
    673         } else if (in_64) {
    674             t_32 = tcg_temp_new_i32();
    675             tcg_gen_extrl_i64_i32(t_32, in_64);
    676             fns[vece](t_ptr, t_desc, t_32);
    677             tcg_temp_free_i32(t_32);
    678         } else {
    679             if (vece == MO_8) {
    680                 in_c &= 0xff;
    681             } else if (vece == MO_16) {
    682                 in_c &= 0xffff;
    683             }
    684             t_32 = tcg_constant_i32(in_c);
    685             fns[vece](t_ptr, t_desc, t_32);
    686         }
    687     }
    688 
    689     tcg_temp_free_ptr(t_ptr);
    690     return;
    691 
    692  done:
    693     if (oprsz < maxsz) {
    694         expand_clr(dofs + oprsz, maxsz - oprsz);
    695     }
    696 }
    697 
    698 /* Likewise, but with zero.  */
    699 static void expand_clr(uint32_t dofs, uint32_t maxsz)
    700 {
    701     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
    702 }
    703 
    704 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
    705 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
    706                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
    707 {
    708     TCGv_i32 t0 = tcg_temp_new_i32();
    709     TCGv_i32 t1 = tcg_temp_new_i32();
    710     uint32_t i;
    711 
    712     for (i = 0; i < oprsz; i += 4) {
    713         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
    714         if (load_dest) {
    715             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
    716         }
    717         fni(t1, t0);
    718         tcg_gen_st_i32(t1, cpu_env, dofs + i);
    719     }
    720     tcg_temp_free_i32(t0);
    721     tcg_temp_free_i32(t1);
    722 }
    723 
    724 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
    725                           int32_t c, bool load_dest,
    726                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
    727 {
    728     TCGv_i32 t0 = tcg_temp_new_i32();
    729     TCGv_i32 t1 = tcg_temp_new_i32();
    730     uint32_t i;
    731 
    732     for (i = 0; i < oprsz; i += 4) {
    733         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
    734         if (load_dest) {
    735             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
    736         }
    737         fni(t1, t0, c);
    738         tcg_gen_st_i32(t1, cpu_env, dofs + i);
    739     }
    740     tcg_temp_free_i32(t0);
    741     tcg_temp_free_i32(t1);
    742 }
    743 
    744 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
    745                           TCGv_i32 c, bool scalar_first,
    746                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
    747 {
    748     TCGv_i32 t0 = tcg_temp_new_i32();
    749     TCGv_i32 t1 = tcg_temp_new_i32();
    750     uint32_t i;
    751 
    752     for (i = 0; i < oprsz; i += 4) {
    753         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
    754         if (scalar_first) {
    755             fni(t1, c, t0);
    756         } else {
    757             fni(t1, t0, c);
    758         }
    759         tcg_gen_st_i32(t1, cpu_env, dofs + i);
    760     }
    761     tcg_temp_free_i32(t0);
    762     tcg_temp_free_i32(t1);
    763 }
    764 
    765 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
    766 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
    767                          uint32_t bofs, uint32_t oprsz, bool load_dest,
    768                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
    769 {
    770     TCGv_i32 t0 = tcg_temp_new_i32();
    771     TCGv_i32 t1 = tcg_temp_new_i32();
    772     TCGv_i32 t2 = tcg_temp_new_i32();
    773     uint32_t i;
    774 
    775     for (i = 0; i < oprsz; i += 4) {
    776         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
    777         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
    778         if (load_dest) {
    779             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
    780         }
    781         fni(t2, t0, t1);
    782         tcg_gen_st_i32(t2, cpu_env, dofs + i);
    783     }
    784     tcg_temp_free_i32(t2);
    785     tcg_temp_free_i32(t1);
    786     tcg_temp_free_i32(t0);
    787 }
    788 
    789 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    790                           uint32_t oprsz, int32_t c, bool load_dest,
    791                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
    792 {
    793     TCGv_i32 t0 = tcg_temp_new_i32();
    794     TCGv_i32 t1 = tcg_temp_new_i32();
    795     TCGv_i32 t2 = tcg_temp_new_i32();
    796     uint32_t i;
    797 
    798     for (i = 0; i < oprsz; i += 4) {
    799         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
    800         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
    801         if (load_dest) {
    802             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
    803         }
    804         fni(t2, t0, t1, c);
    805         tcg_gen_st_i32(t2, cpu_env, dofs + i);
    806     }
    807     tcg_temp_free_i32(t0);
    808     tcg_temp_free_i32(t1);
    809     tcg_temp_free_i32(t2);
    810 }
    811 
    812 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
    813 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    814                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
    815                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
    816 {
    817     TCGv_i32 t0 = tcg_temp_new_i32();
    818     TCGv_i32 t1 = tcg_temp_new_i32();
    819     TCGv_i32 t2 = tcg_temp_new_i32();
    820     TCGv_i32 t3 = tcg_temp_new_i32();
    821     uint32_t i;
    822 
    823     for (i = 0; i < oprsz; i += 4) {
    824         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
    825         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
    826         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
    827         fni(t0, t1, t2, t3);
    828         tcg_gen_st_i32(t0, cpu_env, dofs + i);
    829         if (write_aofs) {
    830             tcg_gen_st_i32(t1, cpu_env, aofs + i);
    831         }
    832     }
    833     tcg_temp_free_i32(t3);
    834     tcg_temp_free_i32(t2);
    835     tcg_temp_free_i32(t1);
    836     tcg_temp_free_i32(t0);
    837 }
    838 
    839 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    840                           uint32_t cofs, uint32_t oprsz, int32_t c,
    841                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32,
    842                                       int32_t))
    843 {
    844     TCGv_i32 t0 = tcg_temp_new_i32();
    845     TCGv_i32 t1 = tcg_temp_new_i32();
    846     TCGv_i32 t2 = tcg_temp_new_i32();
    847     TCGv_i32 t3 = tcg_temp_new_i32();
    848     uint32_t i;
    849 
    850     for (i = 0; i < oprsz; i += 4) {
    851         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
    852         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
    853         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
    854         fni(t0, t1, t2, t3, c);
    855         tcg_gen_st_i32(t0, cpu_env, dofs + i);
    856     }
    857     tcg_temp_free_i32(t3);
    858     tcg_temp_free_i32(t2);
    859     tcg_temp_free_i32(t1);
    860     tcg_temp_free_i32(t0);
    861 }
    862 
    863 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
    864 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
    865                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
    866 {
    867     TCGv_i64 t0 = tcg_temp_new_i64();
    868     TCGv_i64 t1 = tcg_temp_new_i64();
    869     uint32_t i;
    870 
    871     for (i = 0; i < oprsz; i += 8) {
    872         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
    873         if (load_dest) {
    874             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
    875         }
    876         fni(t1, t0);
    877         tcg_gen_st_i64(t1, cpu_env, dofs + i);
    878     }
    879     tcg_temp_free_i64(t0);
    880     tcg_temp_free_i64(t1);
    881 }
    882 
    883 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
    884                           int64_t c, bool load_dest,
    885                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
    886 {
    887     TCGv_i64 t0 = tcg_temp_new_i64();
    888     TCGv_i64 t1 = tcg_temp_new_i64();
    889     uint32_t i;
    890 
    891     for (i = 0; i < oprsz; i += 8) {
    892         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
    893         if (load_dest) {
    894             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
    895         }
    896         fni(t1, t0, c);
    897         tcg_gen_st_i64(t1, cpu_env, dofs + i);
    898     }
    899     tcg_temp_free_i64(t0);
    900     tcg_temp_free_i64(t1);
    901 }
    902 
    903 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
    904                           TCGv_i64 c, bool scalar_first,
    905                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
    906 {
    907     TCGv_i64 t0 = tcg_temp_new_i64();
    908     TCGv_i64 t1 = tcg_temp_new_i64();
    909     uint32_t i;
    910 
    911     for (i = 0; i < oprsz; i += 8) {
    912         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
    913         if (scalar_first) {
    914             fni(t1, c, t0);
    915         } else {
    916             fni(t1, t0, c);
    917         }
    918         tcg_gen_st_i64(t1, cpu_env, dofs + i);
    919     }
    920     tcg_temp_free_i64(t0);
    921     tcg_temp_free_i64(t1);
    922 }
    923 
    924 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
    925 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
    926                          uint32_t bofs, uint32_t oprsz, bool load_dest,
    927                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
    928 {
    929     TCGv_i64 t0 = tcg_temp_new_i64();
    930     TCGv_i64 t1 = tcg_temp_new_i64();
    931     TCGv_i64 t2 = tcg_temp_new_i64();
    932     uint32_t i;
    933 
    934     for (i = 0; i < oprsz; i += 8) {
    935         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
    936         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
    937         if (load_dest) {
    938             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
    939         }
    940         fni(t2, t0, t1);
    941         tcg_gen_st_i64(t2, cpu_env, dofs + i);
    942     }
    943     tcg_temp_free_i64(t2);
    944     tcg_temp_free_i64(t1);
    945     tcg_temp_free_i64(t0);
    946 }
    947 
    948 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    949                           uint32_t oprsz, int64_t c, bool load_dest,
    950                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
    951 {
    952     TCGv_i64 t0 = tcg_temp_new_i64();
    953     TCGv_i64 t1 = tcg_temp_new_i64();
    954     TCGv_i64 t2 = tcg_temp_new_i64();
    955     uint32_t i;
    956 
    957     for (i = 0; i < oprsz; i += 8) {
    958         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
    959         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
    960         if (load_dest) {
    961             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
    962         }
    963         fni(t2, t0, t1, c);
    964         tcg_gen_st_i64(t2, cpu_env, dofs + i);
    965     }
    966     tcg_temp_free_i64(t0);
    967     tcg_temp_free_i64(t1);
    968     tcg_temp_free_i64(t2);
    969 }
    970 
    971 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
    972 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    973                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
    974                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
    975 {
    976     TCGv_i64 t0 = tcg_temp_new_i64();
    977     TCGv_i64 t1 = tcg_temp_new_i64();
    978     TCGv_i64 t2 = tcg_temp_new_i64();
    979     TCGv_i64 t3 = tcg_temp_new_i64();
    980     uint32_t i;
    981 
    982     for (i = 0; i < oprsz; i += 8) {
    983         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
    984         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
    985         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
    986         fni(t0, t1, t2, t3);
    987         tcg_gen_st_i64(t0, cpu_env, dofs + i);
    988         if (write_aofs) {
    989             tcg_gen_st_i64(t1, cpu_env, aofs + i);
    990         }
    991     }
    992     tcg_temp_free_i64(t3);
    993     tcg_temp_free_i64(t2);
    994     tcg_temp_free_i64(t1);
    995     tcg_temp_free_i64(t0);
    996 }
    997 
    998 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    999                           uint32_t cofs, uint32_t oprsz, int64_t c,
   1000                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64,
   1001                                       int64_t))
   1002 {
   1003     TCGv_i64 t0 = tcg_temp_new_i64();
   1004     TCGv_i64 t1 = tcg_temp_new_i64();
   1005     TCGv_i64 t2 = tcg_temp_new_i64();
   1006     TCGv_i64 t3 = tcg_temp_new_i64();
   1007     uint32_t i;
   1008 
   1009     for (i = 0; i < oprsz; i += 8) {
   1010         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
   1011         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
   1012         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
   1013         fni(t0, t1, t2, t3, c);
   1014         tcg_gen_st_i64(t0, cpu_env, dofs + i);
   1015     }
   1016     tcg_temp_free_i64(t3);
   1017     tcg_temp_free_i64(t2);
   1018     tcg_temp_free_i64(t1);
   1019     tcg_temp_free_i64(t0);
   1020 }
   1021 
   1022 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
   1023 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
   1024                          uint32_t oprsz, uint32_t tysz, TCGType type,
   1025                          bool load_dest,
   1026                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
   1027 {
   1028     TCGv_vec t0 = tcg_temp_new_vec(type);
   1029     TCGv_vec t1 = tcg_temp_new_vec(type);
   1030     uint32_t i;
   1031 
   1032     for (i = 0; i < oprsz; i += tysz) {
   1033         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
   1034         if (load_dest) {
   1035             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
   1036         }
   1037         fni(vece, t1, t0);
   1038         tcg_gen_st_vec(t1, cpu_env, dofs + i);
   1039     }
   1040     tcg_temp_free_vec(t0);
   1041     tcg_temp_free_vec(t1);
   1042 }
   1043 
   1044 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
   1045    using host vectors.  */
   1046 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
   1047                           uint32_t oprsz, uint32_t tysz, TCGType type,
   1048                           int64_t c, bool load_dest,
   1049                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
   1050 {
   1051     TCGv_vec t0 = tcg_temp_new_vec(type);
   1052     TCGv_vec t1 = tcg_temp_new_vec(type);
   1053     uint32_t i;
   1054 
   1055     for (i = 0; i < oprsz; i += tysz) {
   1056         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
   1057         if (load_dest) {
   1058             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
   1059         }
   1060         fni(vece, t1, t0, c);
   1061         tcg_gen_st_vec(t1, cpu_env, dofs + i);
   1062     }
   1063     tcg_temp_free_vec(t0);
   1064     tcg_temp_free_vec(t1);
   1065 }
   1066 
   1067 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
   1068                           uint32_t oprsz, uint32_t tysz, TCGType type,
   1069                           TCGv_vec c, bool scalar_first,
   1070                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
   1071 {
   1072     TCGv_vec t0 = tcg_temp_new_vec(type);
   1073     TCGv_vec t1 = tcg_temp_new_vec(type);
   1074     uint32_t i;
   1075 
   1076     for (i = 0; i < oprsz; i += tysz) {
   1077         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
   1078         if (scalar_first) {
   1079             fni(vece, t1, c, t0);
   1080         } else {
   1081             fni(vece, t1, t0, c);
   1082         }
   1083         tcg_gen_st_vec(t1, cpu_env, dofs + i);
   1084     }
   1085     tcg_temp_free_vec(t0);
   1086     tcg_temp_free_vec(t1);
   1087 }
   1088 
   1089 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
   1090 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
   1091                          uint32_t bofs, uint32_t oprsz,
   1092                          uint32_t tysz, TCGType type, bool load_dest,
   1093                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
   1094 {
   1095     TCGv_vec t0 = tcg_temp_new_vec(type);
   1096     TCGv_vec t1 = tcg_temp_new_vec(type);
   1097     TCGv_vec t2 = tcg_temp_new_vec(type);
   1098     uint32_t i;
   1099 
   1100     for (i = 0; i < oprsz; i += tysz) {
   1101         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
   1102         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
   1103         if (load_dest) {
   1104             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
   1105         }
   1106         fni(vece, t2, t0, t1);
   1107         tcg_gen_st_vec(t2, cpu_env, dofs + i);
   1108     }
   1109     tcg_temp_free_vec(t2);
   1110     tcg_temp_free_vec(t1);
   1111     tcg_temp_free_vec(t0);
   1112 }
   1113 
   1114 /*
   1115  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
   1116  * using host vectors.
   1117  */
   1118 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
   1119                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
   1120                           TCGType type, int64_t c, bool load_dest,
   1121                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
   1122                                       int64_t))
   1123 {
   1124     TCGv_vec t0 = tcg_temp_new_vec(type);
   1125     TCGv_vec t1 = tcg_temp_new_vec(type);
   1126     TCGv_vec t2 = tcg_temp_new_vec(type);
   1127     uint32_t i;
   1128 
   1129     for (i = 0; i < oprsz; i += tysz) {
   1130         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
   1131         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
   1132         if (load_dest) {
   1133             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
   1134         }
   1135         fni(vece, t2, t0, t1, c);
   1136         tcg_gen_st_vec(t2, cpu_env, dofs + i);
   1137     }
   1138     tcg_temp_free_vec(t0);
   1139     tcg_temp_free_vec(t1);
   1140     tcg_temp_free_vec(t2);
   1141 }
   1142 
   1143 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
   1144 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
   1145                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
   1146                          uint32_t tysz, TCGType type, bool write_aofs,
   1147                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
   1148                                      TCGv_vec, TCGv_vec))
   1149 {
   1150     TCGv_vec t0 = tcg_temp_new_vec(type);
   1151     TCGv_vec t1 = tcg_temp_new_vec(type);
   1152     TCGv_vec t2 = tcg_temp_new_vec(type);
   1153     TCGv_vec t3 = tcg_temp_new_vec(type);
   1154     uint32_t i;
   1155 
   1156     for (i = 0; i < oprsz; i += tysz) {
   1157         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
   1158         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
   1159         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
   1160         fni(vece, t0, t1, t2, t3);
   1161         tcg_gen_st_vec(t0, cpu_env, dofs + i);
   1162         if (write_aofs) {
   1163             tcg_gen_st_vec(t1, cpu_env, aofs + i);
   1164         }
   1165     }
   1166     tcg_temp_free_vec(t3);
   1167     tcg_temp_free_vec(t2);
   1168     tcg_temp_free_vec(t1);
   1169     tcg_temp_free_vec(t0);
   1170 }
   1171 
   1172 /*
   1173  * Expand OPSZ bytes worth of four-vector operands and an immediate operand
   1174  * using host vectors.
   1175  */
   1176 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
   1177                           uint32_t bofs, uint32_t cofs, uint32_t oprsz,
   1178                           uint32_t tysz, TCGType type, int64_t c,
   1179                           void (*fni)(unsigned, TCGv_vec, TCGv_vec,
   1180                                      TCGv_vec, TCGv_vec, int64_t))
   1181 {
   1182     TCGv_vec t0 = tcg_temp_new_vec(type);
   1183     TCGv_vec t1 = tcg_temp_new_vec(type);
   1184     TCGv_vec t2 = tcg_temp_new_vec(type);
   1185     TCGv_vec t3 = tcg_temp_new_vec(type);
   1186     uint32_t i;
   1187 
   1188     for (i = 0; i < oprsz; i += tysz) {
   1189         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
   1190         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
   1191         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
   1192         fni(vece, t0, t1, t2, t3, c);
   1193         tcg_gen_st_vec(t0, cpu_env, dofs + i);
   1194     }
   1195     tcg_temp_free_vec(t3);
   1196     tcg_temp_free_vec(t2);
   1197     tcg_temp_free_vec(t1);
   1198     tcg_temp_free_vec(t0);
   1199 }
   1200 
   1201 /* Expand a vector two-operand operation.  */
   1202 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
   1203                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
   1204 {
   1205     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
   1206     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
   1207     TCGType type;
   1208     uint32_t some;
   1209 
   1210     check_size_align(oprsz, maxsz, dofs | aofs);
   1211     check_overlap_2(dofs, aofs, maxsz);
   1212 
   1213     type = 0;
   1214     if (g->fniv) {
   1215         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
   1216     }
   1217     switch (type) {
   1218     case TCG_TYPE_V256:
   1219         /* Recall that ARM SVE allows vector sizes that are not a
   1220          * power of 2, but always a multiple of 16.  The intent is
   1221          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
   1222          */
   1223         some = QEMU_ALIGN_DOWN(oprsz, 32);
   1224         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
   1225                      g->load_dest, g->fniv);
   1226         if (some == oprsz) {
   1227             break;
   1228         }
   1229         dofs += some;
   1230         aofs += some;
   1231         oprsz -= some;
   1232         maxsz -= some;
   1233         /* fallthru */
   1234     case TCG_TYPE_V128:
   1235         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
   1236                      g->load_dest, g->fniv);
   1237         break;
   1238     case TCG_TYPE_V64:
   1239         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
   1240                      g->load_dest, g->fniv);
   1241         break;
   1242 
   1243     case 0:
   1244         if (g->fni8 && check_size_impl(oprsz, 8)) {
   1245             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
   1246         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
   1247             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
   1248         } else {
   1249             assert(g->fno != NULL);
   1250             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
   1251             oprsz = maxsz;
   1252         }
   1253         break;
   1254 
   1255     default:
   1256         g_assert_not_reached();
   1257     }
   1258     tcg_swap_vecop_list(hold_list);
   1259 
   1260     if (oprsz < maxsz) {
   1261         expand_clr(dofs + oprsz, maxsz - oprsz);
   1262     }
   1263 }
   1264 
   1265 /* Expand a vector operation with two vectors and an immediate.  */
   1266 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
   1267                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
   1268 {
   1269     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
   1270     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
   1271     TCGType type;
   1272     uint32_t some;
   1273 
   1274     check_size_align(oprsz, maxsz, dofs | aofs);
   1275     check_overlap_2(dofs, aofs, maxsz);
   1276 
   1277     type = 0;
   1278     if (g->fniv) {
   1279         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
   1280     }
   1281     switch (type) {
   1282     case TCG_TYPE_V256:
   1283         /* Recall that ARM SVE allows vector sizes that are not a
   1284          * power of 2, but always a multiple of 16.  The intent is
   1285          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
   1286          */
   1287         some = QEMU_ALIGN_DOWN(oprsz, 32);
   1288         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
   1289                       c, g->load_dest, g->fniv);
   1290         if (some == oprsz) {
   1291             break;
   1292         }
   1293         dofs += some;
   1294         aofs += some;
   1295         oprsz -= some;
   1296         maxsz -= some;
   1297         /* fallthru */
   1298     case TCG_TYPE_V128:
   1299         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
   1300                       c, g->load_dest, g->fniv);
   1301         break;
   1302     case TCG_TYPE_V64:
   1303         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
   1304                       c, g->load_dest, g->fniv);
   1305         break;
   1306 
   1307     case 0:
   1308         if (g->fni8 && check_size_impl(oprsz, 8)) {
   1309             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
   1310         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
   1311             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
   1312         } else {
   1313             if (g->fno) {
   1314                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
   1315             } else {
   1316                 TCGv_i64 tcg_c = tcg_constant_i64(c);
   1317                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
   1318                                     maxsz, c, g->fnoi);
   1319             }
   1320             oprsz = maxsz;
   1321         }
   1322         break;
   1323 
   1324     default:
   1325         g_assert_not_reached();
   1326     }
   1327     tcg_swap_vecop_list(hold_list);
   1328 
   1329     if (oprsz < maxsz) {
   1330         expand_clr(dofs + oprsz, maxsz - oprsz);
   1331     }
   1332 }
   1333 
   1334 /* Expand a vector operation with two vectors and a scalar.  */
   1335 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
   1336                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
   1337 {
   1338     TCGType type;
   1339 
   1340     check_size_align(oprsz, maxsz, dofs | aofs);
   1341     check_overlap_2(dofs, aofs, maxsz);
   1342 
   1343     type = 0;
   1344     if (g->fniv) {
   1345         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
   1346     }
   1347     if (type != 0) {
   1348         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
   1349         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
   1350         TCGv_vec t_vec = tcg_temp_new_vec(type);
   1351         uint32_t some;
   1352 
   1353         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
   1354 
   1355         switch (type) {
   1356         case TCG_TYPE_V256:
   1357             /* Recall that ARM SVE allows vector sizes that are not a
   1358              * power of 2, but always a multiple of 16.  The intent is
   1359              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
   1360              */
   1361             some = QEMU_ALIGN_DOWN(oprsz, 32);
   1362             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
   1363                           t_vec, g->scalar_first, g->fniv);
   1364             if (some == oprsz) {
   1365                 break;
   1366             }
   1367             dofs += some;
   1368             aofs += some;
   1369             oprsz -= some;
   1370             maxsz -= some;
   1371             /* fallthru */
   1372 
   1373         case TCG_TYPE_V128:
   1374             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
   1375                           t_vec, g->scalar_first, g->fniv);
   1376             break;
   1377 
   1378         case TCG_TYPE_V64:
   1379             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
   1380                           t_vec, g->scalar_first, g->fniv);
   1381             break;
   1382 
   1383         default:
   1384             g_assert_not_reached();
   1385         }
   1386         tcg_temp_free_vec(t_vec);
   1387         tcg_swap_vecop_list(hold_list);
   1388     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
   1389         TCGv_i64 t64 = tcg_temp_new_i64();
   1390 
   1391         tcg_gen_dup_i64(g->vece, t64, c);
   1392         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
   1393         tcg_temp_free_i64(t64);
   1394     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
   1395         TCGv_i32 t32 = tcg_temp_new_i32();
   1396 
   1397         tcg_gen_extrl_i64_i32(t32, c);
   1398         tcg_gen_dup_i32(g->vece, t32, t32);
   1399         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
   1400         tcg_temp_free_i32(t32);
   1401     } else {
   1402         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
   1403         return;
   1404     }
   1405 
   1406     if (oprsz < maxsz) {
   1407         expand_clr(dofs + oprsz, maxsz - oprsz);
   1408     }
   1409 }
   1410 
   1411 /* Expand a vector three-operand operation.  */
   1412 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
   1413                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
   1414 {
   1415     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
   1416     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
   1417     TCGType type;
   1418     uint32_t some;
   1419 
   1420     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
   1421     check_overlap_3(dofs, aofs, bofs, maxsz);
   1422 
   1423     type = 0;
   1424     if (g->fniv) {
   1425         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
   1426     }
   1427     switch (type) {
   1428     case TCG_TYPE_V256:
   1429         /* Recall that ARM SVE allows vector sizes that are not a
   1430          * power of 2, but always a multiple of 16.  The intent is
   1431          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
   1432          */
   1433         some = QEMU_ALIGN_DOWN(oprsz, 32);
   1434         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
   1435                      g->load_dest, g->fniv);
   1436         if (some == oprsz) {
   1437             break;
   1438         }
   1439         dofs += some;
   1440         aofs += some;
   1441         bofs += some;
   1442         oprsz -= some;
   1443         maxsz -= some;
   1444         /* fallthru */
   1445     case TCG_TYPE_V128:
   1446         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
   1447                      g->load_dest, g->fniv);
   1448         break;
   1449     case TCG_TYPE_V64:
   1450         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
   1451                      g->load_dest, g->fniv);
   1452         break;
   1453 
   1454     case 0:
   1455         if (g->fni8 && check_size_impl(oprsz, 8)) {
   1456             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
   1457         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
   1458             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
   1459         } else {
   1460             assert(g->fno != NULL);
   1461             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
   1462                                maxsz, g->data, g->fno);
   1463             oprsz = maxsz;
   1464         }
   1465         break;
   1466 
   1467     default:
   1468         g_assert_not_reached();
   1469     }
   1470     tcg_swap_vecop_list(hold_list);
   1471 
   1472     if (oprsz < maxsz) {
   1473         expand_clr(dofs + oprsz, maxsz - oprsz);
   1474     }
   1475 }
   1476 
   1477 /* Expand a vector operation with three vectors and an immediate.  */
   1478 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
   1479                      uint32_t oprsz, uint32_t maxsz, int64_t c,
   1480                      const GVecGen3i *g)
   1481 {
   1482     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
   1483     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
   1484     TCGType type;
   1485     uint32_t some;
   1486 
   1487     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
   1488     check_overlap_3(dofs, aofs, bofs, maxsz);
   1489 
   1490     type = 0;
   1491     if (g->fniv) {
   1492         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
   1493     }
   1494     switch (type) {
   1495     case TCG_TYPE_V256:
   1496         /*
   1497          * Recall that ARM SVE allows vector sizes that are not a
   1498          * power of 2, but always a multiple of 16.  The intent is
   1499          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
   1500          */
   1501         some = QEMU_ALIGN_DOWN(oprsz, 32);
   1502         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
   1503                       c, g->load_dest, g->fniv);
   1504         if (some == oprsz) {
   1505             break;
   1506         }
   1507         dofs += some;
   1508         aofs += some;
   1509         bofs += some;
   1510         oprsz -= some;
   1511         maxsz -= some;
   1512         /* fallthru */
   1513     case TCG_TYPE_V128:
   1514         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
   1515                       c, g->load_dest, g->fniv);
   1516         break;
   1517     case TCG_TYPE_V64:
   1518         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
   1519                       c, g->load_dest, g->fniv);
   1520         break;
   1521 
   1522     case 0:
   1523         if (g->fni8 && check_size_impl(oprsz, 8)) {
   1524             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
   1525         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
   1526             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
   1527         } else {
   1528             assert(g->fno != NULL);
   1529             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
   1530             oprsz = maxsz;
   1531         }
   1532         break;
   1533 
   1534     default:
   1535         g_assert_not_reached();
   1536     }
   1537     tcg_swap_vecop_list(hold_list);
   1538 
   1539     if (oprsz < maxsz) {
   1540         expand_clr(dofs + oprsz, maxsz - oprsz);
   1541     }
   1542 }
   1543 
   1544 /* Expand a vector four-operand operation.  */
   1545 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
   1546                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
   1547 {
   1548     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
   1549     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
   1550     TCGType type;
   1551     uint32_t some;
   1552 
   1553     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
   1554     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
   1555 
   1556     type = 0;
   1557     if (g->fniv) {
   1558         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
   1559     }
   1560     switch (type) {
   1561     case TCG_TYPE_V256:
   1562         /* Recall that ARM SVE allows vector sizes that are not a
   1563          * power of 2, but always a multiple of 16.  The intent is
   1564          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
   1565          */
   1566         some = QEMU_ALIGN_DOWN(oprsz, 32);
   1567         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
   1568                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
   1569         if (some == oprsz) {
   1570             break;
   1571         }
   1572         dofs += some;
   1573         aofs += some;
   1574         bofs += some;
   1575         cofs += some;
   1576         oprsz -= some;
   1577         maxsz -= some;
   1578         /* fallthru */
   1579     case TCG_TYPE_V128:
   1580         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
   1581                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
   1582         break;
   1583     case TCG_TYPE_V64:
   1584         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
   1585                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
   1586         break;
   1587 
   1588     case 0:
   1589         if (g->fni8 && check_size_impl(oprsz, 8)) {
   1590             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
   1591                          g->write_aofs, g->fni8);
   1592         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
   1593             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
   1594                          g->write_aofs, g->fni4);
   1595         } else {
   1596             assert(g->fno != NULL);
   1597             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
   1598                                oprsz, maxsz, g->data, g->fno);
   1599             oprsz = maxsz;
   1600         }
   1601         break;
   1602 
   1603     default:
   1604         g_assert_not_reached();
   1605     }
   1606     tcg_swap_vecop_list(hold_list);
   1607 
   1608     if (oprsz < maxsz) {
   1609         expand_clr(dofs + oprsz, maxsz - oprsz);
   1610     }
   1611 }
   1612 
   1613 /* Expand a vector four-operand operation.  */
   1614 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
   1615                      uint32_t oprsz, uint32_t maxsz, int64_t c,
   1616                      const GVecGen4i *g)
   1617 {
   1618     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
   1619     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
   1620     TCGType type;
   1621     uint32_t some;
   1622 
   1623     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
   1624     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
   1625 
   1626     type = 0;
   1627     if (g->fniv) {
   1628         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
   1629     }
   1630     switch (type) {
   1631     case TCG_TYPE_V256:
   1632         /*
   1633          * Recall that ARM SVE allows vector sizes that are not a
   1634          * power of 2, but always a multiple of 16.  The intent is
   1635          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
   1636          */
   1637         some = QEMU_ALIGN_DOWN(oprsz, 32);
   1638         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some,
   1639                       32, TCG_TYPE_V256, c, g->fniv);
   1640         if (some == oprsz) {
   1641             break;
   1642         }
   1643         dofs += some;
   1644         aofs += some;
   1645         bofs += some;
   1646         cofs += some;
   1647         oprsz -= some;
   1648         maxsz -= some;
   1649         /* fallthru */
   1650     case TCG_TYPE_V128:
   1651         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
   1652                        16, TCG_TYPE_V128, c, g->fniv);
   1653         break;
   1654     case TCG_TYPE_V64:
   1655         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
   1656                       8, TCG_TYPE_V64, c, g->fniv);
   1657         break;
   1658 
   1659     case 0:
   1660         if (g->fni8 && check_size_impl(oprsz, 8)) {
   1661             expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8);
   1662         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
   1663             expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4);
   1664         } else {
   1665             assert(g->fno != NULL);
   1666             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
   1667                                oprsz, maxsz, c, g->fno);
   1668             oprsz = maxsz;
   1669         }
   1670         break;
   1671 
   1672     default:
   1673         g_assert_not_reached();
   1674     }
   1675     tcg_swap_vecop_list(hold_list);
   1676 
   1677     if (oprsz < maxsz) {
   1678         expand_clr(dofs + oprsz, maxsz - oprsz);
   1679     }
   1680 }
   1681 
   1682 /*
   1683  * Expand specific vector operations.
   1684  */
   1685 
   1686 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
   1687 {
   1688     tcg_gen_mov_vec(a, b);
   1689 }
   1690 
   1691 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
   1692                       uint32_t oprsz, uint32_t maxsz)
   1693 {
   1694     static const GVecGen2 g = {
   1695         .fni8 = tcg_gen_mov_i64,
   1696         .fniv = vec_mov2,
   1697         .fno = gen_helper_gvec_mov,
   1698         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   1699     };
   1700     if (dofs != aofs) {
   1701         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
   1702     } else {
   1703         check_size_align(oprsz, maxsz, dofs);
   1704         if (oprsz < maxsz) {
   1705             expand_clr(dofs + oprsz, maxsz - oprsz);
   1706         }
   1707     }
   1708 }
   1709 
   1710 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
   1711                           uint32_t maxsz, TCGv_i32 in)
   1712 {
   1713     check_size_align(oprsz, maxsz, dofs);
   1714     tcg_debug_assert(vece <= MO_32);
   1715     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
   1716 }
   1717 
   1718 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
   1719                           uint32_t maxsz, TCGv_i64 in)
   1720 {
   1721     check_size_align(oprsz, maxsz, dofs);
   1722     tcg_debug_assert(vece <= MO_64);
   1723     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
   1724 }
   1725 
   1726 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
   1727                           uint32_t oprsz, uint32_t maxsz)
   1728 {
   1729     check_size_align(oprsz, maxsz, dofs);
   1730     if (vece <= MO_64) {
   1731         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
   1732         if (type != 0) {
   1733             TCGv_vec t_vec = tcg_temp_new_vec(type);
   1734             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
   1735             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
   1736             tcg_temp_free_vec(t_vec);
   1737         } else if (vece <= MO_32) {
   1738             TCGv_i32 in = tcg_temp_new_i32();
   1739             switch (vece) {
   1740             case MO_8:
   1741                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
   1742                 break;
   1743             case MO_16:
   1744                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
   1745                 break;
   1746             default:
   1747                 tcg_gen_ld_i32(in, cpu_env, aofs);
   1748                 break;
   1749             }
   1750             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
   1751             tcg_temp_free_i32(in);
   1752         } else {
   1753             TCGv_i64 in = tcg_temp_new_i64();
   1754             tcg_gen_ld_i64(in, cpu_env, aofs);
   1755             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
   1756             tcg_temp_free_i64(in);
   1757         }
   1758     } else if (vece == 4) {
   1759         /* 128-bit duplicate.  */
   1760         int i;
   1761 
   1762         tcg_debug_assert(oprsz >= 16);
   1763         if (TCG_TARGET_HAS_v128) {
   1764             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
   1765 
   1766             tcg_gen_ld_vec(in, cpu_env, aofs);
   1767             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
   1768                 tcg_gen_st_vec(in, cpu_env, dofs + i);
   1769             }
   1770             tcg_temp_free_vec(in);
   1771         } else {
   1772             TCGv_i64 in0 = tcg_temp_new_i64();
   1773             TCGv_i64 in1 = tcg_temp_new_i64();
   1774 
   1775             tcg_gen_ld_i64(in0, cpu_env, aofs);
   1776             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
   1777             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
   1778                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
   1779                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
   1780             }
   1781             tcg_temp_free_i64(in0);
   1782             tcg_temp_free_i64(in1);
   1783         }
   1784         if (oprsz < maxsz) {
   1785             expand_clr(dofs + oprsz, maxsz - oprsz);
   1786         }
   1787     } else if (vece == 5) {
   1788         /* 256-bit duplicate.  */
   1789         int i;
   1790 
   1791         tcg_debug_assert(oprsz >= 32);
   1792         tcg_debug_assert(oprsz % 32 == 0);
   1793         if (TCG_TARGET_HAS_v256) {
   1794             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
   1795 
   1796             tcg_gen_ld_vec(in, cpu_env, aofs);
   1797             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
   1798                 tcg_gen_st_vec(in, cpu_env, dofs + i);
   1799             }
   1800             tcg_temp_free_vec(in);
   1801         } else if (TCG_TARGET_HAS_v128) {
   1802             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
   1803             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
   1804 
   1805             tcg_gen_ld_vec(in0, cpu_env, aofs);
   1806             tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
   1807             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
   1808                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
   1809                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
   1810             }
   1811             tcg_temp_free_vec(in0);
   1812             tcg_temp_free_vec(in1);
   1813         } else {
   1814             TCGv_i64 in[4];
   1815             int j;
   1816 
   1817             for (j = 0; j < 4; ++j) {
   1818                 in[j] = tcg_temp_new_i64();
   1819                 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
   1820             }
   1821             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
   1822                 for (j = 0; j < 4; ++j) {
   1823                     tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
   1824                 }
   1825             }
   1826             for (j = 0; j < 4; ++j) {
   1827                 tcg_temp_free_i64(in[j]);
   1828             }
   1829         }
   1830         if (oprsz < maxsz) {
   1831             expand_clr(dofs + oprsz, maxsz - oprsz);
   1832         }
   1833     } else {
   1834         g_assert_not_reached();
   1835     }
   1836 }
   1837 
   1838 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
   1839                           uint32_t maxsz, uint64_t x)
   1840 {
   1841     check_size_align(oprsz, maxsz, dofs);
   1842     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
   1843 }
   1844 
   1845 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
   1846                       uint32_t oprsz, uint32_t maxsz)
   1847 {
   1848     static const GVecGen2 g = {
   1849         .fni8 = tcg_gen_not_i64,
   1850         .fniv = tcg_gen_not_vec,
   1851         .fno = gen_helper_gvec_not,
   1852         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   1853     };
   1854     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
   1855 }
   1856 
   1857 /* Perform a vector addition using normal addition and a mask.  The mask
   1858    should be the sign bit of each lane.  This 6-operation form is more
   1859    efficient than separate additions when there are 4 or more lanes in
   1860    the 64-bit operation.  */
   1861 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
   1862 {
   1863     TCGv_i64 t1 = tcg_temp_new_i64();
   1864     TCGv_i64 t2 = tcg_temp_new_i64();
   1865     TCGv_i64 t3 = tcg_temp_new_i64();
   1866 
   1867     tcg_gen_andc_i64(t1, a, m);
   1868     tcg_gen_andc_i64(t2, b, m);
   1869     tcg_gen_xor_i64(t3, a, b);
   1870     tcg_gen_add_i64(d, t1, t2);
   1871     tcg_gen_and_i64(t3, t3, m);
   1872     tcg_gen_xor_i64(d, d, t3);
   1873 
   1874     tcg_temp_free_i64(t1);
   1875     tcg_temp_free_i64(t2);
   1876     tcg_temp_free_i64(t3);
   1877 }
   1878 
   1879 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   1880 {
   1881     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
   1882     gen_addv_mask(d, a, b, m);
   1883 }
   1884 
   1885 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
   1886 {
   1887     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
   1888     TCGv_i32 t1 = tcg_temp_new_i32();
   1889     TCGv_i32 t2 = tcg_temp_new_i32();
   1890     TCGv_i32 t3 = tcg_temp_new_i32();
   1891 
   1892     tcg_gen_andc_i32(t1, a, m);
   1893     tcg_gen_andc_i32(t2, b, m);
   1894     tcg_gen_xor_i32(t3, a, b);
   1895     tcg_gen_add_i32(d, t1, t2);
   1896     tcg_gen_and_i32(t3, t3, m);
   1897     tcg_gen_xor_i32(d, d, t3);
   1898 
   1899     tcg_temp_free_i32(t1);
   1900     tcg_temp_free_i32(t2);
   1901     tcg_temp_free_i32(t3);
   1902 }
   1903 
   1904 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   1905 {
   1906     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
   1907     gen_addv_mask(d, a, b, m);
   1908 }
   1909 
   1910 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
   1911 {
   1912     TCGv_i32 t1 = tcg_temp_new_i32();
   1913     TCGv_i32 t2 = tcg_temp_new_i32();
   1914 
   1915     tcg_gen_andi_i32(t1, a, ~0xffff);
   1916     tcg_gen_add_i32(t2, a, b);
   1917     tcg_gen_add_i32(t1, t1, b);
   1918     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
   1919 
   1920     tcg_temp_free_i32(t1);
   1921     tcg_temp_free_i32(t2);
   1922 }
   1923 
   1924 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   1925 {
   1926     TCGv_i64 t1 = tcg_temp_new_i64();
   1927     TCGv_i64 t2 = tcg_temp_new_i64();
   1928 
   1929     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
   1930     tcg_gen_add_i64(t2, a, b);
   1931     tcg_gen_add_i64(t1, t1, b);
   1932     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
   1933 
   1934     tcg_temp_free_i64(t1);
   1935     tcg_temp_free_i64(t2);
   1936 }
   1937 
   1938 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
   1939 
   1940 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
   1941                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   1942 {
   1943     static const GVecGen3 g[4] = {
   1944         { .fni8 = tcg_gen_vec_add8_i64,
   1945           .fniv = tcg_gen_add_vec,
   1946           .fno = gen_helper_gvec_add8,
   1947           .opt_opc = vecop_list_add,
   1948           .vece = MO_8 },
   1949         { .fni8 = tcg_gen_vec_add16_i64,
   1950           .fniv = tcg_gen_add_vec,
   1951           .fno = gen_helper_gvec_add16,
   1952           .opt_opc = vecop_list_add,
   1953           .vece = MO_16 },
   1954         { .fni4 = tcg_gen_add_i32,
   1955           .fniv = tcg_gen_add_vec,
   1956           .fno = gen_helper_gvec_add32,
   1957           .opt_opc = vecop_list_add,
   1958           .vece = MO_32 },
   1959         { .fni8 = tcg_gen_add_i64,
   1960           .fniv = tcg_gen_add_vec,
   1961           .fno = gen_helper_gvec_add64,
   1962           .opt_opc = vecop_list_add,
   1963           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   1964           .vece = MO_64 },
   1965     };
   1966 
   1967     tcg_debug_assert(vece <= MO_64);
   1968     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   1969 }
   1970 
   1971 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
   1972                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
   1973 {
   1974     static const GVecGen2s g[4] = {
   1975         { .fni8 = tcg_gen_vec_add8_i64,
   1976           .fniv = tcg_gen_add_vec,
   1977           .fno = gen_helper_gvec_adds8,
   1978           .opt_opc = vecop_list_add,
   1979           .vece = MO_8 },
   1980         { .fni8 = tcg_gen_vec_add16_i64,
   1981           .fniv = tcg_gen_add_vec,
   1982           .fno = gen_helper_gvec_adds16,
   1983           .opt_opc = vecop_list_add,
   1984           .vece = MO_16 },
   1985         { .fni4 = tcg_gen_add_i32,
   1986           .fniv = tcg_gen_add_vec,
   1987           .fno = gen_helper_gvec_adds32,
   1988           .opt_opc = vecop_list_add,
   1989           .vece = MO_32 },
   1990         { .fni8 = tcg_gen_add_i64,
   1991           .fniv = tcg_gen_add_vec,
   1992           .fno = gen_helper_gvec_adds64,
   1993           .opt_opc = vecop_list_add,
   1994           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   1995           .vece = MO_64 },
   1996     };
   1997 
   1998     tcg_debug_assert(vece <= MO_64);
   1999     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
   2000 }
   2001 
   2002 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
   2003                        int64_t c, uint32_t oprsz, uint32_t maxsz)
   2004 {
   2005     TCGv_i64 tmp = tcg_constant_i64(c);
   2006     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
   2007 }
   2008 
   2009 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
   2010 
   2011 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
   2012                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
   2013 {
   2014     static const GVecGen2s g[4] = {
   2015         { .fni8 = tcg_gen_vec_sub8_i64,
   2016           .fniv = tcg_gen_sub_vec,
   2017           .fno = gen_helper_gvec_subs8,
   2018           .opt_opc = vecop_list_sub,
   2019           .vece = MO_8 },
   2020         { .fni8 = tcg_gen_vec_sub16_i64,
   2021           .fniv = tcg_gen_sub_vec,
   2022           .fno = gen_helper_gvec_subs16,
   2023           .opt_opc = vecop_list_sub,
   2024           .vece = MO_16 },
   2025         { .fni4 = tcg_gen_sub_i32,
   2026           .fniv = tcg_gen_sub_vec,
   2027           .fno = gen_helper_gvec_subs32,
   2028           .opt_opc = vecop_list_sub,
   2029           .vece = MO_32 },
   2030         { .fni8 = tcg_gen_sub_i64,
   2031           .fniv = tcg_gen_sub_vec,
   2032           .fno = gen_helper_gvec_subs64,
   2033           .opt_opc = vecop_list_sub,
   2034           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2035           .vece = MO_64 },
   2036     };
   2037 
   2038     tcg_debug_assert(vece <= MO_64);
   2039     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
   2040 }
   2041 
   2042 /* Perform a vector subtraction using normal subtraction and a mask.
   2043    Compare gen_addv_mask above.  */
   2044 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
   2045 {
   2046     TCGv_i64 t1 = tcg_temp_new_i64();
   2047     TCGv_i64 t2 = tcg_temp_new_i64();
   2048     TCGv_i64 t3 = tcg_temp_new_i64();
   2049 
   2050     tcg_gen_or_i64(t1, a, m);
   2051     tcg_gen_andc_i64(t2, b, m);
   2052     tcg_gen_eqv_i64(t3, a, b);
   2053     tcg_gen_sub_i64(d, t1, t2);
   2054     tcg_gen_and_i64(t3, t3, m);
   2055     tcg_gen_xor_i64(d, d, t3);
   2056 
   2057     tcg_temp_free_i64(t1);
   2058     tcg_temp_free_i64(t2);
   2059     tcg_temp_free_i64(t3);
   2060 }
   2061 
   2062 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   2063 {
   2064     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
   2065     gen_subv_mask(d, a, b, m);
   2066 }
   2067 
   2068 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
   2069 {
   2070     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
   2071     TCGv_i32 t1 = tcg_temp_new_i32();
   2072     TCGv_i32 t2 = tcg_temp_new_i32();
   2073     TCGv_i32 t3 = tcg_temp_new_i32();
   2074 
   2075     tcg_gen_or_i32(t1, a, m);
   2076     tcg_gen_andc_i32(t2, b, m);
   2077     tcg_gen_eqv_i32(t3, a, b);
   2078     tcg_gen_sub_i32(d, t1, t2);
   2079     tcg_gen_and_i32(t3, t3, m);
   2080     tcg_gen_xor_i32(d, d, t3);
   2081 
   2082     tcg_temp_free_i32(t1);
   2083     tcg_temp_free_i32(t2);
   2084     tcg_temp_free_i32(t3);
   2085 }
   2086 
   2087 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   2088 {
   2089     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
   2090     gen_subv_mask(d, a, b, m);
   2091 }
   2092 
   2093 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
   2094 {
   2095     TCGv_i32 t1 = tcg_temp_new_i32();
   2096     TCGv_i32 t2 = tcg_temp_new_i32();
   2097 
   2098     tcg_gen_andi_i32(t1, b, ~0xffff);
   2099     tcg_gen_sub_i32(t2, a, b);
   2100     tcg_gen_sub_i32(t1, a, t1);
   2101     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
   2102 
   2103     tcg_temp_free_i32(t1);
   2104     tcg_temp_free_i32(t2);
   2105 }
   2106 
   2107 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   2108 {
   2109     TCGv_i64 t1 = tcg_temp_new_i64();
   2110     TCGv_i64 t2 = tcg_temp_new_i64();
   2111 
   2112     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
   2113     tcg_gen_sub_i64(t2, a, b);
   2114     tcg_gen_sub_i64(t1, a, t1);
   2115     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
   2116 
   2117     tcg_temp_free_i64(t1);
   2118     tcg_temp_free_i64(t2);
   2119 }
   2120 
   2121 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
   2122                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2123 {
   2124     static const GVecGen3 g[4] = {
   2125         { .fni8 = tcg_gen_vec_sub8_i64,
   2126           .fniv = tcg_gen_sub_vec,
   2127           .fno = gen_helper_gvec_sub8,
   2128           .opt_opc = vecop_list_sub,
   2129           .vece = MO_8 },
   2130         { .fni8 = tcg_gen_vec_sub16_i64,
   2131           .fniv = tcg_gen_sub_vec,
   2132           .fno = gen_helper_gvec_sub16,
   2133           .opt_opc = vecop_list_sub,
   2134           .vece = MO_16 },
   2135         { .fni4 = tcg_gen_sub_i32,
   2136           .fniv = tcg_gen_sub_vec,
   2137           .fno = gen_helper_gvec_sub32,
   2138           .opt_opc = vecop_list_sub,
   2139           .vece = MO_32 },
   2140         { .fni8 = tcg_gen_sub_i64,
   2141           .fniv = tcg_gen_sub_vec,
   2142           .fno = gen_helper_gvec_sub64,
   2143           .opt_opc = vecop_list_sub,
   2144           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2145           .vece = MO_64 },
   2146     };
   2147 
   2148     tcg_debug_assert(vece <= MO_64);
   2149     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   2150 }
   2151 
   2152 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
   2153 
   2154 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
   2155                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2156 {
   2157     static const GVecGen3 g[4] = {
   2158         { .fniv = tcg_gen_mul_vec,
   2159           .fno = gen_helper_gvec_mul8,
   2160           .opt_opc = vecop_list_mul,
   2161           .vece = MO_8 },
   2162         { .fniv = tcg_gen_mul_vec,
   2163           .fno = gen_helper_gvec_mul16,
   2164           .opt_opc = vecop_list_mul,
   2165           .vece = MO_16 },
   2166         { .fni4 = tcg_gen_mul_i32,
   2167           .fniv = tcg_gen_mul_vec,
   2168           .fno = gen_helper_gvec_mul32,
   2169           .opt_opc = vecop_list_mul,
   2170           .vece = MO_32 },
   2171         { .fni8 = tcg_gen_mul_i64,
   2172           .fniv = tcg_gen_mul_vec,
   2173           .fno = gen_helper_gvec_mul64,
   2174           .opt_opc = vecop_list_mul,
   2175           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2176           .vece = MO_64 },
   2177     };
   2178 
   2179     tcg_debug_assert(vece <= MO_64);
   2180     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   2181 }
   2182 
   2183 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
   2184                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
   2185 {
   2186     static const GVecGen2s g[4] = {
   2187         { .fniv = tcg_gen_mul_vec,
   2188           .fno = gen_helper_gvec_muls8,
   2189           .opt_opc = vecop_list_mul,
   2190           .vece = MO_8 },
   2191         { .fniv = tcg_gen_mul_vec,
   2192           .fno = gen_helper_gvec_muls16,
   2193           .opt_opc = vecop_list_mul,
   2194           .vece = MO_16 },
   2195         { .fni4 = tcg_gen_mul_i32,
   2196           .fniv = tcg_gen_mul_vec,
   2197           .fno = gen_helper_gvec_muls32,
   2198           .opt_opc = vecop_list_mul,
   2199           .vece = MO_32 },
   2200         { .fni8 = tcg_gen_mul_i64,
   2201           .fniv = tcg_gen_mul_vec,
   2202           .fno = gen_helper_gvec_muls64,
   2203           .opt_opc = vecop_list_mul,
   2204           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2205           .vece = MO_64 },
   2206     };
   2207 
   2208     tcg_debug_assert(vece <= MO_64);
   2209     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
   2210 }
   2211 
   2212 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
   2213                        int64_t c, uint32_t oprsz, uint32_t maxsz)
   2214 {
   2215     TCGv_i64 tmp = tcg_constant_i64(c);
   2216     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
   2217 }
   2218 
   2219 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
   2220                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2221 {
   2222     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
   2223     static const GVecGen3 g[4] = {
   2224         { .fniv = tcg_gen_ssadd_vec,
   2225           .fno = gen_helper_gvec_ssadd8,
   2226           .opt_opc = vecop_list,
   2227           .vece = MO_8 },
   2228         { .fniv = tcg_gen_ssadd_vec,
   2229           .fno = gen_helper_gvec_ssadd16,
   2230           .opt_opc = vecop_list,
   2231           .vece = MO_16 },
   2232         { .fniv = tcg_gen_ssadd_vec,
   2233           .fno = gen_helper_gvec_ssadd32,
   2234           .opt_opc = vecop_list,
   2235           .vece = MO_32 },
   2236         { .fniv = tcg_gen_ssadd_vec,
   2237           .fno = gen_helper_gvec_ssadd64,
   2238           .opt_opc = vecop_list,
   2239           .vece = MO_64 },
   2240     };
   2241     tcg_debug_assert(vece <= MO_64);
   2242     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   2243 }
   2244 
   2245 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
   2246                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2247 {
   2248     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
   2249     static const GVecGen3 g[4] = {
   2250         { .fniv = tcg_gen_sssub_vec,
   2251           .fno = gen_helper_gvec_sssub8,
   2252           .opt_opc = vecop_list,
   2253           .vece = MO_8 },
   2254         { .fniv = tcg_gen_sssub_vec,
   2255           .fno = gen_helper_gvec_sssub16,
   2256           .opt_opc = vecop_list,
   2257           .vece = MO_16 },
   2258         { .fniv = tcg_gen_sssub_vec,
   2259           .fno = gen_helper_gvec_sssub32,
   2260           .opt_opc = vecop_list,
   2261           .vece = MO_32 },
   2262         { .fniv = tcg_gen_sssub_vec,
   2263           .fno = gen_helper_gvec_sssub64,
   2264           .opt_opc = vecop_list,
   2265           .vece = MO_64 },
   2266     };
   2267     tcg_debug_assert(vece <= MO_64);
   2268     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   2269 }
   2270 
   2271 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
   2272 {
   2273     TCGv_i32 max = tcg_constant_i32(-1);
   2274     tcg_gen_add_i32(d, a, b);
   2275     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
   2276 }
   2277 
   2278 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   2279 {
   2280     TCGv_i64 max = tcg_constant_i64(-1);
   2281     tcg_gen_add_i64(d, a, b);
   2282     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
   2283 }
   2284 
   2285 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
   2286                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2287 {
   2288     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
   2289     static const GVecGen3 g[4] = {
   2290         { .fniv = tcg_gen_usadd_vec,
   2291           .fno = gen_helper_gvec_usadd8,
   2292           .opt_opc = vecop_list,
   2293           .vece = MO_8 },
   2294         { .fniv = tcg_gen_usadd_vec,
   2295           .fno = gen_helper_gvec_usadd16,
   2296           .opt_opc = vecop_list,
   2297           .vece = MO_16 },
   2298         { .fni4 = tcg_gen_usadd_i32,
   2299           .fniv = tcg_gen_usadd_vec,
   2300           .fno = gen_helper_gvec_usadd32,
   2301           .opt_opc = vecop_list,
   2302           .vece = MO_32 },
   2303         { .fni8 = tcg_gen_usadd_i64,
   2304           .fniv = tcg_gen_usadd_vec,
   2305           .fno = gen_helper_gvec_usadd64,
   2306           .opt_opc = vecop_list,
   2307           .vece = MO_64 }
   2308     };
   2309     tcg_debug_assert(vece <= MO_64);
   2310     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   2311 }
   2312 
   2313 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
   2314 {
   2315     TCGv_i32 min = tcg_constant_i32(0);
   2316     tcg_gen_sub_i32(d, a, b);
   2317     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
   2318 }
   2319 
   2320 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   2321 {
   2322     TCGv_i64 min = tcg_constant_i64(0);
   2323     tcg_gen_sub_i64(d, a, b);
   2324     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
   2325 }
   2326 
   2327 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
   2328                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2329 {
   2330     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
   2331     static const GVecGen3 g[4] = {
   2332         { .fniv = tcg_gen_ussub_vec,
   2333           .fno = gen_helper_gvec_ussub8,
   2334           .opt_opc = vecop_list,
   2335           .vece = MO_8 },
   2336         { .fniv = tcg_gen_ussub_vec,
   2337           .fno = gen_helper_gvec_ussub16,
   2338           .opt_opc = vecop_list,
   2339           .vece = MO_16 },
   2340         { .fni4 = tcg_gen_ussub_i32,
   2341           .fniv = tcg_gen_ussub_vec,
   2342           .fno = gen_helper_gvec_ussub32,
   2343           .opt_opc = vecop_list,
   2344           .vece = MO_32 },
   2345         { .fni8 = tcg_gen_ussub_i64,
   2346           .fniv = tcg_gen_ussub_vec,
   2347           .fno = gen_helper_gvec_ussub64,
   2348           .opt_opc = vecop_list,
   2349           .vece = MO_64 }
   2350     };
   2351     tcg_debug_assert(vece <= MO_64);
   2352     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   2353 }
   2354 
   2355 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
   2356                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2357 {
   2358     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
   2359     static const GVecGen3 g[4] = {
   2360         { .fniv = tcg_gen_smin_vec,
   2361           .fno = gen_helper_gvec_smin8,
   2362           .opt_opc = vecop_list,
   2363           .vece = MO_8 },
   2364         { .fniv = tcg_gen_smin_vec,
   2365           .fno = gen_helper_gvec_smin16,
   2366           .opt_opc = vecop_list,
   2367           .vece = MO_16 },
   2368         { .fni4 = tcg_gen_smin_i32,
   2369           .fniv = tcg_gen_smin_vec,
   2370           .fno = gen_helper_gvec_smin32,
   2371           .opt_opc = vecop_list,
   2372           .vece = MO_32 },
   2373         { .fni8 = tcg_gen_smin_i64,
   2374           .fniv = tcg_gen_smin_vec,
   2375           .fno = gen_helper_gvec_smin64,
   2376           .opt_opc = vecop_list,
   2377           .vece = MO_64 }
   2378     };
   2379     tcg_debug_assert(vece <= MO_64);
   2380     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   2381 }
   2382 
   2383 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
   2384                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2385 {
   2386     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
   2387     static const GVecGen3 g[4] = {
   2388         { .fniv = tcg_gen_umin_vec,
   2389           .fno = gen_helper_gvec_umin8,
   2390           .opt_opc = vecop_list,
   2391           .vece = MO_8 },
   2392         { .fniv = tcg_gen_umin_vec,
   2393           .fno = gen_helper_gvec_umin16,
   2394           .opt_opc = vecop_list,
   2395           .vece = MO_16 },
   2396         { .fni4 = tcg_gen_umin_i32,
   2397           .fniv = tcg_gen_umin_vec,
   2398           .fno = gen_helper_gvec_umin32,
   2399           .opt_opc = vecop_list,
   2400           .vece = MO_32 },
   2401         { .fni8 = tcg_gen_umin_i64,
   2402           .fniv = tcg_gen_umin_vec,
   2403           .fno = gen_helper_gvec_umin64,
   2404           .opt_opc = vecop_list,
   2405           .vece = MO_64 }
   2406     };
   2407     tcg_debug_assert(vece <= MO_64);
   2408     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   2409 }
   2410 
   2411 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
   2412                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2413 {
   2414     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
   2415     static const GVecGen3 g[4] = {
   2416         { .fniv = tcg_gen_smax_vec,
   2417           .fno = gen_helper_gvec_smax8,
   2418           .opt_opc = vecop_list,
   2419           .vece = MO_8 },
   2420         { .fniv = tcg_gen_smax_vec,
   2421           .fno = gen_helper_gvec_smax16,
   2422           .opt_opc = vecop_list,
   2423           .vece = MO_16 },
   2424         { .fni4 = tcg_gen_smax_i32,
   2425           .fniv = tcg_gen_smax_vec,
   2426           .fno = gen_helper_gvec_smax32,
   2427           .opt_opc = vecop_list,
   2428           .vece = MO_32 },
   2429         { .fni8 = tcg_gen_smax_i64,
   2430           .fniv = tcg_gen_smax_vec,
   2431           .fno = gen_helper_gvec_smax64,
   2432           .opt_opc = vecop_list,
   2433           .vece = MO_64 }
   2434     };
   2435     tcg_debug_assert(vece <= MO_64);
   2436     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   2437 }
   2438 
   2439 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
   2440                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2441 {
   2442     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
   2443     static const GVecGen3 g[4] = {
   2444         { .fniv = tcg_gen_umax_vec,
   2445           .fno = gen_helper_gvec_umax8,
   2446           .opt_opc = vecop_list,
   2447           .vece = MO_8 },
   2448         { .fniv = tcg_gen_umax_vec,
   2449           .fno = gen_helper_gvec_umax16,
   2450           .opt_opc = vecop_list,
   2451           .vece = MO_16 },
   2452         { .fni4 = tcg_gen_umax_i32,
   2453           .fniv = tcg_gen_umax_vec,
   2454           .fno = gen_helper_gvec_umax32,
   2455           .opt_opc = vecop_list,
   2456           .vece = MO_32 },
   2457         { .fni8 = tcg_gen_umax_i64,
   2458           .fniv = tcg_gen_umax_vec,
   2459           .fno = gen_helper_gvec_umax64,
   2460           .opt_opc = vecop_list,
   2461           .vece = MO_64 }
   2462     };
   2463     tcg_debug_assert(vece <= MO_64);
   2464     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   2465 }
   2466 
   2467 /* Perform a vector negation using normal negation and a mask.
   2468    Compare gen_subv_mask above.  */
   2469 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
   2470 {
   2471     TCGv_i64 t2 = tcg_temp_new_i64();
   2472     TCGv_i64 t3 = tcg_temp_new_i64();
   2473 
   2474     tcg_gen_andc_i64(t3, m, b);
   2475     tcg_gen_andc_i64(t2, b, m);
   2476     tcg_gen_sub_i64(d, m, t2);
   2477     tcg_gen_xor_i64(d, d, t3);
   2478 
   2479     tcg_temp_free_i64(t2);
   2480     tcg_temp_free_i64(t3);
   2481 }
   2482 
   2483 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
   2484 {
   2485     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
   2486     gen_negv_mask(d, b, m);
   2487 }
   2488 
   2489 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
   2490 {
   2491     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
   2492     gen_negv_mask(d, b, m);
   2493 }
   2494 
   2495 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
   2496 {
   2497     TCGv_i64 t1 = tcg_temp_new_i64();
   2498     TCGv_i64 t2 = tcg_temp_new_i64();
   2499 
   2500     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
   2501     tcg_gen_neg_i64(t2, b);
   2502     tcg_gen_neg_i64(t1, t1);
   2503     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
   2504 
   2505     tcg_temp_free_i64(t1);
   2506     tcg_temp_free_i64(t2);
   2507 }
   2508 
   2509 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
   2510                       uint32_t oprsz, uint32_t maxsz)
   2511 {
   2512     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
   2513     static const GVecGen2 g[4] = {
   2514         { .fni8 = tcg_gen_vec_neg8_i64,
   2515           .fniv = tcg_gen_neg_vec,
   2516           .fno = gen_helper_gvec_neg8,
   2517           .opt_opc = vecop_list,
   2518           .vece = MO_8 },
   2519         { .fni8 = tcg_gen_vec_neg16_i64,
   2520           .fniv = tcg_gen_neg_vec,
   2521           .fno = gen_helper_gvec_neg16,
   2522           .opt_opc = vecop_list,
   2523           .vece = MO_16 },
   2524         { .fni4 = tcg_gen_neg_i32,
   2525           .fniv = tcg_gen_neg_vec,
   2526           .fno = gen_helper_gvec_neg32,
   2527           .opt_opc = vecop_list,
   2528           .vece = MO_32 },
   2529         { .fni8 = tcg_gen_neg_i64,
   2530           .fniv = tcg_gen_neg_vec,
   2531           .fno = gen_helper_gvec_neg64,
   2532           .opt_opc = vecop_list,
   2533           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2534           .vece = MO_64 },
   2535     };
   2536 
   2537     tcg_debug_assert(vece <= MO_64);
   2538     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
   2539 }
   2540 
   2541 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
   2542 {
   2543     TCGv_i64 t = tcg_temp_new_i64();
   2544     int nbit = 8 << vece;
   2545 
   2546     /* Create -1 for each negative element.  */
   2547     tcg_gen_shri_i64(t, b, nbit - 1);
   2548     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
   2549     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
   2550 
   2551     /*
   2552      * Invert (via xor -1) and add one.
   2553      * Because of the ordering the msb is cleared,
   2554      * so we never have carry into the next element.
   2555      */
   2556     tcg_gen_xor_i64(d, b, t);
   2557     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
   2558     tcg_gen_add_i64(d, d, t);
   2559 
   2560     tcg_temp_free_i64(t);
   2561 }
   2562 
   2563 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
   2564 {
   2565     gen_absv_mask(d, b, MO_8);
   2566 }
   2567 
   2568 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
   2569 {
   2570     gen_absv_mask(d, b, MO_16);
   2571 }
   2572 
   2573 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
   2574                       uint32_t oprsz, uint32_t maxsz)
   2575 {
   2576     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
   2577     static const GVecGen2 g[4] = {
   2578         { .fni8 = tcg_gen_vec_abs8_i64,
   2579           .fniv = tcg_gen_abs_vec,
   2580           .fno = gen_helper_gvec_abs8,
   2581           .opt_opc = vecop_list,
   2582           .vece = MO_8 },
   2583         { .fni8 = tcg_gen_vec_abs16_i64,
   2584           .fniv = tcg_gen_abs_vec,
   2585           .fno = gen_helper_gvec_abs16,
   2586           .opt_opc = vecop_list,
   2587           .vece = MO_16 },
   2588         { .fni4 = tcg_gen_abs_i32,
   2589           .fniv = tcg_gen_abs_vec,
   2590           .fno = gen_helper_gvec_abs32,
   2591           .opt_opc = vecop_list,
   2592           .vece = MO_32 },
   2593         { .fni8 = tcg_gen_abs_i64,
   2594           .fniv = tcg_gen_abs_vec,
   2595           .fno = gen_helper_gvec_abs64,
   2596           .opt_opc = vecop_list,
   2597           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2598           .vece = MO_64 },
   2599     };
   2600 
   2601     tcg_debug_assert(vece <= MO_64);
   2602     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
   2603 }
   2604 
   2605 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
   2606                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2607 {
   2608     static const GVecGen3 g = {
   2609         .fni8 = tcg_gen_and_i64,
   2610         .fniv = tcg_gen_and_vec,
   2611         .fno = gen_helper_gvec_and,
   2612         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2613     };
   2614 
   2615     if (aofs == bofs) {
   2616         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
   2617     } else {
   2618         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
   2619     }
   2620 }
   2621 
   2622 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
   2623                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2624 {
   2625     static const GVecGen3 g = {
   2626         .fni8 = tcg_gen_or_i64,
   2627         .fniv = tcg_gen_or_vec,
   2628         .fno = gen_helper_gvec_or,
   2629         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2630     };
   2631 
   2632     if (aofs == bofs) {
   2633         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
   2634     } else {
   2635         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
   2636     }
   2637 }
   2638 
   2639 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
   2640                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2641 {
   2642     static const GVecGen3 g = {
   2643         .fni8 = tcg_gen_xor_i64,
   2644         .fniv = tcg_gen_xor_vec,
   2645         .fno = gen_helper_gvec_xor,
   2646         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2647     };
   2648 
   2649     if (aofs == bofs) {
   2650         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
   2651     } else {
   2652         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
   2653     }
   2654 }
   2655 
   2656 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
   2657                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2658 {
   2659     static const GVecGen3 g = {
   2660         .fni8 = tcg_gen_andc_i64,
   2661         .fniv = tcg_gen_andc_vec,
   2662         .fno = gen_helper_gvec_andc,
   2663         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2664     };
   2665 
   2666     if (aofs == bofs) {
   2667         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
   2668     } else {
   2669         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
   2670     }
   2671 }
   2672 
   2673 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
   2674                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2675 {
   2676     static const GVecGen3 g = {
   2677         .fni8 = tcg_gen_orc_i64,
   2678         .fniv = tcg_gen_orc_vec,
   2679         .fno = gen_helper_gvec_orc,
   2680         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2681     };
   2682 
   2683     if (aofs == bofs) {
   2684         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
   2685     } else {
   2686         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
   2687     }
   2688 }
   2689 
   2690 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
   2691                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2692 {
   2693     static const GVecGen3 g = {
   2694         .fni8 = tcg_gen_nand_i64,
   2695         .fniv = tcg_gen_nand_vec,
   2696         .fno = gen_helper_gvec_nand,
   2697         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2698     };
   2699 
   2700     if (aofs == bofs) {
   2701         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
   2702     } else {
   2703         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
   2704     }
   2705 }
   2706 
   2707 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
   2708                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2709 {
   2710     static const GVecGen3 g = {
   2711         .fni8 = tcg_gen_nor_i64,
   2712         .fniv = tcg_gen_nor_vec,
   2713         .fno = gen_helper_gvec_nor,
   2714         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2715     };
   2716 
   2717     if (aofs == bofs) {
   2718         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
   2719     } else {
   2720         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
   2721     }
   2722 }
   2723 
   2724 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
   2725                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   2726 {
   2727     static const GVecGen3 g = {
   2728         .fni8 = tcg_gen_eqv_i64,
   2729         .fniv = tcg_gen_eqv_vec,
   2730         .fno = gen_helper_gvec_eqv,
   2731         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2732     };
   2733 
   2734     if (aofs == bofs) {
   2735         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
   2736     } else {
   2737         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
   2738     }
   2739 }
   2740 
   2741 static const GVecGen2s gop_ands = {
   2742     .fni8 = tcg_gen_and_i64,
   2743     .fniv = tcg_gen_and_vec,
   2744     .fno = gen_helper_gvec_ands,
   2745     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2746     .vece = MO_64
   2747 };
   2748 
   2749 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
   2750                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
   2751 {
   2752     TCGv_i64 tmp = tcg_temp_new_i64();
   2753     tcg_gen_dup_i64(vece, tmp, c);
   2754     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
   2755     tcg_temp_free_i64(tmp);
   2756 }
   2757 
   2758 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
   2759                        int64_t c, uint32_t oprsz, uint32_t maxsz)
   2760 {
   2761     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
   2762     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
   2763 }
   2764 
   2765 static const GVecGen2s gop_xors = {
   2766     .fni8 = tcg_gen_xor_i64,
   2767     .fniv = tcg_gen_xor_vec,
   2768     .fno = gen_helper_gvec_xors,
   2769     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2770     .vece = MO_64
   2771 };
   2772 
   2773 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
   2774                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
   2775 {
   2776     TCGv_i64 tmp = tcg_temp_new_i64();
   2777     tcg_gen_dup_i64(vece, tmp, c);
   2778     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
   2779     tcg_temp_free_i64(tmp);
   2780 }
   2781 
   2782 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
   2783                        int64_t c, uint32_t oprsz, uint32_t maxsz)
   2784 {
   2785     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
   2786     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
   2787 }
   2788 
   2789 static const GVecGen2s gop_ors = {
   2790     .fni8 = tcg_gen_or_i64,
   2791     .fniv = tcg_gen_or_vec,
   2792     .fno = gen_helper_gvec_ors,
   2793     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2794     .vece = MO_64
   2795 };
   2796 
   2797 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
   2798                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
   2799 {
   2800     TCGv_i64 tmp = tcg_temp_new_i64();
   2801     tcg_gen_dup_i64(vece, tmp, c);
   2802     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
   2803     tcg_temp_free_i64(tmp);
   2804 }
   2805 
   2806 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
   2807                       int64_t c, uint32_t oprsz, uint32_t maxsz)
   2808 {
   2809     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
   2810     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
   2811 }
   2812 
   2813 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
   2814 {
   2815     uint64_t mask = dup_const(MO_8, 0xff << c);
   2816     tcg_gen_shli_i64(d, a, c);
   2817     tcg_gen_andi_i64(d, d, mask);
   2818 }
   2819 
   2820 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
   2821 {
   2822     uint64_t mask = dup_const(MO_16, 0xffff << c);
   2823     tcg_gen_shli_i64(d, a, c);
   2824     tcg_gen_andi_i64(d, d, mask);
   2825 }
   2826 
   2827 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
   2828 {
   2829     uint32_t mask = dup_const(MO_8, 0xff << c);
   2830     tcg_gen_shli_i32(d, a, c);
   2831     tcg_gen_andi_i32(d, d, mask);
   2832 }
   2833 
   2834 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
   2835 {
   2836     uint32_t mask = dup_const(MO_16, 0xffff << c);
   2837     tcg_gen_shli_i32(d, a, c);
   2838     tcg_gen_andi_i32(d, d, mask);
   2839 }
   2840 
   2841 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
   2842                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
   2843 {
   2844     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
   2845     static const GVecGen2i g[4] = {
   2846         { .fni8 = tcg_gen_vec_shl8i_i64,
   2847           .fniv = tcg_gen_shli_vec,
   2848           .fno = gen_helper_gvec_shl8i,
   2849           .opt_opc = vecop_list,
   2850           .vece = MO_8 },
   2851         { .fni8 = tcg_gen_vec_shl16i_i64,
   2852           .fniv = tcg_gen_shli_vec,
   2853           .fno = gen_helper_gvec_shl16i,
   2854           .opt_opc = vecop_list,
   2855           .vece = MO_16 },
   2856         { .fni4 = tcg_gen_shli_i32,
   2857           .fniv = tcg_gen_shli_vec,
   2858           .fno = gen_helper_gvec_shl32i,
   2859           .opt_opc = vecop_list,
   2860           .vece = MO_32 },
   2861         { .fni8 = tcg_gen_shli_i64,
   2862           .fniv = tcg_gen_shli_vec,
   2863           .fno = gen_helper_gvec_shl64i,
   2864           .opt_opc = vecop_list,
   2865           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2866           .vece = MO_64 },
   2867     };
   2868 
   2869     tcg_debug_assert(vece <= MO_64);
   2870     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
   2871     if (shift == 0) {
   2872         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
   2873     } else {
   2874         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
   2875     }
   2876 }
   2877 
   2878 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
   2879 {
   2880     uint64_t mask = dup_const(MO_8, 0xff >> c);
   2881     tcg_gen_shri_i64(d, a, c);
   2882     tcg_gen_andi_i64(d, d, mask);
   2883 }
   2884 
   2885 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
   2886 {
   2887     uint64_t mask = dup_const(MO_16, 0xffff >> c);
   2888     tcg_gen_shri_i64(d, a, c);
   2889     tcg_gen_andi_i64(d, d, mask);
   2890 }
   2891 
   2892 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
   2893 {
   2894     uint32_t mask = dup_const(MO_8, 0xff >> c);
   2895     tcg_gen_shri_i32(d, a, c);
   2896     tcg_gen_andi_i32(d, d, mask);
   2897 }
   2898 
   2899 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
   2900 {
   2901     uint32_t mask = dup_const(MO_16, 0xffff >> c);
   2902     tcg_gen_shri_i32(d, a, c);
   2903     tcg_gen_andi_i32(d, d, mask);
   2904 }
   2905 
   2906 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
   2907                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
   2908 {
   2909     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
   2910     static const GVecGen2i g[4] = {
   2911         { .fni8 = tcg_gen_vec_shr8i_i64,
   2912           .fniv = tcg_gen_shri_vec,
   2913           .fno = gen_helper_gvec_shr8i,
   2914           .opt_opc = vecop_list,
   2915           .vece = MO_8 },
   2916         { .fni8 = tcg_gen_vec_shr16i_i64,
   2917           .fniv = tcg_gen_shri_vec,
   2918           .fno = gen_helper_gvec_shr16i,
   2919           .opt_opc = vecop_list,
   2920           .vece = MO_16 },
   2921         { .fni4 = tcg_gen_shri_i32,
   2922           .fniv = tcg_gen_shri_vec,
   2923           .fno = gen_helper_gvec_shr32i,
   2924           .opt_opc = vecop_list,
   2925           .vece = MO_32 },
   2926         { .fni8 = tcg_gen_shri_i64,
   2927           .fniv = tcg_gen_shri_vec,
   2928           .fno = gen_helper_gvec_shr64i,
   2929           .opt_opc = vecop_list,
   2930           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   2931           .vece = MO_64 },
   2932     };
   2933 
   2934     tcg_debug_assert(vece <= MO_64);
   2935     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
   2936     if (shift == 0) {
   2937         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
   2938     } else {
   2939         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
   2940     }
   2941 }
   2942 
   2943 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
   2944 {
   2945     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
   2946     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
   2947     TCGv_i64 s = tcg_temp_new_i64();
   2948 
   2949     tcg_gen_shri_i64(d, a, c);
   2950     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
   2951     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
   2952     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
   2953     tcg_gen_or_i64(d, d, s);         /* include sign extension */
   2954     tcg_temp_free_i64(s);
   2955 }
   2956 
   2957 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
   2958 {
   2959     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
   2960     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
   2961     TCGv_i64 s = tcg_temp_new_i64();
   2962 
   2963     tcg_gen_shri_i64(d, a, c);
   2964     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
   2965     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
   2966     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
   2967     tcg_gen_or_i64(d, d, s);         /* include sign extension */
   2968     tcg_temp_free_i64(s);
   2969 }
   2970 
   2971 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
   2972 {
   2973     uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
   2974     uint32_t c_mask = dup_const(MO_8, 0xff >> c);
   2975     TCGv_i32 s = tcg_temp_new_i32();
   2976 
   2977     tcg_gen_shri_i32(d, a, c);
   2978     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
   2979     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
   2980     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
   2981     tcg_gen_or_i32(d, d, s);         /* include sign extension */
   2982     tcg_temp_free_i32(s);
   2983 }
   2984 
   2985 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
   2986 {
   2987     uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
   2988     uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
   2989     TCGv_i32 s = tcg_temp_new_i32();
   2990 
   2991     tcg_gen_shri_i32(d, a, c);
   2992     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
   2993     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
   2994     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
   2995     tcg_gen_or_i32(d, d, s);         /* include sign extension */
   2996     tcg_temp_free_i32(s);
   2997 }
   2998 
   2999 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
   3000                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
   3001 {
   3002     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
   3003     static const GVecGen2i g[4] = {
   3004         { .fni8 = tcg_gen_vec_sar8i_i64,
   3005           .fniv = tcg_gen_sari_vec,
   3006           .fno = gen_helper_gvec_sar8i,
   3007           .opt_opc = vecop_list,
   3008           .vece = MO_8 },
   3009         { .fni8 = tcg_gen_vec_sar16i_i64,
   3010           .fniv = tcg_gen_sari_vec,
   3011           .fno = gen_helper_gvec_sar16i,
   3012           .opt_opc = vecop_list,
   3013           .vece = MO_16 },
   3014         { .fni4 = tcg_gen_sari_i32,
   3015           .fniv = tcg_gen_sari_vec,
   3016           .fno = gen_helper_gvec_sar32i,
   3017           .opt_opc = vecop_list,
   3018           .vece = MO_32 },
   3019         { .fni8 = tcg_gen_sari_i64,
   3020           .fniv = tcg_gen_sari_vec,
   3021           .fno = gen_helper_gvec_sar64i,
   3022           .opt_opc = vecop_list,
   3023           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   3024           .vece = MO_64 },
   3025     };
   3026 
   3027     tcg_debug_assert(vece <= MO_64);
   3028     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
   3029     if (shift == 0) {
   3030         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
   3031     } else {
   3032         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
   3033     }
   3034 }
   3035 
   3036 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
   3037 {
   3038     uint64_t mask = dup_const(MO_8, 0xff << c);
   3039 
   3040     tcg_gen_shli_i64(d, a, c);
   3041     tcg_gen_shri_i64(a, a, 8 - c);
   3042     tcg_gen_andi_i64(d, d, mask);
   3043     tcg_gen_andi_i64(a, a, ~mask);
   3044     tcg_gen_or_i64(d, d, a);
   3045 }
   3046 
   3047 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
   3048 {
   3049     uint64_t mask = dup_const(MO_16, 0xffff << c);
   3050 
   3051     tcg_gen_shli_i64(d, a, c);
   3052     tcg_gen_shri_i64(a, a, 16 - c);
   3053     tcg_gen_andi_i64(d, d, mask);
   3054     tcg_gen_andi_i64(a, a, ~mask);
   3055     tcg_gen_or_i64(d, d, a);
   3056 }
   3057 
   3058 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
   3059                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
   3060 {
   3061     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
   3062     static const GVecGen2i g[4] = {
   3063         { .fni8 = tcg_gen_vec_rotl8i_i64,
   3064           .fniv = tcg_gen_rotli_vec,
   3065           .fno = gen_helper_gvec_rotl8i,
   3066           .opt_opc = vecop_list,
   3067           .vece = MO_8 },
   3068         { .fni8 = tcg_gen_vec_rotl16i_i64,
   3069           .fniv = tcg_gen_rotli_vec,
   3070           .fno = gen_helper_gvec_rotl16i,
   3071           .opt_opc = vecop_list,
   3072           .vece = MO_16 },
   3073         { .fni4 = tcg_gen_rotli_i32,
   3074           .fniv = tcg_gen_rotli_vec,
   3075           .fno = gen_helper_gvec_rotl32i,
   3076           .opt_opc = vecop_list,
   3077           .vece = MO_32 },
   3078         { .fni8 = tcg_gen_rotli_i64,
   3079           .fniv = tcg_gen_rotli_vec,
   3080           .fno = gen_helper_gvec_rotl64i,
   3081           .opt_opc = vecop_list,
   3082           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   3083           .vece = MO_64 },
   3084     };
   3085 
   3086     tcg_debug_assert(vece <= MO_64);
   3087     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
   3088     if (shift == 0) {
   3089         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
   3090     } else {
   3091         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
   3092     }
   3093 }
   3094 
   3095 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
   3096                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
   3097 {
   3098     tcg_debug_assert(vece <= MO_64);
   3099     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
   3100     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
   3101                        oprsz, maxsz);
   3102 }
   3103 
   3104 /*
   3105  * Specialized generation vector shifts by a non-constant scalar.
   3106  */
   3107 
   3108 typedef struct {
   3109     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
   3110     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
   3111     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
   3112     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
   3113     gen_helper_gvec_2 *fno[4];
   3114     TCGOpcode s_list[2];
   3115     TCGOpcode v_list[2];
   3116 } GVecGen2sh;
   3117 
   3118 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
   3119                            uint32_t oprsz, uint32_t tysz, TCGType type,
   3120                            TCGv_i32 shift,
   3121                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
   3122 {
   3123     TCGv_vec t0 = tcg_temp_new_vec(type);
   3124     uint32_t i;
   3125 
   3126     for (i = 0; i < oprsz; i += tysz) {
   3127         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
   3128         fni(vece, t0, t0, shift);
   3129         tcg_gen_st_vec(t0, cpu_env, dofs + i);
   3130     }
   3131     tcg_temp_free_vec(t0);
   3132 }
   3133 
   3134 static void
   3135 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
   3136                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
   3137 {
   3138     TCGType type;
   3139     uint32_t some;
   3140 
   3141     check_size_align(oprsz, maxsz, dofs | aofs);
   3142     check_overlap_2(dofs, aofs, maxsz);
   3143 
   3144     /* If the backend has a scalar expansion, great.  */
   3145     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
   3146     if (type) {
   3147         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
   3148         switch (type) {
   3149         case TCG_TYPE_V256:
   3150             some = QEMU_ALIGN_DOWN(oprsz, 32);
   3151             expand_2sh_vec(vece, dofs, aofs, some, 32,
   3152                            TCG_TYPE_V256, shift, g->fniv_s);
   3153             if (some == oprsz) {
   3154                 break;
   3155             }
   3156             dofs += some;
   3157             aofs += some;
   3158             oprsz -= some;
   3159             maxsz -= some;
   3160             /* fallthru */
   3161         case TCG_TYPE_V128:
   3162             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
   3163                            TCG_TYPE_V128, shift, g->fniv_s);
   3164             break;
   3165         case TCG_TYPE_V64:
   3166             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
   3167                            TCG_TYPE_V64, shift, g->fniv_s);
   3168             break;
   3169         default:
   3170             g_assert_not_reached();
   3171         }
   3172         tcg_swap_vecop_list(hold_list);
   3173         goto clear_tail;
   3174     }
   3175 
   3176     /* If the backend supports variable vector shifts, also cool.  */
   3177     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
   3178     if (type) {
   3179         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
   3180         TCGv_vec v_shift = tcg_temp_new_vec(type);
   3181 
   3182         if (vece == MO_64) {
   3183             TCGv_i64 sh64 = tcg_temp_new_i64();
   3184             tcg_gen_extu_i32_i64(sh64, shift);
   3185             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
   3186             tcg_temp_free_i64(sh64);
   3187         } else {
   3188             tcg_gen_dup_i32_vec(vece, v_shift, shift);
   3189         }
   3190 
   3191         switch (type) {
   3192         case TCG_TYPE_V256:
   3193             some = QEMU_ALIGN_DOWN(oprsz, 32);
   3194             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
   3195                           v_shift, false, g->fniv_v);
   3196             if (some == oprsz) {
   3197                 break;
   3198             }
   3199             dofs += some;
   3200             aofs += some;
   3201             oprsz -= some;
   3202             maxsz -= some;
   3203             /* fallthru */
   3204         case TCG_TYPE_V128:
   3205             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
   3206                           v_shift, false, g->fniv_v);
   3207             break;
   3208         case TCG_TYPE_V64:
   3209             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
   3210                           v_shift, false, g->fniv_v);
   3211             break;
   3212         default:
   3213             g_assert_not_reached();
   3214         }
   3215         tcg_temp_free_vec(v_shift);
   3216         tcg_swap_vecop_list(hold_list);
   3217         goto clear_tail;
   3218     }
   3219 
   3220     /* Otherwise fall back to integral... */
   3221     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
   3222         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
   3223     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
   3224         TCGv_i64 sh64 = tcg_temp_new_i64();
   3225         tcg_gen_extu_i32_i64(sh64, shift);
   3226         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
   3227         tcg_temp_free_i64(sh64);
   3228     } else {
   3229         TCGv_ptr a0 = tcg_temp_new_ptr();
   3230         TCGv_ptr a1 = tcg_temp_new_ptr();
   3231         TCGv_i32 desc = tcg_temp_new_i32();
   3232 
   3233         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
   3234         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
   3235         tcg_gen_addi_ptr(a0, cpu_env, dofs);
   3236         tcg_gen_addi_ptr(a1, cpu_env, aofs);
   3237 
   3238         g->fno[vece](a0, a1, desc);
   3239 
   3240         tcg_temp_free_ptr(a0);
   3241         tcg_temp_free_ptr(a1);
   3242         tcg_temp_free_i32(desc);
   3243         return;
   3244     }
   3245 
   3246  clear_tail:
   3247     if (oprsz < maxsz) {
   3248         expand_clr(dofs + oprsz, maxsz - oprsz);
   3249     }
   3250 }
   3251 
   3252 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
   3253                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
   3254 {
   3255     static const GVecGen2sh g = {
   3256         .fni4 = tcg_gen_shl_i32,
   3257         .fni8 = tcg_gen_shl_i64,
   3258         .fniv_s = tcg_gen_shls_vec,
   3259         .fniv_v = tcg_gen_shlv_vec,
   3260         .fno = {
   3261             gen_helper_gvec_shl8i,
   3262             gen_helper_gvec_shl16i,
   3263             gen_helper_gvec_shl32i,
   3264             gen_helper_gvec_shl64i,
   3265         },
   3266         .s_list = { INDEX_op_shls_vec, 0 },
   3267         .v_list = { INDEX_op_shlv_vec, 0 },
   3268     };
   3269 
   3270     tcg_debug_assert(vece <= MO_64);
   3271     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
   3272 }
   3273 
   3274 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
   3275                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
   3276 {
   3277     static const GVecGen2sh g = {
   3278         .fni4 = tcg_gen_shr_i32,
   3279         .fni8 = tcg_gen_shr_i64,
   3280         .fniv_s = tcg_gen_shrs_vec,
   3281         .fniv_v = tcg_gen_shrv_vec,
   3282         .fno = {
   3283             gen_helper_gvec_shr8i,
   3284             gen_helper_gvec_shr16i,
   3285             gen_helper_gvec_shr32i,
   3286             gen_helper_gvec_shr64i,
   3287         },
   3288         .s_list = { INDEX_op_shrs_vec, 0 },
   3289         .v_list = { INDEX_op_shrv_vec, 0 },
   3290     };
   3291 
   3292     tcg_debug_assert(vece <= MO_64);
   3293     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
   3294 }
   3295 
   3296 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
   3297                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
   3298 {
   3299     static const GVecGen2sh g = {
   3300         .fni4 = tcg_gen_sar_i32,
   3301         .fni8 = tcg_gen_sar_i64,
   3302         .fniv_s = tcg_gen_sars_vec,
   3303         .fniv_v = tcg_gen_sarv_vec,
   3304         .fno = {
   3305             gen_helper_gvec_sar8i,
   3306             gen_helper_gvec_sar16i,
   3307             gen_helper_gvec_sar32i,
   3308             gen_helper_gvec_sar64i,
   3309         },
   3310         .s_list = { INDEX_op_sars_vec, 0 },
   3311         .v_list = { INDEX_op_sarv_vec, 0 },
   3312     };
   3313 
   3314     tcg_debug_assert(vece <= MO_64);
   3315     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
   3316 }
   3317 
   3318 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
   3319                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
   3320 {
   3321     static const GVecGen2sh g = {
   3322         .fni4 = tcg_gen_rotl_i32,
   3323         .fni8 = tcg_gen_rotl_i64,
   3324         .fniv_s = tcg_gen_rotls_vec,
   3325         .fniv_v = tcg_gen_rotlv_vec,
   3326         .fno = {
   3327             gen_helper_gvec_rotl8i,
   3328             gen_helper_gvec_rotl16i,
   3329             gen_helper_gvec_rotl32i,
   3330             gen_helper_gvec_rotl64i,
   3331         },
   3332         .s_list = { INDEX_op_rotls_vec, 0 },
   3333         .v_list = { INDEX_op_rotlv_vec, 0 },
   3334     };
   3335 
   3336     tcg_debug_assert(vece <= MO_64);
   3337     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
   3338 }
   3339 
   3340 /*
   3341  * Expand D = A << (B % element bits)
   3342  *
   3343  * Unlike scalar shifts, where it is easy for the target front end
   3344  * to include the modulo as part of the expansion.  If the target
   3345  * naturally includes the modulo as part of the operation, great!
   3346  * If the target has some other behaviour from out-of-range shifts,
   3347  * then it could not use this function anyway, and would need to
   3348  * do it's own expansion with custom functions.
   3349  */
   3350 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
   3351                                  TCGv_vec a, TCGv_vec b)
   3352 {
   3353     TCGv_vec t = tcg_temp_new_vec_matching(d);
   3354     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
   3355 
   3356     tcg_gen_and_vec(vece, t, b, m);
   3357     tcg_gen_shlv_vec(vece, d, a, t);
   3358     tcg_temp_free_vec(t);
   3359 }
   3360 
   3361 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
   3362 {
   3363     TCGv_i32 t = tcg_temp_new_i32();
   3364 
   3365     tcg_gen_andi_i32(t, b, 31);
   3366     tcg_gen_shl_i32(d, a, t);
   3367     tcg_temp_free_i32(t);
   3368 }
   3369 
   3370 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   3371 {
   3372     TCGv_i64 t = tcg_temp_new_i64();
   3373 
   3374     tcg_gen_andi_i64(t, b, 63);
   3375     tcg_gen_shl_i64(d, a, t);
   3376     tcg_temp_free_i64(t);
   3377 }
   3378 
   3379 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
   3380                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   3381 {
   3382     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
   3383     static const GVecGen3 g[4] = {
   3384         { .fniv = tcg_gen_shlv_mod_vec,
   3385           .fno = gen_helper_gvec_shl8v,
   3386           .opt_opc = vecop_list,
   3387           .vece = MO_8 },
   3388         { .fniv = tcg_gen_shlv_mod_vec,
   3389           .fno = gen_helper_gvec_shl16v,
   3390           .opt_opc = vecop_list,
   3391           .vece = MO_16 },
   3392         { .fni4 = tcg_gen_shl_mod_i32,
   3393           .fniv = tcg_gen_shlv_mod_vec,
   3394           .fno = gen_helper_gvec_shl32v,
   3395           .opt_opc = vecop_list,
   3396           .vece = MO_32 },
   3397         { .fni8 = tcg_gen_shl_mod_i64,
   3398           .fniv = tcg_gen_shlv_mod_vec,
   3399           .fno = gen_helper_gvec_shl64v,
   3400           .opt_opc = vecop_list,
   3401           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   3402           .vece = MO_64 },
   3403     };
   3404 
   3405     tcg_debug_assert(vece <= MO_64);
   3406     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   3407 }
   3408 
   3409 /*
   3410  * Similarly for logical right shifts.
   3411  */
   3412 
   3413 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
   3414                                  TCGv_vec a, TCGv_vec b)
   3415 {
   3416     TCGv_vec t = tcg_temp_new_vec_matching(d);
   3417     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
   3418 
   3419     tcg_gen_and_vec(vece, t, b, m);
   3420     tcg_gen_shrv_vec(vece, d, a, t);
   3421     tcg_temp_free_vec(t);
   3422 }
   3423 
   3424 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
   3425 {
   3426     TCGv_i32 t = tcg_temp_new_i32();
   3427 
   3428     tcg_gen_andi_i32(t, b, 31);
   3429     tcg_gen_shr_i32(d, a, t);
   3430     tcg_temp_free_i32(t);
   3431 }
   3432 
   3433 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   3434 {
   3435     TCGv_i64 t = tcg_temp_new_i64();
   3436 
   3437     tcg_gen_andi_i64(t, b, 63);
   3438     tcg_gen_shr_i64(d, a, t);
   3439     tcg_temp_free_i64(t);
   3440 }
   3441 
   3442 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
   3443                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   3444 {
   3445     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
   3446     static const GVecGen3 g[4] = {
   3447         { .fniv = tcg_gen_shrv_mod_vec,
   3448           .fno = gen_helper_gvec_shr8v,
   3449           .opt_opc = vecop_list,
   3450           .vece = MO_8 },
   3451         { .fniv = tcg_gen_shrv_mod_vec,
   3452           .fno = gen_helper_gvec_shr16v,
   3453           .opt_opc = vecop_list,
   3454           .vece = MO_16 },
   3455         { .fni4 = tcg_gen_shr_mod_i32,
   3456           .fniv = tcg_gen_shrv_mod_vec,
   3457           .fno = gen_helper_gvec_shr32v,
   3458           .opt_opc = vecop_list,
   3459           .vece = MO_32 },
   3460         { .fni8 = tcg_gen_shr_mod_i64,
   3461           .fniv = tcg_gen_shrv_mod_vec,
   3462           .fno = gen_helper_gvec_shr64v,
   3463           .opt_opc = vecop_list,
   3464           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   3465           .vece = MO_64 },
   3466     };
   3467 
   3468     tcg_debug_assert(vece <= MO_64);
   3469     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   3470 }
   3471 
   3472 /*
   3473  * Similarly for arithmetic right shifts.
   3474  */
   3475 
   3476 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
   3477                                  TCGv_vec a, TCGv_vec b)
   3478 {
   3479     TCGv_vec t = tcg_temp_new_vec_matching(d);
   3480     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
   3481 
   3482     tcg_gen_and_vec(vece, t, b, m);
   3483     tcg_gen_sarv_vec(vece, d, a, t);
   3484     tcg_temp_free_vec(t);
   3485 }
   3486 
   3487 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
   3488 {
   3489     TCGv_i32 t = tcg_temp_new_i32();
   3490 
   3491     tcg_gen_andi_i32(t, b, 31);
   3492     tcg_gen_sar_i32(d, a, t);
   3493     tcg_temp_free_i32(t);
   3494 }
   3495 
   3496 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   3497 {
   3498     TCGv_i64 t = tcg_temp_new_i64();
   3499 
   3500     tcg_gen_andi_i64(t, b, 63);
   3501     tcg_gen_sar_i64(d, a, t);
   3502     tcg_temp_free_i64(t);
   3503 }
   3504 
   3505 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
   3506                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   3507 {
   3508     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
   3509     static const GVecGen3 g[4] = {
   3510         { .fniv = tcg_gen_sarv_mod_vec,
   3511           .fno = gen_helper_gvec_sar8v,
   3512           .opt_opc = vecop_list,
   3513           .vece = MO_8 },
   3514         { .fniv = tcg_gen_sarv_mod_vec,
   3515           .fno = gen_helper_gvec_sar16v,
   3516           .opt_opc = vecop_list,
   3517           .vece = MO_16 },
   3518         { .fni4 = tcg_gen_sar_mod_i32,
   3519           .fniv = tcg_gen_sarv_mod_vec,
   3520           .fno = gen_helper_gvec_sar32v,
   3521           .opt_opc = vecop_list,
   3522           .vece = MO_32 },
   3523         { .fni8 = tcg_gen_sar_mod_i64,
   3524           .fniv = tcg_gen_sarv_mod_vec,
   3525           .fno = gen_helper_gvec_sar64v,
   3526           .opt_opc = vecop_list,
   3527           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   3528           .vece = MO_64 },
   3529     };
   3530 
   3531     tcg_debug_assert(vece <= MO_64);
   3532     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   3533 }
   3534 
   3535 /*
   3536  * Similarly for rotates.
   3537  */
   3538 
   3539 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
   3540                                   TCGv_vec a, TCGv_vec b)
   3541 {
   3542     TCGv_vec t = tcg_temp_new_vec_matching(d);
   3543     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
   3544 
   3545     tcg_gen_and_vec(vece, t, b, m);
   3546     tcg_gen_rotlv_vec(vece, d, a, t);
   3547     tcg_temp_free_vec(t);
   3548 }
   3549 
   3550 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
   3551 {
   3552     TCGv_i32 t = tcg_temp_new_i32();
   3553 
   3554     tcg_gen_andi_i32(t, b, 31);
   3555     tcg_gen_rotl_i32(d, a, t);
   3556     tcg_temp_free_i32(t);
   3557 }
   3558 
   3559 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   3560 {
   3561     TCGv_i64 t = tcg_temp_new_i64();
   3562 
   3563     tcg_gen_andi_i64(t, b, 63);
   3564     tcg_gen_rotl_i64(d, a, t);
   3565     tcg_temp_free_i64(t);
   3566 }
   3567 
   3568 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
   3569                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   3570 {
   3571     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
   3572     static const GVecGen3 g[4] = {
   3573         { .fniv = tcg_gen_rotlv_mod_vec,
   3574           .fno = gen_helper_gvec_rotl8v,
   3575           .opt_opc = vecop_list,
   3576           .vece = MO_8 },
   3577         { .fniv = tcg_gen_rotlv_mod_vec,
   3578           .fno = gen_helper_gvec_rotl16v,
   3579           .opt_opc = vecop_list,
   3580           .vece = MO_16 },
   3581         { .fni4 = tcg_gen_rotl_mod_i32,
   3582           .fniv = tcg_gen_rotlv_mod_vec,
   3583           .fno = gen_helper_gvec_rotl32v,
   3584           .opt_opc = vecop_list,
   3585           .vece = MO_32 },
   3586         { .fni8 = tcg_gen_rotl_mod_i64,
   3587           .fniv = tcg_gen_rotlv_mod_vec,
   3588           .fno = gen_helper_gvec_rotl64v,
   3589           .opt_opc = vecop_list,
   3590           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   3591           .vece = MO_64 },
   3592     };
   3593 
   3594     tcg_debug_assert(vece <= MO_64);
   3595     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   3596 }
   3597 
   3598 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
   3599                                   TCGv_vec a, TCGv_vec b)
   3600 {
   3601     TCGv_vec t = tcg_temp_new_vec_matching(d);
   3602     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
   3603 
   3604     tcg_gen_and_vec(vece, t, b, m);
   3605     tcg_gen_rotrv_vec(vece, d, a, t);
   3606     tcg_temp_free_vec(t);
   3607 }
   3608 
   3609 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
   3610 {
   3611     TCGv_i32 t = tcg_temp_new_i32();
   3612 
   3613     tcg_gen_andi_i32(t, b, 31);
   3614     tcg_gen_rotr_i32(d, a, t);
   3615     tcg_temp_free_i32(t);
   3616 }
   3617 
   3618 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
   3619 {
   3620     TCGv_i64 t = tcg_temp_new_i64();
   3621 
   3622     tcg_gen_andi_i64(t, b, 63);
   3623     tcg_gen_rotr_i64(d, a, t);
   3624     tcg_temp_free_i64(t);
   3625 }
   3626 
   3627 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
   3628                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
   3629 {
   3630     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
   3631     static const GVecGen3 g[4] = {
   3632         { .fniv = tcg_gen_rotrv_mod_vec,
   3633           .fno = gen_helper_gvec_rotr8v,
   3634           .opt_opc = vecop_list,
   3635           .vece = MO_8 },
   3636         { .fniv = tcg_gen_rotrv_mod_vec,
   3637           .fno = gen_helper_gvec_rotr16v,
   3638           .opt_opc = vecop_list,
   3639           .vece = MO_16 },
   3640         { .fni4 = tcg_gen_rotr_mod_i32,
   3641           .fniv = tcg_gen_rotrv_mod_vec,
   3642           .fno = gen_helper_gvec_rotr32v,
   3643           .opt_opc = vecop_list,
   3644           .vece = MO_32 },
   3645         { .fni8 = tcg_gen_rotr_mod_i64,
   3646           .fniv = tcg_gen_rotrv_mod_vec,
   3647           .fno = gen_helper_gvec_rotr64v,
   3648           .opt_opc = vecop_list,
   3649           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
   3650           .vece = MO_64 },
   3651     };
   3652 
   3653     tcg_debug_assert(vece <= MO_64);
   3654     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
   3655 }
   3656 
   3657 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
   3658 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
   3659                            uint32_t oprsz, TCGCond cond)
   3660 {
   3661     TCGv_i32 t0 = tcg_temp_new_i32();
   3662     TCGv_i32 t1 = tcg_temp_new_i32();
   3663     uint32_t i;
   3664 
   3665     for (i = 0; i < oprsz; i += 4) {
   3666         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
   3667         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
   3668         tcg_gen_setcond_i32(cond, t0, t0, t1);
   3669         tcg_gen_neg_i32(t0, t0);
   3670         tcg_gen_st_i32(t0, cpu_env, dofs + i);
   3671     }
   3672     tcg_temp_free_i32(t1);
   3673     tcg_temp_free_i32(t0);
   3674 }
   3675 
   3676 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
   3677                            uint32_t oprsz, TCGCond cond)
   3678 {
   3679     TCGv_i64 t0 = tcg_temp_new_i64();
   3680     TCGv_i64 t1 = tcg_temp_new_i64();
   3681     uint32_t i;
   3682 
   3683     for (i = 0; i < oprsz; i += 8) {
   3684         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
   3685         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
   3686         tcg_gen_setcond_i64(cond, t0, t0, t1);
   3687         tcg_gen_neg_i64(t0, t0);
   3688         tcg_gen_st_i64(t0, cpu_env, dofs + i);
   3689     }
   3690     tcg_temp_free_i64(t1);
   3691     tcg_temp_free_i64(t0);
   3692 }
   3693 
   3694 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
   3695                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
   3696                            TCGType type, TCGCond cond)
   3697 {
   3698     TCGv_vec t0 = tcg_temp_new_vec(type);
   3699     TCGv_vec t1 = tcg_temp_new_vec(type);
   3700     uint32_t i;
   3701 
   3702     for (i = 0; i < oprsz; i += tysz) {
   3703         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
   3704         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
   3705         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
   3706         tcg_gen_st_vec(t0, cpu_env, dofs + i);
   3707     }
   3708     tcg_temp_free_vec(t1);
   3709     tcg_temp_free_vec(t0);
   3710 }
   3711 
   3712 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
   3713                       uint32_t aofs, uint32_t bofs,
   3714                       uint32_t oprsz, uint32_t maxsz)
   3715 {
   3716     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
   3717     static gen_helper_gvec_3 * const eq_fn[4] = {
   3718         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
   3719         gen_helper_gvec_eq32, gen_helper_gvec_eq64
   3720     };
   3721     static gen_helper_gvec_3 * const ne_fn[4] = {
   3722         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
   3723         gen_helper_gvec_ne32, gen_helper_gvec_ne64
   3724     };
   3725     static gen_helper_gvec_3 * const lt_fn[4] = {
   3726         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
   3727         gen_helper_gvec_lt32, gen_helper_gvec_lt64
   3728     };
   3729     static gen_helper_gvec_3 * const le_fn[4] = {
   3730         gen_helper_gvec_le8, gen_helper_gvec_le16,
   3731         gen_helper_gvec_le32, gen_helper_gvec_le64
   3732     };
   3733     static gen_helper_gvec_3 * const ltu_fn[4] = {
   3734         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
   3735         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
   3736     };
   3737     static gen_helper_gvec_3 * const leu_fn[4] = {
   3738         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
   3739         gen_helper_gvec_leu32, gen_helper_gvec_leu64
   3740     };
   3741     static gen_helper_gvec_3 * const * const fns[16] = {
   3742         [TCG_COND_EQ] = eq_fn,
   3743         [TCG_COND_NE] = ne_fn,
   3744         [TCG_COND_LT] = lt_fn,
   3745         [TCG_COND_LE] = le_fn,
   3746         [TCG_COND_LTU] = ltu_fn,
   3747         [TCG_COND_LEU] = leu_fn,
   3748     };
   3749 
   3750     const TCGOpcode *hold_list;
   3751     TCGType type;
   3752     uint32_t some;
   3753 
   3754     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
   3755     check_overlap_3(dofs, aofs, bofs, maxsz);
   3756 
   3757     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
   3758         do_dup(MO_8, dofs, oprsz, maxsz,
   3759                NULL, NULL, -(cond == TCG_COND_ALWAYS));
   3760         return;
   3761     }
   3762 
   3763     /*
   3764      * Implement inline with a vector type, if possible.
   3765      * Prefer integer when 64-bit host and 64-bit comparison.
   3766      */
   3767     hold_list = tcg_swap_vecop_list(cmp_list);
   3768     type = choose_vector_type(cmp_list, vece, oprsz,
   3769                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
   3770     switch (type) {
   3771     case TCG_TYPE_V256:
   3772         /* Recall that ARM SVE allows vector sizes that are not a
   3773          * power of 2, but always a multiple of 16.  The intent is
   3774          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
   3775          */
   3776         some = QEMU_ALIGN_DOWN(oprsz, 32);
   3777         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
   3778         if (some == oprsz) {
   3779             break;
   3780         }
   3781         dofs += some;
   3782         aofs += some;
   3783         bofs += some;
   3784         oprsz -= some;
   3785         maxsz -= some;
   3786         /* fallthru */
   3787     case TCG_TYPE_V128:
   3788         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
   3789         break;
   3790     case TCG_TYPE_V64:
   3791         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
   3792         break;
   3793 
   3794     case 0:
   3795         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
   3796             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
   3797         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
   3798             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
   3799         } else {
   3800             gen_helper_gvec_3 * const *fn = fns[cond];
   3801 
   3802             if (fn == NULL) {
   3803                 uint32_t tmp;
   3804                 tmp = aofs, aofs = bofs, bofs = tmp;
   3805                 cond = tcg_swap_cond(cond);
   3806                 fn = fns[cond];
   3807                 assert(fn != NULL);
   3808             }
   3809             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
   3810             oprsz = maxsz;
   3811         }
   3812         break;
   3813 
   3814     default:
   3815         g_assert_not_reached();
   3816     }
   3817     tcg_swap_vecop_list(hold_list);
   3818 
   3819     if (oprsz < maxsz) {
   3820         expand_clr(dofs + oprsz, maxsz - oprsz);
   3821     }
   3822 }
   3823 
   3824 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
   3825 {
   3826     TCGv_i64 t = tcg_temp_new_i64();
   3827 
   3828     tcg_gen_and_i64(t, b, a);
   3829     tcg_gen_andc_i64(d, c, a);
   3830     tcg_gen_or_i64(d, d, t);
   3831     tcg_temp_free_i64(t);
   3832 }
   3833 
   3834 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
   3835                          uint32_t bofs, uint32_t cofs,
   3836                          uint32_t oprsz, uint32_t maxsz)
   3837 {
   3838     static const GVecGen4 g = {
   3839         .fni8 = tcg_gen_bitsel_i64,
   3840         .fniv = tcg_gen_bitsel_vec,
   3841         .fno = gen_helper_gvec_bitsel,
   3842     };
   3843 
   3844     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
   3845 }