logic-aarch64.cc - duckstation - duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one

logic-aarch64.cc (268439B)
      1 // Copyright 2015, VIXL authors
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are met:
      6 //
      7 //   * Redistributions of source code must retain the above copyright notice,
      8 //     this list of conditions and the following disclaimer.
      9 //   * Redistributions in binary form must reproduce the above copyright notice,
     10 //     this list of conditions and the following disclaimer in the documentation
     11 //     and/or other materials provided with the distribution.
     12 //   * Neither the name of ARM Limited nor the names of its contributors may be
     13 //     used to endorse or promote products derived from this software without
     14 //     specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
     17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
     20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 
     27 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     28 
     29 #include <cmath>
     30 
     31 #include "simulator-aarch64.h"
     32 
     33 namespace vixl {
     34 namespace aarch64 {
     35 
     36 using vixl::internal::SimFloat16;
     37 
     38 template <typename T>
     39 bool IsFloat64() {
     40   return false;
     41 }
     42 template <>
     43 bool IsFloat64<double>() {
     44   return true;
     45 }
     46 
     47 template <typename T>
     48 bool IsFloat32() {
     49   return false;
     50 }
     51 template <>
     52 bool IsFloat32<float>() {
     53   return true;
     54 }
     55 
     56 template <typename T>
     57 bool IsFloat16() {
     58   return false;
     59 }
     60 template <>
     61 bool IsFloat16<Float16>() {
     62   return true;
     63 }
     64 template <>
     65 bool IsFloat16<SimFloat16>() {
     66   return true;
     67 }
     68 
     69 template <>
     70 double Simulator::FPDefaultNaN<double>() {
     71   return kFP64DefaultNaN;
     72 }
     73 
     74 
     75 template <>
     76 float Simulator::FPDefaultNaN<float>() {
     77   return kFP32DefaultNaN;
     78 }
     79 
     80 
     81 template <>
     82 SimFloat16 Simulator::FPDefaultNaN<SimFloat16>() {
     83   return SimFloat16(kFP16DefaultNaN);
     84 }
     85 
     86 
     87 double Simulator::FixedToDouble(int64_t src, int fbits, FPRounding round) {
     88   if (src >= 0) {
     89     return UFixedToDouble(src, fbits, round);
     90   } else if (src == INT64_MIN) {
     91     return -UFixedToDouble(src, fbits, round);
     92   } else {
     93     return -UFixedToDouble(-src, fbits, round);
     94   }
     95 }
     96 
     97 
     98 double Simulator::UFixedToDouble(uint64_t src, int fbits, FPRounding round) {
     99   // An input of 0 is a special case because the result is effectively
    100   // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
    101   if (src == 0) {
    102     return 0.0;
    103   }
    104 
    105   // Calculate the exponent. The highest significant bit will have the value
    106   // 2^exponent.
    107   const int highest_significant_bit = 63 - CountLeadingZeros(src);
    108   const int64_t exponent = highest_significant_bit - fbits;
    109 
    110   return FPRoundToDouble(0, exponent, src, round);
    111 }
    112 
    113 
    114 float Simulator::FixedToFloat(int64_t src, int fbits, FPRounding round) {
    115   if (src >= 0) {
    116     return UFixedToFloat(src, fbits, round);
    117   } else if (src == INT64_MIN) {
    118     return -UFixedToFloat(src, fbits, round);
    119   } else {
    120     return -UFixedToFloat(-src, fbits, round);
    121   }
    122 }
    123 
    124 
    125 float Simulator::UFixedToFloat(uint64_t src, int fbits, FPRounding round) {
    126   // An input of 0 is a special case because the result is effectively
    127   // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
    128   if (src == 0) {
    129     return 0.0f;
    130   }
    131 
    132   // Calculate the exponent. The highest significant bit will have the value
    133   // 2^exponent.
    134   const int highest_significant_bit = 63 - CountLeadingZeros(src);
    135   const int32_t exponent = highest_significant_bit - fbits;
    136 
    137   return FPRoundToFloat(0, exponent, src, round);
    138 }
    139 
    140 
    141 SimFloat16 Simulator::FixedToFloat16(int64_t src, int fbits, FPRounding round) {
    142   if (src >= 0) {
    143     return UFixedToFloat16(src, fbits, round);
    144   } else if (src == INT64_MIN) {
    145     return -UFixedToFloat16(src, fbits, round);
    146   } else {
    147     return -UFixedToFloat16(-src, fbits, round);
    148   }
    149 }
    150 
    151 
    152 SimFloat16 Simulator::UFixedToFloat16(uint64_t src,
    153                                       int fbits,
    154                                       FPRounding round) {
    155   // An input of 0 is a special case because the result is effectively
    156   // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
    157   if (src == 0) {
    158     return 0.0f;
    159   }
    160 
    161   // Calculate the exponent. The highest significant bit will have the value
    162   // 2^exponent.
    163   const int highest_significant_bit = 63 - CountLeadingZeros(src);
    164   const int16_t exponent = highest_significant_bit - fbits;
    165 
    166   return FPRoundToFloat16(0, exponent, src, round);
    167 }
    168 
    169 
    170 uint64_t Simulator::GenerateRandomTag(uint16_t exclude) {
    171   uint64_t rtag = nrand48(rand_state_) >> 28;
    172   VIXL_ASSERT(IsUint4(rtag));
    173 
    174   if (exclude == 0) {
    175     exclude = nrand48(rand_state_) >> 27;
    176   }
    177 
    178   // TODO: implement this to better match the specification, which calls for a
    179   // true random mode, and a pseudo-random mode with state (EL1.TAG) modified by
    180   // PRNG.
    181   return ChooseNonExcludedTag(rtag, 0, exclude);
    182 }
    183 
    184 
    185 bool Simulator::ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr) {
    186   dst.ClearForWrite(vform);
    187   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    188     if (!LoadLane(dst, vform, i, addr)) {
    189       return false;
    190     }
    191     addr += LaneSizeInBytesFromFormat(vform);
    192   }
    193   return true;
    194 }
    195 
    196 
    197 bool Simulator::ld1(VectorFormat vform,
    198                     LogicVRegister dst,
    199                     int index,
    200                     uint64_t addr) {
    201   dst.ClearForWrite(vform);
    202   return LoadLane(dst, vform, index, addr);
    203 }
    204 
    205 
    206 bool Simulator::ld1r(VectorFormat vform,
    207                      VectorFormat unpack_vform,
    208                      LogicVRegister dst,
    209                      uint64_t addr,
    210                      bool is_signed) {
    211   unsigned unpack_size = LaneSizeInBytesFromFormat(unpack_vform);
    212   dst.ClearForWrite(vform);
    213   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    214     if (is_signed) {
    215       if (!LoadIntToLane(dst, vform, unpack_size, i, addr)) {
    216         return false;
    217       }
    218     } else {
    219       if (!LoadUintToLane(dst, vform, unpack_size, i, addr)) {
    220         return false;
    221       }
    222     }
    223   }
    224   return true;
    225 }
    226 
    227 
    228 bool Simulator::ld1r(VectorFormat vform, LogicVRegister dst, uint64_t addr) {
    229   return ld1r(vform, vform, dst, addr);
    230 }
    231 
    232 
    233 bool Simulator::ld2(VectorFormat vform,
    234                     LogicVRegister dst1,
    235                     LogicVRegister dst2,
    236                     uint64_t addr1) {
    237   dst1.ClearForWrite(vform);
    238   dst2.ClearForWrite(vform);
    239   int esize = LaneSizeInBytesFromFormat(vform);
    240   uint64_t addr2 = addr1 + esize;
    241   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    242     if (!LoadLane(dst1, vform, i, addr1) || !LoadLane(dst2, vform, i, addr2)) {
    243       return false;
    244     }
    245     addr1 += 2 * esize;
    246     addr2 += 2 * esize;
    247   }
    248   return true;
    249 }
    250 
    251 
    252 bool Simulator::ld2(VectorFormat vform,
    253                     LogicVRegister dst1,
    254                     LogicVRegister dst2,
    255                     int index,
    256                     uint64_t addr1) {
    257   dst1.ClearForWrite(vform);
    258   dst2.ClearForWrite(vform);
    259   uint64_t addr2 = addr1 + LaneSizeInBytesFromFormat(vform);
    260   return (LoadLane(dst1, vform, index, addr1) &&
    261           LoadLane(dst2, vform, index, addr2));
    262 }
    263 
    264 
    265 bool Simulator::ld2r(VectorFormat vform,
    266                      LogicVRegister dst1,
    267                      LogicVRegister dst2,
    268                      uint64_t addr) {
    269   dst1.ClearForWrite(vform);
    270   dst2.ClearForWrite(vform);
    271   uint64_t addr2 = addr + LaneSizeInBytesFromFormat(vform);
    272   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    273     if (!LoadLane(dst1, vform, i, addr) || !LoadLane(dst2, vform, i, addr2)) {
    274       return false;
    275     }
    276   }
    277   return true;
    278 }
    279 
    280 
    281 bool Simulator::ld3(VectorFormat vform,
    282                     LogicVRegister dst1,
    283                     LogicVRegister dst2,
    284                     LogicVRegister dst3,
    285                     uint64_t addr1) {
    286   dst1.ClearForWrite(vform);
    287   dst2.ClearForWrite(vform);
    288   dst3.ClearForWrite(vform);
    289   int esize = LaneSizeInBytesFromFormat(vform);
    290   uint64_t addr2 = addr1 + esize;
    291   uint64_t addr3 = addr2 + esize;
    292   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    293     if (!LoadLane(dst1, vform, i, addr1) || !LoadLane(dst2, vform, i, addr2) ||
    294         !LoadLane(dst3, vform, i, addr3)) {
    295       return false;
    296     }
    297     addr1 += 3 * esize;
    298     addr2 += 3 * esize;
    299     addr3 += 3 * esize;
    300   }
    301   return true;
    302 }
    303 
    304 
    305 bool Simulator::ld3(VectorFormat vform,
    306                     LogicVRegister dst1,
    307                     LogicVRegister dst2,
    308                     LogicVRegister dst3,
    309                     int index,
    310                     uint64_t addr1) {
    311   dst1.ClearForWrite(vform);
    312   dst2.ClearForWrite(vform);
    313   dst3.ClearForWrite(vform);
    314   uint64_t addr2 = addr1 + LaneSizeInBytesFromFormat(vform);
    315   uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform);
    316   return (LoadLane(dst1, vform, index, addr1) &&
    317           LoadLane(dst2, vform, index, addr2) &&
    318           LoadLane(dst3, vform, index, addr3));
    319 }
    320 
    321 
    322 bool Simulator::ld3r(VectorFormat vform,
    323                      LogicVRegister dst1,
    324                      LogicVRegister dst2,
    325                      LogicVRegister dst3,
    326                      uint64_t addr) {
    327   dst1.ClearForWrite(vform);
    328   dst2.ClearForWrite(vform);
    329   dst3.ClearForWrite(vform);
    330   uint64_t addr2 = addr + LaneSizeInBytesFromFormat(vform);
    331   uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform);
    332   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    333     if (!LoadLane(dst1, vform, i, addr) || !LoadLane(dst2, vform, i, addr2) ||
    334         !LoadLane(dst3, vform, i, addr3)) {
    335       return false;
    336     }
    337   }
    338   return true;
    339 }
    340 
    341 
    342 bool Simulator::ld4(VectorFormat vform,
    343                     LogicVRegister dst1,
    344                     LogicVRegister dst2,
    345                     LogicVRegister dst3,
    346                     LogicVRegister dst4,
    347                     uint64_t addr1) {
    348   dst1.ClearForWrite(vform);
    349   dst2.ClearForWrite(vform);
    350   dst3.ClearForWrite(vform);
    351   dst4.ClearForWrite(vform);
    352   int esize = LaneSizeInBytesFromFormat(vform);
    353   uint64_t addr2 = addr1 + esize;
    354   uint64_t addr3 = addr2 + esize;
    355   uint64_t addr4 = addr3 + esize;
    356   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    357     if (!LoadLane(dst1, vform, i, addr1) || !LoadLane(dst2, vform, i, addr2) ||
    358         !LoadLane(dst3, vform, i, addr3) || !LoadLane(dst4, vform, i, addr4)) {
    359       return false;
    360     }
    361     addr1 += 4 * esize;
    362     addr2 += 4 * esize;
    363     addr3 += 4 * esize;
    364     addr4 += 4 * esize;
    365   }
    366   return true;
    367 }
    368 
    369 
    370 bool Simulator::ld4(VectorFormat vform,
    371                     LogicVRegister dst1,
    372                     LogicVRegister dst2,
    373                     LogicVRegister dst3,
    374                     LogicVRegister dst4,
    375                     int index,
    376                     uint64_t addr1) {
    377   dst1.ClearForWrite(vform);
    378   dst2.ClearForWrite(vform);
    379   dst3.ClearForWrite(vform);
    380   dst4.ClearForWrite(vform);
    381   uint64_t addr2 = addr1 + LaneSizeInBytesFromFormat(vform);
    382   uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform);
    383   uint64_t addr4 = addr3 + LaneSizeInBytesFromFormat(vform);
    384   return (LoadLane(dst1, vform, index, addr1) &&
    385           LoadLane(dst2, vform, index, addr2) &&
    386           LoadLane(dst3, vform, index, addr3) &&
    387           LoadLane(dst4, vform, index, addr4));
    388 }
    389 
    390 
    391 bool Simulator::ld4r(VectorFormat vform,
    392                      LogicVRegister dst1,
    393                      LogicVRegister dst2,
    394                      LogicVRegister dst3,
    395                      LogicVRegister dst4,
    396                      uint64_t addr) {
    397   dst1.ClearForWrite(vform);
    398   dst2.ClearForWrite(vform);
    399   dst3.ClearForWrite(vform);
    400   dst4.ClearForWrite(vform);
    401   uint64_t addr2 = addr + LaneSizeInBytesFromFormat(vform);
    402   uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform);
    403   uint64_t addr4 = addr3 + LaneSizeInBytesFromFormat(vform);
    404   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    405     if (!LoadLane(dst1, vform, i, addr) || !LoadLane(dst2, vform, i, addr2) ||
    406         !LoadLane(dst3, vform, i, addr3) || !LoadLane(dst4, vform, i, addr4)) {
    407       return false;
    408     }
    409   }
    410   return true;
    411 }
    412 
    413 
    414 bool Simulator::st1(VectorFormat vform, LogicVRegister src, uint64_t addr) {
    415   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    416     if (!StoreLane(src, vform, i, addr)) return false;
    417     addr += LaneSizeInBytesFromFormat(vform);
    418   }
    419   return true;
    420 }
    421 
    422 
    423 bool Simulator::st1(VectorFormat vform,
    424                     LogicVRegister src,
    425                     int index,
    426                     uint64_t addr) {
    427   return StoreLane(src, vform, index, addr);
    428 }
    429 
    430 
    431 bool Simulator::st2(VectorFormat vform,
    432                     LogicVRegister src,
    433                     LogicVRegister src2,
    434                     uint64_t addr) {
    435   int esize = LaneSizeInBytesFromFormat(vform);
    436   uint64_t addr2 = addr + esize;
    437   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    438     if (!StoreLane(src, vform, i, addr) || !StoreLane(src2, vform, i, addr2)) {
    439       return false;
    440     }
    441     addr += 2 * esize;
    442     addr2 += 2 * esize;
    443   }
    444   return true;
    445 }
    446 
    447 
    448 bool Simulator::st2(VectorFormat vform,
    449                     LogicVRegister src,
    450                     LogicVRegister src2,
    451                     int index,
    452                     uint64_t addr) {
    453   int esize = LaneSizeInBytesFromFormat(vform);
    454   return (StoreLane(src, vform, index, addr) &&
    455           StoreLane(src2, vform, index, addr + 1 * esize));
    456 }
    457 
    458 
    459 bool Simulator::st3(VectorFormat vform,
    460                     LogicVRegister src,
    461                     LogicVRegister src2,
    462                     LogicVRegister src3,
    463                     uint64_t addr) {
    464   int esize = LaneSizeInBytesFromFormat(vform);
    465   uint64_t addr2 = addr + esize;
    466   uint64_t addr3 = addr2 + esize;
    467   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    468     if (!StoreLane(src, vform, i, addr) || !StoreLane(src2, vform, i, addr2) ||
    469         !StoreLane(src3, vform, i, addr3)) {
    470       return false;
    471     }
    472     addr += 3 * esize;
    473     addr2 += 3 * esize;
    474     addr3 += 3 * esize;
    475   }
    476   return true;
    477 }
    478 
    479 
    480 bool Simulator::st3(VectorFormat vform,
    481                     LogicVRegister src,
    482                     LogicVRegister src2,
    483                     LogicVRegister src3,
    484                     int index,
    485                     uint64_t addr) {
    486   int esize = LaneSizeInBytesFromFormat(vform);
    487   return (StoreLane(src, vform, index, addr) &&
    488           StoreLane(src2, vform, index, addr + 1 * esize) &&
    489           StoreLane(src3, vform, index, addr + 2 * esize));
    490 }
    491 
    492 
    493 bool Simulator::st4(VectorFormat vform,
    494                     LogicVRegister src,
    495                     LogicVRegister src2,
    496                     LogicVRegister src3,
    497                     LogicVRegister src4,
    498                     uint64_t addr) {
    499   int esize = LaneSizeInBytesFromFormat(vform);
    500   uint64_t addr2 = addr + esize;
    501   uint64_t addr3 = addr2 + esize;
    502   uint64_t addr4 = addr3 + esize;
    503   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    504     if (!StoreLane(src, vform, i, addr) || !StoreLane(src2, vform, i, addr2) ||
    505         !StoreLane(src3, vform, i, addr3) ||
    506         !StoreLane(src4, vform, i, addr4)) {
    507       return false;
    508     }
    509     addr += 4 * esize;
    510     addr2 += 4 * esize;
    511     addr3 += 4 * esize;
    512     addr4 += 4 * esize;
    513   }
    514   return true;
    515 }
    516 
    517 
    518 bool Simulator::st4(VectorFormat vform,
    519                     LogicVRegister src,
    520                     LogicVRegister src2,
    521                     LogicVRegister src3,
    522                     LogicVRegister src4,
    523                     int index,
    524                     uint64_t addr) {
    525   int esize = LaneSizeInBytesFromFormat(vform);
    526   return (StoreLane(src, vform, index, addr) &&
    527           StoreLane(src2, vform, index, addr + 1 * esize) &&
    528           StoreLane(src3, vform, index, addr + 2 * esize) &&
    529           StoreLane(src4, vform, index, addr + 3 * esize));
    530 }
    531 
    532 
    533 LogicVRegister Simulator::cmp(VectorFormat vform,
    534                               LogicVRegister dst,
    535                               const LogicVRegister& src1,
    536                               const LogicVRegister& src2,
    537                               Condition cond) {
    538   dst.ClearForWrite(vform);
    539   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    540     int64_t sa = src1.Int(vform, i);
    541     int64_t sb = src2.Int(vform, i);
    542     uint64_t ua = src1.Uint(vform, i);
    543     uint64_t ub = src2.Uint(vform, i);
    544     bool result = false;
    545     switch (cond) {
    546       case eq:
    547         result = (ua == ub);
    548         break;
    549       case ge:
    550         result = (sa >= sb);
    551         break;
    552       case gt:
    553         result = (sa > sb);
    554         break;
    555       case hi:
    556         result = (ua > ub);
    557         break;
    558       case hs:
    559         result = (ua >= ub);
    560         break;
    561       case lt:
    562         result = (sa < sb);
    563         break;
    564       case le:
    565         result = (sa <= sb);
    566         break;
    567       default:
    568         VIXL_UNREACHABLE();
    569         break;
    570     }
    571     dst.SetUint(vform, i, result ? MaxUintFromFormat(vform) : 0);
    572   }
    573   return dst;
    574 }
    575 
    576 
    577 LogicVRegister Simulator::cmp(VectorFormat vform,
    578                               LogicVRegister dst,
    579                               const LogicVRegister& src1,
    580                               int imm,
    581                               Condition cond) {
    582   SimVRegister temp;
    583   LogicVRegister imm_reg = dup_immediate(vform, temp, imm);
    584   return cmp(vform, dst, src1, imm_reg, cond);
    585 }
    586 
    587 
    588 LogicVRegister Simulator::cmptst(VectorFormat vform,
    589                                  LogicVRegister dst,
    590                                  const LogicVRegister& src1,
    591                                  const LogicVRegister& src2) {
    592   dst.ClearForWrite(vform);
    593   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    594     uint64_t ua = src1.Uint(vform, i);
    595     uint64_t ub = src2.Uint(vform, i);
    596     dst.SetUint(vform, i, ((ua & ub) != 0) ? MaxUintFromFormat(vform) : 0);
    597   }
    598   return dst;
    599 }
    600 
    601 
    602 LogicVRegister Simulator::add(VectorFormat vform,
    603                               LogicVRegister dst,
    604                               const LogicVRegister& src1,
    605                               const LogicVRegister& src2) {
    606   int lane_size = LaneSizeInBitsFromFormat(vform);
    607   dst.ClearForWrite(vform);
    608 
    609   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    610     // Test for unsigned saturation.
    611     uint64_t ua = src1.UintLeftJustified(vform, i);
    612     uint64_t ub = src2.UintLeftJustified(vform, i);
    613     uint64_t ur = ua + ub;
    614     if (ur < ua) {
    615       dst.SetUnsignedSat(i, true);
    616     }
    617 
    618     // Test for signed saturation.
    619     bool pos_a = (ua >> 63) == 0;
    620     bool pos_b = (ub >> 63) == 0;
    621     bool pos_r = (ur >> 63) == 0;
    622     // If the signs of the operands are the same, but different from the result,
    623     // there was an overflow.
    624     if ((pos_a == pos_b) && (pos_a != pos_r)) {
    625       dst.SetSignedSat(i, pos_a);
    626     }
    627     dst.SetInt(vform, i, ur >> (64 - lane_size));
    628   }
    629   return dst;
    630 }
    631 
    632 LogicVRegister Simulator::add_uint(VectorFormat vform,
    633                                    LogicVRegister dst,
    634                                    const LogicVRegister& src1,
    635                                    uint64_t value) {
    636   int lane_size = LaneSizeInBitsFromFormat(vform);
    637   VIXL_ASSERT(IsUintN(lane_size, value));
    638   dst.ClearForWrite(vform);
    639   // Left-justify `value`.
    640   uint64_t ub = value << (64 - lane_size);
    641   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    642     // Test for unsigned saturation.
    643     uint64_t ua = src1.UintLeftJustified(vform, i);
    644     uint64_t ur = ua + ub;
    645     if (ur < ua) {
    646       dst.SetUnsignedSat(i, true);
    647     }
    648 
    649     // Test for signed saturation.
    650     // `value` is always positive, so we have an overflow if the (signed) result
    651     // is smaller than the first operand.
    652     if (RawbitsToInt64(ur) < RawbitsToInt64(ua)) {
    653       dst.SetSignedSat(i, true);
    654     }
    655 
    656     dst.SetInt(vform, i, ur >> (64 - lane_size));
    657   }
    658   return dst;
    659 }
    660 
    661 LogicVRegister Simulator::addp(VectorFormat vform,
    662                                LogicVRegister dst,
    663                                const LogicVRegister& src1,
    664                                const LogicVRegister& src2) {
    665   SimVRegister temp1, temp2;
    666   uzp1(vform, temp1, src1, src2);
    667   uzp2(vform, temp2, src1, src2);
    668   add(vform, dst, temp1, temp2);
    669   if (IsSVEFormat(vform)) {
    670     interleave_top_bottom(vform, dst, dst);
    671   }
    672   return dst;
    673 }
    674 
    675 LogicVRegister Simulator::sdiv(VectorFormat vform,
    676                                LogicVRegister dst,
    677                                const LogicVRegister& src1,
    678                                const LogicVRegister& src2) {
    679   VIXL_ASSERT((vform == kFormatVnS) || (vform == kFormatVnD));
    680 
    681   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    682     int64_t val1 = src1.Int(vform, i);
    683     int64_t val2 = src2.Int(vform, i);
    684     int64_t min_int = (vform == kFormatVnD) ? kXMinInt : kWMinInt;
    685     int64_t quotient = 0;
    686     if ((val1 == min_int) && (val2 == -1)) {
    687       quotient = min_int;
    688     } else if (val2 != 0) {
    689       quotient = val1 / val2;
    690     }
    691     dst.SetInt(vform, i, quotient);
    692   }
    693 
    694   return dst;
    695 }
    696 
    697 LogicVRegister Simulator::udiv(VectorFormat vform,
    698                                LogicVRegister dst,
    699                                const LogicVRegister& src1,
    700                                const LogicVRegister& src2) {
    701   VIXL_ASSERT((vform == kFormatVnS) || (vform == kFormatVnD));
    702 
    703   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    704     uint64_t val1 = src1.Uint(vform, i);
    705     uint64_t val2 = src2.Uint(vform, i);
    706     uint64_t quotient = 0;
    707     if (val2 != 0) {
    708       quotient = val1 / val2;
    709     }
    710     dst.SetUint(vform, i, quotient);
    711   }
    712 
    713   return dst;
    714 }
    715 
    716 
    717 LogicVRegister Simulator::mla(VectorFormat vform,
    718                               LogicVRegister dst,
    719                               const LogicVRegister& srca,
    720                               const LogicVRegister& src1,
    721                               const LogicVRegister& src2) {
    722   SimVRegister temp;
    723   mul(vform, temp, src1, src2);
    724   add(vform, dst, srca, temp);
    725   return dst;
    726 }
    727 
    728 
    729 LogicVRegister Simulator::mls(VectorFormat vform,
    730                               LogicVRegister dst,
    731                               const LogicVRegister& srca,
    732                               const LogicVRegister& src1,
    733                               const LogicVRegister& src2) {
    734   SimVRegister temp;
    735   mul(vform, temp, src1, src2);
    736   sub(vform, dst, srca, temp);
    737   return dst;
    738 }
    739 
    740 
    741 LogicVRegister Simulator::mul(VectorFormat vform,
    742                               LogicVRegister dst,
    743                               const LogicVRegister& src1,
    744                               const LogicVRegister& src2) {
    745   dst.ClearForWrite(vform);
    746 
    747   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    748     dst.SetUint(vform, i, src1.Uint(vform, i) * src2.Uint(vform, i));
    749   }
    750   return dst;
    751 }
    752 
    753 
    754 LogicVRegister Simulator::mul(VectorFormat vform,
    755                               LogicVRegister dst,
    756                               const LogicVRegister& src1,
    757                               const LogicVRegister& src2,
    758                               int index) {
    759   SimVRegister temp;
    760   VectorFormat indexform = VectorFormatFillQ(vform);
    761   return mul(vform, dst, src1, dup_element(indexform, temp, src2, index));
    762 }
    763 
    764 
    765 LogicVRegister Simulator::smulh(VectorFormat vform,
    766                                 LogicVRegister dst,
    767                                 const LogicVRegister& src1,
    768                                 const LogicVRegister& src2) {
    769   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    770     int64_t dst_val = 0xbadbeef;
    771     int64_t val1 = src1.Int(vform, i);
    772     int64_t val2 = src2.Int(vform, i);
    773     switch (LaneSizeInBitsFromFormat(vform)) {
    774       case 8:
    775         dst_val = internal::MultiplyHigh<8>(val1, val2);
    776         break;
    777       case 16:
    778         dst_val = internal::MultiplyHigh<16>(val1, val2);
    779         break;
    780       case 32:
    781         dst_val = internal::MultiplyHigh<32>(val1, val2);
    782         break;
    783       case 64:
    784         dst_val = internal::MultiplyHigh<64>(val1, val2);
    785         break;
    786       default:
    787         VIXL_UNREACHABLE();
    788         break;
    789     }
    790     dst.SetInt(vform, i, dst_val);
    791   }
    792   return dst;
    793 }
    794 
    795 
    796 LogicVRegister Simulator::umulh(VectorFormat vform,
    797                                 LogicVRegister dst,
    798                                 const LogicVRegister& src1,
    799                                 const LogicVRegister& src2) {
    800   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    801     uint64_t dst_val = 0xbadbeef;
    802     uint64_t val1 = src1.Uint(vform, i);
    803     uint64_t val2 = src2.Uint(vform, i);
    804     switch (LaneSizeInBitsFromFormat(vform)) {
    805       case 8:
    806         dst_val = internal::MultiplyHigh<8>(val1, val2);
    807         break;
    808       case 16:
    809         dst_val = internal::MultiplyHigh<16>(val1, val2);
    810         break;
    811       case 32:
    812         dst_val = internal::MultiplyHigh<32>(val1, val2);
    813         break;
    814       case 64:
    815         dst_val = internal::MultiplyHigh<64>(val1, val2);
    816         break;
    817       default:
    818         VIXL_UNREACHABLE();
    819         break;
    820     }
    821     dst.SetUint(vform, i, dst_val);
    822   }
    823   return dst;
    824 }
    825 
    826 
    827 LogicVRegister Simulator::mla(VectorFormat vform,
    828                               LogicVRegister dst,
    829                               const LogicVRegister& src1,
    830                               const LogicVRegister& src2,
    831                               int index) {
    832   SimVRegister temp;
    833   VectorFormat indexform = VectorFormatFillQ(vform);
    834   return mla(vform, dst, dst, src1, dup_element(indexform, temp, src2, index));
    835 }
    836 
    837 
    838 LogicVRegister Simulator::mls(VectorFormat vform,
    839                               LogicVRegister dst,
    840                               const LogicVRegister& src1,
    841                               const LogicVRegister& src2,
    842                               int index) {
    843   SimVRegister temp;
    844   VectorFormat indexform = VectorFormatFillQ(vform);
    845   return mls(vform, dst, dst, src1, dup_element(indexform, temp, src2, index));
    846 }
    847 
    848 LogicVRegister Simulator::sqdmull(VectorFormat vform,
    849                                   LogicVRegister dst,
    850                                   const LogicVRegister& src1,
    851                                   const LogicVRegister& src2,
    852                                   int index) {
    853   SimVRegister temp;
    854   VectorFormat indexform =
    855       VectorFormatHalfWidthDoubleLanes(VectorFormatFillQ(vform));
    856   return sqdmull(vform, dst, src1, dup_element(indexform, temp, src2, index));
    857 }
    858 
    859 LogicVRegister Simulator::sqdmlal(VectorFormat vform,
    860                                   LogicVRegister dst,
    861                                   const LogicVRegister& src1,
    862                                   const LogicVRegister& src2,
    863                                   int index) {
    864   SimVRegister temp;
    865   VectorFormat indexform =
    866       VectorFormatHalfWidthDoubleLanes(VectorFormatFillQ(vform));
    867   return sqdmlal(vform, dst, src1, dup_element(indexform, temp, src2, index));
    868 }
    869 
    870 LogicVRegister Simulator::sqdmlsl(VectorFormat vform,
    871                                   LogicVRegister dst,
    872                                   const LogicVRegister& src1,
    873                                   const LogicVRegister& src2,
    874                                   int index) {
    875   SimVRegister temp;
    876   VectorFormat indexform =
    877       VectorFormatHalfWidthDoubleLanes(VectorFormatFillQ(vform));
    878   return sqdmlsl(vform, dst, src1, dup_element(indexform, temp, src2, index));
    879 }
    880 
    881 LogicVRegister Simulator::sqdmulh(VectorFormat vform,
    882                                   LogicVRegister dst,
    883                                   const LogicVRegister& src1,
    884                                   const LogicVRegister& src2,
    885                                   int index) {
    886   SimVRegister temp;
    887   VectorFormat indexform = VectorFormatFillQ(vform);
    888   return sqdmulh(vform, dst, src1, dup_element(indexform, temp, src2, index));
    889 }
    890 
    891 
    892 LogicVRegister Simulator::sqrdmulh(VectorFormat vform,
    893                                    LogicVRegister dst,
    894                                    const LogicVRegister& src1,
    895                                    const LogicVRegister& src2,
    896                                    int index) {
    897   SimVRegister temp;
    898   VectorFormat indexform = VectorFormatFillQ(vform);
    899   return sqrdmulh(vform, dst, src1, dup_element(indexform, temp, src2, index));
    900 }
    901 
    902 
    903 LogicVRegister Simulator::sqrdmlah(VectorFormat vform,
    904                                    LogicVRegister dst,
    905                                    const LogicVRegister& src1,
    906                                    const LogicVRegister& src2,
    907                                    int index) {
    908   SimVRegister temp;
    909   VectorFormat indexform = VectorFormatFillQ(vform);
    910   return sqrdmlah(vform, dst, src1, dup_element(indexform, temp, src2, index));
    911 }
    912 
    913 
    914 LogicVRegister Simulator::sqrdmlsh(VectorFormat vform,
    915                                    LogicVRegister dst,
    916                                    const LogicVRegister& src1,
    917                                    const LogicVRegister& src2,
    918                                    int index) {
    919   SimVRegister temp;
    920   VectorFormat indexform = VectorFormatFillQ(vform);
    921   return sqrdmlsh(vform, dst, src1, dup_element(indexform, temp, src2, index));
    922 }
    923 
    924 uint64_t Simulator::PolynomialMult(uint64_t op1,
    925                                    uint64_t op2,
    926                                    int lane_size_in_bits) const {
    927   return PolynomialMult128(op1, op2, lane_size_in_bits).second;
    928 }
    929 
    930 LogicVRegister Simulator::pmul(VectorFormat vform,
    931                                LogicVRegister dst,
    932                                const LogicVRegister& src1,
    933                                const LogicVRegister& src2) {
    934   dst.ClearForWrite(vform);
    935   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    936     dst.SetUint(vform,
    937                 i,
    938                 PolynomialMult(src1.Uint(vform, i),
    939                                src2.Uint(vform, i),
    940                                LaneSizeInBitsFromFormat(vform)));
    941   }
    942   return dst;
    943 }
    944 
    945 
    946 LogicVRegister Simulator::pmull(VectorFormat vform,
    947                                 LogicVRegister dst,
    948                                 const LogicVRegister& src1,
    949                                 const LogicVRegister& src2) {
    950   dst.ClearForWrite(vform);
    951   VectorFormat vform_src = VectorFormatHalfWidth(vform);
    952 
    953   // Process the elements in reverse to avoid problems when the destination
    954   // register is the same as a source.
    955   for (int i = LaneCountFromFormat(vform) - 1; i >= 0; i--) {
    956     dst.SetUint(vform,
    957                 i,
    958                 PolynomialMult128(src1.Uint(vform_src, i),
    959                                   src2.Uint(vform_src, i),
    960                                   LaneSizeInBitsFromFormat(vform_src)));
    961   }
    962 
    963   return dst;
    964 }
    965 
    966 
    967 LogicVRegister Simulator::pmull2(VectorFormat vform,
    968                                  LogicVRegister dst,
    969                                  const LogicVRegister& src1,
    970                                  const LogicVRegister& src2) {
    971   dst.ClearForWrite(vform);
    972   VectorFormat vform_src = VectorFormatHalfWidthDoubleLanes(vform);
    973 
    974   int lane_count = LaneCountFromFormat(vform);
    975   for (int i = 0; i < lane_count; i++) {
    976     dst.SetUint(vform,
    977                 i,
    978                 PolynomialMult128(src1.Uint(vform_src, lane_count + i),
    979                                   src2.Uint(vform_src, lane_count + i),
    980                                   LaneSizeInBitsFromFormat(vform_src)));
    981   }
    982 
    983   return dst;
    984 }
    985 
    986 
    987 LogicVRegister Simulator::sub(VectorFormat vform,
    988                               LogicVRegister dst,
    989                               const LogicVRegister& src1,
    990                               const LogicVRegister& src2) {
    991   int lane_size = LaneSizeInBitsFromFormat(vform);
    992   dst.ClearForWrite(vform);
    993   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    994     // Test for unsigned saturation.
    995     uint64_t ua = src1.UintLeftJustified(vform, i);
    996     uint64_t ub = src2.UintLeftJustified(vform, i);
    997     uint64_t ur = ua - ub;
    998     if (ub > ua) {
    999       dst.SetUnsignedSat(i, false);
   1000     }
   1001 
   1002     // Test for signed saturation.
   1003     bool pos_a = (ua >> 63) == 0;
   1004     bool pos_b = (ub >> 63) == 0;
   1005     bool pos_r = (ur >> 63) == 0;
   1006     // If the signs of the operands are different, and the sign of the first
   1007     // operand doesn't match the result, there was an overflow.
   1008     if ((pos_a != pos_b) && (pos_a != pos_r)) {
   1009       dst.SetSignedSat(i, pos_a);
   1010     }
   1011 
   1012     dst.SetInt(vform, i, ur >> (64 - lane_size));
   1013   }
   1014   return dst;
   1015 }
   1016 
   1017 LogicVRegister Simulator::sub_uint(VectorFormat vform,
   1018                                    LogicVRegister dst,
   1019                                    const LogicVRegister& src1,
   1020                                    uint64_t value) {
   1021   int lane_size = LaneSizeInBitsFromFormat(vform);
   1022   VIXL_ASSERT(IsUintN(lane_size, value));
   1023   dst.ClearForWrite(vform);
   1024   // Left-justify `value`.
   1025   uint64_t ub = value << (64 - lane_size);
   1026   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1027     // Test for unsigned saturation.
   1028     uint64_t ua = src1.UintLeftJustified(vform, i);
   1029     uint64_t ur = ua - ub;
   1030     if (ub > ua) {
   1031       dst.SetUnsignedSat(i, false);
   1032     }
   1033 
   1034     // Test for signed saturation.
   1035     // `value` is always positive, so we have an overflow if the (signed) result
   1036     // is greater than the first operand.
   1037     if (RawbitsToInt64(ur) > RawbitsToInt64(ua)) {
   1038       dst.SetSignedSat(i, false);
   1039     }
   1040 
   1041     dst.SetInt(vform, i, ur >> (64 - lane_size));
   1042   }
   1043   return dst;
   1044 }
   1045 
   1046 LogicVRegister Simulator::and_(VectorFormat vform,
   1047                                LogicVRegister dst,
   1048                                const LogicVRegister& src1,
   1049                                const LogicVRegister& src2) {
   1050   dst.ClearForWrite(vform);
   1051   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1052     dst.SetUint(vform, i, src1.Uint(vform, i) & src2.Uint(vform, i));
   1053   }
   1054   return dst;
   1055 }
   1056 
   1057 
   1058 LogicVRegister Simulator::orr(VectorFormat vform,
   1059                               LogicVRegister dst,
   1060                               const LogicVRegister& src1,
   1061                               const LogicVRegister& src2) {
   1062   dst.ClearForWrite(vform);
   1063   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1064     dst.SetUint(vform, i, src1.Uint(vform, i) | src2.Uint(vform, i));
   1065   }
   1066   return dst;
   1067 }
   1068 
   1069 
   1070 LogicVRegister Simulator::orn(VectorFormat vform,
   1071                               LogicVRegister dst,
   1072                               const LogicVRegister& src1,
   1073                               const LogicVRegister& src2) {
   1074   dst.ClearForWrite(vform);
   1075   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1076     dst.SetUint(vform, i, src1.Uint(vform, i) | ~src2.Uint(vform, i));
   1077   }
   1078   return dst;
   1079 }
   1080 
   1081 
   1082 LogicVRegister Simulator::eor(VectorFormat vform,
   1083                               LogicVRegister dst,
   1084                               const LogicVRegister& src1,
   1085                               const LogicVRegister& src2) {
   1086   dst.ClearForWrite(vform);
   1087   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1088     dst.SetUint(vform, i, src1.Uint(vform, i) ^ src2.Uint(vform, i));
   1089   }
   1090   return dst;
   1091 }
   1092 
   1093 
   1094 LogicVRegister Simulator::bic(VectorFormat vform,
   1095                               LogicVRegister dst,
   1096                               const LogicVRegister& src1,
   1097                               const LogicVRegister& src2) {
   1098   dst.ClearForWrite(vform);
   1099   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1100     dst.SetUint(vform, i, src1.Uint(vform, i) & ~src2.Uint(vform, i));
   1101   }
   1102   return dst;
   1103 }
   1104 
   1105 
   1106 LogicVRegister Simulator::bic(VectorFormat vform,
   1107                               LogicVRegister dst,
   1108                               const LogicVRegister& src,
   1109                               uint64_t imm) {
   1110   uint64_t result[16];
   1111   int lane_count = LaneCountFromFormat(vform);
   1112   for (int i = 0; i < lane_count; ++i) {
   1113     result[i] = src.Uint(vform, i) & ~imm;
   1114   }
   1115   dst.ClearForWrite(vform);
   1116   for (int i = 0; i < lane_count; ++i) {
   1117     dst.SetUint(vform, i, result[i]);
   1118   }
   1119   return dst;
   1120 }
   1121 
   1122 
   1123 LogicVRegister Simulator::bif(VectorFormat vform,
   1124                               LogicVRegister dst,
   1125                               const LogicVRegister& src1,
   1126                               const LogicVRegister& src2) {
   1127   dst.ClearForWrite(vform);
   1128   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1129     uint64_t operand1 = dst.Uint(vform, i);
   1130     uint64_t operand2 = ~src2.Uint(vform, i);
   1131     uint64_t operand3 = src1.Uint(vform, i);
   1132     uint64_t result = operand1 ^ ((operand1 ^ operand3) & operand2);
   1133     dst.SetUint(vform, i, result);
   1134   }
   1135   return dst;
   1136 }
   1137 
   1138 
   1139 LogicVRegister Simulator::bit(VectorFormat vform,
   1140                               LogicVRegister dst,
   1141                               const LogicVRegister& src1,
   1142                               const LogicVRegister& src2) {
   1143   dst.ClearForWrite(vform);
   1144   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1145     uint64_t operand1 = dst.Uint(vform, i);
   1146     uint64_t operand2 = src2.Uint(vform, i);
   1147     uint64_t operand3 = src1.Uint(vform, i);
   1148     uint64_t result = operand1 ^ ((operand1 ^ operand3) & operand2);
   1149     dst.SetUint(vform, i, result);
   1150   }
   1151   return dst;
   1152 }
   1153 
   1154 
   1155 LogicVRegister Simulator::bsl(VectorFormat vform,
   1156                               LogicVRegister dst,
   1157                               const LogicVRegister& src_mask,
   1158                               const LogicVRegister& src1,
   1159                               const LogicVRegister& src2) {
   1160   dst.ClearForWrite(vform);
   1161   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1162     uint64_t operand1 = src2.Uint(vform, i);
   1163     uint64_t operand2 = src_mask.Uint(vform, i);
   1164     uint64_t operand3 = src1.Uint(vform, i);
   1165     uint64_t result = operand1 ^ ((operand1 ^ operand3) & operand2);
   1166     dst.SetUint(vform, i, result);
   1167   }
   1168   return dst;
   1169 }
   1170 
   1171 
   1172 LogicVRegister Simulator::sminmax(VectorFormat vform,
   1173                                   LogicVRegister dst,
   1174                                   const LogicVRegister& src1,
   1175                                   const LogicVRegister& src2,
   1176                                   bool max) {
   1177   dst.ClearForWrite(vform);
   1178   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1179     int64_t src1_val = src1.Int(vform, i);
   1180     int64_t src2_val = src2.Int(vform, i);
   1181     int64_t dst_val;
   1182     if (max) {
   1183       dst_val = (src1_val > src2_val) ? src1_val : src2_val;
   1184     } else {
   1185       dst_val = (src1_val < src2_val) ? src1_val : src2_val;
   1186     }
   1187     dst.SetInt(vform, i, dst_val);
   1188   }
   1189   return dst;
   1190 }
   1191 
   1192 
   1193 LogicVRegister Simulator::smax(VectorFormat vform,
   1194                                LogicVRegister dst,
   1195                                const LogicVRegister& src1,
   1196                                const LogicVRegister& src2) {
   1197   return sminmax(vform, dst, src1, src2, true);
   1198 }
   1199 
   1200 
   1201 LogicVRegister Simulator::smin(VectorFormat vform,
   1202                                LogicVRegister dst,
   1203                                const LogicVRegister& src1,
   1204                                const LogicVRegister& src2) {
   1205   return sminmax(vform, dst, src1, src2, false);
   1206 }
   1207 
   1208 
   1209 LogicVRegister Simulator::sminmaxp(VectorFormat vform,
   1210                                    LogicVRegister dst,
   1211                                    const LogicVRegister& src1,
   1212                                    const LogicVRegister& src2,
   1213                                    bool max) {
   1214   unsigned lanes = LaneCountFromFormat(vform);
   1215   int64_t result[kZRegMaxSizeInBytes];
   1216   const LogicVRegister* src = &src1;
   1217   for (unsigned j = 0; j < 2; j++) {
   1218     for (unsigned i = 0; i < lanes; i += 2) {
   1219       int64_t first_val = src->Int(vform, i);
   1220       int64_t second_val = src->Int(vform, i + 1);
   1221       int64_t dst_val;
   1222       if (max) {
   1223         dst_val = (first_val > second_val) ? first_val : second_val;
   1224       } else {
   1225         dst_val = (first_val < second_val) ? first_val : second_val;
   1226       }
   1227       VIXL_ASSERT(((i >> 1) + (j * lanes / 2)) < ArrayLength(result));
   1228       result[(i >> 1) + (j * lanes / 2)] = dst_val;
   1229     }
   1230     src = &src2;
   1231   }
   1232   dst.SetIntArray(vform, result);
   1233   if (IsSVEFormat(vform)) {
   1234     interleave_top_bottom(vform, dst, dst);
   1235   }
   1236   return dst;
   1237 }
   1238 
   1239 
   1240 LogicVRegister Simulator::smaxp(VectorFormat vform,
   1241                                 LogicVRegister dst,
   1242                                 const LogicVRegister& src1,
   1243                                 const LogicVRegister& src2) {
   1244   return sminmaxp(vform, dst, src1, src2, true);
   1245 }
   1246 
   1247 
   1248 LogicVRegister Simulator::sminp(VectorFormat vform,
   1249                                 LogicVRegister dst,
   1250                                 const LogicVRegister& src1,
   1251                                 const LogicVRegister& src2) {
   1252   return sminmaxp(vform, dst, src1, src2, false);
   1253 }
   1254 
   1255 
   1256 LogicVRegister Simulator::addp(VectorFormat vform,
   1257                                LogicVRegister dst,
   1258                                const LogicVRegister& src) {
   1259   VIXL_ASSERT(vform == kFormatD);
   1260 
   1261   uint64_t dst_val = src.Uint(kFormat2D, 0) + src.Uint(kFormat2D, 1);
   1262   dst.ClearForWrite(vform);
   1263   dst.SetUint(vform, 0, dst_val);
   1264   return dst;
   1265 }
   1266 
   1267 
   1268 LogicVRegister Simulator::addv(VectorFormat vform,
   1269                                LogicVRegister dst,
   1270                                const LogicVRegister& src) {
   1271   VectorFormat vform_dst =
   1272       ScalarFormatFromLaneSize(LaneSizeInBitsFromFormat(vform));
   1273 
   1274 
   1275   int64_t dst_val = 0;
   1276   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1277     dst_val += src.Int(vform, i);
   1278   }
   1279 
   1280   dst.ClearForWrite(vform_dst);
   1281   dst.SetInt(vform_dst, 0, dst_val);
   1282   return dst;
   1283 }
   1284 
   1285 
   1286 LogicVRegister Simulator::saddlv(VectorFormat vform,
   1287                                  LogicVRegister dst,
   1288                                  const LogicVRegister& src) {
   1289   VectorFormat vform_dst =
   1290       ScalarFormatFromLaneSize(LaneSizeInBitsFromFormat(vform) * 2);
   1291 
   1292   int64_t dst_val = 0;
   1293   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1294     dst_val += src.Int(vform, i);
   1295   }
   1296 
   1297   dst.ClearForWrite(vform_dst);
   1298   dst.SetInt(vform_dst, 0, dst_val);
   1299   return dst;
   1300 }
   1301 
   1302 
   1303 LogicVRegister Simulator::uaddlv(VectorFormat vform,
   1304                                  LogicVRegister dst,
   1305                                  const LogicVRegister& src) {
   1306   VectorFormat vform_dst =
   1307       ScalarFormatFromLaneSize(LaneSizeInBitsFromFormat(vform) * 2);
   1308 
   1309   uint64_t dst_val = 0;
   1310   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1311     dst_val += src.Uint(vform, i);
   1312   }
   1313 
   1314   dst.ClearForWrite(vform_dst);
   1315   dst.SetUint(vform_dst, 0, dst_val);
   1316   return dst;
   1317 }
   1318 
   1319 
   1320 LogicVRegister Simulator::sminmaxv(VectorFormat vform,
   1321                                    LogicVRegister dst,
   1322                                    const LogicPRegister& pg,
   1323                                    const LogicVRegister& src,
   1324                                    bool max) {
   1325   int64_t dst_val = max ? INT64_MIN : INT64_MAX;
   1326   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1327     if (!pg.IsActive(vform, i)) continue;
   1328 
   1329     int64_t src_val = src.Int(vform, i);
   1330     if (max) {
   1331       dst_val = (src_val > dst_val) ? src_val : dst_val;
   1332     } else {
   1333       dst_val = (src_val < dst_val) ? src_val : dst_val;
   1334     }
   1335   }
   1336   dst.ClearForWrite(ScalarFormatFromFormat(vform));
   1337   dst.SetInt(vform, 0, dst_val);
   1338   return dst;
   1339 }
   1340 
   1341 
   1342 LogicVRegister Simulator::smaxv(VectorFormat vform,
   1343                                 LogicVRegister dst,
   1344                                 const LogicVRegister& src) {
   1345   sminmaxv(vform, dst, GetPTrue(), src, true);
   1346   return dst;
   1347 }
   1348 
   1349 
   1350 LogicVRegister Simulator::sminv(VectorFormat vform,
   1351                                 LogicVRegister dst,
   1352                                 const LogicVRegister& src) {
   1353   sminmaxv(vform, dst, GetPTrue(), src, false);
   1354   return dst;
   1355 }
   1356 
   1357 
   1358 LogicVRegister Simulator::smaxv(VectorFormat vform,
   1359                                 LogicVRegister dst,
   1360                                 const LogicPRegister& pg,
   1361                                 const LogicVRegister& src) {
   1362   VIXL_ASSERT(IsSVEFormat(vform));
   1363   sminmaxv(vform, dst, pg, src, true);
   1364   return dst;
   1365 }
   1366 
   1367 
   1368 LogicVRegister Simulator::sminv(VectorFormat vform,
   1369                                 LogicVRegister dst,
   1370                                 const LogicPRegister& pg,
   1371                                 const LogicVRegister& src) {
   1372   VIXL_ASSERT(IsSVEFormat(vform));
   1373   sminmaxv(vform, dst, pg, src, false);
   1374   return dst;
   1375 }
   1376 
   1377 
   1378 LogicVRegister Simulator::uminmax(VectorFormat vform,
   1379                                   LogicVRegister dst,
   1380                                   const LogicVRegister& src1,
   1381                                   const LogicVRegister& src2,
   1382                                   bool max) {
   1383   dst.ClearForWrite(vform);
   1384   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1385     uint64_t src1_val = src1.Uint(vform, i);
   1386     uint64_t src2_val = src2.Uint(vform, i);
   1387     uint64_t dst_val;
   1388     if (max) {
   1389       dst_val = (src1_val > src2_val) ? src1_val : src2_val;
   1390     } else {
   1391       dst_val = (src1_val < src2_val) ? src1_val : src2_val;
   1392     }
   1393     dst.SetUint(vform, i, dst_val);
   1394   }
   1395   return dst;
   1396 }
   1397 
   1398 
   1399 LogicVRegister Simulator::umax(VectorFormat vform,
   1400                                LogicVRegister dst,
   1401                                const LogicVRegister& src1,
   1402                                const LogicVRegister& src2) {
   1403   return uminmax(vform, dst, src1, src2, true);
   1404 }
   1405 
   1406 
   1407 LogicVRegister Simulator::umin(VectorFormat vform,
   1408                                LogicVRegister dst,
   1409                                const LogicVRegister& src1,
   1410                                const LogicVRegister& src2) {
   1411   return uminmax(vform, dst, src1, src2, false);
   1412 }
   1413 
   1414 
   1415 LogicVRegister Simulator::uminmaxp(VectorFormat vform,
   1416                                    LogicVRegister dst,
   1417                                    const LogicVRegister& src1,
   1418                                    const LogicVRegister& src2,
   1419                                    bool max) {
   1420   unsigned lanes = LaneCountFromFormat(vform);
   1421   uint64_t result[kZRegMaxSizeInBytes];
   1422   const LogicVRegister* src = &src1;
   1423   for (unsigned j = 0; j < 2; j++) {
   1424     for (unsigned i = 0; i < lanes; i += 2) {
   1425       uint64_t first_val = src->Uint(vform, i);
   1426       uint64_t second_val = src->Uint(vform, i + 1);
   1427       uint64_t dst_val;
   1428       if (max) {
   1429         dst_val = (first_val > second_val) ? first_val : second_val;
   1430       } else {
   1431         dst_val = (first_val < second_val) ? first_val : second_val;
   1432       }
   1433       VIXL_ASSERT(((i >> 1) + (j * lanes / 2)) < ArrayLength(result));
   1434       result[(i >> 1) + (j * lanes / 2)] = dst_val;
   1435     }
   1436     src = &src2;
   1437   }
   1438   dst.SetUintArray(vform, result);
   1439   if (IsSVEFormat(vform)) {
   1440     interleave_top_bottom(vform, dst, dst);
   1441   }
   1442   return dst;
   1443 }
   1444 
   1445 
   1446 LogicVRegister Simulator::umaxp(VectorFormat vform,
   1447                                 LogicVRegister dst,
   1448                                 const LogicVRegister& src1,
   1449                                 const LogicVRegister& src2) {
   1450   return uminmaxp(vform, dst, src1, src2, true);
   1451 }
   1452 
   1453 
   1454 LogicVRegister Simulator::uminp(VectorFormat vform,
   1455                                 LogicVRegister dst,
   1456                                 const LogicVRegister& src1,
   1457                                 const LogicVRegister& src2) {
   1458   return uminmaxp(vform, dst, src1, src2, false);
   1459 }
   1460 
   1461 
   1462 LogicVRegister Simulator::uminmaxv(VectorFormat vform,
   1463                                    LogicVRegister dst,
   1464                                    const LogicPRegister& pg,
   1465                                    const LogicVRegister& src,
   1466                                    bool max) {
   1467   uint64_t dst_val = max ? 0 : UINT64_MAX;
   1468   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1469     if (!pg.IsActive(vform, i)) continue;
   1470 
   1471     uint64_t src_val = src.Uint(vform, i);
   1472     if (max) {
   1473       dst_val = (src_val > dst_val) ? src_val : dst_val;
   1474     } else {
   1475       dst_val = (src_val < dst_val) ? src_val : dst_val;
   1476     }
   1477   }
   1478   dst.ClearForWrite(ScalarFormatFromFormat(vform));
   1479   dst.SetUint(vform, 0, dst_val);
   1480   return dst;
   1481 }
   1482 
   1483 
   1484 LogicVRegister Simulator::umaxv(VectorFormat vform,
   1485                                 LogicVRegister dst,
   1486                                 const LogicVRegister& src) {
   1487   uminmaxv(vform, dst, GetPTrue(), src, true);
   1488   return dst;
   1489 }
   1490 
   1491 
   1492 LogicVRegister Simulator::uminv(VectorFormat vform,
   1493                                 LogicVRegister dst,
   1494                                 const LogicVRegister& src) {
   1495   uminmaxv(vform, dst, GetPTrue(), src, false);
   1496   return dst;
   1497 }
   1498 
   1499 
   1500 LogicVRegister Simulator::umaxv(VectorFormat vform,
   1501                                 LogicVRegister dst,
   1502                                 const LogicPRegister& pg,
   1503                                 const LogicVRegister& src) {
   1504   VIXL_ASSERT(IsSVEFormat(vform));
   1505   uminmaxv(vform, dst, pg, src, true);
   1506   return dst;
   1507 }
   1508 
   1509 
   1510 LogicVRegister Simulator::uminv(VectorFormat vform,
   1511                                 LogicVRegister dst,
   1512                                 const LogicPRegister& pg,
   1513                                 const LogicVRegister& src) {
   1514   VIXL_ASSERT(IsSVEFormat(vform));
   1515   uminmaxv(vform, dst, pg, src, false);
   1516   return dst;
   1517 }
   1518 
   1519 
   1520 LogicVRegister Simulator::shl(VectorFormat vform,
   1521                               LogicVRegister dst,
   1522                               const LogicVRegister& src,
   1523                               int shift) {
   1524   VIXL_ASSERT(shift >= 0);
   1525   SimVRegister temp;
   1526   LogicVRegister shiftreg = dup_immediate(vform, temp, shift);
   1527   return ushl(vform, dst, src, shiftreg);
   1528 }
   1529 
   1530 
   1531 LogicVRegister Simulator::sshll(VectorFormat vform,
   1532                                 LogicVRegister dst,
   1533                                 const LogicVRegister& src,
   1534                                 int shift) {
   1535   VIXL_ASSERT(shift >= 0);
   1536   SimVRegister temp1, temp2;
   1537   LogicVRegister shiftreg = dup_immediate(vform, temp1, shift);
   1538   LogicVRegister extendedreg = sxtl(vform, temp2, src);
   1539   return sshl(vform, dst, extendedreg, shiftreg);
   1540 }
   1541 
   1542 
   1543 LogicVRegister Simulator::sshll2(VectorFormat vform,
   1544                                  LogicVRegister dst,
   1545                                  const LogicVRegister& src,
   1546                                  int shift) {
   1547   VIXL_ASSERT(shift >= 0);
   1548   SimVRegister temp1, temp2;
   1549   LogicVRegister shiftreg = dup_immediate(vform, temp1, shift);
   1550   LogicVRegister extendedreg = sxtl2(vform, temp2, src);
   1551   return sshl(vform, dst, extendedreg, shiftreg);
   1552 }
   1553 
   1554 
   1555 LogicVRegister Simulator::shll(VectorFormat vform,
   1556                                LogicVRegister dst,
   1557                                const LogicVRegister& src) {
   1558   int shift = LaneSizeInBitsFromFormat(vform) / 2;
   1559   return sshll(vform, dst, src, shift);
   1560 }
   1561 
   1562 
   1563 LogicVRegister Simulator::shll2(VectorFormat vform,
   1564                                 LogicVRegister dst,
   1565                                 const LogicVRegister& src) {
   1566   int shift = LaneSizeInBitsFromFormat(vform) / 2;
   1567   return sshll2(vform, dst, src, shift);
   1568 }
   1569 
   1570 
   1571 LogicVRegister Simulator::ushll(VectorFormat vform,
   1572                                 LogicVRegister dst,
   1573                                 const LogicVRegister& src,
   1574                                 int shift) {
   1575   VIXL_ASSERT(shift >= 0);
   1576   SimVRegister temp1, temp2;
   1577   LogicVRegister shiftreg = dup_immediate(vform, temp1, shift);
   1578   LogicVRegister extendedreg = uxtl(vform, temp2, src);
   1579   return ushl(vform, dst, extendedreg, shiftreg);
   1580 }
   1581 
   1582 
   1583 LogicVRegister Simulator::ushll2(VectorFormat vform,
   1584                                  LogicVRegister dst,
   1585                                  const LogicVRegister& src,
   1586                                  int shift) {
   1587   VIXL_ASSERT(shift >= 0);
   1588   SimVRegister temp1, temp2;
   1589   LogicVRegister shiftreg = dup_immediate(vform, temp1, shift);
   1590   LogicVRegister extendedreg = uxtl2(vform, temp2, src);
   1591   return ushl(vform, dst, extendedreg, shiftreg);
   1592 }
   1593 
   1594 std::pair<bool, uint64_t> Simulator::clast(VectorFormat vform,
   1595                                            const LogicPRegister& pg,
   1596                                            const LogicVRegister& src,
   1597                                            int offset_from_last_active) {
   1598   // Untested for any other values.
   1599   VIXL_ASSERT((offset_from_last_active == 0) || (offset_from_last_active == 1));
   1600 
   1601   int last_active = GetLastActive(vform, pg);
   1602   int lane_count = LaneCountFromFormat(vform);
   1603   int index =
   1604       ((last_active + offset_from_last_active) + lane_count) % lane_count;
   1605   return std::make_pair(last_active >= 0, src.Uint(vform, index));
   1606 }
   1607 
   1608 LogicVRegister Simulator::compact(VectorFormat vform,
   1609                                   LogicVRegister dst,
   1610                                   const LogicPRegister& pg,
   1611                                   const LogicVRegister& src) {
   1612   int j = 0;
   1613   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1614     if (pg.IsActive(vform, i)) {
   1615       dst.SetUint(vform, j++, src.Uint(vform, i));
   1616     }
   1617   }
   1618   for (; j < LaneCountFromFormat(vform); j++) {
   1619     dst.SetUint(vform, j, 0);
   1620   }
   1621   return dst;
   1622 }
   1623 
   1624 LogicVRegister Simulator::splice(VectorFormat vform,
   1625                                  LogicVRegister dst,
   1626                                  const LogicPRegister& pg,
   1627                                  const LogicVRegister& src1,
   1628                                  const LogicVRegister& src2) {
   1629   int lane_count = LaneCountFromFormat(vform);
   1630   int first_active = GetFirstActive(vform, pg);
   1631   int last_active = GetLastActive(vform, pg);
   1632   int dst_idx = 0;
   1633   uint64_t result[kZRegMaxSizeInBytes];
   1634 
   1635   if (first_active >= 0) {
   1636     VIXL_ASSERT(last_active >= first_active);
   1637     VIXL_ASSERT(last_active < lane_count);
   1638     for (int i = first_active; i <= last_active; i++) {
   1639       result[dst_idx++] = src1.Uint(vform, i);
   1640     }
   1641   }
   1642 
   1643   VIXL_ASSERT(dst_idx <= lane_count);
   1644   for (int i = dst_idx; i < lane_count; i++) {
   1645     result[i] = src2.Uint(vform, i - dst_idx);
   1646   }
   1647 
   1648   dst.SetUintArray(vform, result);
   1649 
   1650   return dst;
   1651 }
   1652 
   1653 LogicVRegister Simulator::sel(VectorFormat vform,
   1654                               LogicVRegister dst,
   1655                               const SimPRegister& pg,
   1656                               const LogicVRegister& src1,
   1657                               const LogicVRegister& src2) {
   1658   int p_reg_bits_per_lane =
   1659       LaneSizeInBitsFromFormat(vform) / kZRegBitsPerPRegBit;
   1660   for (int lane = 0; lane < LaneCountFromFormat(vform); lane++) {
   1661     uint64_t lane_value = pg.GetBit(lane * p_reg_bits_per_lane)
   1662                               ? src1.Uint(vform, lane)
   1663                               : src2.Uint(vform, lane);
   1664     dst.SetUint(vform, lane, lane_value);
   1665   }
   1666   return dst;
   1667 }
   1668 
   1669 
   1670 LogicPRegister Simulator::sel(LogicPRegister dst,
   1671                               const LogicPRegister& pg,
   1672                               const LogicPRegister& src1,
   1673                               const LogicPRegister& src2) {
   1674   for (int i = 0; i < dst.GetChunkCount(); i++) {
   1675     LogicPRegister::ChunkType mask = pg.GetChunk(i);
   1676     LogicPRegister::ChunkType result =
   1677         (mask & src1.GetChunk(i)) | (~mask & src2.GetChunk(i));
   1678     dst.SetChunk(i, result);
   1679   }
   1680   return dst;
   1681 }
   1682 
   1683 
   1684 LogicVRegister Simulator::sli(VectorFormat vform,
   1685                               LogicVRegister dst,
   1686                               const LogicVRegister& src,
   1687                               int shift) {
   1688   dst.ClearForWrite(vform);
   1689   int lane_count = LaneCountFromFormat(vform);
   1690   for (int i = 0; i < lane_count; i++) {
   1691     uint64_t src_lane = src.Uint(vform, i);
   1692     uint64_t dst_lane = dst.Uint(vform, i);
   1693     uint64_t shifted = src_lane << shift;
   1694     uint64_t mask = MaxUintFromFormat(vform) << shift;
   1695     dst.SetUint(vform, i, (dst_lane & ~mask) | shifted);
   1696   }
   1697   return dst;
   1698 }
   1699 
   1700 
   1701 LogicVRegister Simulator::sqshl(VectorFormat vform,
   1702                                 LogicVRegister dst,
   1703                                 const LogicVRegister& src,
   1704                                 int shift) {
   1705   VIXL_ASSERT(shift >= 0);
   1706   SimVRegister temp;
   1707   LogicVRegister shiftreg = dup_immediate(vform, temp, shift);
   1708   return sshl(vform, dst, src, shiftreg).SignedSaturate(vform);
   1709 }
   1710 
   1711 
   1712 LogicVRegister Simulator::uqshl(VectorFormat vform,
   1713                                 LogicVRegister dst,
   1714                                 const LogicVRegister& src,
   1715                                 int shift) {
   1716   VIXL_ASSERT(shift >= 0);
   1717   SimVRegister temp;
   1718   LogicVRegister shiftreg = dup_immediate(vform, temp, shift);
   1719   return ushl(vform, dst, src, shiftreg).UnsignedSaturate(vform);
   1720 }
   1721 
   1722 
   1723 LogicVRegister Simulator::sqshlu(VectorFormat vform,
   1724                                  LogicVRegister dst,
   1725                                  const LogicVRegister& src,
   1726                                  int shift) {
   1727   VIXL_ASSERT(shift >= 0);
   1728   SimVRegister temp;
   1729   LogicVRegister shiftreg = dup_immediate(vform, temp, shift);
   1730   return sshl(vform, dst, src, shiftreg).UnsignedSaturate(vform);
   1731 }
   1732 
   1733 
   1734 LogicVRegister Simulator::sri(VectorFormat vform,
   1735                               LogicVRegister dst,
   1736                               const LogicVRegister& src,
   1737                               int shift) {
   1738   dst.ClearForWrite(vform);
   1739   int lane_count = LaneCountFromFormat(vform);
   1740   VIXL_ASSERT((shift > 0) &&
   1741               (shift <= static_cast<int>(LaneSizeInBitsFromFormat(vform))));
   1742   for (int i = 0; i < lane_count; i++) {
   1743     uint64_t src_lane = src.Uint(vform, i);
   1744     uint64_t dst_lane = dst.Uint(vform, i);
   1745     uint64_t shifted;
   1746     uint64_t mask;
   1747     if (shift == 64) {
   1748       shifted = 0;
   1749       mask = 0;
   1750     } else {
   1751       shifted = src_lane >> shift;
   1752       mask = MaxUintFromFormat(vform) >> shift;
   1753     }
   1754     dst.SetUint(vform, i, (dst_lane & ~mask) | shifted);
   1755   }
   1756   return dst;
   1757 }
   1758 
   1759 
   1760 LogicVRegister Simulator::ushr(VectorFormat vform,
   1761                                LogicVRegister dst,
   1762                                const LogicVRegister& src,
   1763                                int shift) {
   1764   VIXL_ASSERT(shift >= 0);
   1765   SimVRegister temp;
   1766   LogicVRegister shiftreg = dup_immediate(vform, temp, -shift);
   1767   return ushl(vform, dst, src, shiftreg);
   1768 }
   1769 
   1770 
   1771 LogicVRegister Simulator::sshr(VectorFormat vform,
   1772                                LogicVRegister dst,
   1773                                const LogicVRegister& src,
   1774                                int shift) {
   1775   VIXL_ASSERT(shift >= 0);
   1776   SimVRegister temp;
   1777   LogicVRegister shiftreg = dup_immediate(vform, temp, -shift);
   1778   return sshl(vform, dst, src, shiftreg);
   1779 }
   1780 
   1781 
   1782 LogicVRegister Simulator::ssra(VectorFormat vform,
   1783                                LogicVRegister dst,
   1784                                const LogicVRegister& src,
   1785                                int shift) {
   1786   SimVRegister temp;
   1787   LogicVRegister shifted_reg = sshr(vform, temp, src, shift);
   1788   return add(vform, dst, dst, shifted_reg);
   1789 }
   1790 
   1791 
   1792 LogicVRegister Simulator::usra(VectorFormat vform,
   1793                                LogicVRegister dst,
   1794                                const LogicVRegister& src,
   1795                                int shift) {
   1796   SimVRegister temp;
   1797   LogicVRegister shifted_reg = ushr(vform, temp, src, shift);
   1798   return add(vform, dst, dst, shifted_reg);
   1799 }
   1800 
   1801 
   1802 LogicVRegister Simulator::srsra(VectorFormat vform,
   1803                                 LogicVRegister dst,
   1804                                 const LogicVRegister& src,
   1805                                 int shift) {
   1806   SimVRegister temp;
   1807   LogicVRegister shifted_reg = sshr(vform, temp, src, shift).Round(vform);
   1808   return add(vform, dst, dst, shifted_reg);
   1809 }
   1810 
   1811 
   1812 LogicVRegister Simulator::ursra(VectorFormat vform,
   1813                                 LogicVRegister dst,
   1814                                 const LogicVRegister& src,
   1815                                 int shift) {
   1816   SimVRegister temp;
   1817   LogicVRegister shifted_reg = ushr(vform, temp, src, shift).Round(vform);
   1818   return add(vform, dst, dst, shifted_reg);
   1819 }
   1820 
   1821 
   1822 LogicVRegister Simulator::cls(VectorFormat vform,
   1823                               LogicVRegister dst,
   1824                               const LogicVRegister& src) {
   1825   int lane_size_in_bits = LaneSizeInBitsFromFormat(vform);
   1826   int lane_count = LaneCountFromFormat(vform);
   1827 
   1828   // Ensure that we can store one result per lane.
   1829   int result[kZRegMaxSizeInBytes];
   1830 
   1831   for (int i = 0; i < lane_count; i++) {
   1832     result[i] = CountLeadingSignBits(src.Int(vform, i), lane_size_in_bits);
   1833   }
   1834 
   1835   dst.ClearForWrite(vform);
   1836   for (int i = 0; i < lane_count; ++i) {
   1837     dst.SetUint(vform, i, result[i]);
   1838   }
   1839   return dst;
   1840 }
   1841 
   1842 
   1843 LogicVRegister Simulator::clz(VectorFormat vform,
   1844                               LogicVRegister dst,
   1845                               const LogicVRegister& src) {
   1846   int lane_size_in_bits = LaneSizeInBitsFromFormat(vform);
   1847   int lane_count = LaneCountFromFormat(vform);
   1848 
   1849   // Ensure that we can store one result per lane.
   1850   int result[kZRegMaxSizeInBytes];
   1851 
   1852   for (int i = 0; i < lane_count; i++) {
   1853     result[i] = CountLeadingZeros(src.Uint(vform, i), lane_size_in_bits);
   1854   }
   1855 
   1856   dst.ClearForWrite(vform);
   1857   for (int i = 0; i < lane_count; ++i) {
   1858     dst.SetUint(vform, i, result[i]);
   1859   }
   1860   return dst;
   1861 }
   1862 
   1863 
   1864 LogicVRegister Simulator::cnot(VectorFormat vform,
   1865                                LogicVRegister dst,
   1866                                const LogicVRegister& src) {
   1867   dst.ClearForWrite(vform);
   1868   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1869     uint64_t value = (src.Uint(vform, i) == 0) ? 1 : 0;
   1870     dst.SetUint(vform, i, value);
   1871   }
   1872   return dst;
   1873 }
   1874 
   1875 
   1876 LogicVRegister Simulator::cnt(VectorFormat vform,
   1877                               LogicVRegister dst,
   1878                               const LogicVRegister& src) {
   1879   int lane_size_in_bits = LaneSizeInBitsFromFormat(vform);
   1880   int lane_count = LaneCountFromFormat(vform);
   1881 
   1882   // Ensure that we can store one result per lane.
   1883   int result[kZRegMaxSizeInBytes];
   1884 
   1885   for (int i = 0; i < lane_count; i++) {
   1886     result[i] = CountSetBits(src.Uint(vform, i), lane_size_in_bits);
   1887   }
   1888 
   1889   dst.ClearForWrite(vform);
   1890   for (int i = 0; i < lane_count; ++i) {
   1891     dst.SetUint(vform, i, result[i]);
   1892   }
   1893   return dst;
   1894 }
   1895 
   1896 static int64_t CalculateSignedShiftDistance(int64_t shift_val,
   1897                                             int esize,
   1898                                             bool shift_in_ls_byte) {
   1899   if (shift_in_ls_byte) {
   1900     // Neon uses the least-significant byte of the lane as the shift distance.
   1901     shift_val = ExtractSignedBitfield64(7, 0, shift_val);
   1902   } else {
   1903     // SVE uses a saturated shift distance in the range
   1904     //  -(esize + 1) ... (esize + 1).
   1905     if (shift_val > (esize + 1)) shift_val = esize + 1;
   1906     if (shift_val < -(esize + 1)) shift_val = -(esize + 1);
   1907   }
   1908   return shift_val;
   1909 }
   1910 
   1911 LogicVRegister Simulator::sshl(VectorFormat vform,
   1912                                LogicVRegister dst,
   1913                                const LogicVRegister& src1,
   1914                                const LogicVRegister& src2,
   1915                                bool shift_in_ls_byte) {
   1916   dst.ClearForWrite(vform);
   1917   int esize = LaneSizeInBitsFromFormat(vform);
   1918   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1919     int64_t shift_val = CalculateSignedShiftDistance(src2.Int(vform, i),
   1920                                                      esize,
   1921                                                      shift_in_ls_byte);
   1922 
   1923     int64_t lj_src_val = src1.IntLeftJustified(vform, i);
   1924 
   1925     // Set signed saturation state.
   1926     if ((shift_val > CountLeadingSignBits(lj_src_val)) && (lj_src_val != 0)) {
   1927       dst.SetSignedSat(i, lj_src_val >= 0);
   1928     }
   1929 
   1930     // Set unsigned saturation state.
   1931     if (lj_src_val < 0) {
   1932       dst.SetUnsignedSat(i, false);
   1933     } else if ((shift_val > CountLeadingZeros(lj_src_val)) &&
   1934                (lj_src_val != 0)) {
   1935       dst.SetUnsignedSat(i, true);
   1936     }
   1937 
   1938     int64_t src_val = src1.Int(vform, i);
   1939     bool src_is_negative = src_val < 0;
   1940     if (shift_val > 63) {
   1941       dst.SetInt(vform, i, 0);
   1942     } else if (shift_val < -63) {
   1943       dst.SetRounding(i, src_is_negative);
   1944       dst.SetInt(vform, i, src_is_negative ? -1 : 0);
   1945     } else {
   1946       // Use unsigned types for shifts, as behaviour is undefined for signed
   1947       // lhs.
   1948       uint64_t usrc_val = static_cast<uint64_t>(src_val);
   1949 
   1950       if (shift_val < 0) {
   1951         // Convert to right shift.
   1952         shift_val = -shift_val;
   1953 
   1954         // Set rounding state by testing most-significant bit shifted out.
   1955         // Rounding only needed on right shifts.
   1956         if (((usrc_val >> (shift_val - 1)) & 1) == 1) {
   1957           dst.SetRounding(i, true);
   1958         }
   1959 
   1960         usrc_val >>= shift_val;
   1961 
   1962         if (src_is_negative) {
   1963           // Simulate sign-extension.
   1964           usrc_val |= (~UINT64_C(0) << (64 - shift_val));
   1965         }
   1966       } else {
   1967         usrc_val <<= shift_val;
   1968       }
   1969       dst.SetUint(vform, i, usrc_val);
   1970     }
   1971   }
   1972   return dst;
   1973 }
   1974 
   1975 
   1976 LogicVRegister Simulator::ushl(VectorFormat vform,
   1977                                LogicVRegister dst,
   1978                                const LogicVRegister& src1,
   1979                                const LogicVRegister& src2,
   1980                                bool shift_in_ls_byte) {
   1981   dst.ClearForWrite(vform);
   1982   int esize = LaneSizeInBitsFromFormat(vform);
   1983   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   1984     int64_t shift_val = CalculateSignedShiftDistance(src2.Int(vform, i),
   1985                                                      esize,
   1986                                                      shift_in_ls_byte);
   1987 
   1988     uint64_t lj_src_val = src1.UintLeftJustified(vform, i);
   1989 
   1990     // Set saturation state.
   1991     if ((shift_val > CountLeadingZeros(lj_src_val)) && (lj_src_val != 0)) {
   1992       dst.SetUnsignedSat(i, true);
   1993     }
   1994 
   1995     uint64_t src_val = src1.Uint(vform, i);
   1996     if ((shift_val > 63) || (shift_val < -64)) {
   1997       dst.SetUint(vform, i, 0);
   1998     } else {
   1999       if (shift_val < 0) {
   2000         // Set rounding state. Rounding only needed on right shifts.
   2001         if (((src_val >> (-shift_val - 1)) & 1) == 1) {
   2002           dst.SetRounding(i, true);
   2003         }
   2004 
   2005         if (shift_val == -64) {
   2006           src_val = 0;
   2007         } else {
   2008           src_val >>= -shift_val;
   2009         }
   2010       } else {
   2011         src_val <<= shift_val;
   2012       }
   2013       dst.SetUint(vform, i, src_val);
   2014     }
   2015   }
   2016   return dst;
   2017 }
   2018 
   2019 LogicVRegister Simulator::sshr(VectorFormat vform,
   2020                                LogicVRegister dst,
   2021                                const LogicVRegister& src1,
   2022                                const LogicVRegister& src2) {
   2023   SimVRegister temp;
   2024   // Saturate to sidestep the min-int problem.
   2025   neg(vform, temp, src2).SignedSaturate(vform);
   2026   sshl(vform, dst, src1, temp, false);
   2027   return dst;
   2028 }
   2029 
   2030 LogicVRegister Simulator::ushr(VectorFormat vform,
   2031                                LogicVRegister dst,
   2032                                const LogicVRegister& src1,
   2033                                const LogicVRegister& src2) {
   2034   SimVRegister temp;
   2035   // Saturate to sidestep the min-int problem.
   2036   neg(vform, temp, src2).SignedSaturate(vform);
   2037   ushl(vform, dst, src1, temp, false);
   2038   return dst;
   2039 }
   2040 
   2041 LogicVRegister Simulator::neg(VectorFormat vform,
   2042                               LogicVRegister dst,
   2043                               const LogicVRegister& src) {
   2044   dst.ClearForWrite(vform);
   2045   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2046     // Test for signed saturation.
   2047     int64_t sa = src.Int(vform, i);
   2048     if (sa == MinIntFromFormat(vform)) {
   2049       dst.SetSignedSat(i, true);
   2050     }
   2051     dst.SetInt(vform, i, (sa == INT64_MIN) ? sa : -sa);
   2052   }
   2053   return dst;
   2054 }
   2055 
   2056 
   2057 LogicVRegister Simulator::suqadd(VectorFormat vform,
   2058                                  LogicVRegister dst,
   2059                                  const LogicVRegister& src1,
   2060                                  const LogicVRegister& src2) {
   2061   dst.ClearForWrite(vform);
   2062   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2063     int64_t sa = src1.IntLeftJustified(vform, i);
   2064     uint64_t ub = src2.UintLeftJustified(vform, i);
   2065     uint64_t ur = sa + ub;
   2066 
   2067     int64_t sr;
   2068     memcpy(&sr, &ur, sizeof(sr));
   2069     if (sr < sa) {  // Test for signed positive saturation.
   2070       dst.SetInt(vform, i, MaxIntFromFormat(vform));
   2071     } else {
   2072       dst.SetUint(vform, i, src1.Int(vform, i) + src2.Uint(vform, i));
   2073     }
   2074   }
   2075   return dst;
   2076 }
   2077 
   2078 
   2079 LogicVRegister Simulator::usqadd(VectorFormat vform,
   2080                                  LogicVRegister dst,
   2081                                  const LogicVRegister& src1,
   2082                                  const LogicVRegister& src2) {
   2083   dst.ClearForWrite(vform);
   2084   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2085     uint64_t ua = src1.UintLeftJustified(vform, i);
   2086     int64_t sb = src2.IntLeftJustified(vform, i);
   2087     uint64_t ur = ua + sb;
   2088 
   2089     if ((sb > 0) && (ur <= ua)) {
   2090       dst.SetUint(vform, i, MaxUintFromFormat(vform));  // Positive saturation.
   2091     } else if ((sb < 0) && (ur >= ua)) {
   2092       dst.SetUint(vform, i, 0);  // Negative saturation.
   2093     } else {
   2094       dst.SetUint(vform, i, src1.Uint(vform, i) + src2.Int(vform, i));
   2095     }
   2096   }
   2097   return dst;
   2098 }
   2099 
   2100 
   2101 LogicVRegister Simulator::abs(VectorFormat vform,
   2102                               LogicVRegister dst,
   2103                               const LogicVRegister& src) {
   2104   dst.ClearForWrite(vform);
   2105   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2106     // Test for signed saturation.
   2107     int64_t sa = src.Int(vform, i);
   2108     if (sa == MinIntFromFormat(vform)) {
   2109       dst.SetSignedSat(i, true);
   2110     }
   2111     if (sa < 0) {
   2112       dst.SetInt(vform, i, (sa == INT64_MIN) ? sa : -sa);
   2113     } else {
   2114       dst.SetInt(vform, i, sa);
   2115     }
   2116   }
   2117   return dst;
   2118 }
   2119 
   2120 
   2121 LogicVRegister Simulator::andv(VectorFormat vform,
   2122                                LogicVRegister dst,
   2123                                const LogicPRegister& pg,
   2124                                const LogicVRegister& src) {
   2125   VIXL_ASSERT(IsSVEFormat(vform));
   2126   uint64_t result = GetUintMask(LaneSizeInBitsFromFormat(vform));
   2127   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2128     if (!pg.IsActive(vform, i)) continue;
   2129 
   2130     result &= src.Uint(vform, i);
   2131   }
   2132   VectorFormat vform_dst =
   2133       ScalarFormatFromLaneSize(LaneSizeInBitsFromFormat(vform));
   2134   dst.ClearForWrite(vform_dst);
   2135   dst.SetUint(vform_dst, 0, result);
   2136   return dst;
   2137 }
   2138 
   2139 
   2140 LogicVRegister Simulator::eorv(VectorFormat vform,
   2141                                LogicVRegister dst,
   2142                                const LogicPRegister& pg,
   2143                                const LogicVRegister& src) {
   2144   VIXL_ASSERT(IsSVEFormat(vform));
   2145   uint64_t result = 0;
   2146   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2147     if (!pg.IsActive(vform, i)) continue;
   2148 
   2149     result ^= src.Uint(vform, i);
   2150   }
   2151   VectorFormat vform_dst =
   2152       ScalarFormatFromLaneSize(LaneSizeInBitsFromFormat(vform));
   2153   dst.ClearForWrite(vform_dst);
   2154   dst.SetUint(vform_dst, 0, result);
   2155   return dst;
   2156 }
   2157 
   2158 
   2159 LogicVRegister Simulator::orv(VectorFormat vform,
   2160                               LogicVRegister dst,
   2161                               const LogicPRegister& pg,
   2162                               const LogicVRegister& src) {
   2163   VIXL_ASSERT(IsSVEFormat(vform));
   2164   uint64_t result = 0;
   2165   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2166     if (!pg.IsActive(vform, i)) continue;
   2167 
   2168     result |= src.Uint(vform, i);
   2169   }
   2170   VectorFormat vform_dst =
   2171       ScalarFormatFromLaneSize(LaneSizeInBitsFromFormat(vform));
   2172   dst.ClearForWrite(vform_dst);
   2173   dst.SetUint(vform_dst, 0, result);
   2174   return dst;
   2175 }
   2176 
   2177 
   2178 LogicVRegister Simulator::saddv(VectorFormat vform,
   2179                                 LogicVRegister dst,
   2180                                 const LogicPRegister& pg,
   2181                                 const LogicVRegister& src) {
   2182   VIXL_ASSERT(IsSVEFormat(vform));
   2183   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) <= kSRegSize);
   2184   int64_t result = 0;
   2185   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2186     if (!pg.IsActive(vform, i)) continue;
   2187 
   2188     // The destination register always has D-lane sizes and the source register
   2189     // always has S-lanes or smaller, so signed integer overflow -- undefined
   2190     // behaviour -- can't occur.
   2191     result += src.Int(vform, i);
   2192   }
   2193 
   2194   dst.ClearForWrite(kFormatD);
   2195   dst.SetInt(kFormatD, 0, result);
   2196   return dst;
   2197 }
   2198 
   2199 
   2200 LogicVRegister Simulator::uaddv(VectorFormat vform,
   2201                                 LogicVRegister dst,
   2202                                 const LogicPRegister& pg,
   2203                                 const LogicVRegister& src) {
   2204   VIXL_ASSERT(IsSVEFormat(vform));
   2205   uint64_t result = 0;
   2206   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2207     if (!pg.IsActive(vform, i)) continue;
   2208 
   2209     result += src.Uint(vform, i);
   2210   }
   2211 
   2212   dst.ClearForWrite(kFormatD);
   2213   dst.SetUint(kFormatD, 0, result);
   2214   return dst;
   2215 }
   2216 
   2217 
   2218 LogicVRegister Simulator::extractnarrow(VectorFormat dstform,
   2219                                         LogicVRegister dst,
   2220                                         bool dst_is_signed,
   2221                                         const LogicVRegister& src,
   2222                                         bool src_is_signed) {
   2223   bool upperhalf = false;
   2224   VectorFormat srcform = dstform;
   2225   if ((dstform == kFormat16B) || (dstform == kFormat8H) ||
   2226       (dstform == kFormat4S)) {
   2227     upperhalf = true;
   2228     srcform = VectorFormatHalfLanes(srcform);
   2229   }
   2230   srcform = VectorFormatDoubleWidth(srcform);
   2231 
   2232   LogicVRegister src_copy = src;
   2233 
   2234   int offset;
   2235   if (upperhalf) {
   2236     offset = LaneCountFromFormat(dstform) / 2;
   2237   } else {
   2238     offset = 0;
   2239   }
   2240 
   2241   for (int i = 0; i < LaneCountFromFormat(srcform); i++) {
   2242     int64_t ssrc = src_copy.Int(srcform, i);
   2243     uint64_t usrc = src_copy.Uint(srcform, i);
   2244 
   2245     // Test for signed saturation
   2246     if (ssrc > MaxIntFromFormat(dstform)) {
   2247       dst.SetSignedSat(offset + i, true);
   2248     } else if (ssrc < MinIntFromFormat(dstform)) {
   2249       dst.SetSignedSat(offset + i, false);
   2250     }
   2251 
   2252     // Test for unsigned saturation
   2253     if (src_is_signed) {
   2254       if (ssrc > static_cast<int64_t>(MaxUintFromFormat(dstform))) {
   2255         dst.SetUnsignedSat(offset + i, true);
   2256       } else if (ssrc < 0) {
   2257         dst.SetUnsignedSat(offset + i, false);
   2258       }
   2259     } else {
   2260       if (usrc > MaxUintFromFormat(dstform)) {
   2261         dst.SetUnsignedSat(offset + i, true);
   2262       }
   2263     }
   2264 
   2265     int64_t result;
   2266     if (src_is_signed) {
   2267       result = ssrc & MaxUintFromFormat(dstform);
   2268     } else {
   2269       result = usrc & MaxUintFromFormat(dstform);
   2270     }
   2271 
   2272     if (dst_is_signed) {
   2273       dst.SetInt(dstform, offset + i, result);
   2274     } else {
   2275       dst.SetUint(dstform, offset + i, result);
   2276     }
   2277   }
   2278 
   2279   if (upperhalf) {
   2280     // Clear any bits beyond a Q register.
   2281     dst.ClearForWrite(kFormat16B);
   2282   } else {
   2283     dst.ClearForWrite(dstform);
   2284   }
   2285   return dst;
   2286 }
   2287 
   2288 
   2289 LogicVRegister Simulator::xtn(VectorFormat vform,
   2290                               LogicVRegister dst,
   2291                               const LogicVRegister& src) {
   2292   return extractnarrow(vform, dst, true, src, true);
   2293 }
   2294 
   2295 
   2296 LogicVRegister Simulator::sqxtn(VectorFormat vform,
   2297                                 LogicVRegister dst,
   2298                                 const LogicVRegister& src) {
   2299   return extractnarrow(vform, dst, true, src, true).SignedSaturate(vform);
   2300 }
   2301 
   2302 
   2303 LogicVRegister Simulator::sqxtun(VectorFormat vform,
   2304                                  LogicVRegister dst,
   2305                                  const LogicVRegister& src) {
   2306   return extractnarrow(vform, dst, false, src, true).UnsignedSaturate(vform);
   2307 }
   2308 
   2309 
   2310 LogicVRegister Simulator::uqxtn(VectorFormat vform,
   2311                                 LogicVRegister dst,
   2312                                 const LogicVRegister& src) {
   2313   return extractnarrow(vform, dst, false, src, false).UnsignedSaturate(vform);
   2314 }
   2315 
   2316 
   2317 LogicVRegister Simulator::absdiff(VectorFormat vform,
   2318                                   LogicVRegister dst,
   2319                                   const LogicVRegister& src1,
   2320                                   const LogicVRegister& src2,
   2321                                   bool is_signed) {
   2322   dst.ClearForWrite(vform);
   2323   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2324     bool src1_gt_src2 = is_signed ? (src1.Int(vform, i) > src2.Int(vform, i))
   2325                                   : (src1.Uint(vform, i) > src2.Uint(vform, i));
   2326     // Always calculate the answer using unsigned arithmetic, to avoid
   2327     // implementation-defined signed overflow.
   2328     if (src1_gt_src2) {
   2329       dst.SetUint(vform, i, src1.Uint(vform, i) - src2.Uint(vform, i));
   2330     } else {
   2331       dst.SetUint(vform, i, src2.Uint(vform, i) - src1.Uint(vform, i));
   2332     }
   2333   }
   2334   return dst;
   2335 }
   2336 
   2337 
   2338 LogicVRegister Simulator::saba(VectorFormat vform,
   2339                                LogicVRegister dst,
   2340                                const LogicVRegister& src1,
   2341                                const LogicVRegister& src2) {
   2342   SimVRegister temp;
   2343   dst.ClearForWrite(vform);
   2344   absdiff(vform, temp, src1, src2, true);
   2345   add(vform, dst, dst, temp);
   2346   return dst;
   2347 }
   2348 
   2349 
   2350 LogicVRegister Simulator::uaba(VectorFormat vform,
   2351                                LogicVRegister dst,
   2352                                const LogicVRegister& src1,
   2353                                const LogicVRegister& src2) {
   2354   SimVRegister temp;
   2355   dst.ClearForWrite(vform);
   2356   absdiff(vform, temp, src1, src2, false);
   2357   add(vform, dst, dst, temp);
   2358   return dst;
   2359 }
   2360 
   2361 
   2362 LogicVRegister Simulator::not_(VectorFormat vform,
   2363                                LogicVRegister dst,
   2364                                const LogicVRegister& src) {
   2365   dst.ClearForWrite(vform);
   2366   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2367     dst.SetUint(vform, i, ~src.Uint(vform, i));
   2368   }
   2369   return dst;
   2370 }
   2371 
   2372 
   2373 LogicVRegister Simulator::rbit(VectorFormat vform,
   2374                                LogicVRegister dst,
   2375                                const LogicVRegister& src) {
   2376   uint64_t result[kZRegMaxSizeInBytes];
   2377   int lane_count = LaneCountFromFormat(vform);
   2378   int lane_size_in_bits = LaneSizeInBitsFromFormat(vform);
   2379   uint64_t reversed_value;
   2380   uint64_t value;
   2381   for (int i = 0; i < lane_count; i++) {
   2382     value = src.Uint(vform, i);
   2383     reversed_value = 0;
   2384     for (int j = 0; j < lane_size_in_bits; j++) {
   2385       reversed_value = (reversed_value << 1) | (value & 1);
   2386       value >>= 1;
   2387     }
   2388     result[i] = reversed_value;
   2389   }
   2390 
   2391   dst.ClearForWrite(vform);
   2392   for (int i = 0; i < lane_count; ++i) {
   2393     dst.SetUint(vform, i, result[i]);
   2394   }
   2395   return dst;
   2396 }
   2397 
   2398 
   2399 LogicVRegister Simulator::rev(VectorFormat vform,
   2400                               LogicVRegister dst,
   2401                               const LogicVRegister& src) {
   2402   VIXL_ASSERT(IsSVEFormat(vform));
   2403   int lane_count = LaneCountFromFormat(vform);
   2404   for (int i = 0; i < lane_count / 2; i++) {
   2405     uint64_t t = src.Uint(vform, i);
   2406     dst.SetUint(vform, i, src.Uint(vform, lane_count - i - 1));
   2407     dst.SetUint(vform, lane_count - i - 1, t);
   2408   }
   2409   return dst;
   2410 }
   2411 
   2412 
   2413 LogicVRegister Simulator::rev_byte(VectorFormat vform,
   2414                                    LogicVRegister dst,
   2415                                    const LogicVRegister& src,
   2416                                    int rev_size) {
   2417   uint64_t result[kZRegMaxSizeInBytes] = {};
   2418   int lane_count = LaneCountFromFormat(vform);
   2419   int lane_size = LaneSizeInBytesFromFormat(vform);
   2420   int lanes_per_loop = rev_size / lane_size;
   2421   for (int i = 0; i < lane_count; i += lanes_per_loop) {
   2422     for (int j = 0; j < lanes_per_loop; j++) {
   2423       result[i + lanes_per_loop - 1 - j] = src.Uint(vform, i + j);
   2424     }
   2425   }
   2426   dst.ClearForWrite(vform);
   2427   for (int i = 0; i < lane_count; ++i) {
   2428     dst.SetUint(vform, i, result[i]);
   2429   }
   2430   return dst;
   2431 }
   2432 
   2433 
   2434 LogicVRegister Simulator::rev16(VectorFormat vform,
   2435                                 LogicVRegister dst,
   2436                                 const LogicVRegister& src) {
   2437   return rev_byte(vform, dst, src, 2);
   2438 }
   2439 
   2440 
   2441 LogicVRegister Simulator::rev32(VectorFormat vform,
   2442                                 LogicVRegister dst,
   2443                                 const LogicVRegister& src) {
   2444   return rev_byte(vform, dst, src, 4);
   2445 }
   2446 
   2447 
   2448 LogicVRegister Simulator::rev64(VectorFormat vform,
   2449                                 LogicVRegister dst,
   2450                                 const LogicVRegister& src) {
   2451   return rev_byte(vform, dst, src, 8);
   2452 }
   2453 
   2454 LogicVRegister Simulator::addlp(VectorFormat vform,
   2455                                 LogicVRegister dst,
   2456                                 const LogicVRegister& src,
   2457                                 bool is_signed,
   2458                                 bool do_accumulate) {
   2459   VectorFormat vformsrc = VectorFormatHalfWidthDoubleLanes(vform);
   2460   VIXL_ASSERT(LaneSizeInBitsFromFormat(vformsrc) <= kSRegSize);
   2461 
   2462   uint64_t result[kZRegMaxSizeInBytes];
   2463   int lane_count = LaneCountFromFormat(vform);
   2464   for (int i = 0; i < lane_count; i++) {
   2465     if (is_signed) {
   2466       result[i] = static_cast<uint64_t>(src.Int(vformsrc, 2 * i) +
   2467                                         src.Int(vformsrc, 2 * i + 1));
   2468     } else {
   2469       result[i] = src.Uint(vformsrc, 2 * i) + src.Uint(vformsrc, 2 * i + 1);
   2470     }
   2471   }
   2472 
   2473   dst.ClearForWrite(vform);
   2474   for (int i = 0; i < lane_count; ++i) {
   2475     if (do_accumulate) {
   2476       result[i] += dst.Uint(vform, i);
   2477     }
   2478     dst.SetUint(vform, i, result[i]);
   2479   }
   2480 
   2481   return dst;
   2482 }
   2483 
   2484 
   2485 LogicVRegister Simulator::saddlp(VectorFormat vform,
   2486                                  LogicVRegister dst,
   2487                                  const LogicVRegister& src) {
   2488   return addlp(vform, dst, src, true, false);
   2489 }
   2490 
   2491 
   2492 LogicVRegister Simulator::uaddlp(VectorFormat vform,
   2493                                  LogicVRegister dst,
   2494                                  const LogicVRegister& src) {
   2495   return addlp(vform, dst, src, false, false);
   2496 }
   2497 
   2498 
   2499 LogicVRegister Simulator::sadalp(VectorFormat vform,
   2500                                  LogicVRegister dst,
   2501                                  const LogicVRegister& src) {
   2502   return addlp(vform, dst, src, true, true);
   2503 }
   2504 
   2505 
   2506 LogicVRegister Simulator::uadalp(VectorFormat vform,
   2507                                  LogicVRegister dst,
   2508                                  const LogicVRegister& src) {
   2509   return addlp(vform, dst, src, false, true);
   2510 }
   2511 
   2512 LogicVRegister Simulator::ror(VectorFormat vform,
   2513                               LogicVRegister dst,
   2514                               const LogicVRegister& src,
   2515                               int rotation) {
   2516   int width = LaneSizeInBitsFromFormat(vform);
   2517   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2518     uint64_t value = src.Uint(vform, i);
   2519     dst.SetUint(vform, i, RotateRight(value, rotation, width));
   2520   }
   2521   return dst;
   2522 }
   2523 
   2524 LogicVRegister Simulator::ext(VectorFormat vform,
   2525                               LogicVRegister dst,
   2526                               const LogicVRegister& src1,
   2527                               const LogicVRegister& src2,
   2528                               int index) {
   2529   uint8_t result[kZRegMaxSizeInBytes] = {};
   2530   int lane_count = LaneCountFromFormat(vform);
   2531   for (int i = 0; i < lane_count - index; ++i) {
   2532     result[i] = src1.Uint(vform, i + index);
   2533   }
   2534   for (int i = 0; i < index; ++i) {
   2535     result[lane_count - index + i] = src2.Uint(vform, i);
   2536   }
   2537   dst.ClearForWrite(vform);
   2538   for (int i = 0; i < lane_count; ++i) {
   2539     dst.SetUint(vform, i, result[i]);
   2540   }
   2541   return dst;
   2542 }
   2543 
   2544 LogicVRegister Simulator::rotate_elements_right(VectorFormat vform,
   2545                                                 LogicVRegister dst,
   2546                                                 const LogicVRegister& src,
   2547                                                 int index) {
   2548   if (index < 0) index += LaneCountFromFormat(vform);
   2549   VIXL_ASSERT((index >= 0) && (index < LaneCountFromFormat(vform)));
   2550   index *= LaneSizeInBytesFromFormat(vform);
   2551   return ext(kFormatVnB, dst, src, src, index);
   2552 }
   2553 
   2554 
   2555 template <typename T>
   2556 LogicVRegister Simulator::fadda(VectorFormat vform,
   2557                                 LogicVRegister acc,
   2558                                 const LogicPRegister& pg,
   2559                                 const LogicVRegister& src) {
   2560   T result = acc.Float<T>(0);
   2561   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2562     if (!pg.IsActive(vform, i)) continue;
   2563 
   2564     result = FPAdd(result, src.Float<T>(i));
   2565   }
   2566   VectorFormat vform_dst =
   2567       ScalarFormatFromLaneSize(LaneSizeInBitsFromFormat(vform));
   2568   acc.ClearForWrite(vform_dst);
   2569   acc.SetFloat(0, result);
   2570   return acc;
   2571 }
   2572 
   2573 LogicVRegister Simulator::fadda(VectorFormat vform,
   2574                                 LogicVRegister acc,
   2575                                 const LogicPRegister& pg,
   2576                                 const LogicVRegister& src) {
   2577   switch (LaneSizeInBitsFromFormat(vform)) {
   2578     case kHRegSize:
   2579       fadda<SimFloat16>(vform, acc, pg, src);
   2580       break;
   2581     case kSRegSize:
   2582       fadda<float>(vform, acc, pg, src);
   2583       break;
   2584     case kDRegSize:
   2585       fadda<double>(vform, acc, pg, src);
   2586       break;
   2587     default:
   2588       VIXL_UNREACHABLE();
   2589   }
   2590   return acc;
   2591 }
   2592 
   2593 template <typename T>
   2594 LogicVRegister Simulator::fcadd(VectorFormat vform,
   2595                                 LogicVRegister dst,          // d
   2596                                 const LogicVRegister& src1,  // n
   2597                                 const LogicVRegister& src2,  // m
   2598                                 int rot) {
   2599   int elements = LaneCountFromFormat(vform);
   2600 
   2601   T element1, element3;
   2602   rot = (rot == 1) ? 270 : 90;
   2603 
   2604   // Loop example:
   2605   // 2S --> (2/2 = 1 - 1 = 0) --> 1 x Complex Number (2x components: r+i)
   2606   // 4S --> (4/2 = 2) - 1 = 1) --> 2 x Complex Number (2x2 components: r+i)
   2607 
   2608   for (int e = 0; e <= (elements / 2) - 1; e++) {
   2609     switch (rot) {
   2610       case 90:
   2611         element1 = FPNeg(src2.Float<T>(e * 2 + 1));
   2612         element3 = src2.Float<T>(e * 2);
   2613         break;
   2614       case 270:
   2615         element1 = src2.Float<T>(e * 2 + 1);
   2616         element3 = FPNeg(src2.Float<T>(e * 2));
   2617         break;
   2618       default:
   2619         VIXL_UNREACHABLE();
   2620         return dst;  // prevents "element(n) may be unintialized" errors
   2621     }
   2622     dst.ClearForWrite(vform);
   2623     dst.SetFloat<T>(e * 2, FPAdd(src1.Float<T>(e * 2), element1));
   2624     dst.SetFloat<T>(e * 2 + 1, FPAdd(src1.Float<T>(e * 2 + 1), element3));
   2625   }
   2626   return dst;
   2627 }
   2628 
   2629 
   2630 LogicVRegister Simulator::fcadd(VectorFormat vform,
   2631                                 LogicVRegister dst,          // d
   2632                                 const LogicVRegister& src1,  // n
   2633                                 const LogicVRegister& src2,  // m
   2634                                 int rot) {
   2635   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   2636     fcadd<SimFloat16>(vform, dst, src1, src2, rot);
   2637   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   2638     fcadd<float>(vform, dst, src1, src2, rot);
   2639   } else {
   2640     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   2641     fcadd<double>(vform, dst, src1, src2, rot);
   2642   }
   2643   return dst;
   2644 }
   2645 
   2646 template <typename T>
   2647 LogicVRegister Simulator::fcmla(VectorFormat vform,
   2648                                 LogicVRegister dst,
   2649                                 const LogicVRegister& src1,
   2650                                 const LogicVRegister& src2,
   2651                                 const LogicVRegister& acc,
   2652                                 int index,
   2653                                 int rot) {
   2654   int elements = LaneCountFromFormat(vform);
   2655 
   2656   T element1, element2, element3, element4;
   2657   rot *= 90;
   2658 
   2659   // Loop example:
   2660   // 2S --> (2/2 = 1 - 1 = 0) --> 1 x Complex Number (2x components: r+i)
   2661   // 4S --> (4/2 = 2) - 1 = 1) --> 2 x Complex Number (2x2 components: r+i)
   2662 
   2663   for (int e = 0; e <= (elements / 2) - 1; e++) {
   2664     // Index == -1 indicates a vector/vector rather than vector/indexed-element
   2665     // operation.
   2666     int f = (index < 0) ? e : index;
   2667 
   2668     switch (rot) {
   2669       case 0:
   2670         element1 = src2.Float<T>(f * 2);
   2671         element2 = src1.Float<T>(e * 2);
   2672         element3 = src2.Float<T>(f * 2 + 1);
   2673         element4 = src1.Float<T>(e * 2);
   2674         break;
   2675       case 90:
   2676         element1 = FPNeg(src2.Float<T>(f * 2 + 1));
   2677         element2 = src1.Float<T>(e * 2 + 1);
   2678         element3 = src2.Float<T>(f * 2);
   2679         element4 = src1.Float<T>(e * 2 + 1);
   2680         break;
   2681       case 180:
   2682         element1 = FPNeg(src2.Float<T>(f * 2));
   2683         element2 = src1.Float<T>(e * 2);
   2684         element3 = FPNeg(src2.Float<T>(f * 2 + 1));
   2685         element4 = src1.Float<T>(e * 2);
   2686         break;
   2687       case 270:
   2688         element1 = src2.Float<T>(f * 2 + 1);
   2689         element2 = src1.Float<T>(e * 2 + 1);
   2690         element3 = FPNeg(src2.Float<T>(f * 2));
   2691         element4 = src1.Float<T>(e * 2 + 1);
   2692         break;
   2693       default:
   2694         VIXL_UNREACHABLE();
   2695         return dst;  // prevents "element(n) may be unintialized" errors
   2696     }
   2697     dst.ClearForWrite(vform);
   2698     dst.SetFloat<T>(vform,
   2699                     e * 2,
   2700                     FPMulAdd(acc.Float<T>(e * 2), element2, element1));
   2701     dst.SetFloat<T>(vform,
   2702                     e * 2 + 1,
   2703                     FPMulAdd(acc.Float<T>(e * 2 + 1), element4, element3));
   2704   }
   2705   return dst;
   2706 }
   2707 
   2708 LogicVRegister Simulator::fcmla(VectorFormat vform,
   2709                                 LogicVRegister dst,
   2710                                 const LogicVRegister& src1,
   2711                                 const LogicVRegister& src2,
   2712                                 const LogicVRegister& acc,
   2713                                 int rot) {
   2714   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   2715     fcmla<SimFloat16>(vform, dst, src1, src2, acc, -1, rot);
   2716   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   2717     fcmla<float>(vform, dst, src1, src2, acc, -1, rot);
   2718   } else {
   2719     fcmla<double>(vform, dst, src1, src2, acc, -1, rot);
   2720   }
   2721   return dst;
   2722 }
   2723 
   2724 
   2725 LogicVRegister Simulator::fcmla(VectorFormat vform,
   2726                                 LogicVRegister dst,          // d
   2727                                 const LogicVRegister& src1,  // n
   2728                                 const LogicVRegister& src2,  // m
   2729                                 int index,
   2730                                 int rot) {
   2731   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   2732     fcmla<SimFloat16>(vform, dst, src1, src2, dst, index, rot);
   2733   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   2734     fcmla<float>(vform, dst, src1, src2, dst, index, rot);
   2735   } else {
   2736     fcmla<double>(vform, dst, src1, src2, dst, index, rot);
   2737   }
   2738   return dst;
   2739 }
   2740 
   2741 LogicVRegister Simulator::cadd(VectorFormat vform,
   2742                                LogicVRegister dst,
   2743                                const LogicVRegister& src1,
   2744                                const LogicVRegister& src2,
   2745                                int rot,
   2746                                bool saturate) {
   2747   SimVRegister src1_r, src1_i;
   2748   SimVRegister src2_r, src2_i;
   2749   SimVRegister zero;
   2750   zero.Clear();
   2751   uzp1(vform, src1_r, src1, zero);
   2752   uzp2(vform, src1_i, src1, zero);
   2753   uzp1(vform, src2_r, src2, zero);
   2754   uzp2(vform, src2_i, src2, zero);
   2755 
   2756   if (rot == 90) {
   2757     if (saturate) {
   2758       sub(vform, src1_r, src1_r, src2_i).SignedSaturate(vform);
   2759       add(vform, src1_i, src1_i, src2_r).SignedSaturate(vform);
   2760     } else {
   2761       sub(vform, src1_r, src1_r, src2_i);
   2762       add(vform, src1_i, src1_i, src2_r);
   2763     }
   2764   } else {
   2765     VIXL_ASSERT(rot == 270);
   2766     if (saturate) {
   2767       add(vform, src1_r, src1_r, src2_i).SignedSaturate(vform);
   2768       sub(vform, src1_i, src1_i, src2_r).SignedSaturate(vform);
   2769     } else {
   2770       add(vform, src1_r, src1_r, src2_i);
   2771       sub(vform, src1_i, src1_i, src2_r);
   2772     }
   2773   }
   2774 
   2775   zip1(vform, dst, src1_r, src1_i);
   2776   return dst;
   2777 }
   2778 
   2779 LogicVRegister Simulator::cmla(VectorFormat vform,
   2780                                LogicVRegister dst,
   2781                                const LogicVRegister& srca,
   2782                                const LogicVRegister& src1,
   2783                                const LogicVRegister& src2,
   2784                                int rot) {
   2785   SimVRegister src1_a;
   2786   SimVRegister src2_a, src2_b;
   2787   SimVRegister srca_i, srca_r;
   2788   SimVRegister zero, temp;
   2789   zero.Clear();
   2790 
   2791   if ((rot == 0) || (rot == 180)) {
   2792     uzp1(vform, src1_a, src1, zero);
   2793     uzp1(vform, src2_a, src2, zero);
   2794     uzp2(vform, src2_b, src2, zero);
   2795   } else {
   2796     uzp2(vform, src1_a, src1, zero);
   2797     uzp2(vform, src2_a, src2, zero);
   2798     uzp1(vform, src2_b, src2, zero);
   2799   }
   2800 
   2801   uzp1(vform, srca_r, srca, zero);
   2802   uzp2(vform, srca_i, srca, zero);
   2803 
   2804   bool sub_r = (rot == 90) || (rot == 180);
   2805   bool sub_i = (rot == 180) || (rot == 270);
   2806 
   2807   mul(vform, temp, src1_a, src2_a);
   2808   if (sub_r) {
   2809     sub(vform, srca_r, srca_r, temp);
   2810   } else {
   2811     add(vform, srca_r, srca_r, temp);
   2812   }
   2813 
   2814   mul(vform, temp, src1_a, src2_b);
   2815   if (sub_i) {
   2816     sub(vform, srca_i, srca_i, temp);
   2817   } else {
   2818     add(vform, srca_i, srca_i, temp);
   2819   }
   2820 
   2821   zip1(vform, dst, srca_r, srca_i);
   2822   return dst;
   2823 }
   2824 
   2825 LogicVRegister Simulator::cmla(VectorFormat vform,
   2826                                LogicVRegister dst,
   2827                                const LogicVRegister& srca,
   2828                                const LogicVRegister& src1,
   2829                                const LogicVRegister& src2,
   2830                                int index,
   2831                                int rot) {
   2832   SimVRegister temp;
   2833   dup_elements_to_segments(VectorFormatDoubleWidth(vform), temp, src2, index);
   2834   return cmla(vform, dst, srca, src1, temp, rot);
   2835 }
   2836 
   2837 LogicVRegister Simulator::bgrp(VectorFormat vform,
   2838                                LogicVRegister dst,
   2839                                const LogicVRegister& src1,
   2840                                const LogicVRegister& src2,
   2841                                bool do_bext) {
   2842   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2843     uint64_t value = src1.Uint(vform, i);
   2844     uint64_t mask = src2.Uint(vform, i);
   2845     int high_pos = 0;
   2846     int low_pos = 0;
   2847     uint64_t result_high = 0;
   2848     uint64_t result_low = 0;
   2849     for (unsigned j = 0; j < LaneSizeInBitsFromFormat(vform); j++) {
   2850       if ((mask & 1) == 0) {
   2851         result_high |= (value & 1) << high_pos;
   2852         high_pos++;
   2853       } else {
   2854         result_low |= (value & 1) << low_pos;
   2855         low_pos++;
   2856       }
   2857       mask >>= 1;
   2858       value >>= 1;
   2859     }
   2860 
   2861     if (!do_bext) {
   2862       result_low |= result_high << low_pos;
   2863     }
   2864 
   2865     dst.SetUint(vform, i, result_low);
   2866   }
   2867   return dst;
   2868 }
   2869 
   2870 LogicVRegister Simulator::bdep(VectorFormat vform,
   2871                                LogicVRegister dst,
   2872                                const LogicVRegister& src1,
   2873                                const LogicVRegister& src2) {
   2874   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2875     uint64_t value = src1.Uint(vform, i);
   2876     uint64_t mask = src2.Uint(vform, i);
   2877     uint64_t result = 0;
   2878     for (unsigned j = 0; j < LaneSizeInBitsFromFormat(vform); j++) {
   2879       if ((mask & 1) == 1) {
   2880         result |= (value & 1) << j;
   2881         value >>= 1;
   2882       }
   2883       mask >>= 1;
   2884     }
   2885     dst.SetUint(vform, i, result);
   2886   }
   2887   return dst;
   2888 }
   2889 
   2890 LogicVRegister Simulator::histogram(VectorFormat vform,
   2891                                     LogicVRegister dst,
   2892                                     const LogicPRegister& pg,
   2893                                     const LogicVRegister& src1,
   2894                                     const LogicVRegister& src2,
   2895                                     bool do_segmented) {
   2896   int elements_per_segment = kQRegSize / LaneSizeInBitsFromFormat(vform);
   2897   uint64_t result[kZRegMaxSizeInBytes];
   2898 
   2899   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   2900     uint64_t count = 0;
   2901     uint64_t value = src1.Uint(vform, i);
   2902 
   2903     int segment = do_segmented ? (i / elements_per_segment) : 0;
   2904     int segment_offset = segment * elements_per_segment;
   2905     int hist_limit = do_segmented ? elements_per_segment : (i + 1);
   2906     for (int j = 0; j < hist_limit; j++) {
   2907       if (pg.IsActive(vform, j) &&
   2908           (value == src2.Uint(vform, j + segment_offset))) {
   2909         count++;
   2910       }
   2911     }
   2912     result[i] = count;
   2913   }
   2914   dst.SetUintArray(vform, result);
   2915   return dst;
   2916 }
   2917 
   2918 LogicVRegister Simulator::dup_element(VectorFormat vform,
   2919                                       LogicVRegister dst,
   2920                                       const LogicVRegister& src,
   2921                                       int src_index) {
   2922   if ((vform == kFormatVnQ) || (vform == kFormatVnO)) {
   2923     // When duplicating an element larger than 64 bits, split the element into
   2924     // 64-bit parts, and duplicate the parts across the destination.
   2925     uint64_t d[4];
   2926     int count = (vform == kFormatVnQ) ? 2 : 4;
   2927     for (int i = 0; i < count; i++) {
   2928       d[i] = src.Uint(kFormatVnD, (src_index * count) + i);
   2929     }
   2930     dst.Clear();
   2931     for (int i = 0; i < LaneCountFromFormat(vform) * count; i++) {
   2932       dst.SetUint(kFormatVnD, i, d[i % count]);
   2933     }
   2934   } else {
   2935     int lane_count = LaneCountFromFormat(vform);
   2936     uint64_t value = src.Uint(vform, src_index);
   2937     dst.ClearForWrite(vform);
   2938     for (int i = 0; i < lane_count; ++i) {
   2939       dst.SetUint(vform, i, value);
   2940     }
   2941   }
   2942   return dst;
   2943 }
   2944 
   2945 LogicVRegister Simulator::dup_elements_to_segments(VectorFormat vform,
   2946                                                    LogicVRegister dst,
   2947                                                    const LogicVRegister& src,
   2948                                                    int src_index) {
   2949   // In SVE, a segment is a 128-bit portion of a vector, like a Q register,
   2950   // whereas in NEON, the size of segment is equal to the size of register
   2951   // itself.
   2952   int segment_size = std::min(kQRegSize, RegisterSizeInBitsFromFormat(vform));
   2953   VIXL_ASSERT(IsMultiple(segment_size, LaneSizeInBitsFromFormat(vform)));
   2954   int lanes_per_segment = segment_size / LaneSizeInBitsFromFormat(vform);
   2955 
   2956   VIXL_ASSERT(src_index >= 0);
   2957   VIXL_ASSERT(src_index < lanes_per_segment);
   2958 
   2959   dst.ClearForWrite(vform);
   2960   for (int j = 0; j < LaneCountFromFormat(vform); j += lanes_per_segment) {
   2961     uint64_t value = src.Uint(vform, j + src_index);
   2962     for (int i = 0; i < lanes_per_segment; i++) {
   2963       dst.SetUint(vform, j + i, value);
   2964     }
   2965   }
   2966   return dst;
   2967 }
   2968 
   2969 LogicVRegister Simulator::dup_elements_to_segments(
   2970     VectorFormat vform,
   2971     LogicVRegister dst,
   2972     const std::pair<int, int>& src_and_index) {
   2973   return dup_elements_to_segments(vform,
   2974                                   dst,
   2975                                   ReadVRegister(src_and_index.first),
   2976                                   src_and_index.second);
   2977 }
   2978 
   2979 LogicVRegister Simulator::dup_immediate(VectorFormat vform,
   2980                                         LogicVRegister dst,
   2981                                         uint64_t imm) {
   2982   int lane_count = LaneCountFromFormat(vform);
   2983   uint64_t value = imm & MaxUintFromFormat(vform);
   2984   dst.ClearForWrite(vform);
   2985   for (int i = 0; i < lane_count; ++i) {
   2986     dst.SetUint(vform, i, value);
   2987   }
   2988   return dst;
   2989 }
   2990 
   2991 
   2992 LogicVRegister Simulator::ins_element(VectorFormat vform,
   2993                                       LogicVRegister dst,
   2994                                       int dst_index,
   2995                                       const LogicVRegister& src,
   2996                                       int src_index) {
   2997   dst.SetUint(vform, dst_index, src.Uint(vform, src_index));
   2998   return dst;
   2999 }
   3000 
   3001 
   3002 LogicVRegister Simulator::ins_immediate(VectorFormat vform,
   3003                                         LogicVRegister dst,
   3004                                         int dst_index,
   3005                                         uint64_t imm) {
   3006   uint64_t value = imm & MaxUintFromFormat(vform);
   3007   dst.SetUint(vform, dst_index, value);
   3008   return dst;
   3009 }
   3010 
   3011 
   3012 LogicVRegister Simulator::index(VectorFormat vform,
   3013                                 LogicVRegister dst,
   3014                                 uint64_t start,
   3015                                 uint64_t step) {
   3016   VIXL_ASSERT(IsSVEFormat(vform));
   3017   uint64_t value = start;
   3018   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   3019     dst.SetUint(vform, i, value);
   3020     value += step;
   3021   }
   3022   return dst;
   3023 }
   3024 
   3025 
   3026 LogicVRegister Simulator::insr(VectorFormat vform,
   3027                                LogicVRegister dst,
   3028                                uint64_t imm) {
   3029   VIXL_ASSERT(IsSVEFormat(vform));
   3030   for (int i = LaneCountFromFormat(vform) - 1; i > 0; i--) {
   3031     dst.SetUint(vform, i, dst.Uint(vform, i - 1));
   3032   }
   3033   dst.SetUint(vform, 0, imm);
   3034   return dst;
   3035 }
   3036 
   3037 
   3038 LogicVRegister Simulator::mov(VectorFormat vform,
   3039                               LogicVRegister dst,
   3040                               const LogicVRegister& src) {
   3041   dst.ClearForWrite(vform);
   3042   for (int lane = 0; lane < LaneCountFromFormat(vform); lane++) {
   3043     dst.SetUint(vform, lane, src.Uint(vform, lane));
   3044   }
   3045   return dst;
   3046 }
   3047 
   3048 
   3049 LogicPRegister Simulator::mov(LogicPRegister dst, const LogicPRegister& src) {
   3050   // Avoid a copy if the registers already alias.
   3051   if (dst.Aliases(src)) return dst;
   3052 
   3053   for (int i = 0; i < dst.GetChunkCount(); i++) {
   3054     dst.SetChunk(i, src.GetChunk(i));
   3055   }
   3056   return dst;
   3057 }
   3058 
   3059 
   3060 LogicVRegister Simulator::mov_merging(VectorFormat vform,
   3061                                       LogicVRegister dst,
   3062                                       const SimPRegister& pg,
   3063                                       const LogicVRegister& src) {
   3064   return sel(vform, dst, pg, src, dst);
   3065 }
   3066 
   3067 LogicVRegister Simulator::mov_zeroing(VectorFormat vform,
   3068                                       LogicVRegister dst,
   3069                                       const SimPRegister& pg,
   3070                                       const LogicVRegister& src) {
   3071   SimVRegister zero;
   3072   dup_immediate(vform, zero, 0);
   3073   return sel(vform, dst, pg, src, zero);
   3074 }
   3075 
   3076 LogicVRegister Simulator::mov_alternating(VectorFormat vform,
   3077                                           LogicVRegister dst,
   3078                                           const LogicVRegister& src,
   3079                                           int start_at) {
   3080   VIXL_ASSERT((start_at == 0) || (start_at == 1));
   3081   for (int i = start_at; i < LaneCountFromFormat(vform); i += 2) {
   3082     dst.SetUint(vform, i, src.Uint(vform, i));
   3083   }
   3084   return dst;
   3085 }
   3086 
   3087 LogicPRegister Simulator::mov_merging(LogicPRegister dst,
   3088                                       const LogicPRegister& pg,
   3089                                       const LogicPRegister& src) {
   3090   return sel(dst, pg, src, dst);
   3091 }
   3092 
   3093 LogicPRegister Simulator::mov_zeroing(LogicPRegister dst,
   3094                                       const LogicPRegister& pg,
   3095                                       const LogicPRegister& src) {
   3096   SimPRegister all_false;
   3097   return sel(dst, pg, src, pfalse(all_false));
   3098 }
   3099 
   3100 LogicVRegister Simulator::movi(VectorFormat vform,
   3101                                LogicVRegister dst,
   3102                                uint64_t imm) {
   3103   int lane_count = LaneCountFromFormat(vform);
   3104   dst.ClearForWrite(vform);
   3105   for (int i = 0; i < lane_count; ++i) {
   3106     dst.SetUint(vform, i, imm);
   3107   }
   3108   return dst;
   3109 }
   3110 
   3111 
   3112 LogicVRegister Simulator::mvni(VectorFormat vform,
   3113                                LogicVRegister dst,
   3114                                uint64_t imm) {
   3115   int lane_count = LaneCountFromFormat(vform);
   3116   dst.ClearForWrite(vform);
   3117   for (int i = 0; i < lane_count; ++i) {
   3118     dst.SetUint(vform, i, ~imm);
   3119   }
   3120   return dst;
   3121 }
   3122 
   3123 
   3124 LogicVRegister Simulator::orr(VectorFormat vform,
   3125                               LogicVRegister dst,
   3126                               const LogicVRegister& src,
   3127                               uint64_t imm) {
   3128   uint64_t result[16];
   3129   int lane_count = LaneCountFromFormat(vform);
   3130   for (int i = 0; i < lane_count; ++i) {
   3131     result[i] = src.Uint(vform, i) | imm;
   3132   }
   3133   dst.ClearForWrite(vform);
   3134   for (int i = 0; i < lane_count; ++i) {
   3135     dst.SetUint(vform, i, result[i]);
   3136   }
   3137   return dst;
   3138 }
   3139 
   3140 
   3141 LogicVRegister Simulator::uxtl(VectorFormat vform,
   3142                                LogicVRegister dst,
   3143                                const LogicVRegister& src,
   3144                                bool is_2) {
   3145   VectorFormat vform_half = VectorFormatHalfWidth(vform);
   3146   int lane_count = LaneCountFromFormat(vform);
   3147   int src_offset = is_2 ? lane_count : 0;
   3148 
   3149   dst.ClearForWrite(vform);
   3150   for (int i = 0; i < lane_count; i++) {
   3151     dst.SetUint(vform, i, src.Uint(vform_half, src_offset + i));
   3152   }
   3153   return dst;
   3154 }
   3155 
   3156 
   3157 LogicVRegister Simulator::sxtl(VectorFormat vform,
   3158                                LogicVRegister dst,
   3159                                const LogicVRegister& src,
   3160                                bool is_2) {
   3161   VectorFormat vform_half = VectorFormatHalfWidth(vform);
   3162   int lane_count = LaneCountFromFormat(vform);
   3163   int src_offset = is_2 ? lane_count : 0;
   3164 
   3165   dst.ClearForWrite(vform);
   3166   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   3167     dst.SetInt(vform, i, src.Int(vform_half, src_offset + i));
   3168   }
   3169   return dst;
   3170 }
   3171 
   3172 
   3173 LogicVRegister Simulator::uxtl2(VectorFormat vform,
   3174                                 LogicVRegister dst,
   3175                                 const LogicVRegister& src) {
   3176   return uxtl(vform, dst, src, /* is_2 = */ true);
   3177 }
   3178 
   3179 
   3180 LogicVRegister Simulator::sxtl2(VectorFormat vform,
   3181                                 LogicVRegister dst,
   3182                                 const LogicVRegister& src) {
   3183   return sxtl(vform, dst, src, /* is_2 = */ true);
   3184 }
   3185 
   3186 
   3187 LogicVRegister Simulator::uxt(VectorFormat vform,
   3188                               LogicVRegister dst,
   3189                               const LogicVRegister& src,
   3190                               unsigned from_size_in_bits) {
   3191   int lane_count = LaneCountFromFormat(vform);
   3192   uint64_t mask = GetUintMask(from_size_in_bits);
   3193 
   3194   dst.ClearForWrite(vform);
   3195   for (int i = 0; i < lane_count; i++) {
   3196     dst.SetInt(vform, i, src.Uint(vform, i) & mask);
   3197   }
   3198   return dst;
   3199 }
   3200 
   3201 
   3202 LogicVRegister Simulator::sxt(VectorFormat vform,
   3203                               LogicVRegister dst,
   3204                               const LogicVRegister& src,
   3205                               unsigned from_size_in_bits) {
   3206   int lane_count = LaneCountFromFormat(vform);
   3207 
   3208   dst.ClearForWrite(vform);
   3209   for (int i = 0; i < lane_count; i++) {
   3210     uint64_t value =
   3211         ExtractSignedBitfield64(from_size_in_bits - 1, 0, src.Uint(vform, i));
   3212     dst.SetInt(vform, i, value);
   3213   }
   3214   return dst;
   3215 }
   3216 
   3217 
   3218 LogicVRegister Simulator::shrn(VectorFormat vform,
   3219                                LogicVRegister dst,
   3220                                const LogicVRegister& src,
   3221                                int shift) {
   3222   SimVRegister temp;
   3223   VectorFormat vform_src = VectorFormatDoubleWidth(vform);
   3224   VectorFormat vform_dst = vform;
   3225   LogicVRegister shifted_src = ushr(vform_src, temp, src, shift);
   3226   return extractnarrow(vform_dst, dst, false, shifted_src, false);
   3227 }
   3228 
   3229 
   3230 LogicVRegister Simulator::shrn2(VectorFormat vform,
   3231                                 LogicVRegister dst,
   3232                                 const LogicVRegister& src,
   3233                                 int shift) {
   3234   SimVRegister temp;
   3235   VectorFormat vformsrc = VectorFormatDoubleWidth(VectorFormatHalfLanes(vform));
   3236   VectorFormat vformdst = vform;
   3237   LogicVRegister shifted_src = ushr(vformsrc, temp, src, shift);
   3238   return extractnarrow(vformdst, dst, false, shifted_src, false);
   3239 }
   3240 
   3241 
   3242 LogicVRegister Simulator::rshrn(VectorFormat vform,
   3243                                 LogicVRegister dst,
   3244                                 const LogicVRegister& src,
   3245                                 int shift) {
   3246   SimVRegister temp;
   3247   VectorFormat vformsrc = VectorFormatDoubleWidth(vform);
   3248   VectorFormat vformdst = vform;
   3249   LogicVRegister shifted_src = ushr(vformsrc, temp, src, shift).Round(vformsrc);
   3250   return extractnarrow(vformdst, dst, false, shifted_src, false);
   3251 }
   3252 
   3253 
   3254 LogicVRegister Simulator::rshrn2(VectorFormat vform,
   3255                                  LogicVRegister dst,
   3256                                  const LogicVRegister& src,
   3257                                  int shift) {
   3258   SimVRegister temp;
   3259   VectorFormat vformsrc = VectorFormatDoubleWidth(VectorFormatHalfLanes(vform));
   3260   VectorFormat vformdst = vform;
   3261   LogicVRegister shifted_src = ushr(vformsrc, temp, src, shift).Round(vformsrc);
   3262   return extractnarrow(vformdst, dst, false, shifted_src, false);
   3263 }
   3264 
   3265 LogicVRegister Simulator::Table(VectorFormat vform,
   3266                                 LogicVRegister dst,
   3267                                 const LogicVRegister& ind,
   3268                                 bool zero_out_of_bounds,
   3269                                 const LogicVRegister* tab1,
   3270                                 const LogicVRegister* tab2,
   3271                                 const LogicVRegister* tab3,
   3272                                 const LogicVRegister* tab4) {
   3273   VIXL_ASSERT(tab1 != NULL);
   3274   int lane_count = LaneCountFromFormat(vform);
   3275   VIXL_ASSERT((tab3 == NULL) || (lane_count <= 16));
   3276   uint64_t table[kZRegMaxSizeInBytes * 2];
   3277   uint64_t result[kZRegMaxSizeInBytes];
   3278 
   3279   // For Neon, the table source registers are always 16B, and Neon allows only
   3280   // 8B or 16B vform for the destination, so infer the table format from the
   3281   // destination.
   3282   VectorFormat vform_tab = (vform == kFormat8B) ? kFormat16B : vform;
   3283 
   3284   uint64_t tab_size = tab1->UintArray(vform_tab, &table[0]);
   3285   if (tab2 != NULL) tab_size += tab2->UintArray(vform_tab, &table[tab_size]);
   3286   if (tab3 != NULL) tab_size += tab3->UintArray(vform_tab, &table[tab_size]);
   3287   if (tab4 != NULL) tab_size += tab4->UintArray(vform_tab, &table[tab_size]);
   3288 
   3289   for (int i = 0; i < lane_count; i++) {
   3290     uint64_t index = ind.Uint(vform, i);
   3291     result[i] = zero_out_of_bounds ? 0 : dst.Uint(vform, i);
   3292     if (index < tab_size) result[i] = table[index];
   3293   }
   3294   dst.SetUintArray(vform, result);
   3295   return dst;
   3296 }
   3297 
   3298 LogicVRegister Simulator::tbl(VectorFormat vform,
   3299                               LogicVRegister dst,
   3300                               const LogicVRegister& tab,
   3301                               const LogicVRegister& ind) {
   3302   return Table(vform, dst, ind, true, &tab);
   3303 }
   3304 
   3305 
   3306 LogicVRegister Simulator::tbl(VectorFormat vform,
   3307                               LogicVRegister dst,
   3308                               const LogicVRegister& tab,
   3309                               const LogicVRegister& tab2,
   3310                               const LogicVRegister& ind) {
   3311   return Table(vform, dst, ind, true, &tab, &tab2);
   3312 }
   3313 
   3314 
   3315 LogicVRegister Simulator::tbl(VectorFormat vform,
   3316                               LogicVRegister dst,
   3317                               const LogicVRegister& tab,
   3318                               const LogicVRegister& tab2,
   3319                               const LogicVRegister& tab3,
   3320                               const LogicVRegister& ind) {
   3321   return Table(vform, dst, ind, true, &tab, &tab2, &tab3);
   3322 }
   3323 
   3324 
   3325 LogicVRegister Simulator::tbl(VectorFormat vform,
   3326                               LogicVRegister dst,
   3327                               const LogicVRegister& tab,
   3328                               const LogicVRegister& tab2,
   3329                               const LogicVRegister& tab3,
   3330                               const LogicVRegister& tab4,
   3331                               const LogicVRegister& ind) {
   3332   return Table(vform, dst, ind, true, &tab, &tab2, &tab3, &tab4);
   3333 }
   3334 
   3335 
   3336 LogicVRegister Simulator::tbx(VectorFormat vform,
   3337                               LogicVRegister dst,
   3338                               const LogicVRegister& tab,
   3339                               const LogicVRegister& ind) {
   3340   return Table(vform, dst, ind, false, &tab);
   3341 }
   3342 
   3343 
   3344 LogicVRegister Simulator::tbx(VectorFormat vform,
   3345                               LogicVRegister dst,
   3346                               const LogicVRegister& tab,
   3347                               const LogicVRegister& tab2,
   3348                               const LogicVRegister& ind) {
   3349   return Table(vform, dst, ind, false, &tab, &tab2);
   3350 }
   3351 
   3352 
   3353 LogicVRegister Simulator::tbx(VectorFormat vform,
   3354                               LogicVRegister dst,
   3355                               const LogicVRegister& tab,
   3356                               const LogicVRegister& tab2,
   3357                               const LogicVRegister& tab3,
   3358                               const LogicVRegister& ind) {
   3359   return Table(vform, dst, ind, false, &tab, &tab2, &tab3);
   3360 }
   3361 
   3362 
   3363 LogicVRegister Simulator::tbx(VectorFormat vform,
   3364                               LogicVRegister dst,
   3365                               const LogicVRegister& tab,
   3366                               const LogicVRegister& tab2,
   3367                               const LogicVRegister& tab3,
   3368                               const LogicVRegister& tab4,
   3369                               const LogicVRegister& ind) {
   3370   return Table(vform, dst, ind, false, &tab, &tab2, &tab3, &tab4);
   3371 }
   3372 
   3373 
   3374 LogicVRegister Simulator::uqshrn(VectorFormat vform,
   3375                                  LogicVRegister dst,
   3376                                  const LogicVRegister& src,
   3377                                  int shift) {
   3378   return shrn(vform, dst, src, shift).UnsignedSaturate(vform);
   3379 }
   3380 
   3381 
   3382 LogicVRegister Simulator::uqshrn2(VectorFormat vform,
   3383                                   LogicVRegister dst,
   3384                                   const LogicVRegister& src,
   3385                                   int shift) {
   3386   return shrn2(vform, dst, src, shift).UnsignedSaturate(vform);
   3387 }
   3388 
   3389 
   3390 LogicVRegister Simulator::uqrshrn(VectorFormat vform,
   3391                                   LogicVRegister dst,
   3392                                   const LogicVRegister& src,
   3393                                   int shift) {
   3394   return rshrn(vform, dst, src, shift).UnsignedSaturate(vform);
   3395 }
   3396 
   3397 
   3398 LogicVRegister Simulator::uqrshrn2(VectorFormat vform,
   3399                                    LogicVRegister dst,
   3400                                    const LogicVRegister& src,
   3401                                    int shift) {
   3402   return rshrn2(vform, dst, src, shift).UnsignedSaturate(vform);
   3403 }
   3404 
   3405 
   3406 LogicVRegister Simulator::sqshrn(VectorFormat vform,
   3407                                  LogicVRegister dst,
   3408                                  const LogicVRegister& src,
   3409                                  int shift) {
   3410   SimVRegister temp;
   3411   VectorFormat vformsrc = VectorFormatDoubleWidth(vform);
   3412   VectorFormat vformdst = vform;
   3413   LogicVRegister shifted_src = sshr(vformsrc, temp, src, shift);
   3414   return sqxtn(vformdst, dst, shifted_src);
   3415 }
   3416 
   3417 
   3418 LogicVRegister Simulator::sqshrn2(VectorFormat vform,
   3419                                   LogicVRegister dst,
   3420                                   const LogicVRegister& src,
   3421                                   int shift) {
   3422   SimVRegister temp;
   3423   VectorFormat vformsrc = VectorFormatDoubleWidth(VectorFormatHalfLanes(vform));
   3424   VectorFormat vformdst = vform;
   3425   LogicVRegister shifted_src = sshr(vformsrc, temp, src, shift);
   3426   return sqxtn(vformdst, dst, shifted_src);
   3427 }
   3428 
   3429 
   3430 LogicVRegister Simulator::sqrshrn(VectorFormat vform,
   3431                                   LogicVRegister dst,
   3432                                   const LogicVRegister& src,
   3433                                   int shift) {
   3434   SimVRegister temp;
   3435   VectorFormat vformsrc = VectorFormatDoubleWidth(vform);
   3436   VectorFormat vformdst = vform;
   3437   LogicVRegister shifted_src = sshr(vformsrc, temp, src, shift).Round(vformsrc);
   3438   return sqxtn(vformdst, dst, shifted_src);
   3439 }
   3440 
   3441 
   3442 LogicVRegister Simulator::sqrshrn2(VectorFormat vform,
   3443                                    LogicVRegister dst,
   3444                                    const LogicVRegister& src,
   3445                                    int shift) {
   3446   SimVRegister temp;
   3447   VectorFormat vformsrc = VectorFormatDoubleWidth(VectorFormatHalfLanes(vform));
   3448   VectorFormat vformdst = vform;
   3449   LogicVRegister shifted_src = sshr(vformsrc, temp, src, shift).Round(vformsrc);
   3450   return sqxtn(vformdst, dst, shifted_src);
   3451 }
   3452 
   3453 
   3454 LogicVRegister Simulator::sqshrun(VectorFormat vform,
   3455                                   LogicVRegister dst,
   3456                                   const LogicVRegister& src,
   3457                                   int shift) {
   3458   SimVRegister temp;
   3459   VectorFormat vformsrc = VectorFormatDoubleWidth(vform);
   3460   VectorFormat vformdst = vform;
   3461   LogicVRegister shifted_src = sshr(vformsrc, temp, src, shift);
   3462   return sqxtun(vformdst, dst, shifted_src);
   3463 }
   3464 
   3465 
   3466 LogicVRegister Simulator::sqshrun2(VectorFormat vform,
   3467                                    LogicVRegister dst,
   3468                                    const LogicVRegister& src,
   3469                                    int shift) {
   3470   SimVRegister temp;
   3471   VectorFormat vformsrc = VectorFormatDoubleWidth(VectorFormatHalfLanes(vform));
   3472   VectorFormat vformdst = vform;
   3473   LogicVRegister shifted_src = sshr(vformsrc, temp, src, shift);
   3474   return sqxtun(vformdst, dst, shifted_src);
   3475 }
   3476 
   3477 
   3478 LogicVRegister Simulator::sqrshrun(VectorFormat vform,
   3479                                    LogicVRegister dst,
   3480                                    const LogicVRegister& src,
   3481                                    int shift) {
   3482   SimVRegister temp;
   3483   VectorFormat vformsrc = VectorFormatDoubleWidth(vform);
   3484   VectorFormat vformdst = vform;
   3485   LogicVRegister shifted_src = sshr(vformsrc, temp, src, shift).Round(vformsrc);
   3486   return sqxtun(vformdst, dst, shifted_src);
   3487 }
   3488 
   3489 
   3490 LogicVRegister Simulator::sqrshrun2(VectorFormat vform,
   3491                                     LogicVRegister dst,
   3492                                     const LogicVRegister& src,
   3493                                     int shift) {
   3494   SimVRegister temp;
   3495   VectorFormat vformsrc = VectorFormatDoubleWidth(VectorFormatHalfLanes(vform));
   3496   VectorFormat vformdst = vform;
   3497   LogicVRegister shifted_src = sshr(vformsrc, temp, src, shift).Round(vformsrc);
   3498   return sqxtun(vformdst, dst, shifted_src);
   3499 }
   3500 
   3501 
   3502 LogicVRegister Simulator::uaddl(VectorFormat vform,
   3503                                 LogicVRegister dst,
   3504                                 const LogicVRegister& src1,
   3505                                 const LogicVRegister& src2) {
   3506   SimVRegister temp1, temp2;
   3507   uxtl(vform, temp1, src1);
   3508   uxtl(vform, temp2, src2);
   3509   add(vform, dst, temp1, temp2);
   3510   return dst;
   3511 }
   3512 
   3513 
   3514 LogicVRegister Simulator::uaddl2(VectorFormat vform,
   3515                                  LogicVRegister dst,
   3516                                  const LogicVRegister& src1,
   3517                                  const LogicVRegister& src2) {
   3518   SimVRegister temp1, temp2;
   3519   uxtl2(vform, temp1, src1);
   3520   uxtl2(vform, temp2, src2);
   3521   add(vform, dst, temp1, temp2);
   3522   return dst;
   3523 }
   3524 
   3525 
   3526 LogicVRegister Simulator::uaddw(VectorFormat vform,
   3527                                 LogicVRegister dst,
   3528                                 const LogicVRegister& src1,
   3529                                 const LogicVRegister& src2) {
   3530   SimVRegister temp;
   3531   uxtl(vform, temp, src2);
   3532   add(vform, dst, src1, temp);
   3533   return dst;
   3534 }
   3535 
   3536 
   3537 LogicVRegister Simulator::uaddw2(VectorFormat vform,
   3538                                  LogicVRegister dst,
   3539                                  const LogicVRegister& src1,
   3540                                  const LogicVRegister& src2) {
   3541   SimVRegister temp;
   3542   uxtl2(vform, temp, src2);
   3543   add(vform, dst, src1, temp);
   3544   return dst;
   3545 }
   3546 
   3547 
   3548 LogicVRegister Simulator::saddl(VectorFormat vform,
   3549                                 LogicVRegister dst,
   3550                                 const LogicVRegister& src1,
   3551                                 const LogicVRegister& src2) {
   3552   SimVRegister temp1, temp2;
   3553   sxtl(vform, temp1, src1);
   3554   sxtl(vform, temp2, src2);
   3555   add(vform, dst, temp1, temp2);
   3556   return dst;
   3557 }
   3558 
   3559 
   3560 LogicVRegister Simulator::saddl2(VectorFormat vform,
   3561                                  LogicVRegister dst,
   3562                                  const LogicVRegister& src1,
   3563                                  const LogicVRegister& src2) {
   3564   SimVRegister temp1, temp2;
   3565   sxtl2(vform, temp1, src1);
   3566   sxtl2(vform, temp2, src2);
   3567   add(vform, dst, temp1, temp2);
   3568   return dst;
   3569 }
   3570 
   3571 
   3572 LogicVRegister Simulator::saddw(VectorFormat vform,
   3573                                 LogicVRegister dst,
   3574                                 const LogicVRegister& src1,
   3575                                 const LogicVRegister& src2) {
   3576   SimVRegister temp;
   3577   sxtl(vform, temp, src2);
   3578   add(vform, dst, src1, temp);
   3579   return dst;
   3580 }
   3581 
   3582 
   3583 LogicVRegister Simulator::saddw2(VectorFormat vform,
   3584                                  LogicVRegister dst,
   3585                                  const LogicVRegister& src1,
   3586                                  const LogicVRegister& src2) {
   3587   SimVRegister temp;
   3588   sxtl2(vform, temp, src2);
   3589   add(vform, dst, src1, temp);
   3590   return dst;
   3591 }
   3592 
   3593 
   3594 LogicVRegister Simulator::usubl(VectorFormat vform,
   3595                                 LogicVRegister dst,
   3596                                 const LogicVRegister& src1,
   3597                                 const LogicVRegister& src2) {
   3598   SimVRegister temp1, temp2;
   3599   uxtl(vform, temp1, src1);
   3600   uxtl(vform, temp2, src2);
   3601   sub(vform, dst, temp1, temp2);
   3602   return dst;
   3603 }
   3604 
   3605 
   3606 LogicVRegister Simulator::usubl2(VectorFormat vform,
   3607                                  LogicVRegister dst,
   3608                                  const LogicVRegister& src1,
   3609                                  const LogicVRegister& src2) {
   3610   SimVRegister temp1, temp2;
   3611   uxtl2(vform, temp1, src1);
   3612   uxtl2(vform, temp2, src2);
   3613   sub(vform, dst, temp1, temp2);
   3614   return dst;
   3615 }
   3616 
   3617 
   3618 LogicVRegister Simulator::usubw(VectorFormat vform,
   3619                                 LogicVRegister dst,
   3620                                 const LogicVRegister& src1,
   3621                                 const LogicVRegister& src2) {
   3622   SimVRegister temp;
   3623   uxtl(vform, temp, src2);
   3624   sub(vform, dst, src1, temp);
   3625   return dst;
   3626 }
   3627 
   3628 
   3629 LogicVRegister Simulator::usubw2(VectorFormat vform,
   3630                                  LogicVRegister dst,
   3631                                  const LogicVRegister& src1,
   3632                                  const LogicVRegister& src2) {
   3633   SimVRegister temp;
   3634   uxtl2(vform, temp, src2);
   3635   sub(vform, dst, src1, temp);
   3636   return dst;
   3637 }
   3638 
   3639 
   3640 LogicVRegister Simulator::ssubl(VectorFormat vform,
   3641                                 LogicVRegister dst,
   3642                                 const LogicVRegister& src1,
   3643                                 const LogicVRegister& src2) {
   3644   SimVRegister temp1, temp2;
   3645   sxtl(vform, temp1, src1);
   3646   sxtl(vform, temp2, src2);
   3647   sub(vform, dst, temp1, temp2);
   3648   return dst;
   3649 }
   3650 
   3651 
   3652 LogicVRegister Simulator::ssubl2(VectorFormat vform,
   3653                                  LogicVRegister dst,
   3654                                  const LogicVRegister& src1,
   3655                                  const LogicVRegister& src2) {
   3656   SimVRegister temp1, temp2;
   3657   sxtl2(vform, temp1, src1);
   3658   sxtl2(vform, temp2, src2);
   3659   sub(vform, dst, temp1, temp2);
   3660   return dst;
   3661 }
   3662 
   3663 
   3664 LogicVRegister Simulator::ssubw(VectorFormat vform,
   3665                                 LogicVRegister dst,
   3666                                 const LogicVRegister& src1,
   3667                                 const LogicVRegister& src2) {
   3668   SimVRegister temp;
   3669   sxtl(vform, temp, src2);
   3670   sub(vform, dst, src1, temp);
   3671   return dst;
   3672 }
   3673 
   3674 
   3675 LogicVRegister Simulator::ssubw2(VectorFormat vform,
   3676                                  LogicVRegister dst,
   3677                                  const LogicVRegister& src1,
   3678                                  const LogicVRegister& src2) {
   3679   SimVRegister temp;
   3680   sxtl2(vform, temp, src2);
   3681   sub(vform, dst, src1, temp);
   3682   return dst;
   3683 }
   3684 
   3685 
   3686 LogicVRegister Simulator::uabal(VectorFormat vform,
   3687                                 LogicVRegister dst,
   3688                                 const LogicVRegister& src1,
   3689                                 const LogicVRegister& src2) {
   3690   SimVRegister temp1, temp2;
   3691   uxtl(vform, temp1, src1);
   3692   uxtl(vform, temp2, src2);
   3693   uaba(vform, dst, temp1, temp2);
   3694   return dst;
   3695 }
   3696 
   3697 
   3698 LogicVRegister Simulator::uabal2(VectorFormat vform,
   3699                                  LogicVRegister dst,
   3700                                  const LogicVRegister& src1,
   3701                                  const LogicVRegister& src2) {
   3702   SimVRegister temp1, temp2;
   3703   uxtl2(vform, temp1, src1);
   3704   uxtl2(vform, temp2, src2);
   3705   uaba(vform, dst, temp1, temp2);
   3706   return dst;
   3707 }
   3708 
   3709 
   3710 LogicVRegister Simulator::sabal(VectorFormat vform,
   3711                                 LogicVRegister dst,
   3712                                 const LogicVRegister& src1,
   3713                                 const LogicVRegister& src2) {
   3714   SimVRegister temp1, temp2;
   3715   sxtl(vform, temp1, src1);
   3716   sxtl(vform, temp2, src2);
   3717   saba(vform, dst, temp1, temp2);
   3718   return dst;
   3719 }
   3720 
   3721 
   3722 LogicVRegister Simulator::sabal2(VectorFormat vform,
   3723                                  LogicVRegister dst,
   3724                                  const LogicVRegister& src1,
   3725                                  const LogicVRegister& src2) {
   3726   SimVRegister temp1, temp2;
   3727   sxtl2(vform, temp1, src1);
   3728   sxtl2(vform, temp2, src2);
   3729   saba(vform, dst, temp1, temp2);
   3730   return dst;
   3731 }
   3732 
   3733 
   3734 LogicVRegister Simulator::uabdl(VectorFormat vform,
   3735                                 LogicVRegister dst,
   3736                                 const LogicVRegister& src1,
   3737                                 const LogicVRegister& src2) {
   3738   SimVRegister temp1, temp2;
   3739   uxtl(vform, temp1, src1);
   3740   uxtl(vform, temp2, src2);
   3741   absdiff(vform, dst, temp1, temp2, false);
   3742   return dst;
   3743 }
   3744 
   3745 
   3746 LogicVRegister Simulator::uabdl2(VectorFormat vform,
   3747                                  LogicVRegister dst,
   3748                                  const LogicVRegister& src1,
   3749                                  const LogicVRegister& src2) {
   3750   SimVRegister temp1, temp2;
   3751   uxtl2(vform, temp1, src1);
   3752   uxtl2(vform, temp2, src2);
   3753   absdiff(vform, dst, temp1, temp2, false);
   3754   return dst;
   3755 }
   3756 
   3757 
   3758 LogicVRegister Simulator::sabdl(VectorFormat vform,
   3759                                 LogicVRegister dst,
   3760                                 const LogicVRegister& src1,
   3761                                 const LogicVRegister& src2) {
   3762   SimVRegister temp1, temp2;
   3763   sxtl(vform, temp1, src1);
   3764   sxtl(vform, temp2, src2);
   3765   absdiff(vform, dst, temp1, temp2, true);
   3766   return dst;
   3767 }
   3768 
   3769 
   3770 LogicVRegister Simulator::sabdl2(VectorFormat vform,
   3771                                  LogicVRegister dst,
   3772                                  const LogicVRegister& src1,
   3773                                  const LogicVRegister& src2) {
   3774   SimVRegister temp1, temp2;
   3775   sxtl2(vform, temp1, src1);
   3776   sxtl2(vform, temp2, src2);
   3777   absdiff(vform, dst, temp1, temp2, true);
   3778   return dst;
   3779 }
   3780 
   3781 
   3782 LogicVRegister Simulator::umull(VectorFormat vform,
   3783                                 LogicVRegister dst,
   3784                                 const LogicVRegister& src1,
   3785                                 const LogicVRegister& src2,
   3786                                 bool is_2) {
   3787   SimVRegister temp1, temp2;
   3788   uxtl(vform, temp1, src1, is_2);
   3789   uxtl(vform, temp2, src2, is_2);
   3790   mul(vform, dst, temp1, temp2);
   3791   return dst;
   3792 }
   3793 
   3794 
   3795 LogicVRegister Simulator::umull2(VectorFormat vform,
   3796                                  LogicVRegister dst,
   3797                                  const LogicVRegister& src1,
   3798                                  const LogicVRegister& src2) {
   3799   return umull(vform, dst, src1, src2, /* is_2 = */ true);
   3800 }
   3801 
   3802 
   3803 LogicVRegister Simulator::smull(VectorFormat vform,
   3804                                 LogicVRegister dst,
   3805                                 const LogicVRegister& src1,
   3806                                 const LogicVRegister& src2,
   3807                                 bool is_2) {
   3808   SimVRegister temp1, temp2;
   3809   sxtl(vform, temp1, src1, is_2);
   3810   sxtl(vform, temp2, src2, is_2);
   3811   mul(vform, dst, temp1, temp2);
   3812   return dst;
   3813 }
   3814 
   3815 
   3816 LogicVRegister Simulator::smull2(VectorFormat vform,
   3817                                  LogicVRegister dst,
   3818                                  const LogicVRegister& src1,
   3819                                  const LogicVRegister& src2) {
   3820   return smull(vform, dst, src1, src2, /* is_2 = */ true);
   3821 }
   3822 
   3823 
   3824 LogicVRegister Simulator::umlsl(VectorFormat vform,
   3825                                 LogicVRegister dst,
   3826                                 const LogicVRegister& src1,
   3827                                 const LogicVRegister& src2,
   3828                                 bool is_2) {
   3829   SimVRegister temp1, temp2;
   3830   uxtl(vform, temp1, src1, is_2);
   3831   uxtl(vform, temp2, src2, is_2);
   3832   mls(vform, dst, dst, temp1, temp2);
   3833   return dst;
   3834 }
   3835 
   3836 
   3837 LogicVRegister Simulator::umlsl2(VectorFormat vform,
   3838                                  LogicVRegister dst,
   3839                                  const LogicVRegister& src1,
   3840                                  const LogicVRegister& src2) {
   3841   return umlsl(vform, dst, src1, src2, /* is_2 = */ true);
   3842 }
   3843 
   3844 
   3845 LogicVRegister Simulator::smlsl(VectorFormat vform,
   3846                                 LogicVRegister dst,
   3847                                 const LogicVRegister& src1,
   3848                                 const LogicVRegister& src2,
   3849                                 bool is_2) {
   3850   SimVRegister temp1, temp2;
   3851   sxtl(vform, temp1, src1, is_2);
   3852   sxtl(vform, temp2, src2, is_2);
   3853   mls(vform, dst, dst, temp1, temp2);
   3854   return dst;
   3855 }
   3856 
   3857 
   3858 LogicVRegister Simulator::smlsl2(VectorFormat vform,
   3859                                  LogicVRegister dst,
   3860                                  const LogicVRegister& src1,
   3861                                  const LogicVRegister& src2) {
   3862   return smlsl(vform, dst, src1, src2, /* is_2 = */ true);
   3863 }
   3864 
   3865 
   3866 LogicVRegister Simulator::umlal(VectorFormat vform,
   3867                                 LogicVRegister dst,
   3868                                 const LogicVRegister& src1,
   3869                                 const LogicVRegister& src2,
   3870                                 bool is_2) {
   3871   SimVRegister temp1, temp2;
   3872   uxtl(vform, temp1, src1, is_2);
   3873   uxtl(vform, temp2, src2, is_2);
   3874   mla(vform, dst, dst, temp1, temp2);
   3875   return dst;
   3876 }
   3877 
   3878 
   3879 LogicVRegister Simulator::umlal2(VectorFormat vform,
   3880                                  LogicVRegister dst,
   3881                                  const LogicVRegister& src1,
   3882                                  const LogicVRegister& src2) {
   3883   return umlal(vform, dst, src1, src2, /* is_2 = */ true);
   3884 }
   3885 
   3886 
   3887 LogicVRegister Simulator::smlal(VectorFormat vform,
   3888                                 LogicVRegister dst,
   3889                                 const LogicVRegister& src1,
   3890                                 const LogicVRegister& src2,
   3891                                 bool is_2) {
   3892   SimVRegister temp1, temp2;
   3893   sxtl(vform, temp1, src1, is_2);
   3894   sxtl(vform, temp2, src2, is_2);
   3895   mla(vform, dst, dst, temp1, temp2);
   3896   return dst;
   3897 }
   3898 
   3899 
   3900 LogicVRegister Simulator::smlal2(VectorFormat vform,
   3901                                  LogicVRegister dst,
   3902                                  const LogicVRegister& src1,
   3903                                  const LogicVRegister& src2) {
   3904   return smlal(vform, dst, src1, src2, /* is_2 = */ true);
   3905 }
   3906 
   3907 
   3908 LogicVRegister Simulator::sqdmlal(VectorFormat vform,
   3909                                   LogicVRegister dst,
   3910                                   const LogicVRegister& src1,
   3911                                   const LogicVRegister& src2,
   3912                                   bool is_2) {
   3913   SimVRegister temp;
   3914   LogicVRegister product = sqdmull(vform, temp, src1, src2, is_2);
   3915   return add(vform, dst, dst, product).SignedSaturate(vform);
   3916 }
   3917 
   3918 
   3919 LogicVRegister Simulator::sqdmlal2(VectorFormat vform,
   3920                                    LogicVRegister dst,
   3921                                    const LogicVRegister& src1,
   3922                                    const LogicVRegister& src2) {
   3923   return sqdmlal(vform, dst, src1, src2, /* is_2 = */ true);
   3924 }
   3925 
   3926 
   3927 LogicVRegister Simulator::sqdmlsl(VectorFormat vform,
   3928                                   LogicVRegister dst,
   3929                                   const LogicVRegister& src1,
   3930                                   const LogicVRegister& src2,
   3931                                   bool is_2) {
   3932   SimVRegister temp;
   3933   LogicVRegister product = sqdmull(vform, temp, src1, src2, is_2);
   3934   return sub(vform, dst, dst, product).SignedSaturate(vform);
   3935 }
   3936 
   3937 
   3938 LogicVRegister Simulator::sqdmlsl2(VectorFormat vform,
   3939                                    LogicVRegister dst,
   3940                                    const LogicVRegister& src1,
   3941                                    const LogicVRegister& src2) {
   3942   return sqdmlsl(vform, dst, src1, src2, /* is_2 = */ true);
   3943 }
   3944 
   3945 
   3946 LogicVRegister Simulator::sqdmull(VectorFormat vform,
   3947                                   LogicVRegister dst,
   3948                                   const LogicVRegister& src1,
   3949                                   const LogicVRegister& src2,
   3950                                   bool is_2) {
   3951   SimVRegister temp;
   3952   LogicVRegister product = smull(vform, temp, src1, src2, is_2);
   3953   return add(vform, dst, product, product).SignedSaturate(vform);
   3954 }
   3955 
   3956 
   3957 LogicVRegister Simulator::sqdmull2(VectorFormat vform,
   3958                                    LogicVRegister dst,
   3959                                    const LogicVRegister& src1,
   3960                                    const LogicVRegister& src2) {
   3961   return sqdmull(vform, dst, src1, src2, /* is_2 = */ true);
   3962 }
   3963 
   3964 LogicVRegister Simulator::sqrdmulh(VectorFormat vform,
   3965                                    LogicVRegister dst,
   3966                                    const LogicVRegister& src1,
   3967                                    const LogicVRegister& src2,
   3968                                    bool round) {
   3969   int esize = LaneSizeInBitsFromFormat(vform);
   3970 
   3971   SimVRegister temp_lo, temp_hi;
   3972 
   3973   // Compute low and high multiplication results.
   3974   mul(vform, temp_lo, src1, src2);
   3975   smulh(vform, temp_hi, src1, src2);
   3976 
   3977   // Double by shifting high half, and adding in most-significant bit of low
   3978   // half.
   3979   shl(vform, temp_hi, temp_hi, 1);
   3980   usra(vform, temp_hi, temp_lo, esize - 1);
   3981 
   3982   if (round) {
   3983     // Add the second (due to doubling) most-significant bit of the low half
   3984     // into the result.
   3985     shl(vform, temp_lo, temp_lo, 1);
   3986     usra(vform, temp_hi, temp_lo, esize - 1);
   3987   }
   3988 
   3989   SimPRegister not_sat;
   3990   LogicPRegister ptemp(not_sat);
   3991   dst.ClearForWrite(vform);
   3992   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   3993     // Saturation only occurs when src1 = src2 = minimum representable value.
   3994     // Check this as a special case.
   3995     ptemp.SetActive(vform, i, true);
   3996     if ((src1.Int(vform, i) == MinIntFromFormat(vform)) &&
   3997         (src2.Int(vform, i) == MinIntFromFormat(vform))) {
   3998       ptemp.SetActive(vform, i, false);
   3999     }
   4000     dst.SetInt(vform, i, MaxIntFromFormat(vform));
   4001   }
   4002 
   4003   mov_merging(vform, dst, not_sat, temp_hi);
   4004   return dst;
   4005 }
   4006 
   4007 
   4008 LogicVRegister Simulator::dot(VectorFormat vform,
   4009                               LogicVRegister dst,
   4010                               const LogicVRegister& src1,
   4011                               const LogicVRegister& src2,
   4012                               bool is_src1_signed,
   4013                               bool is_src2_signed) {
   4014   VectorFormat quarter_vform =
   4015       VectorFormatHalfWidthDoubleLanes(VectorFormatHalfWidthDoubleLanes(vform));
   4016 
   4017   dst.ClearForWrite(vform);
   4018   for (int e = 0; e < LaneCountFromFormat(vform); e++) {
   4019     uint64_t result = 0;
   4020     int64_t element1, element2;
   4021     for (int i = 0; i < 4; i++) {
   4022       int index = 4 * e + i;
   4023       if (is_src1_signed) {
   4024         element1 = src1.Int(quarter_vform, index);
   4025       } else {
   4026         element1 = src1.Uint(quarter_vform, index);
   4027       }
   4028       if (is_src2_signed) {
   4029         element2 = src2.Int(quarter_vform, index);
   4030       } else {
   4031         element2 = src2.Uint(quarter_vform, index);
   4032       }
   4033       result += element1 * element2;
   4034     }
   4035     dst.SetUint(vform, e, result + dst.Uint(vform, e));
   4036   }
   4037   return dst;
   4038 }
   4039 
   4040 
   4041 LogicVRegister Simulator::sdot(VectorFormat vform,
   4042                                LogicVRegister dst,
   4043                                const LogicVRegister& src1,
   4044                                const LogicVRegister& src2) {
   4045   return dot(vform, dst, src1, src2, true, true);
   4046 }
   4047 
   4048 
   4049 LogicVRegister Simulator::udot(VectorFormat vform,
   4050                                LogicVRegister dst,
   4051                                const LogicVRegister& src1,
   4052                                const LogicVRegister& src2) {
   4053   return dot(vform, dst, src1, src2, false, false);
   4054 }
   4055 
   4056 LogicVRegister Simulator::usdot(VectorFormat vform,
   4057                                 LogicVRegister dst,
   4058                                 const LogicVRegister& src1,
   4059                                 const LogicVRegister& src2) {
   4060   return dot(vform, dst, src1, src2, false, true);
   4061 }
   4062 
   4063 LogicVRegister Simulator::cdot(VectorFormat vform,
   4064                                LogicVRegister dst,
   4065                                const LogicVRegister& acc,
   4066                                const LogicVRegister& src1,
   4067                                const LogicVRegister& src2,
   4068                                int rot) {
   4069   VIXL_ASSERT((rot == 0) || (rot == 90) || (rot == 180) || (rot == 270));
   4070   VectorFormat quarter_vform =
   4071       VectorFormatHalfWidthDoubleLanes(VectorFormatHalfWidthDoubleLanes(vform));
   4072 
   4073   int sel_a = ((rot == 0) || (rot == 180)) ? 0 : 1;
   4074   int sel_b = 1 - sel_a;
   4075   int sub_i = ((rot == 90) || (rot == 180)) ? 1 : -1;
   4076 
   4077   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   4078     int64_t result = acc.Int(vform, i);
   4079     for (int j = 0; j < 2; j++) {
   4080       int64_t r1 = src1.Int(quarter_vform, (4 * i) + (2 * j) + 0);
   4081       int64_t i1 = src1.Int(quarter_vform, (4 * i) + (2 * j) + 1);
   4082       int64_t r2 = src2.Int(quarter_vform, (4 * i) + (2 * j) + sel_a);
   4083       int64_t i2 = src2.Int(quarter_vform, (4 * i) + (2 * j) + sel_b);
   4084       result += (r1 * r2) + (sub_i * i1 * i2);
   4085     }
   4086     dst.SetInt(vform, i, result);
   4087   }
   4088   return dst;
   4089 }
   4090 
   4091 LogicVRegister Simulator::sqrdcmlah(VectorFormat vform,
   4092                                     LogicVRegister dst,
   4093                                     const LogicVRegister& srca,
   4094                                     const LogicVRegister& src1,
   4095                                     const LogicVRegister& src2,
   4096                                     int rot) {
   4097   SimVRegister src1_a, src1_b;
   4098   SimVRegister src2_a, src2_b;
   4099   SimVRegister srca_i, srca_r;
   4100   SimVRegister zero, temp;
   4101   zero.Clear();
   4102 
   4103   if ((rot == 0) || (rot == 180)) {
   4104     uzp1(vform, src1_a, src1, zero);
   4105     uzp1(vform, src2_a, src2, zero);
   4106     uzp2(vform, src2_b, src2, zero);
   4107   } else {
   4108     uzp2(vform, src1_a, src1, zero);
   4109     uzp2(vform, src2_a, src2, zero);
   4110     uzp1(vform, src2_b, src2, zero);
   4111   }
   4112 
   4113   uzp1(vform, srca_r, srca, zero);
   4114   uzp2(vform, srca_i, srca, zero);
   4115 
   4116   bool sub_r = (rot == 90) || (rot == 180);
   4117   bool sub_i = (rot == 180) || (rot == 270);
   4118 
   4119   const bool round = true;
   4120   sqrdmlash(vform, srca_r, src1_a, src2_a, round, sub_r);
   4121   sqrdmlash(vform, srca_i, src1_a, src2_b, round, sub_i);
   4122   zip1(vform, dst, srca_r, srca_i);
   4123   return dst;
   4124 }
   4125 
   4126 LogicVRegister Simulator::sqrdcmlah(VectorFormat vform,
   4127                                     LogicVRegister dst,
   4128                                     const LogicVRegister& srca,
   4129                                     const LogicVRegister& src1,
   4130                                     const LogicVRegister& src2,
   4131                                     int index,
   4132                                     int rot) {
   4133   SimVRegister temp;
   4134   dup_elements_to_segments(VectorFormatDoubleWidth(vform), temp, src2, index);
   4135   return sqrdcmlah(vform, dst, srca, src1, temp, rot);
   4136 }
   4137 
   4138 LogicVRegister Simulator::sqrdmlash_d(VectorFormat vform,
   4139                                       LogicVRegister dst,
   4140                                       const LogicVRegister& src1,
   4141                                       const LogicVRegister& src2,
   4142                                       bool round,
   4143                                       bool sub_op) {
   4144   // 2 * INT_64_MIN * INT_64_MIN causes INT_128 to overflow.
   4145   // To avoid this, we use:
   4146   //     (dst << (esize - 1) + src1 * src2 + 1 << (esize - 2)) >> (esize - 1)
   4147   // which is same as:
   4148   //     (dst << esize + 2 * src1 * src2 + 1 << (esize - 1)) >> esize.
   4149 
   4150   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   4151   int esize = kDRegSize;
   4152   vixl_uint128_t round_const, accum;
   4153   round_const.first = 0;
   4154   if (round) {
   4155     round_const.second = UINT64_C(1) << (esize - 2);
   4156   } else {
   4157     round_const.second = 0;
   4158   }
   4159 
   4160   dst.ClearForWrite(vform);
   4161   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   4162     // Shift the whole value left by `esize - 1` bits.
   4163     accum.first = dst.Int(vform, i) >> 1;
   4164     accum.second = dst.Int(vform, i) << (esize - 1);
   4165 
   4166     vixl_uint128_t product = Mul64(src1.Int(vform, i), src2.Int(vform, i));
   4167 
   4168     if (sub_op) {
   4169       product = Neg128(product);
   4170     }
   4171     accum = Add128(accum, product);
   4172 
   4173     // Perform rounding.
   4174     accum = Add128(accum, round_const);
   4175 
   4176     // Arithmetic shift the whole value right by `esize - 1` bits.
   4177     accum.second = (accum.first << 1) | (accum.second >> (esize - 1));
   4178     accum.first = -(accum.first >> (esize - 1));
   4179 
   4180     // Perform saturation.
   4181     bool is_pos = (accum.first == 0) ? true : false;
   4182     if (is_pos &&
   4183         (accum.second > static_cast<uint64_t>(MaxIntFromFormat(vform)))) {
   4184       accum.second = MaxIntFromFormat(vform);
   4185     } else if (!is_pos && (accum.second <
   4186                            static_cast<uint64_t>(MinIntFromFormat(vform)))) {
   4187       accum.second = MinIntFromFormat(vform);
   4188     }
   4189 
   4190     dst.SetInt(vform, i, accum.second);
   4191   }
   4192 
   4193   return dst;
   4194 }
   4195 
   4196 LogicVRegister Simulator::sqrdmlash(VectorFormat vform,
   4197                                     LogicVRegister dst,
   4198                                     const LogicVRegister& src1,
   4199                                     const LogicVRegister& src2,
   4200                                     bool round,
   4201                                     bool sub_op) {
   4202   // 2 * INT_32_MIN * INT_32_MIN causes int64_t to overflow.
   4203   // To avoid this, we use:
   4204   //     (dst << (esize - 1) + src1 * src2 + 1 << (esize - 2)) >> (esize - 1)
   4205   // which is same as:
   4206   //     (dst << esize + 2 * src1 * src2 + 1 << (esize - 1)) >> esize.
   4207 
   4208   if (vform == kFormatVnD) {
   4209     return sqrdmlash_d(vform, dst, src1, src2, round, sub_op);
   4210   }
   4211 
   4212   int esize = LaneSizeInBitsFromFormat(vform);
   4213   int round_const = round ? (1 << (esize - 2)) : 0;
   4214   int64_t accum;
   4215 
   4216   dst.ClearForWrite(vform);
   4217   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   4218     accum = dst.Int(vform, i) << (esize - 1);
   4219     if (sub_op) {
   4220       accum -= src1.Int(vform, i) * src2.Int(vform, i);
   4221     } else {
   4222       accum += src1.Int(vform, i) * src2.Int(vform, i);
   4223     }
   4224     accum += round_const;
   4225     accum = accum >> (esize - 1);
   4226 
   4227     if (accum > MaxIntFromFormat(vform)) {
   4228       accum = MaxIntFromFormat(vform);
   4229     } else if (accum < MinIntFromFormat(vform)) {
   4230       accum = MinIntFromFormat(vform);
   4231     }
   4232     dst.SetInt(vform, i, accum);
   4233   }
   4234   return dst;
   4235 }
   4236 
   4237 
   4238 LogicVRegister Simulator::sqrdmlah(VectorFormat vform,
   4239                                    LogicVRegister dst,
   4240                                    const LogicVRegister& src1,
   4241                                    const LogicVRegister& src2,
   4242                                    bool round) {
   4243   return sqrdmlash(vform, dst, src1, src2, round, false);
   4244 }
   4245 
   4246 
   4247 LogicVRegister Simulator::sqrdmlsh(VectorFormat vform,
   4248                                    LogicVRegister dst,
   4249                                    const LogicVRegister& src1,
   4250                                    const LogicVRegister& src2,
   4251                                    bool round) {
   4252   return sqrdmlash(vform, dst, src1, src2, round, true);
   4253 }
   4254 
   4255 
   4256 LogicVRegister Simulator::sqdmulh(VectorFormat vform,
   4257                                   LogicVRegister dst,
   4258                                   const LogicVRegister& src1,
   4259                                   const LogicVRegister& src2) {
   4260   return sqrdmulh(vform, dst, src1, src2, false);
   4261 }
   4262 
   4263 
   4264 LogicVRegister Simulator::addhn(VectorFormat vform,
   4265                                 LogicVRegister dst,
   4266                                 const LogicVRegister& src1,
   4267                                 const LogicVRegister& src2) {
   4268   SimVRegister temp;
   4269   add(VectorFormatDoubleWidth(vform), temp, src1, src2);
   4270   shrn(vform, dst, temp, LaneSizeInBitsFromFormat(vform));
   4271   return dst;
   4272 }
   4273 
   4274 
   4275 LogicVRegister Simulator::addhn2(VectorFormat vform,
   4276                                  LogicVRegister dst,
   4277                                  const LogicVRegister& src1,
   4278                                  const LogicVRegister& src2) {
   4279   SimVRegister temp;
   4280   add(VectorFormatDoubleWidth(VectorFormatHalfLanes(vform)), temp, src1, src2);
   4281   shrn2(vform, dst, temp, LaneSizeInBitsFromFormat(vform));
   4282   return dst;
   4283 }
   4284 
   4285 
   4286 LogicVRegister Simulator::raddhn(VectorFormat vform,
   4287                                  LogicVRegister dst,
   4288                                  const LogicVRegister& src1,
   4289                                  const LogicVRegister& src2) {
   4290   SimVRegister temp;
   4291   add(VectorFormatDoubleWidth(vform), temp, src1, src2);
   4292   rshrn(vform, dst, temp, LaneSizeInBitsFromFormat(vform));
   4293   return dst;
   4294 }
   4295 
   4296 
   4297 LogicVRegister Simulator::raddhn2(VectorFormat vform,
   4298                                   LogicVRegister dst,
   4299                                   const LogicVRegister& src1,
   4300                                   const LogicVRegister& src2) {
   4301   SimVRegister temp;
   4302   add(VectorFormatDoubleWidth(VectorFormatHalfLanes(vform)), temp, src1, src2);
   4303   rshrn2(vform, dst, temp, LaneSizeInBitsFromFormat(vform));
   4304   return dst;
   4305 }
   4306 
   4307 
   4308 LogicVRegister Simulator::subhn(VectorFormat vform,
   4309                                 LogicVRegister dst,
   4310                                 const LogicVRegister& src1,
   4311                                 const LogicVRegister& src2) {
   4312   SimVRegister temp;
   4313   sub(VectorFormatDoubleWidth(vform), temp, src1, src2);
   4314   shrn(vform, dst, temp, LaneSizeInBitsFromFormat(vform));
   4315   return dst;
   4316 }
   4317 
   4318 
   4319 LogicVRegister Simulator::subhn2(VectorFormat vform,
   4320                                  LogicVRegister dst,
   4321                                  const LogicVRegister& src1,
   4322                                  const LogicVRegister& src2) {
   4323   SimVRegister temp;
   4324   sub(VectorFormatDoubleWidth(VectorFormatHalfLanes(vform)), temp, src1, src2);
   4325   shrn2(vform, dst, temp, LaneSizeInBitsFromFormat(vform));
   4326   return dst;
   4327 }
   4328 
   4329 
   4330 LogicVRegister Simulator::rsubhn(VectorFormat vform,
   4331                                  LogicVRegister dst,
   4332                                  const LogicVRegister& src1,
   4333                                  const LogicVRegister& src2) {
   4334   SimVRegister temp;
   4335   sub(VectorFormatDoubleWidth(vform), temp, src1, src2);
   4336   rshrn(vform, dst, temp, LaneSizeInBitsFromFormat(vform));
   4337   return dst;
   4338 }
   4339 
   4340 
   4341 LogicVRegister Simulator::rsubhn2(VectorFormat vform,
   4342                                   LogicVRegister dst,
   4343                                   const LogicVRegister& src1,
   4344                                   const LogicVRegister& src2) {
   4345   SimVRegister temp;
   4346   sub(VectorFormatDoubleWidth(VectorFormatHalfLanes(vform)), temp, src1, src2);
   4347   rshrn2(vform, dst, temp, LaneSizeInBitsFromFormat(vform));
   4348   return dst;
   4349 }
   4350 
   4351 
   4352 LogicVRegister Simulator::trn1(VectorFormat vform,
   4353                                LogicVRegister dst,
   4354                                const LogicVRegister& src1,
   4355                                const LogicVRegister& src2) {
   4356   uint64_t result[kZRegMaxSizeInBytes] = {};
   4357   int lane_count = LaneCountFromFormat(vform);
   4358   int pairs = lane_count / 2;
   4359   for (int i = 0; i < pairs; ++i) {
   4360     result[2 * i] = src1.Uint(vform, 2 * i);
   4361     result[(2 * i) + 1] = src2.Uint(vform, 2 * i);
   4362   }
   4363 
   4364   dst.ClearForWrite(vform);
   4365   for (int i = 0; i < lane_count; ++i) {
   4366     dst.SetUint(vform, i, result[i]);
   4367   }
   4368   return dst;
   4369 }
   4370 
   4371 
   4372 LogicVRegister Simulator::trn2(VectorFormat vform,
   4373                                LogicVRegister dst,
   4374                                const LogicVRegister& src1,
   4375                                const LogicVRegister& src2) {
   4376   uint64_t result[kZRegMaxSizeInBytes] = {};
   4377   int lane_count = LaneCountFromFormat(vform);
   4378   int pairs = lane_count / 2;
   4379   for (int i = 0; i < pairs; ++i) {
   4380     result[2 * i] = src1.Uint(vform, (2 * i) + 1);
   4381     result[(2 * i) + 1] = src2.Uint(vform, (2 * i) + 1);
   4382   }
   4383 
   4384   dst.ClearForWrite(vform);
   4385   for (int i = 0; i < lane_count; ++i) {
   4386     dst.SetUint(vform, i, result[i]);
   4387   }
   4388   return dst;
   4389 }
   4390 
   4391 
   4392 LogicVRegister Simulator::zip1(VectorFormat vform,
   4393                                LogicVRegister dst,
   4394                                const LogicVRegister& src1,
   4395                                const LogicVRegister& src2) {
   4396   uint64_t result[kZRegMaxSizeInBytes] = {};
   4397   int lane_count = LaneCountFromFormat(vform);
   4398   int pairs = lane_count / 2;
   4399   for (int i = 0; i < pairs; ++i) {
   4400     result[2 * i] = src1.Uint(vform, i);
   4401     result[(2 * i) + 1] = src2.Uint(vform, i);
   4402   }
   4403 
   4404   dst.ClearForWrite(vform);
   4405   for (int i = 0; i < lane_count; ++i) {
   4406     dst.SetUint(vform, i, result[i]);
   4407   }
   4408   return dst;
   4409 }
   4410 
   4411 
   4412 LogicVRegister Simulator::zip2(VectorFormat vform,
   4413                                LogicVRegister dst,
   4414                                const LogicVRegister& src1,
   4415                                const LogicVRegister& src2) {
   4416   uint64_t result[kZRegMaxSizeInBytes] = {};
   4417   int lane_count = LaneCountFromFormat(vform);
   4418   int pairs = lane_count / 2;
   4419   for (int i = 0; i < pairs; ++i) {
   4420     result[2 * i] = src1.Uint(vform, pairs + i);
   4421     result[(2 * i) + 1] = src2.Uint(vform, pairs + i);
   4422   }
   4423 
   4424   dst.ClearForWrite(vform);
   4425   for (int i = 0; i < lane_count; ++i) {
   4426     dst.SetUint(vform, i, result[i]);
   4427   }
   4428   return dst;
   4429 }
   4430 
   4431 
   4432 LogicVRegister Simulator::uzp1(VectorFormat vform,
   4433                                LogicVRegister dst,
   4434                                const LogicVRegister& src1,
   4435                                const LogicVRegister& src2) {
   4436   uint64_t result[kZRegMaxSizeInBytes * 2];
   4437   int lane_count = LaneCountFromFormat(vform);
   4438   for (int i = 0; i < lane_count; ++i) {
   4439     result[i] = src1.Uint(vform, i);
   4440     result[lane_count + i] = src2.Uint(vform, i);
   4441   }
   4442 
   4443   dst.ClearForWrite(vform);
   4444   for (int i = 0; i < lane_count; ++i) {
   4445     dst.SetUint(vform, i, result[2 * i]);
   4446   }
   4447   return dst;
   4448 }
   4449 
   4450 
   4451 LogicVRegister Simulator::uzp2(VectorFormat vform,
   4452                                LogicVRegister dst,
   4453                                const LogicVRegister& src1,
   4454                                const LogicVRegister& src2) {
   4455   uint64_t result[kZRegMaxSizeInBytes * 2];
   4456   int lane_count = LaneCountFromFormat(vform);
   4457   for (int i = 0; i < lane_count; ++i) {
   4458     result[i] = src1.Uint(vform, i);
   4459     result[lane_count + i] = src2.Uint(vform, i);
   4460   }
   4461 
   4462   dst.ClearForWrite(vform);
   4463   for (int i = 0; i < lane_count; ++i) {
   4464     dst.SetUint(vform, i, result[(2 * i) + 1]);
   4465   }
   4466   return dst;
   4467 }
   4468 
   4469 LogicVRegister Simulator::interleave_top_bottom(VectorFormat vform,
   4470                                                 LogicVRegister dst,
   4471                                                 const LogicVRegister& src) {
   4472   // Interleave the top and bottom half of a vector, ie. for a vector:
   4473   //
   4474   //   [ ... | F | D | B | ... | E | C | A ]
   4475   //
   4476   // where B is the first element in the top half of the vector, produce a
   4477   // result vector:
   4478   //
   4479   //   [ ... | ... | F | E | D | C | B | A ]
   4480 
   4481   uint64_t result[kZRegMaxSizeInBytes] = {};
   4482   int lane_count = LaneCountFromFormat(vform);
   4483   for (int i = 0; i < lane_count; i += 2) {
   4484     result[i] = src.Uint(vform, i / 2);
   4485     result[i + 1] = src.Uint(vform, (lane_count / 2) + (i / 2));
   4486   }
   4487   dst.SetUintArray(vform, result);
   4488   return dst;
   4489 }
   4490 
   4491 template <typename T>
   4492 T Simulator::FPNeg(T op) {
   4493   return -op;
   4494 }
   4495 
   4496 template <typename T>
   4497 T Simulator::FPAdd(T op1, T op2) {
   4498   T result = FPProcessNaNs(op1, op2);
   4499   if (IsNaN(result)) {
   4500     return result;
   4501   }
   4502 
   4503   if (IsInf(op1) && IsInf(op2) && (op1 != op2)) {
   4504     // inf + -inf returns the default NaN.
   4505     FPProcessException();
   4506     return FPDefaultNaN<T>();
   4507   } else {
   4508     // Other cases should be handled by standard arithmetic.
   4509     return op1 + op2;
   4510   }
   4511 }
   4512 
   4513 
   4514 template <typename T>
   4515 T Simulator::FPSub(T op1, T op2) {
   4516   // NaNs should be handled elsewhere.
   4517   VIXL_ASSERT(!IsNaN(op1) && !IsNaN(op2));
   4518 
   4519   if (IsInf(op1) && IsInf(op2) && (op1 == op2)) {
   4520     // inf - inf returns the default NaN.
   4521     FPProcessException();
   4522     return FPDefaultNaN<T>();
   4523   } else {
   4524     // Other cases should be handled by standard arithmetic.
   4525     return op1 - op2;
   4526   }
   4527 }
   4528 
   4529 template <typename T>
   4530 T Simulator::FPMulNaNs(T op1, T op2) {
   4531   T result = FPProcessNaNs(op1, op2);
   4532   return IsNaN(result) ? result : FPMul(op1, op2);
   4533 }
   4534 
   4535 template <typename T>
   4536 T Simulator::FPMul(T op1, T op2) {
   4537   // NaNs should be handled elsewhere.
   4538   VIXL_ASSERT(!IsNaN(op1) && !IsNaN(op2));
   4539 
   4540   if ((IsInf(op1) && (op2 == 0.0)) || (IsInf(op2) && (op1 == 0.0))) {
   4541     // inf * 0.0 returns the default NaN.
   4542     FPProcessException();
   4543     return FPDefaultNaN<T>();
   4544   } else {
   4545     // Other cases should be handled by standard arithmetic.
   4546     return op1 * op2;
   4547   }
   4548 }
   4549 
   4550 
   4551 template <typename T>
   4552 T Simulator::FPMulx(T op1, T op2) {
   4553   if ((IsInf(op1) && (op2 == 0.0)) || (IsInf(op2) && (op1 == 0.0))) {
   4554     // inf * 0.0 returns +/-2.0.
   4555     T two = 2.0;
   4556     return copysign(1.0, op1) * copysign(1.0, op2) * two;
   4557   }
   4558   return FPMul(op1, op2);
   4559 }
   4560 
   4561 
   4562 template <typename T>
   4563 T Simulator::FPMulAdd(T a, T op1, T op2) {
   4564   T result = FPProcessNaNs3(a, op1, op2);
   4565 
   4566   T sign_a = copysign(1.0, a);
   4567   T sign_prod = copysign(1.0, op1) * copysign(1.0, op2);
   4568   bool isinf_prod = IsInf(op1) || IsInf(op2);
   4569   bool operation_generates_nan =
   4570       (IsInf(op1) && (op2 == 0.0)) ||                     // inf * 0.0
   4571       (IsInf(op2) && (op1 == 0.0)) ||                     // 0.0 * inf
   4572       (IsInf(a) && isinf_prod && (sign_a != sign_prod));  // inf - inf
   4573 
   4574   if (IsNaN(result)) {
   4575     // Generated NaNs override quiet NaNs propagated from a.
   4576     if (operation_generates_nan && IsQuietNaN(a)) {
   4577       FPProcessException();
   4578       return FPDefaultNaN<T>();
   4579     } else {
   4580       return result;
   4581     }
   4582   }
   4583 
   4584   // If the operation would produce a NaN, return the default NaN.
   4585   if (operation_generates_nan) {
   4586     FPProcessException();
   4587     return FPDefaultNaN<T>();
   4588   }
   4589 
   4590   // Work around broken fma implementations for exact zero results: The sign of
   4591   // exact 0.0 results is positive unless both a and op1 * op2 are negative.
   4592   if (((op1 == 0.0) || (op2 == 0.0)) && (a == 0.0)) {
   4593     return ((sign_a < T(0.0)) && (sign_prod < T(0.0))) ? -0.0 : 0.0;
   4594   }
   4595 
   4596   result = FusedMultiplyAdd(op1, op2, a);
   4597   VIXL_ASSERT(!IsNaN(result));
   4598 
   4599   // Work around broken fma implementations for rounded zero results: If a is
   4600   // 0.0, the sign of the result is the sign of op1 * op2 before rounding.
   4601   if ((a == 0.0) && (result == 0.0)) {
   4602     return copysign(0.0, sign_prod);
   4603   }
   4604 
   4605   return result;
   4606 }
   4607 
   4608 
   4609 template <typename T>
   4610 T Simulator::FPDiv(T op1, T op2) {
   4611   // NaNs should be handled elsewhere.
   4612   VIXL_ASSERT(!IsNaN(op1) && !IsNaN(op2));
   4613 
   4614   if ((IsInf(op1) && IsInf(op2)) || ((op1 == 0.0) && (op2 == 0.0))) {
   4615     // inf / inf and 0.0 / 0.0 return the default NaN.
   4616     FPProcessException();
   4617     return FPDefaultNaN<T>();
   4618   } else {
   4619     if (op2 == 0.0) {
   4620       FPProcessException();
   4621       if (!IsNaN(op1)) {
   4622         double op1_sign = copysign(1.0, op1);
   4623         double op2_sign = copysign(1.0, op2);
   4624         return static_cast<T>(op1_sign * op2_sign * kFP64PositiveInfinity);
   4625       }
   4626     }
   4627 
   4628     // Other cases should be handled by standard arithmetic.
   4629     return op1 / op2;
   4630   }
   4631 }
   4632 
   4633 
   4634 template <typename T>
   4635 T Simulator::FPSqrt(T op) {
   4636   if (IsNaN(op)) {
   4637     return FPProcessNaN(op);
   4638   } else if (op < T(0.0)) {
   4639     FPProcessException();
   4640     return FPDefaultNaN<T>();
   4641   } else {
   4642     return sqrt(op);
   4643   }
   4644 }
   4645 
   4646 
   4647 template <typename T>
   4648 T Simulator::FPMax(T a, T b) {
   4649   T result = FPProcessNaNs(a, b);
   4650   if (IsNaN(result)) return result;
   4651 
   4652   if ((a == 0.0) && (b == 0.0) && (copysign(1.0, a) != copysign(1.0, b))) {
   4653     // a and b are zero, and the sign differs: return +0.0.
   4654     return 0.0;
   4655   } else {
   4656     return (a > b) ? a : b;
   4657   }
   4658 }
   4659 
   4660 
   4661 template <typename T>
   4662 T Simulator::FPMaxNM(T a, T b) {
   4663   if (IsQuietNaN(a) && !IsQuietNaN(b)) {
   4664     a = kFP64NegativeInfinity;
   4665   } else if (!IsQuietNaN(a) && IsQuietNaN(b)) {
   4666     b = kFP64NegativeInfinity;
   4667   }
   4668 
   4669   T result = FPProcessNaNs(a, b);
   4670   return IsNaN(result) ? result : FPMax(a, b);
   4671 }
   4672 
   4673 
   4674 template <typename T>
   4675 T Simulator::FPMin(T a, T b) {
   4676   T result = FPProcessNaNs(a, b);
   4677   if (IsNaN(result)) return result;
   4678 
   4679   if ((a == 0.0) && (b == 0.0) && (copysign(1.0, a) != copysign(1.0, b))) {
   4680     // a and b are zero, and the sign differs: return -0.0.
   4681     return -0.0;
   4682   } else {
   4683     return (a < b) ? a : b;
   4684   }
   4685 }
   4686 
   4687 
   4688 template <typename T>
   4689 T Simulator::FPMinNM(T a, T b) {
   4690   if (IsQuietNaN(a) && !IsQuietNaN(b)) {
   4691     a = kFP64PositiveInfinity;
   4692   } else if (!IsQuietNaN(a) && IsQuietNaN(b)) {
   4693     b = kFP64PositiveInfinity;
   4694   }
   4695 
   4696   T result = FPProcessNaNs(a, b);
   4697   return IsNaN(result) ? result : FPMin(a, b);
   4698 }
   4699 
   4700 
   4701 template <typename T>
   4702 T Simulator::FPRecipStepFused(T op1, T op2) {
   4703   const T two = 2.0;
   4704   if ((IsInf(op1) && (op2 == 0.0)) || ((op1 == 0.0) && (IsInf(op2)))) {
   4705     return two;
   4706   } else if (IsInf(op1) || IsInf(op2)) {
   4707     // Return +inf if signs match, otherwise -inf.
   4708     return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity
   4709                                           : kFP64NegativeInfinity;
   4710   } else {
   4711     return FusedMultiplyAdd(op1, op2, two);
   4712   }
   4713 }
   4714 
   4715 template <typename T>
   4716 bool IsNormal(T value) {
   4717   return std::isnormal(value);
   4718 }
   4719 
   4720 template <>
   4721 bool IsNormal(SimFloat16 value) {
   4722   uint16_t rawbits = Float16ToRawbits(value);
   4723   uint16_t exp_mask = 0x7c00;
   4724   // Check that the exponent is neither all zeroes or all ones.
   4725   return ((rawbits & exp_mask) != 0) && ((~rawbits & exp_mask) != 0);
   4726 }
   4727 
   4728 
   4729 template <typename T>
   4730 T Simulator::FPRSqrtStepFused(T op1, T op2) {
   4731   const T one_point_five = 1.5;
   4732   const T two = 2.0;
   4733 
   4734   if ((IsInf(op1) && (op2 == 0.0)) || ((op1 == 0.0) && (IsInf(op2)))) {
   4735     return one_point_five;
   4736   } else if (IsInf(op1) || IsInf(op2)) {
   4737     // Return +inf if signs match, otherwise -inf.
   4738     return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity
   4739                                           : kFP64NegativeInfinity;
   4740   } else {
   4741     // The multiply-add-halve operation must be fully fused, so avoid interim
   4742     // rounding by checking which operand can be losslessly divided by two
   4743     // before doing the multiply-add.
   4744     if (IsNormal(op1 / two)) {
   4745       return FusedMultiplyAdd(op1 / two, op2, one_point_five);
   4746     } else if (IsNormal(op2 / two)) {
   4747       return FusedMultiplyAdd(op1, op2 / two, one_point_five);
   4748     } else {
   4749       // Neither operand is normal after halving: the result is dominated by
   4750       // the addition term, so just return that.
   4751       return one_point_five;
   4752     }
   4753   }
   4754 }
   4755 
   4756 int32_t Simulator::FPToFixedJS(double value) {
   4757   // The Z-flag is set when the conversion from double precision floating-point
   4758   // to 32-bit integer is exact. If the source value is +/-Infinity, -0.0, NaN,
   4759   // outside the bounds of a 32-bit integer, or isn't an exact integer then the
   4760   // Z-flag is unset.
   4761   int Z = 1;
   4762   int32_t result;
   4763 
   4764   if ((value == 0.0) || (value == kFP64PositiveInfinity) ||
   4765       (value == kFP64NegativeInfinity)) {
   4766     // +/- zero and infinity all return zero, however -0 and +/- Infinity also
   4767     // unset the Z-flag.
   4768     result = 0.0;
   4769     if ((value != 0.0) || std::signbit(value)) {
   4770       Z = 0;
   4771     }
   4772   } else if (std::isnan(value)) {
   4773     // NaN values unset the Z-flag and set the result to 0.
   4774     FPProcessNaN(value);
   4775     result = 0;
   4776     Z = 0;
   4777   } else {
   4778     // All other values are converted to an integer representation, rounded
   4779     // toward zero.
   4780     double int_result = std::floor(value);
   4781     double error = value - int_result;
   4782 
   4783     if ((error != 0.0) && (int_result < 0.0)) {
   4784       int_result++;
   4785     }
   4786 
   4787     // Constrain the value into the range [INT32_MIN, INT32_MAX]. We can almost
   4788     // write a one-liner with std::round, but the behaviour on ties is incorrect
   4789     // for our purposes.
   4790     double mod_const = static_cast<double>(UINT64_C(1) << 32);
   4791     double mod_error =
   4792         (int_result / mod_const) - std::floor(int_result / mod_const);
   4793     double constrained;
   4794     if (mod_error == 0.5) {
   4795       constrained = INT32_MIN;
   4796     } else {
   4797       constrained = int_result - mod_const * round(int_result / mod_const);
   4798     }
   4799 
   4800     VIXL_ASSERT(std::floor(constrained) == constrained);
   4801     VIXL_ASSERT(constrained >= INT32_MIN);
   4802     VIXL_ASSERT(constrained <= INT32_MAX);
   4803 
   4804     // Take the bottom 32 bits of the result as a 32-bit integer.
   4805     result = static_cast<int32_t>(constrained);
   4806 
   4807     if ((int_result < INT32_MIN) || (int_result > INT32_MAX) ||
   4808         (error != 0.0)) {
   4809       // If the integer result is out of range or the conversion isn't exact,
   4810       // take exception and unset the Z-flag.
   4811       FPProcessException();
   4812       Z = 0;
   4813     }
   4814   }
   4815 
   4816   ReadNzcv().SetN(0);
   4817   ReadNzcv().SetZ(Z);
   4818   ReadNzcv().SetC(0);
   4819   ReadNzcv().SetV(0);
   4820 
   4821   return result;
   4822 }
   4823 
   4824 double Simulator::FPRoundIntCommon(double value, FPRounding round_mode) {
   4825   VIXL_ASSERT((value != kFP64PositiveInfinity) &&
   4826               (value != kFP64NegativeInfinity));
   4827   VIXL_ASSERT(!IsNaN(value));
   4828 
   4829   double int_result = std::floor(value);
   4830   double error = value - int_result;
   4831   switch (round_mode) {
   4832     case FPTieAway: {
   4833       // Take care of correctly handling the range ]-0.5, -0.0], which must
   4834       // yield -0.0.
   4835       if ((-0.5 < value) && (value < 0.0)) {
   4836         int_result = -0.0;
   4837 
   4838       } else if ((error > 0.5) || ((error == 0.5) && (int_result >= 0.0))) {
   4839         // If the error is greater than 0.5, or is equal to 0.5 and the integer
   4840         // result is positive, round up.
   4841         int_result++;
   4842       }
   4843       break;
   4844     }
   4845     case FPTieEven: {
   4846       // Take care of correctly handling the range [-0.5, -0.0], which must
   4847       // yield -0.0.
   4848       if ((-0.5 <= value) && (value < 0.0)) {
   4849         int_result = -0.0;
   4850 
   4851         // If the error is greater than 0.5, or is equal to 0.5 and the integer
   4852         // result is odd, round up.
   4853       } else if ((error > 0.5) ||
   4854                  ((error == 0.5) && (std::fmod(int_result, 2) != 0))) {
   4855         int_result++;
   4856       }
   4857       break;
   4858     }
   4859     case FPZero: {
   4860       // If value>0 then we take floor(value)
   4861       // otherwise, ceil(value).
   4862       if (value < 0) {
   4863         int_result = ceil(value);
   4864       }
   4865       break;
   4866     }
   4867     case FPNegativeInfinity: {
   4868       // We always use floor(value).
   4869       break;
   4870     }
   4871     case FPPositiveInfinity: {
   4872       // Take care of correctly handling the range ]-1.0, -0.0], which must
   4873       // yield -0.0.
   4874       if ((-1.0 < value) && (value < 0.0)) {
   4875         int_result = -0.0;
   4876 
   4877         // If the error is non-zero, round up.
   4878       } else if (error > 0.0) {
   4879         int_result++;
   4880       }
   4881       break;
   4882     }
   4883     default:
   4884       VIXL_UNIMPLEMENTED();
   4885   }
   4886   return int_result;
   4887 }
   4888 
   4889 double Simulator::FPRoundInt(double value, FPRounding round_mode) {
   4890   if ((value == 0.0) || (value == kFP64PositiveInfinity) ||
   4891       (value == kFP64NegativeInfinity)) {
   4892     return value;
   4893   } else if (IsNaN(value)) {
   4894     return FPProcessNaN(value);
   4895   }
   4896   return FPRoundIntCommon(value, round_mode);
   4897 }
   4898 
   4899 double Simulator::FPRoundInt(double value,
   4900                              FPRounding round_mode,
   4901                              FrintMode frint_mode) {
   4902   if (frint_mode == kFrintToInteger) {
   4903     return FPRoundInt(value, round_mode);
   4904   }
   4905 
   4906   VIXL_ASSERT((frint_mode == kFrintToInt32) || (frint_mode == kFrintToInt64));
   4907 
   4908   if (value == 0.0) {
   4909     return value;
   4910   }
   4911 
   4912   if ((value == kFP64PositiveInfinity) || (value == kFP64NegativeInfinity) ||
   4913       IsNaN(value)) {
   4914     if (frint_mode == kFrintToInt32) {
   4915       return INT32_MIN;
   4916     } else {
   4917       return INT64_MIN;
   4918     }
   4919   }
   4920 
   4921   double result = FPRoundIntCommon(value, round_mode);
   4922 
   4923   // We want to compare `result > INT64_MAX` below, but INT64_MAX isn't exactly
   4924   // representable as a double, and is rounded to (INT64_MAX + 1) when
   4925   // converted. To avoid this, we compare `result >= int64_max_plus_one`
   4926   // instead; this is safe because `result` is known to be integral, and
   4927   // `int64_max_plus_one` is exactly representable as a double.
   4928   constexpr uint64_t int64_max_plus_one = static_cast<uint64_t>(INT64_MAX) + 1;
   4929   VIXL_STATIC_ASSERT(static_cast<uint64_t>(static_cast<double>(
   4930                          int64_max_plus_one)) == int64_max_plus_one);
   4931 
   4932   if (frint_mode == kFrintToInt32) {
   4933     if ((result > INT32_MAX) || (result < INT32_MIN)) {
   4934       return INT32_MIN;
   4935     }
   4936   } else if ((result >= int64_max_plus_one) || (result < INT64_MIN)) {
   4937     return INT64_MIN;
   4938   }
   4939 
   4940   return result;
   4941 }
   4942 
   4943 int16_t Simulator::FPToInt16(double value, FPRounding rmode) {
   4944   value = FPRoundInt(value, rmode);
   4945   if (value >= kHMaxInt) {
   4946     return kHMaxInt;
   4947   } else if (value < kHMinInt) {
   4948     return kHMinInt;
   4949   }
   4950   return IsNaN(value) ? 0 : static_cast<int16_t>(value);
   4951 }
   4952 
   4953 
   4954 int32_t Simulator::FPToInt32(double value, FPRounding rmode) {
   4955   value = FPRoundInt(value, rmode);
   4956   if (value >= kWMaxInt) {
   4957     return kWMaxInt;
   4958   } else if (value < kWMinInt) {
   4959     return kWMinInt;
   4960   }
   4961   return IsNaN(value) ? 0 : static_cast<int32_t>(value);
   4962 }
   4963 
   4964 
   4965 int64_t Simulator::FPToInt64(double value, FPRounding rmode) {
   4966   value = FPRoundInt(value, rmode);
   4967   // This is equivalent to "if (value >= kXMaxInt)" but avoids rounding issues
   4968   // as a result of kMaxInt not being representable as a double.
   4969   if (value >= 9223372036854775808.) {
   4970     return kXMaxInt;
   4971   } else if (value < kXMinInt) {
   4972     return kXMinInt;
   4973   }
   4974   return IsNaN(value) ? 0 : static_cast<int64_t>(value);
   4975 }
   4976 
   4977 
   4978 uint16_t Simulator::FPToUInt16(double value, FPRounding rmode) {
   4979   value = FPRoundInt(value, rmode);
   4980   if (value >= kHMaxUInt) {
   4981     return kHMaxUInt;
   4982   } else if (value < 0.0) {
   4983     return 0;
   4984   }
   4985   return IsNaN(value) ? 0 : static_cast<uint16_t>(value);
   4986 }
   4987 
   4988 
   4989 uint32_t Simulator::FPToUInt32(double value, FPRounding rmode) {
   4990   value = FPRoundInt(value, rmode);
   4991   if (value >= kWMaxUInt) {
   4992     return kWMaxUInt;
   4993   } else if (value < 0.0) {
   4994     return 0;
   4995   }
   4996   return IsNaN(value) ? 0 : static_cast<uint32_t>(value);
   4997 }
   4998 
   4999 
   5000 uint64_t Simulator::FPToUInt64(double value, FPRounding rmode) {
   5001   value = FPRoundInt(value, rmode);
   5002   // This is equivalent to "if (value >= kXMaxUInt)" but avoids rounding issues
   5003   // as a result of kMaxUInt not being representable as a double.
   5004   if (value >= 18446744073709551616.) {
   5005     return kXMaxUInt;
   5006   } else if (value < 0.0) {
   5007     return 0;
   5008   }
   5009   return IsNaN(value) ? 0 : static_cast<uint64_t>(value);
   5010 }
   5011 
   5012 
   5013 #define DEFINE_NEON_FP_VECTOR_OP(FN, OP, PROCNAN)                \
   5014   template <typename T>                                          \
   5015   LogicVRegister Simulator::FN(VectorFormat vform,               \
   5016                                LogicVRegister dst,               \
   5017                                const LogicVRegister& src1,       \
   5018                                const LogicVRegister& src2) {     \
   5019     dst.ClearForWrite(vform);                                    \
   5020     for (int i = 0; i < LaneCountFromFormat(vform); i++) {       \
   5021       T op1 = src1.Float<T>(i);                                  \
   5022       T op2 = src2.Float<T>(i);                                  \
   5023       T result;                                                  \
   5024       if (PROCNAN) {                                             \
   5025         result = FPProcessNaNs(op1, op2);                        \
   5026         if (!IsNaN(result)) {                                    \
   5027           result = OP(op1, op2);                                 \
   5028         }                                                        \
   5029       } else {                                                   \
   5030         result = OP(op1, op2);                                   \
   5031       }                                                          \
   5032       dst.SetFloat(vform, i, result);                            \
   5033     }                                                            \
   5034     return dst;                                                  \
   5035   }                                                              \
   5036                                                                  \
   5037   LogicVRegister Simulator::FN(VectorFormat vform,               \
   5038                                LogicVRegister dst,               \
   5039                                const LogicVRegister& src1,       \
   5040                                const LogicVRegister& src2) {     \
   5041     if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {          \
   5042       FN<SimFloat16>(vform, dst, src1, src2);                    \
   5043     } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {   \
   5044       FN<float>(vform, dst, src1, src2);                         \
   5045     } else {                                                     \
   5046       VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize); \
   5047       FN<double>(vform, dst, src1, src2);                        \
   5048     }                                                            \
   5049     return dst;                                                  \
   5050   }
   5051 NEON_FP3SAME_LIST(DEFINE_NEON_FP_VECTOR_OP)
   5052 #undef DEFINE_NEON_FP_VECTOR_OP
   5053 
   5054 
   5055 LogicVRegister Simulator::fnmul(VectorFormat vform,
   5056                                 LogicVRegister dst,
   5057                                 const LogicVRegister& src1,
   5058                                 const LogicVRegister& src2) {
   5059   SimVRegister temp;
   5060   LogicVRegister product = fmul(vform, temp, src1, src2);
   5061   return fneg(vform, dst, product);
   5062 }
   5063 
   5064 
   5065 template <typename T>
   5066 LogicVRegister Simulator::frecps(VectorFormat vform,
   5067                                  LogicVRegister dst,
   5068                                  const LogicVRegister& src1,
   5069                                  const LogicVRegister& src2) {
   5070   dst.ClearForWrite(vform);
   5071   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5072     T op1 = -src1.Float<T>(i);
   5073     T op2 = src2.Float<T>(i);
   5074     T result = FPProcessNaNs(op1, op2);
   5075     dst.SetFloat(vform, i, IsNaN(result) ? result : FPRecipStepFused(op1, op2));
   5076   }
   5077   return dst;
   5078 }
   5079 
   5080 
   5081 LogicVRegister Simulator::frecps(VectorFormat vform,
   5082                                  LogicVRegister dst,
   5083                                  const LogicVRegister& src1,
   5084                                  const LogicVRegister& src2) {
   5085   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5086     frecps<SimFloat16>(vform, dst, src1, src2);
   5087   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5088     frecps<float>(vform, dst, src1, src2);
   5089   } else {
   5090     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5091     frecps<double>(vform, dst, src1, src2);
   5092   }
   5093   return dst;
   5094 }
   5095 
   5096 
   5097 template <typename T>
   5098 LogicVRegister Simulator::frsqrts(VectorFormat vform,
   5099                                   LogicVRegister dst,
   5100                                   const LogicVRegister& src1,
   5101                                   const LogicVRegister& src2) {
   5102   dst.ClearForWrite(vform);
   5103   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5104     T op1 = -src1.Float<T>(i);
   5105     T op2 = src2.Float<T>(i);
   5106     T result = FPProcessNaNs(op1, op2);
   5107     dst.SetFloat(vform, i, IsNaN(result) ? result : FPRSqrtStepFused(op1, op2));
   5108   }
   5109   return dst;
   5110 }
   5111 
   5112 
   5113 LogicVRegister Simulator::frsqrts(VectorFormat vform,
   5114                                   LogicVRegister dst,
   5115                                   const LogicVRegister& src1,
   5116                                   const LogicVRegister& src2) {
   5117   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5118     frsqrts<SimFloat16>(vform, dst, src1, src2);
   5119   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5120     frsqrts<float>(vform, dst, src1, src2);
   5121   } else {
   5122     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5123     frsqrts<double>(vform, dst, src1, src2);
   5124   }
   5125   return dst;
   5126 }
   5127 
   5128 
   5129 template <typename T>
   5130 LogicVRegister Simulator::fcmp(VectorFormat vform,
   5131                                LogicVRegister dst,
   5132                                const LogicVRegister& src1,
   5133                                const LogicVRegister& src2,
   5134                                Condition cond) {
   5135   dst.ClearForWrite(vform);
   5136   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5137     bool result = false;
   5138     T op1 = src1.Float<T>(i);
   5139     T op2 = src2.Float<T>(i);
   5140     bool unordered = IsNaN(FPProcessNaNs(op1, op2));
   5141 
   5142     switch (cond) {
   5143       case eq:
   5144         result = (op1 == op2);
   5145         break;
   5146       case ge:
   5147         result = (op1 >= op2);
   5148         break;
   5149       case gt:
   5150         result = (op1 > op2);
   5151         break;
   5152       case le:
   5153         result = (op1 <= op2);
   5154         break;
   5155       case lt:
   5156         result = (op1 < op2);
   5157         break;
   5158       case ne:
   5159         result = (op1 != op2);
   5160         break;
   5161       case uo:
   5162         result = unordered;
   5163         break;
   5164       default:
   5165         // Other conditions are defined in terms of those above.
   5166         VIXL_UNREACHABLE();
   5167         break;
   5168     }
   5169 
   5170     if (result && unordered) {
   5171       // Only `uo` and `ne` can be true for unordered comparisons.
   5172       VIXL_ASSERT((cond == uo) || (cond == ne));
   5173     }
   5174 
   5175     dst.SetUint(vform, i, result ? MaxUintFromFormat(vform) : 0);
   5176   }
   5177   return dst;
   5178 }
   5179 
   5180 
   5181 LogicVRegister Simulator::fcmp(VectorFormat vform,
   5182                                LogicVRegister dst,
   5183                                const LogicVRegister& src1,
   5184                                const LogicVRegister& src2,
   5185                                Condition cond) {
   5186   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5187     fcmp<SimFloat16>(vform, dst, src1, src2, cond);
   5188   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5189     fcmp<float>(vform, dst, src1, src2, cond);
   5190   } else {
   5191     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5192     fcmp<double>(vform, dst, src1, src2, cond);
   5193   }
   5194   return dst;
   5195 }
   5196 
   5197 
   5198 LogicVRegister Simulator::fcmp_zero(VectorFormat vform,
   5199                                     LogicVRegister dst,
   5200                                     const LogicVRegister& src,
   5201                                     Condition cond) {
   5202   SimVRegister temp;
   5203   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5204     LogicVRegister zero_reg =
   5205         dup_immediate(vform, temp, Float16ToRawbits(SimFloat16(0.0)));
   5206     fcmp<SimFloat16>(vform, dst, src, zero_reg, cond);
   5207   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5208     LogicVRegister zero_reg = dup_immediate(vform, temp, FloatToRawbits(0.0));
   5209     fcmp<float>(vform, dst, src, zero_reg, cond);
   5210   } else {
   5211     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5212     LogicVRegister zero_reg = dup_immediate(vform, temp, DoubleToRawbits(0.0));
   5213     fcmp<double>(vform, dst, src, zero_reg, cond);
   5214   }
   5215   return dst;
   5216 }
   5217 
   5218 
   5219 LogicVRegister Simulator::fabscmp(VectorFormat vform,
   5220                                   LogicVRegister dst,
   5221                                   const LogicVRegister& src1,
   5222                                   const LogicVRegister& src2,
   5223                                   Condition cond) {
   5224   SimVRegister temp1, temp2;
   5225   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5226     LogicVRegister abs_src1 = fabs_<SimFloat16>(vform, temp1, src1);
   5227     LogicVRegister abs_src2 = fabs_<SimFloat16>(vform, temp2, src2);
   5228     fcmp<SimFloat16>(vform, dst, abs_src1, abs_src2, cond);
   5229   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5230     LogicVRegister abs_src1 = fabs_<float>(vform, temp1, src1);
   5231     LogicVRegister abs_src2 = fabs_<float>(vform, temp2, src2);
   5232     fcmp<float>(vform, dst, abs_src1, abs_src2, cond);
   5233   } else {
   5234     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5235     LogicVRegister abs_src1 = fabs_<double>(vform, temp1, src1);
   5236     LogicVRegister abs_src2 = fabs_<double>(vform, temp2, src2);
   5237     fcmp<double>(vform, dst, abs_src1, abs_src2, cond);
   5238   }
   5239   return dst;
   5240 }
   5241 
   5242 
   5243 template <typename T>
   5244 LogicVRegister Simulator::fmla(VectorFormat vform,
   5245                                LogicVRegister dst,
   5246                                const LogicVRegister& srca,
   5247                                const LogicVRegister& src1,
   5248                                const LogicVRegister& src2) {
   5249   dst.ClearForWrite(vform);
   5250   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5251     T op1 = src1.Float<T>(i);
   5252     T op2 = src2.Float<T>(i);
   5253     T acc = srca.Float<T>(i);
   5254     T result = FPMulAdd(acc, op1, op2);
   5255     dst.SetFloat(vform, i, result);
   5256   }
   5257   return dst;
   5258 }
   5259 
   5260 
   5261 LogicVRegister Simulator::fmla(VectorFormat vform,
   5262                                LogicVRegister dst,
   5263                                const LogicVRegister& srca,
   5264                                const LogicVRegister& src1,
   5265                                const LogicVRegister& src2) {
   5266   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5267     fmla<SimFloat16>(vform, dst, srca, src1, src2);
   5268   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5269     fmla<float>(vform, dst, srca, src1, src2);
   5270   } else {
   5271     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5272     fmla<double>(vform, dst, srca, src1, src2);
   5273   }
   5274   return dst;
   5275 }
   5276 
   5277 
   5278 template <typename T>
   5279 LogicVRegister Simulator::fmls(VectorFormat vform,
   5280                                LogicVRegister dst,
   5281                                const LogicVRegister& srca,
   5282                                const LogicVRegister& src1,
   5283                                const LogicVRegister& src2) {
   5284   dst.ClearForWrite(vform);
   5285   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5286     T op1 = -src1.Float<T>(i);
   5287     T op2 = src2.Float<T>(i);
   5288     T acc = srca.Float<T>(i);
   5289     T result = FPMulAdd(acc, op1, op2);
   5290     dst.SetFloat(i, result);
   5291   }
   5292   return dst;
   5293 }
   5294 
   5295 
   5296 LogicVRegister Simulator::fmls(VectorFormat vform,
   5297                                LogicVRegister dst,
   5298                                const LogicVRegister& srca,
   5299                                const LogicVRegister& src1,
   5300                                const LogicVRegister& src2) {
   5301   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5302     fmls<SimFloat16>(vform, dst, srca, src1, src2);
   5303   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5304     fmls<float>(vform, dst, srca, src1, src2);
   5305   } else {
   5306     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5307     fmls<double>(vform, dst, srca, src1, src2);
   5308   }
   5309   return dst;
   5310 }
   5311 
   5312 
   5313 LogicVRegister Simulator::fmlal(VectorFormat vform,
   5314                                 LogicVRegister dst,
   5315                                 const LogicVRegister& src1,
   5316                                 const LogicVRegister& src2) {
   5317   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   5318   dst.ClearForWrite(vform);
   5319   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5320     float op1 = FPToFloat(src1.Float<SimFloat16>(i), kIgnoreDefaultNaN);
   5321     float op2 = FPToFloat(src2.Float<SimFloat16>(i), kIgnoreDefaultNaN);
   5322     float acc = dst.Float<float>(i);
   5323     float result = FPMulAdd(acc, op1, op2);
   5324     dst.SetFloat(i, result);
   5325   }
   5326   return dst;
   5327 }
   5328 
   5329 
   5330 LogicVRegister Simulator::fmlal2(VectorFormat vform,
   5331                                  LogicVRegister dst,
   5332                                  const LogicVRegister& src1,
   5333                                  const LogicVRegister& src2) {
   5334   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   5335   dst.ClearForWrite(vform);
   5336   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5337     int src = i + LaneCountFromFormat(vform);
   5338     float op1 = FPToFloat(src1.Float<SimFloat16>(src), kIgnoreDefaultNaN);
   5339     float op2 = FPToFloat(src2.Float<SimFloat16>(src), kIgnoreDefaultNaN);
   5340     float acc = dst.Float<float>(i);
   5341     float result = FPMulAdd(acc, op1, op2);
   5342     dst.SetFloat(i, result);
   5343   }
   5344   return dst;
   5345 }
   5346 
   5347 
   5348 LogicVRegister Simulator::fmlsl(VectorFormat vform,
   5349                                 LogicVRegister dst,
   5350                                 const LogicVRegister& src1,
   5351                                 const LogicVRegister& src2) {
   5352   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   5353   dst.ClearForWrite(vform);
   5354   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5355     float op1 = -FPToFloat(src1.Float<SimFloat16>(i), kIgnoreDefaultNaN);
   5356     float op2 = FPToFloat(src2.Float<SimFloat16>(i), kIgnoreDefaultNaN);
   5357     float acc = dst.Float<float>(i);
   5358     float result = FPMulAdd(acc, op1, op2);
   5359     dst.SetFloat(i, result);
   5360   }
   5361   return dst;
   5362 }
   5363 
   5364 
   5365 LogicVRegister Simulator::fmlsl2(VectorFormat vform,
   5366                                  LogicVRegister dst,
   5367                                  const LogicVRegister& src1,
   5368                                  const LogicVRegister& src2) {
   5369   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   5370   dst.ClearForWrite(vform);
   5371   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5372     int src = i + LaneCountFromFormat(vform);
   5373     float op1 = -FPToFloat(src1.Float<SimFloat16>(src), kIgnoreDefaultNaN);
   5374     float op2 = FPToFloat(src2.Float<SimFloat16>(src), kIgnoreDefaultNaN);
   5375     float acc = dst.Float<float>(i);
   5376     float result = FPMulAdd(acc, op1, op2);
   5377     dst.SetFloat(i, result);
   5378   }
   5379   return dst;
   5380 }
   5381 
   5382 
   5383 LogicVRegister Simulator::fmlal(VectorFormat vform,
   5384                                 LogicVRegister dst,
   5385                                 const LogicVRegister& src1,
   5386                                 const LogicVRegister& src2,
   5387                                 int index) {
   5388   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   5389   dst.ClearForWrite(vform);
   5390   float op2 = FPToFloat(src2.Float<SimFloat16>(index), kIgnoreDefaultNaN);
   5391   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5392     float op1 = FPToFloat(src1.Float<SimFloat16>(i), kIgnoreDefaultNaN);
   5393     float acc = dst.Float<float>(i);
   5394     float result = FPMulAdd(acc, op1, op2);
   5395     dst.SetFloat(i, result);
   5396   }
   5397   return dst;
   5398 }
   5399 
   5400 
   5401 LogicVRegister Simulator::fmlal2(VectorFormat vform,
   5402                                  LogicVRegister dst,
   5403                                  const LogicVRegister& src1,
   5404                                  const LogicVRegister& src2,
   5405                                  int index) {
   5406   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   5407   dst.ClearForWrite(vform);
   5408   float op2 = FPToFloat(src2.Float<SimFloat16>(index), kIgnoreDefaultNaN);
   5409   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5410     int src = i + LaneCountFromFormat(vform);
   5411     float op1 = FPToFloat(src1.Float<SimFloat16>(src), kIgnoreDefaultNaN);
   5412     float acc = dst.Float<float>(i);
   5413     float result = FPMulAdd(acc, op1, op2);
   5414     dst.SetFloat(i, result);
   5415   }
   5416   return dst;
   5417 }
   5418 
   5419 
   5420 LogicVRegister Simulator::fmlsl(VectorFormat vform,
   5421                                 LogicVRegister dst,
   5422                                 const LogicVRegister& src1,
   5423                                 const LogicVRegister& src2,
   5424                                 int index) {
   5425   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   5426   dst.ClearForWrite(vform);
   5427   float op2 = FPToFloat(src2.Float<SimFloat16>(index), kIgnoreDefaultNaN);
   5428   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5429     float op1 = -FPToFloat(src1.Float<SimFloat16>(i), kIgnoreDefaultNaN);
   5430     float acc = dst.Float<float>(i);
   5431     float result = FPMulAdd(acc, op1, op2);
   5432     dst.SetFloat(i, result);
   5433   }
   5434   return dst;
   5435 }
   5436 
   5437 
   5438 LogicVRegister Simulator::fmlsl2(VectorFormat vform,
   5439                                  LogicVRegister dst,
   5440                                  const LogicVRegister& src1,
   5441                                  const LogicVRegister& src2,
   5442                                  int index) {
   5443   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   5444   dst.ClearForWrite(vform);
   5445   float op2 = FPToFloat(src2.Float<SimFloat16>(index), kIgnoreDefaultNaN);
   5446   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5447     int src = i + LaneCountFromFormat(vform);
   5448     float op1 = -FPToFloat(src1.Float<SimFloat16>(src), kIgnoreDefaultNaN);
   5449     float acc = dst.Float<float>(i);
   5450     float result = FPMulAdd(acc, op1, op2);
   5451     dst.SetFloat(i, result);
   5452   }
   5453   return dst;
   5454 }
   5455 
   5456 
   5457 template <typename T>
   5458 LogicVRegister Simulator::fneg(VectorFormat vform,
   5459                                LogicVRegister dst,
   5460                                const LogicVRegister& src) {
   5461   dst.ClearForWrite(vform);
   5462   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5463     T op = src.Float<T>(i);
   5464     op = -op;
   5465     dst.SetFloat(i, op);
   5466   }
   5467   return dst;
   5468 }
   5469 
   5470 
   5471 LogicVRegister Simulator::fneg(VectorFormat vform,
   5472                                LogicVRegister dst,
   5473                                const LogicVRegister& src) {
   5474   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5475     fneg<SimFloat16>(vform, dst, src);
   5476   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5477     fneg<float>(vform, dst, src);
   5478   } else {
   5479     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5480     fneg<double>(vform, dst, src);
   5481   }
   5482   return dst;
   5483 }
   5484 
   5485 
   5486 template <typename T>
   5487 LogicVRegister Simulator::fabs_(VectorFormat vform,
   5488                                 LogicVRegister dst,
   5489                                 const LogicVRegister& src) {
   5490   dst.ClearForWrite(vform);
   5491   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5492     T op = src.Float<T>(i);
   5493     if (copysign(1.0, op) < 0.0) {
   5494       op = -op;
   5495     }
   5496     dst.SetFloat(i, op);
   5497   }
   5498   return dst;
   5499 }
   5500 
   5501 
   5502 LogicVRegister Simulator::fabs_(VectorFormat vform,
   5503                                 LogicVRegister dst,
   5504                                 const LogicVRegister& src) {
   5505   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5506     fabs_<SimFloat16>(vform, dst, src);
   5507   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5508     fabs_<float>(vform, dst, src);
   5509   } else {
   5510     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5511     fabs_<double>(vform, dst, src);
   5512   }
   5513   return dst;
   5514 }
   5515 
   5516 
   5517 LogicVRegister Simulator::fabd(VectorFormat vform,
   5518                                LogicVRegister dst,
   5519                                const LogicVRegister& src1,
   5520                                const LogicVRegister& src2) {
   5521   SimVRegister temp;
   5522   fsub(vform, temp, src1, src2);
   5523   fabs_(vform, dst, temp);
   5524   return dst;
   5525 }
   5526 
   5527 
   5528 LogicVRegister Simulator::fsqrt(VectorFormat vform,
   5529                                 LogicVRegister dst,
   5530                                 const LogicVRegister& src) {
   5531   dst.ClearForWrite(vform);
   5532   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5533     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5534       SimFloat16 result = FPSqrt(src.Float<SimFloat16>(i));
   5535       dst.SetFloat(i, result);
   5536     }
   5537   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5538     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5539       float result = FPSqrt(src.Float<float>(i));
   5540       dst.SetFloat(i, result);
   5541     }
   5542   } else {
   5543     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5544     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5545       double result = FPSqrt(src.Float<double>(i));
   5546       dst.SetFloat(i, result);
   5547     }
   5548   }
   5549   return dst;
   5550 }
   5551 
   5552 
   5553 #define DEFINE_NEON_FP_PAIR_OP(FNP, FN, OP)                                    \
   5554   LogicVRegister Simulator::FNP(VectorFormat vform,                            \
   5555                                 LogicVRegister dst,                            \
   5556                                 const LogicVRegister& src1,                    \
   5557                                 const LogicVRegister& src2) {                  \
   5558     SimVRegister temp1, temp2;                                                 \
   5559     uzp1(vform, temp1, src1, src2);                                            \
   5560     uzp2(vform, temp2, src1, src2);                                            \
   5561     FN(vform, dst, temp1, temp2);                                              \
   5562     if (IsSVEFormat(vform)) {                                                  \
   5563       interleave_top_bottom(vform, dst, dst);                                  \
   5564     }                                                                          \
   5565     return dst;                                                                \
   5566   }                                                                            \
   5567                                                                                \
   5568   LogicVRegister Simulator::FNP(VectorFormat vform,                            \
   5569                                 LogicVRegister dst,                            \
   5570                                 const LogicVRegister& src) {                   \
   5571     if (vform == kFormatH) {                                                   \
   5572       SimFloat16 result(OP(SimFloat16(RawbitsToFloat16(src.Uint(vform, 0))),   \
   5573                            SimFloat16(RawbitsToFloat16(src.Uint(vform, 1))))); \
   5574       dst.SetUint(vform, 0, Float16ToRawbits(result));                         \
   5575     } else if (vform == kFormatS) {                                            \
   5576       float result = OP(src.Float<float>(0), src.Float<float>(1));             \
   5577       dst.SetFloat(0, result);                                                 \
   5578     } else {                                                                   \
   5579       VIXL_ASSERT(vform == kFormatD);                                          \
   5580       double result = OP(src.Float<double>(0), src.Float<double>(1));          \
   5581       dst.SetFloat(0, result);                                                 \
   5582     }                                                                          \
   5583     dst.ClearForWrite(vform);                                                  \
   5584     return dst;                                                                \
   5585   }
   5586 NEON_FPPAIRWISE_LIST(DEFINE_NEON_FP_PAIR_OP)
   5587 #undef DEFINE_NEON_FP_PAIR_OP
   5588 
   5589 template <typename T>
   5590 LogicVRegister Simulator::FPPairedAcrossHelper(VectorFormat vform,
   5591                                                LogicVRegister dst,
   5592                                                const LogicVRegister& src,
   5593                                                typename TFPPairOp<T>::type fn,
   5594                                                uint64_t inactive_value) {
   5595   int lane_count = LaneCountFromFormat(vform);
   5596   T result[kZRegMaxSizeInBytes / sizeof(T)];
   5597   // Copy the source vector into a working array. Initialise the unused elements
   5598   // at the end of the array to the same value that a false predicate would set.
   5599   for (int i = 0; i < static_cast<int>(ArrayLength(result)); i++) {
   5600     result[i] = (i < lane_count)
   5601                     ? src.Float<T>(i)
   5602                     : RawbitsWithSizeToFP<T>(sizeof(T) * 8, inactive_value);
   5603   }
   5604 
   5605   // Pairwise reduce the elements to a single value, using the pair op function
   5606   // argument.
   5607   for (int step = 1; step < lane_count; step *= 2) {
   5608     for (int i = 0; i < lane_count; i += step * 2) {
   5609       result[i] = (this->*fn)(result[i], result[i + step]);
   5610     }
   5611   }
   5612   dst.ClearForWrite(ScalarFormatFromFormat(vform));
   5613   dst.SetFloat<T>(0, result[0]);
   5614   return dst;
   5615 }
   5616 
   5617 LogicVRegister Simulator::FPPairedAcrossHelper(
   5618     VectorFormat vform,
   5619     LogicVRegister dst,
   5620     const LogicVRegister& src,
   5621     typename TFPPairOp<SimFloat16>::type fn16,
   5622     typename TFPPairOp<float>::type fn32,
   5623     typename TFPPairOp<double>::type fn64,
   5624     uint64_t inactive_value) {
   5625   switch (LaneSizeInBitsFromFormat(vform)) {
   5626     case kHRegSize:
   5627       return FPPairedAcrossHelper<SimFloat16>(vform,
   5628                                               dst,
   5629                                               src,
   5630                                               fn16,
   5631                                               inactive_value);
   5632     case kSRegSize:
   5633       return FPPairedAcrossHelper<float>(vform, dst, src, fn32, inactive_value);
   5634     default:
   5635       VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5636       return FPPairedAcrossHelper<double>(vform,
   5637                                           dst,
   5638                                           src,
   5639                                           fn64,
   5640                                           inactive_value);
   5641   }
   5642 }
   5643 
   5644 LogicVRegister Simulator::faddv(VectorFormat vform,
   5645                                 LogicVRegister dst,
   5646                                 const LogicVRegister& src) {
   5647   return FPPairedAcrossHelper(vform,
   5648                               dst,
   5649                               src,
   5650                               &Simulator::FPAdd<SimFloat16>,
   5651                               &Simulator::FPAdd<float>,
   5652                               &Simulator::FPAdd<double>,
   5653                               0);
   5654 }
   5655 
   5656 LogicVRegister Simulator::fmaxv(VectorFormat vform,
   5657                                 LogicVRegister dst,
   5658                                 const LogicVRegister& src) {
   5659   int lane_size = LaneSizeInBitsFromFormat(vform);
   5660   uint64_t inactive_value =
   5661       FPToRawbitsWithSize(lane_size, kFP64NegativeInfinity);
   5662   return FPPairedAcrossHelper(vform,
   5663                               dst,
   5664                               src,
   5665                               &Simulator::FPMax<SimFloat16>,
   5666                               &Simulator::FPMax<float>,
   5667                               &Simulator::FPMax<double>,
   5668                               inactive_value);
   5669 }
   5670 
   5671 
   5672 LogicVRegister Simulator::fminv(VectorFormat vform,
   5673                                 LogicVRegister dst,
   5674                                 const LogicVRegister& src) {
   5675   int lane_size = LaneSizeInBitsFromFormat(vform);
   5676   uint64_t inactive_value =
   5677       FPToRawbitsWithSize(lane_size, kFP64PositiveInfinity);
   5678   return FPPairedAcrossHelper(vform,
   5679                               dst,
   5680                               src,
   5681                               &Simulator::FPMin<SimFloat16>,
   5682                               &Simulator::FPMin<float>,
   5683                               &Simulator::FPMin<double>,
   5684                               inactive_value);
   5685 }
   5686 
   5687 
   5688 LogicVRegister Simulator::fmaxnmv(VectorFormat vform,
   5689                                   LogicVRegister dst,
   5690                                   const LogicVRegister& src) {
   5691   int lane_size = LaneSizeInBitsFromFormat(vform);
   5692   uint64_t inactive_value = FPToRawbitsWithSize(lane_size, kFP64DefaultNaN);
   5693   return FPPairedAcrossHelper(vform,
   5694                               dst,
   5695                               src,
   5696                               &Simulator::FPMaxNM<SimFloat16>,
   5697                               &Simulator::FPMaxNM<float>,
   5698                               &Simulator::FPMaxNM<double>,
   5699                               inactive_value);
   5700 }
   5701 
   5702 
   5703 LogicVRegister Simulator::fminnmv(VectorFormat vform,
   5704                                   LogicVRegister dst,
   5705                                   const LogicVRegister& src) {
   5706   int lane_size = LaneSizeInBitsFromFormat(vform);
   5707   uint64_t inactive_value = FPToRawbitsWithSize(lane_size, kFP64DefaultNaN);
   5708   return FPPairedAcrossHelper(vform,
   5709                               dst,
   5710                               src,
   5711                               &Simulator::FPMinNM<SimFloat16>,
   5712                               &Simulator::FPMinNM<float>,
   5713                               &Simulator::FPMinNM<double>,
   5714                               inactive_value);
   5715 }
   5716 
   5717 
   5718 LogicVRegister Simulator::fmul(VectorFormat vform,
   5719                                LogicVRegister dst,
   5720                                const LogicVRegister& src1,
   5721                                const LogicVRegister& src2,
   5722                                int index) {
   5723   dst.ClearForWrite(vform);
   5724   SimVRegister temp;
   5725   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5726     LogicVRegister index_reg = dup_element(kFormat8H, temp, src2, index);
   5727     fmul<SimFloat16>(vform, dst, src1, index_reg);
   5728   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5729     LogicVRegister index_reg = dup_element(kFormat4S, temp, src2, index);
   5730     fmul<float>(vform, dst, src1, index_reg);
   5731   } else {
   5732     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5733     LogicVRegister index_reg = dup_element(kFormat2D, temp, src2, index);
   5734     fmul<double>(vform, dst, src1, index_reg);
   5735   }
   5736   return dst;
   5737 }
   5738 
   5739 
   5740 LogicVRegister Simulator::fmla(VectorFormat vform,
   5741                                LogicVRegister dst,
   5742                                const LogicVRegister& src1,
   5743                                const LogicVRegister& src2,
   5744                                int index) {
   5745   dst.ClearForWrite(vform);
   5746   SimVRegister temp;
   5747   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5748     LogicVRegister index_reg = dup_element(kFormat8H, temp, src2, index);
   5749     fmla<SimFloat16>(vform, dst, dst, src1, index_reg);
   5750   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5751     LogicVRegister index_reg = dup_element(kFormat4S, temp, src2, index);
   5752     fmla<float>(vform, dst, dst, src1, index_reg);
   5753   } else {
   5754     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5755     LogicVRegister index_reg = dup_element(kFormat2D, temp, src2, index);
   5756     fmla<double>(vform, dst, dst, src1, index_reg);
   5757   }
   5758   return dst;
   5759 }
   5760 
   5761 
   5762 LogicVRegister Simulator::fmls(VectorFormat vform,
   5763                                LogicVRegister dst,
   5764                                const LogicVRegister& src1,
   5765                                const LogicVRegister& src2,
   5766                                int index) {
   5767   dst.ClearForWrite(vform);
   5768   SimVRegister temp;
   5769   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5770     LogicVRegister index_reg = dup_element(kFormat8H, temp, src2, index);
   5771     fmls<SimFloat16>(vform, dst, dst, src1, index_reg);
   5772   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5773     LogicVRegister index_reg = dup_element(kFormat4S, temp, src2, index);
   5774     fmls<float>(vform, dst, dst, src1, index_reg);
   5775   } else {
   5776     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5777     LogicVRegister index_reg = dup_element(kFormat2D, temp, src2, index);
   5778     fmls<double>(vform, dst, dst, src1, index_reg);
   5779   }
   5780   return dst;
   5781 }
   5782 
   5783 
   5784 LogicVRegister Simulator::fmulx(VectorFormat vform,
   5785                                 LogicVRegister dst,
   5786                                 const LogicVRegister& src1,
   5787                                 const LogicVRegister& src2,
   5788                                 int index) {
   5789   dst.ClearForWrite(vform);
   5790   SimVRegister temp;
   5791   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5792     LogicVRegister index_reg = dup_element(kFormat8H, temp, src2, index);
   5793     fmulx<SimFloat16>(vform, dst, src1, index_reg);
   5794   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5795     LogicVRegister index_reg = dup_element(kFormat4S, temp, src2, index);
   5796     fmulx<float>(vform, dst, src1, index_reg);
   5797   } else {
   5798     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5799     LogicVRegister index_reg = dup_element(kFormat2D, temp, src2, index);
   5800     fmulx<double>(vform, dst, src1, index_reg);
   5801   }
   5802   return dst;
   5803 }
   5804 
   5805 
   5806 LogicVRegister Simulator::frint(VectorFormat vform,
   5807                                 LogicVRegister dst,
   5808                                 const LogicVRegister& src,
   5809                                 FPRounding rounding_mode,
   5810                                 bool inexact_exception,
   5811                                 FrintMode frint_mode) {
   5812   dst.ClearForWrite(vform);
   5813   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   5814     VIXL_ASSERT(frint_mode == kFrintToInteger);
   5815     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5816       SimFloat16 input = src.Float<SimFloat16>(i);
   5817       SimFloat16 rounded = FPRoundInt(input, rounding_mode);
   5818       if (inexact_exception && !IsNaN(input) && (input != rounded)) {
   5819         FPProcessException();
   5820       }
   5821       dst.SetFloat<SimFloat16>(i, rounded);
   5822     }
   5823   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5824     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5825       float input = src.Float<float>(i);
   5826       float rounded = FPRoundInt(input, rounding_mode, frint_mode);
   5827 
   5828       if (inexact_exception && !IsNaN(input) && (input != rounded)) {
   5829         FPProcessException();
   5830       }
   5831       dst.SetFloat<float>(i, rounded);
   5832     }
   5833   } else {
   5834     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5835     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5836       double input = src.Float<double>(i);
   5837       double rounded = FPRoundInt(input, rounding_mode, frint_mode);
   5838       if (inexact_exception && !IsNaN(input) && (input != rounded)) {
   5839         FPProcessException();
   5840       }
   5841       dst.SetFloat<double>(i, rounded);
   5842     }
   5843   }
   5844   return dst;
   5845 }
   5846 
   5847 LogicVRegister Simulator::fcvt(VectorFormat dst_vform,
   5848                                VectorFormat src_vform,
   5849                                LogicVRegister dst,
   5850                                const LogicPRegister& pg,
   5851                                const LogicVRegister& src) {
   5852   unsigned dst_data_size_in_bits = LaneSizeInBitsFromFormat(dst_vform);
   5853   unsigned src_data_size_in_bits = LaneSizeInBitsFromFormat(src_vform);
   5854   VectorFormat vform = SVEFormatFromLaneSizeInBits(
   5855       std::max(dst_data_size_in_bits, src_data_size_in_bits));
   5856 
   5857   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5858     if (!pg.IsActive(vform, i)) continue;
   5859 
   5860     uint64_t src_raw_bits = ExtractUnsignedBitfield64(src_data_size_in_bits - 1,
   5861                                                       0,
   5862                                                       src.Uint(vform, i));
   5863     double dst_value =
   5864         RawbitsWithSizeToFP<double>(src_data_size_in_bits, src_raw_bits);
   5865 
   5866     uint64_t dst_raw_bits =
   5867         FPToRawbitsWithSize(dst_data_size_in_bits, dst_value);
   5868 
   5869     dst.SetUint(vform, i, dst_raw_bits);
   5870   }
   5871 
   5872   return dst;
   5873 }
   5874 
   5875 LogicVRegister Simulator::fcvts(VectorFormat vform,
   5876                                 unsigned dst_data_size_in_bits,
   5877                                 unsigned src_data_size_in_bits,
   5878                                 LogicVRegister dst,
   5879                                 const LogicPRegister& pg,
   5880                                 const LogicVRegister& src,
   5881                                 FPRounding round,
   5882                                 int fbits) {
   5883   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) >= dst_data_size_in_bits);
   5884   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) >= src_data_size_in_bits);
   5885 
   5886   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5887     if (!pg.IsActive(vform, i)) continue;
   5888 
   5889     uint64_t value = ExtractUnsignedBitfield64(src_data_size_in_bits - 1,
   5890                                                0,
   5891                                                src.Uint(vform, i));
   5892     double result = RawbitsWithSizeToFP<double>(src_data_size_in_bits, value) *
   5893                     std::pow(2.0, fbits);
   5894 
   5895     switch (dst_data_size_in_bits) {
   5896       case kHRegSize:
   5897         dst.SetInt(vform, i, FPToInt16(result, round));
   5898         break;
   5899       case kSRegSize:
   5900         dst.SetInt(vform, i, FPToInt32(result, round));
   5901         break;
   5902       case kDRegSize:
   5903         dst.SetInt(vform, i, FPToInt64(result, round));
   5904         break;
   5905       default:
   5906         VIXL_UNIMPLEMENTED();
   5907         break;
   5908     }
   5909   }
   5910 
   5911   return dst;
   5912 }
   5913 
   5914 LogicVRegister Simulator::fcvts(VectorFormat vform,
   5915                                 LogicVRegister dst,
   5916                                 const LogicVRegister& src,
   5917                                 FPRounding round,
   5918                                 int fbits) {
   5919   dst.ClearForWrite(vform);
   5920   return fcvts(vform,
   5921                LaneSizeInBitsFromFormat(vform),
   5922                LaneSizeInBitsFromFormat(vform),
   5923                dst,
   5924                GetPTrue(),
   5925                src,
   5926                round,
   5927                fbits);
   5928 }
   5929 
   5930 LogicVRegister Simulator::fcvtu(VectorFormat vform,
   5931                                 unsigned dst_data_size_in_bits,
   5932                                 unsigned src_data_size_in_bits,
   5933                                 LogicVRegister dst,
   5934                                 const LogicPRegister& pg,
   5935                                 const LogicVRegister& src,
   5936                                 FPRounding round,
   5937                                 int fbits) {
   5938   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) >= dst_data_size_in_bits);
   5939   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) >= src_data_size_in_bits);
   5940 
   5941   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   5942     if (!pg.IsActive(vform, i)) continue;
   5943 
   5944     uint64_t value = ExtractUnsignedBitfield64(src_data_size_in_bits - 1,
   5945                                                0,
   5946                                                src.Uint(vform, i));
   5947     double result = RawbitsWithSizeToFP<double>(src_data_size_in_bits, value) *
   5948                     std::pow(2.0, fbits);
   5949 
   5950     switch (dst_data_size_in_bits) {
   5951       case kHRegSize:
   5952         dst.SetUint(vform, i, FPToUInt16(result, round));
   5953         break;
   5954       case kSRegSize:
   5955         dst.SetUint(vform, i, FPToUInt32(result, round));
   5956         break;
   5957       case kDRegSize:
   5958         dst.SetUint(vform, i, FPToUInt64(result, round));
   5959         break;
   5960       default:
   5961         VIXL_UNIMPLEMENTED();
   5962         break;
   5963     }
   5964   }
   5965 
   5966   return dst;
   5967 }
   5968 
   5969 LogicVRegister Simulator::fcvtu(VectorFormat vform,
   5970                                 LogicVRegister dst,
   5971                                 const LogicVRegister& src,
   5972                                 FPRounding round,
   5973                                 int fbits) {
   5974   dst.ClearForWrite(vform);
   5975   return fcvtu(vform,
   5976                LaneSizeInBitsFromFormat(vform),
   5977                LaneSizeInBitsFromFormat(vform),
   5978                dst,
   5979                GetPTrue(),
   5980                src,
   5981                round,
   5982                fbits);
   5983 }
   5984 
   5985 LogicVRegister Simulator::fcvtl(VectorFormat vform,
   5986                                 LogicVRegister dst,
   5987                                 const LogicVRegister& src) {
   5988   dst.ClearForWrite(vform);
   5989   if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   5990     for (int i = LaneCountFromFormat(vform) - 1; i >= 0; i--) {
   5991       // TODO: Full support for SimFloat16 in SimRegister(s).
   5992       dst.SetFloat(i,
   5993                    FPToFloat(RawbitsToFloat16(src.Float<uint16_t>(i)),
   5994                              ReadDN()));
   5995     }
   5996   } else {
   5997     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   5998     for (int i = LaneCountFromFormat(vform) - 1; i >= 0; i--) {
   5999       dst.SetFloat(i, FPToDouble(src.Float<float>(i), ReadDN()));
   6000     }
   6001   }
   6002   return dst;
   6003 }
   6004 
   6005 
   6006 LogicVRegister Simulator::fcvtl2(VectorFormat vform,
   6007                                  LogicVRegister dst,
   6008                                  const LogicVRegister& src) {
   6009   dst.ClearForWrite(vform);
   6010   int lane_count = LaneCountFromFormat(vform);
   6011   if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   6012     for (int i = 0; i < lane_count; i++) {
   6013       // TODO: Full support for SimFloat16 in SimRegister(s).
   6014       dst.SetFloat(i,
   6015                    FPToFloat(RawbitsToFloat16(
   6016                                  src.Float<uint16_t>(i + lane_count)),
   6017                              ReadDN()));
   6018     }
   6019   } else {
   6020     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   6021     for (int i = 0; i < lane_count; i++) {
   6022       dst.SetFloat(i, FPToDouble(src.Float<float>(i + lane_count), ReadDN()));
   6023     }
   6024   }
   6025   return dst;
   6026 }
   6027 
   6028 
   6029 LogicVRegister Simulator::fcvtn(VectorFormat vform,
   6030                                 LogicVRegister dst,
   6031                                 const LogicVRegister& src) {
   6032   SimVRegister tmp;
   6033   LogicVRegister srctmp = mov(kFormat2D, tmp, src);
   6034   dst.ClearForWrite(vform);
   6035   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   6036     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6037       dst.SetFloat(i,
   6038                    Float16ToRawbits(FPToFloat16(srctmp.Float<float>(i),
   6039                                                 FPTieEven,
   6040                                                 ReadDN())));
   6041     }
   6042   } else {
   6043     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   6044     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6045       dst.SetFloat(i, FPToFloat(srctmp.Float<double>(i), FPTieEven, ReadDN()));
   6046     }
   6047   }
   6048   return dst;
   6049 }
   6050 
   6051 
   6052 LogicVRegister Simulator::fcvtn2(VectorFormat vform,
   6053                                  LogicVRegister dst,
   6054                                  const LogicVRegister& src) {
   6055   dst.ClearForWrite(vform);
   6056   int lane_count = LaneCountFromFormat(vform) / 2;
   6057   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   6058     for (int i = lane_count - 1; i >= 0; i--) {
   6059       dst.SetFloat(i + lane_count,
   6060                    Float16ToRawbits(
   6061                        FPToFloat16(src.Float<float>(i), FPTieEven, ReadDN())));
   6062     }
   6063   } else {
   6064     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   6065     for (int i = lane_count - 1; i >= 0; i--) {
   6066       dst.SetFloat(i + lane_count,
   6067                    FPToFloat(src.Float<double>(i), FPTieEven, ReadDN()));
   6068     }
   6069   }
   6070   return dst;
   6071 }
   6072 
   6073 
   6074 LogicVRegister Simulator::fcvtxn(VectorFormat vform,
   6075                                  LogicVRegister dst,
   6076                                  const LogicVRegister& src) {
   6077   SimVRegister tmp;
   6078   LogicVRegister srctmp = mov(kFormat2D, tmp, src);
   6079   int input_lane_count = LaneCountFromFormat(vform);
   6080   if (IsSVEFormat(vform)) {
   6081     mov(kFormatVnB, tmp, src);
   6082     input_lane_count /= 2;
   6083   }
   6084 
   6085   dst.ClearForWrite(vform);
   6086   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   6087 
   6088   for (int i = 0; i < input_lane_count; i++) {
   6089     dst.SetFloat(i, FPToFloat(srctmp.Float<double>(i), FPRoundOdd, ReadDN()));
   6090   }
   6091   return dst;
   6092 }
   6093 
   6094 
   6095 LogicVRegister Simulator::fcvtxn2(VectorFormat vform,
   6096                                   LogicVRegister dst,
   6097                                   const LogicVRegister& src) {
   6098   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
   6099   dst.ClearForWrite(vform);
   6100   int lane_count = LaneCountFromFormat(vform) / 2;
   6101   for (int i = lane_count - 1; i >= 0; i--) {
   6102     dst.SetFloat(i + lane_count,
   6103                  FPToFloat(src.Float<double>(i), FPRoundOdd, ReadDN()));
   6104   }
   6105   return dst;
   6106 }
   6107 
   6108 
   6109 // Based on reference C function recip_sqrt_estimate from ARM ARM.
   6110 double Simulator::recip_sqrt_estimate(double a) {
   6111   int quot0, quot1, s;
   6112   double r;
   6113   if (a < 0.5) {
   6114     quot0 = static_cast<int>(a * 512.0);
   6115     r = 1.0 / sqrt((static_cast<double>(quot0) + 0.5) / 512.0);
   6116   } else {
   6117     quot1 = static_cast<int>(a * 256.0);
   6118     r = 1.0 / sqrt((static_cast<double>(quot1) + 0.5) / 256.0);
   6119   }
   6120   s = static_cast<int>(256.0 * r + 0.5);
   6121   return static_cast<double>(s) / 256.0;
   6122 }
   6123 
   6124 
   6125 static inline uint64_t Bits(uint64_t val, int start_bit, int end_bit) {
   6126   return ExtractUnsignedBitfield64(start_bit, end_bit, val);
   6127 }
   6128 
   6129 
   6130 template <typename T>
   6131 T Simulator::FPRecipSqrtEstimate(T op) {
   6132   if (IsNaN(op)) {
   6133     return FPProcessNaN(op);
   6134   } else if (op == 0.0) {
   6135     if (copysign(1.0, op) < 0.0) {
   6136       return kFP64NegativeInfinity;
   6137     } else {
   6138       return kFP64PositiveInfinity;
   6139     }
   6140   } else if (copysign(1.0, op) < 0.0) {
   6141     FPProcessException();
   6142     return FPDefaultNaN<T>();
   6143   } else if (IsInf(op)) {
   6144     return 0.0;
   6145   } else {
   6146     uint64_t fraction;
   6147     int exp, result_exp;
   6148 
   6149     if (IsFloat16<T>()) {
   6150       exp = Float16Exp(op);
   6151       fraction = Float16Mantissa(op);
   6152       fraction <<= 42;
   6153     } else if (IsFloat32<T>()) {
   6154       exp = FloatExp(op);
   6155       fraction = FloatMantissa(op);
   6156       fraction <<= 29;
   6157     } else {
   6158       VIXL_ASSERT(IsFloat64<T>());
   6159       exp = DoubleExp(op);
   6160       fraction = DoubleMantissa(op);
   6161     }
   6162 
   6163     if (exp == 0) {
   6164       while (Bits(fraction, 51, 51) == 0) {
   6165         fraction = Bits(fraction, 50, 0) << 1;
   6166         exp -= 1;
   6167       }
   6168       fraction = Bits(fraction, 50, 0) << 1;
   6169     }
   6170 
   6171     double scaled;
   6172     if (Bits(exp, 0, 0) == 0) {
   6173       scaled = DoublePack(0, 1022, Bits(fraction, 51, 44) << 44);
   6174     } else {
   6175       scaled = DoublePack(0, 1021, Bits(fraction, 51, 44) << 44);
   6176     }
   6177 
   6178     if (IsFloat16<T>()) {
   6179       result_exp = (44 - exp) / 2;
   6180     } else if (IsFloat32<T>()) {
   6181       result_exp = (380 - exp) / 2;
   6182     } else {
   6183       VIXL_ASSERT(IsFloat64<T>());
   6184       result_exp = (3068 - exp) / 2;
   6185     }
   6186 
   6187     uint64_t estimate = DoubleToRawbits(recip_sqrt_estimate(scaled));
   6188 
   6189     if (IsFloat16<T>()) {
   6190       uint16_t exp_bits = static_cast<uint16_t>(Bits(result_exp, 4, 0));
   6191       uint16_t est_bits = static_cast<uint16_t>(Bits(estimate, 51, 42));
   6192       return Float16Pack(0, exp_bits, est_bits);
   6193     } else if (IsFloat32<T>()) {
   6194       uint32_t exp_bits = static_cast<uint32_t>(Bits(result_exp, 7, 0));
   6195       uint32_t est_bits = static_cast<uint32_t>(Bits(estimate, 51, 29));
   6196       return FloatPack(0, exp_bits, est_bits);
   6197     } else {
   6198       VIXL_ASSERT(IsFloat64<T>());
   6199       return DoublePack(0, Bits(result_exp, 10, 0), Bits(estimate, 51, 0));
   6200     }
   6201   }
   6202 }
   6203 
   6204 
   6205 LogicVRegister Simulator::frsqrte(VectorFormat vform,
   6206                                   LogicVRegister dst,
   6207                                   const LogicVRegister& src) {
   6208   dst.ClearForWrite(vform);
   6209   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   6210     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6211       SimFloat16 input = src.Float<SimFloat16>(i);
   6212       dst.SetFloat(vform, i, FPRecipSqrtEstimate<SimFloat16>(input));
   6213     }
   6214   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   6215     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6216       float input = src.Float<float>(i);
   6217       dst.SetFloat(vform, i, FPRecipSqrtEstimate<float>(input));
   6218     }
   6219   } else {
   6220     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   6221     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6222       double input = src.Float<double>(i);
   6223       dst.SetFloat(vform, i, FPRecipSqrtEstimate<double>(input));
   6224     }
   6225   }
   6226   return dst;
   6227 }
   6228 
   6229 template <typename T>
   6230 T Simulator::FPRecipEstimate(T op, FPRounding rounding) {
   6231   uint32_t sign;
   6232 
   6233   if (IsFloat16<T>()) {
   6234     sign = Float16Sign(op);
   6235   } else if (IsFloat32<T>()) {
   6236     sign = FloatSign(op);
   6237   } else {
   6238     VIXL_ASSERT(IsFloat64<T>());
   6239     sign = DoubleSign(op);
   6240   }
   6241 
   6242   if (IsNaN(op)) {
   6243     return FPProcessNaN(op);
   6244   } else if (IsInf(op)) {
   6245     return (sign == 1) ? -0.0 : 0.0;
   6246   } else if (op == 0.0) {
   6247     FPProcessException();  // FPExc_DivideByZero exception.
   6248     return (sign == 1) ? kFP64NegativeInfinity : kFP64PositiveInfinity;
   6249   } else if ((IsFloat16<T>() && (std::fabs(op) < std::pow(2.0, -16.0))) ||
   6250              (IsFloat32<T>() && (std::fabs(op) < std::pow(2.0, -128.0))) ||
   6251              (IsFloat64<T>() && (std::fabs(op) < std::pow(2.0, -1024.0)))) {
   6252     bool overflow_to_inf = false;
   6253     switch (rounding) {
   6254       case FPTieEven:
   6255         overflow_to_inf = true;
   6256         break;
   6257       case FPPositiveInfinity:
   6258         overflow_to_inf = (sign == 0);
   6259         break;
   6260       case FPNegativeInfinity:
   6261         overflow_to_inf = (sign == 1);
   6262         break;
   6263       case FPZero:
   6264         overflow_to_inf = false;
   6265         break;
   6266       default:
   6267         break;
   6268     }
   6269     FPProcessException();  // FPExc_Overflow and FPExc_Inexact.
   6270     if (overflow_to_inf) {
   6271       return (sign == 1) ? kFP64NegativeInfinity : kFP64PositiveInfinity;
   6272     } else {
   6273       // Return FPMaxNormal(sign).
   6274       if (IsFloat16<T>()) {
   6275         return Float16Pack(sign, 0x1f, 0x3ff);
   6276       } else if (IsFloat32<T>()) {
   6277         return FloatPack(sign, 0xfe, 0x07fffff);
   6278       } else {
   6279         VIXL_ASSERT(IsFloat64<T>());
   6280         return DoublePack(sign, 0x7fe, 0x0fffffffffffffl);
   6281       }
   6282     }
   6283   } else {
   6284     uint64_t fraction;
   6285     int exp, result_exp;
   6286 
   6287     if (IsFloat16<T>()) {
   6288       sign = Float16Sign(op);
   6289       exp = Float16Exp(op);
   6290       fraction = Float16Mantissa(op);
   6291       fraction <<= 42;
   6292     } else if (IsFloat32<T>()) {
   6293       sign = FloatSign(op);
   6294       exp = FloatExp(op);
   6295       fraction = FloatMantissa(op);
   6296       fraction <<= 29;
   6297     } else {
   6298       VIXL_ASSERT(IsFloat64<T>());
   6299       sign = DoubleSign(op);
   6300       exp = DoubleExp(op);
   6301       fraction = DoubleMantissa(op);
   6302     }
   6303 
   6304     if (exp == 0) {
   6305       if (Bits(fraction, 51, 51) == 0) {
   6306         exp -= 1;
   6307         fraction = Bits(fraction, 49, 0) << 2;
   6308       } else {
   6309         fraction = Bits(fraction, 50, 0) << 1;
   6310       }
   6311     }
   6312 
   6313     double scaled = DoublePack(0, 1022, Bits(fraction, 51, 44) << 44);
   6314 
   6315     if (IsFloat16<T>()) {
   6316       result_exp = (29 - exp);  // In range 29-30 = -1 to 29+1 = 30.
   6317     } else if (IsFloat32<T>()) {
   6318       result_exp = (253 - exp);  // In range 253-254 = -1 to 253+1 = 254.
   6319     } else {
   6320       VIXL_ASSERT(IsFloat64<T>());
   6321       result_exp = (2045 - exp);  // In range 2045-2046 = -1 to 2045+1 = 2046.
   6322     }
   6323 
   6324     double estimate = recip_estimate(scaled);
   6325 
   6326     fraction = DoubleMantissa(estimate);
   6327     if (result_exp == 0) {
   6328       fraction = (UINT64_C(1) << 51) | Bits(fraction, 51, 1);
   6329     } else if (result_exp == -1) {
   6330       fraction = (UINT64_C(1) << 50) | Bits(fraction, 51, 2);
   6331       result_exp = 0;
   6332     }
   6333     if (IsFloat16<T>()) {
   6334       uint16_t exp_bits = static_cast<uint16_t>(Bits(result_exp, 4, 0));
   6335       uint16_t frac_bits = static_cast<uint16_t>(Bits(fraction, 51, 42));
   6336       return Float16Pack(sign, exp_bits, frac_bits);
   6337     } else if (IsFloat32<T>()) {
   6338       uint32_t exp_bits = static_cast<uint32_t>(Bits(result_exp, 7, 0));
   6339       uint32_t frac_bits = static_cast<uint32_t>(Bits(fraction, 51, 29));
   6340       return FloatPack(sign, exp_bits, frac_bits);
   6341     } else {
   6342       VIXL_ASSERT(IsFloat64<T>());
   6343       return DoublePack(sign, Bits(result_exp, 10, 0), Bits(fraction, 51, 0));
   6344     }
   6345   }
   6346 }
   6347 
   6348 
   6349 LogicVRegister Simulator::frecpe(VectorFormat vform,
   6350                                  LogicVRegister dst,
   6351                                  const LogicVRegister& src,
   6352                                  FPRounding round) {
   6353   dst.ClearForWrite(vform);
   6354   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   6355     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6356       SimFloat16 input = src.Float<SimFloat16>(i);
   6357       dst.SetFloat(vform, i, FPRecipEstimate<SimFloat16>(input, round));
   6358     }
   6359   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   6360     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6361       float input = src.Float<float>(i);
   6362       dst.SetFloat(vform, i, FPRecipEstimate<float>(input, round));
   6363     }
   6364   } else {
   6365     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   6366     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6367       double input = src.Float<double>(i);
   6368       dst.SetFloat(vform, i, FPRecipEstimate<double>(input, round));
   6369     }
   6370   }
   6371   return dst;
   6372 }
   6373 
   6374 
   6375 LogicVRegister Simulator::ursqrte(VectorFormat vform,
   6376                                   LogicVRegister dst,
   6377                                   const LogicVRegister& src) {
   6378   dst.ClearForWrite(vform);
   6379   uint64_t operand;
   6380   uint32_t result;
   6381   double dp_operand, dp_result;
   6382   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6383     operand = src.Uint(vform, i);
   6384     if (operand <= 0x3FFFFFFF) {
   6385       result = 0xFFFFFFFF;
   6386     } else {
   6387       dp_operand = operand * std::pow(2.0, -32);
   6388       dp_result = recip_sqrt_estimate(dp_operand) * std::pow(2.0, 31);
   6389       result = static_cast<uint32_t>(dp_result);
   6390     }
   6391     dst.SetUint(vform, i, result);
   6392   }
   6393   return dst;
   6394 }
   6395 
   6396 
   6397 // Based on reference C function recip_estimate from ARM ARM.
   6398 double Simulator::recip_estimate(double a) {
   6399   int q, s;
   6400   double r;
   6401   q = static_cast<int>(a * 512.0);
   6402   r = 1.0 / ((static_cast<double>(q) + 0.5) / 512.0);
   6403   s = static_cast<int>(256.0 * r + 0.5);
   6404   return static_cast<double>(s) / 256.0;
   6405 }
   6406 
   6407 
   6408 LogicVRegister Simulator::urecpe(VectorFormat vform,
   6409                                  LogicVRegister dst,
   6410                                  const LogicVRegister& src) {
   6411   dst.ClearForWrite(vform);
   6412   uint64_t operand;
   6413   uint32_t result;
   6414   double dp_operand, dp_result;
   6415   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6416     operand = src.Uint(vform, i);
   6417     if (operand <= 0x7FFFFFFF) {
   6418       result = 0xFFFFFFFF;
   6419     } else {
   6420       dp_operand = operand * std::pow(2.0, -32);
   6421       dp_result = recip_estimate(dp_operand) * std::pow(2.0, 31);
   6422       result = static_cast<uint32_t>(dp_result);
   6423     }
   6424     dst.SetUint(vform, i, result);
   6425   }
   6426   return dst;
   6427 }
   6428 
   6429 LogicPRegister Simulator::pfalse(LogicPRegister dst) {
   6430   dst.Clear();
   6431   return dst;
   6432 }
   6433 
   6434 LogicPRegister Simulator::pfirst(LogicPRegister dst,
   6435                                  const LogicPRegister& pg,
   6436                                  const LogicPRegister& src) {
   6437   int first_pg = GetFirstActive(kFormatVnB, pg);
   6438   VIXL_ASSERT(first_pg < LaneCountFromFormat(kFormatVnB));
   6439   mov(dst, src);
   6440   if (first_pg >= 0) dst.SetActive(kFormatVnB, first_pg, true);
   6441   return dst;
   6442 }
   6443 
   6444 LogicPRegister Simulator::ptrue(VectorFormat vform,
   6445                                 LogicPRegister dst,
   6446                                 int pattern) {
   6447   int count = GetPredicateConstraintLaneCount(vform, pattern);
   6448   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6449     dst.SetActive(vform, i, i < count);
   6450   }
   6451   return dst;
   6452 }
   6453 
   6454 LogicPRegister Simulator::pnext(VectorFormat vform,
   6455                                 LogicPRegister dst,
   6456                                 const LogicPRegister& pg,
   6457                                 const LogicPRegister& src) {
   6458   int next = GetLastActive(vform, src) + 1;
   6459   while (next < LaneCountFromFormat(vform)) {
   6460     if (pg.IsActive(vform, next)) break;
   6461     next++;
   6462   }
   6463 
   6464   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6465     dst.SetActive(vform, i, (i == next));
   6466   }
   6467   return dst;
   6468 }
   6469 
   6470 template <typename T>
   6471 LogicVRegister Simulator::frecpx(VectorFormat vform,
   6472                                  LogicVRegister dst,
   6473                                  const LogicVRegister& src) {
   6474   dst.ClearForWrite(vform);
   6475   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6476     T op = src.Float<T>(i);
   6477     T result;
   6478     if (IsNaN(op)) {
   6479       result = FPProcessNaN(op);
   6480     } else {
   6481       int exp;
   6482       uint32_t sign;
   6483       if (IsFloat16<T>()) {
   6484         sign = Float16Sign(op);
   6485         exp = Float16Exp(op);
   6486         exp = (exp == 0) ? (0x1F - 1) : static_cast<int>(Bits(~exp, 4, 0));
   6487         result = Float16Pack(sign, exp, 0);
   6488       } else if (IsFloat32<T>()) {
   6489         sign = FloatSign(op);
   6490         exp = FloatExp(op);
   6491         exp = (exp == 0) ? (0xFF - 1) : static_cast<int>(Bits(~exp, 7, 0));
   6492         result = FloatPack(sign, exp, 0);
   6493       } else {
   6494         VIXL_ASSERT(IsFloat64<T>());
   6495         sign = DoubleSign(op);
   6496         exp = DoubleExp(op);
   6497         exp = (exp == 0) ? (0x7FF - 1) : static_cast<int>(Bits(~exp, 10, 0));
   6498         result = DoublePack(sign, exp, 0);
   6499       }
   6500     }
   6501     dst.SetFloat(i, result);
   6502   }
   6503   return dst;
   6504 }
   6505 
   6506 
   6507 LogicVRegister Simulator::frecpx(VectorFormat vform,
   6508                                  LogicVRegister dst,
   6509                                  const LogicVRegister& src) {
   6510   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   6511     frecpx<SimFloat16>(vform, dst, src);
   6512   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   6513     frecpx<float>(vform, dst, src);
   6514   } else {
   6515     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   6516     frecpx<double>(vform, dst, src);
   6517   }
   6518   return dst;
   6519 }
   6520 
   6521 LogicVRegister Simulator::flogb(VectorFormat vform,
   6522                                 LogicVRegister dst,
   6523                                 const LogicVRegister& src) {
   6524   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6525     double op = 0.0;
   6526     switch (vform) {
   6527       case kFormatVnH:
   6528         op = FPToDouble(src.Float<SimFloat16>(i), kIgnoreDefaultNaN);
   6529         break;
   6530       case kFormatVnS:
   6531         op = src.Float<float>(i);
   6532         break;
   6533       case kFormatVnD:
   6534         op = src.Float<double>(i);
   6535         break;
   6536       default:
   6537         VIXL_UNREACHABLE();
   6538     }
   6539 
   6540     switch (std::fpclassify(op)) {
   6541       case FP_INFINITE:
   6542         dst.SetInt(vform, i, MaxIntFromFormat(vform));
   6543         break;
   6544       case FP_NAN:
   6545       case FP_ZERO:
   6546         dst.SetInt(vform, i, MinIntFromFormat(vform));
   6547         break;
   6548       case FP_SUBNORMAL: {
   6549         // DoubleMantissa returns the mantissa of its input, leaving 12 zero
   6550         // bits where the sign and exponent would be. We subtract 12 to
   6551         // find the number of leading zero bits in the mantissa itself.
   6552         int64_t mant_zero_count = CountLeadingZeros(DoubleMantissa(op)) - 12;
   6553         // Log2 of a subnormal is the lowest exponent a normal number can
   6554         // represent, together with the zeros in the mantissa.
   6555         dst.SetInt(vform, i, -1023 - mant_zero_count);
   6556         break;
   6557       }
   6558       case FP_NORMAL:
   6559         // Log2 of a normal number is the exponent minus the bias.
   6560         dst.SetInt(vform, i, static_cast<int64_t>(DoubleExp(op)) - 1023);
   6561         break;
   6562     }
   6563   }
   6564   return dst;
   6565 }
   6566 
   6567 LogicVRegister Simulator::ftsmul(VectorFormat vform,
   6568                                  LogicVRegister dst,
   6569                                  const LogicVRegister& src1,
   6570                                  const LogicVRegister& src2) {
   6571   SimVRegister maybe_neg_src1;
   6572 
   6573   // The bottom bit of src2 controls the sign of the result. Use it to
   6574   // conditionally invert the sign of one `fmul` operand.
   6575   shl(vform, maybe_neg_src1, src2, LaneSizeInBitsFromFormat(vform) - 1);
   6576   eor(vform, maybe_neg_src1, maybe_neg_src1, src1);
   6577 
   6578   // Multiply src1 by the modified neg_src1, which is potentially its negation.
   6579   // In the case of NaNs, NaN * -NaN will return the first NaN intact, so src1,
   6580   // rather than neg_src1, must be the first source argument.
   6581   fmul(vform, dst, src1, maybe_neg_src1);
   6582 
   6583   return dst;
   6584 }
   6585 
   6586 LogicVRegister Simulator::ftssel(VectorFormat vform,
   6587                                  LogicVRegister dst,
   6588                                  const LogicVRegister& src1,
   6589                                  const LogicVRegister& src2) {
   6590   unsigned lane_bits = LaneSizeInBitsFromFormat(vform);
   6591   uint64_t sign_bit = UINT64_C(1) << (lane_bits - 1);
   6592   uint64_t one;
   6593 
   6594   if (lane_bits == kHRegSize) {
   6595     one = Float16ToRawbits(Float16(1.0));
   6596   } else if (lane_bits == kSRegSize) {
   6597     one = FloatToRawbits(1.0);
   6598   } else {
   6599     VIXL_ASSERT(lane_bits == kDRegSize);
   6600     one = DoubleToRawbits(1.0);
   6601   }
   6602 
   6603   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6604     // Use integer accessors for this operation, as this is a data manipulation
   6605     // task requiring no calculation.
   6606     uint64_t op = src1.Uint(vform, i);
   6607 
   6608     // Only the bottom two bits of the src2 register are significant, indicating
   6609     // the quadrant. Bit 0 controls whether src1 or 1.0 is written to dst. Bit 1
   6610     // determines the sign of the value written to dst.
   6611     uint64_t q = src2.Uint(vform, i);
   6612     if ((q & 1) == 1) op = one;
   6613     if ((q & 2) == 2) op ^= sign_bit;
   6614 
   6615     dst.SetUint(vform, i, op);
   6616   }
   6617 
   6618   return dst;
   6619 }
   6620 
   6621 template <typename T>
   6622 LogicVRegister Simulator::FTMaddHelper(VectorFormat vform,
   6623                                        LogicVRegister dst,
   6624                                        const LogicVRegister& src1,
   6625                                        const LogicVRegister& src2,
   6626                                        uint64_t coeff_pos,
   6627                                        uint64_t coeff_neg) {
   6628   SimVRegister zero;
   6629   dup_immediate(kFormatVnB, zero, 0);
   6630 
   6631   SimVRegister cf;
   6632   SimVRegister cfn;
   6633   dup_immediate(vform, cf, coeff_pos);
   6634   dup_immediate(vform, cfn, coeff_neg);
   6635 
   6636   // The specification requires testing the top bit of the raw value, rather
   6637   // than the sign of the floating point number, so use an integer comparison
   6638   // here.
   6639   SimPRegister is_neg;
   6640   SVEIntCompareVectorsHelper(lt,
   6641                              vform,
   6642                              is_neg,
   6643                              GetPTrue(),
   6644                              src2,
   6645                              zero,
   6646                              false,
   6647                              LeaveFlags);
   6648   mov_merging(vform, cf, is_neg, cfn);
   6649 
   6650   SimVRegister temp;
   6651   fabs_<T>(vform, temp, src2);
   6652   fmla<T>(vform, cf, cf, src1, temp);
   6653   mov(vform, dst, cf);
   6654   return dst;
   6655 }
   6656 
   6657 
   6658 LogicVRegister Simulator::ftmad(VectorFormat vform,
   6659                                 LogicVRegister dst,
   6660                                 const LogicVRegister& src1,
   6661                                 const LogicVRegister& src2,
   6662                                 unsigned index) {
   6663   static const uint64_t ftmad_coeff16[] = {0x3c00,
   6664                                            0xb155,
   6665                                            0x2030,
   6666                                            0x0000,
   6667                                            0x0000,
   6668                                            0x0000,
   6669                                            0x0000,
   6670                                            0x0000,
   6671                                            0x3c00,
   6672                                            0xb800,
   6673                                            0x293a,
   6674                                            0x0000,
   6675                                            0x0000,
   6676                                            0x0000,
   6677                                            0x0000,
   6678                                            0x0000};
   6679 
   6680   static const uint64_t ftmad_coeff32[] = {0x3f800000,
   6681                                            0xbe2aaaab,
   6682                                            0x3c088886,
   6683                                            0xb95008b9,
   6684                                            0x36369d6d,
   6685                                            0x00000000,
   6686                                            0x00000000,
   6687                                            0x00000000,
   6688                                            0x3f800000,
   6689                                            0xbf000000,
   6690                                            0x3d2aaaa6,
   6691                                            0xbab60705,
   6692                                            0x37cd37cc,
   6693                                            0x00000000,
   6694                                            0x00000000,
   6695                                            0x00000000};
   6696 
   6697   static const uint64_t ftmad_coeff64[] = {0x3ff0000000000000,
   6698                                            0xbfc5555555555543,
   6699                                            0x3f8111111110f30c,
   6700                                            0xbf2a01a019b92fc6,
   6701                                            0x3ec71de351f3d22b,
   6702                                            0xbe5ae5e2b60f7b91,
   6703                                            0x3de5d8408868552f,
   6704                                            0x0000000000000000,
   6705                                            0x3ff0000000000000,
   6706                                            0xbfe0000000000000,
   6707                                            0x3fa5555555555536,
   6708                                            0xbf56c16c16c13a0b,
   6709                                            0x3efa01a019b1e8d8,
   6710                                            0xbe927e4f7282f468,
   6711                                            0x3e21ee96d2641b13,
   6712                                            0xbda8f76380fbb401};
   6713   VIXL_ASSERT((index + 8) < ArrayLength(ftmad_coeff64));
   6714   VIXL_ASSERT(ArrayLength(ftmad_coeff16) == ArrayLength(ftmad_coeff64));
   6715   VIXL_ASSERT(ArrayLength(ftmad_coeff32) == ArrayLength(ftmad_coeff64));
   6716 
   6717   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   6718     FTMaddHelper<SimFloat16>(vform,
   6719                              dst,
   6720                              src1,
   6721                              src2,
   6722                              ftmad_coeff16[index],
   6723                              ftmad_coeff16[index + 8]);
   6724   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   6725     FTMaddHelper<float>(vform,
   6726                         dst,
   6727                         src1,
   6728                         src2,
   6729                         ftmad_coeff32[index],
   6730                         ftmad_coeff32[index + 8]);
   6731   } else {
   6732     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   6733     FTMaddHelper<double>(vform,
   6734                          dst,
   6735                          src1,
   6736                          src2,
   6737                          ftmad_coeff64[index],
   6738                          ftmad_coeff64[index + 8]);
   6739   }
   6740   return dst;
   6741 }
   6742 
   6743 LogicVRegister Simulator::fexpa(VectorFormat vform,
   6744                                 LogicVRegister dst,
   6745                                 const LogicVRegister& src) {
   6746   static const uint64_t fexpa_coeff16[] = {0x0000, 0x0016, 0x002d, 0x0045,
   6747                                            0x005d, 0x0075, 0x008e, 0x00a8,
   6748                                            0x00c2, 0x00dc, 0x00f8, 0x0114,
   6749                                            0x0130, 0x014d, 0x016b, 0x0189,
   6750                                            0x01a8, 0x01c8, 0x01e8, 0x0209,
   6751                                            0x022b, 0x024e, 0x0271, 0x0295,
   6752                                            0x02ba, 0x02e0, 0x0306, 0x032e,
   6753                                            0x0356, 0x037f, 0x03a9, 0x03d4};
   6754 
   6755   static const uint64_t fexpa_coeff32[] =
   6756       {0x000000, 0x0164d2, 0x02cd87, 0x043a29, 0x05aac3, 0x071f62, 0x08980f,
   6757        0x0a14d5, 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 0x11c3d3, 0x135a2b,
   6758        0x14f4f0, 0x16942d, 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 0x1ef532,
   6759        0x20b051, 0x227043, 0x243516, 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
   6760        0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 0x3504f3, 0x36fd92, 0x38fbaf,
   6761        0x3aff5b, 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 0x45672a, 0x478d75,
   6762        0x49b9be, 0x4bec15, 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 0x5744fd,
   6763        0x599d16, 0x5bfbb8, 0x5e60f5, 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
   6764        0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 0x75257d, 0x77d0df, 0x7a83b3,
   6765        0x7d3e0c};
   6766 
   6767   static const uint64_t fexpa_coeff64[] =
   6768       {0X0000000000000, 0X02c9a3e778061, 0X059b0d3158574, 0X0874518759bc8,
   6769        0X0b5586cf9890f, 0X0e3ec32d3d1a2, 0X11301d0125b51, 0X1429aaea92de0,
   6770        0X172b83c7d517b, 0X1a35beb6fcb75, 0X1d4873168b9aa, 0X2063b88628cd6,
   6771        0X2387a6e756238, 0X26b4565e27cdd, 0X29e9df51fdee1, 0X2d285a6e4030b,
   6772        0X306fe0a31b715, 0X33c08b26416ff, 0X371a7373aa9cb, 0X3a7db34e59ff7,
   6773        0X3dea64c123422, 0X4160a21f72e2a, 0X44e086061892d, 0X486a2b5c13cd0,
   6774        0X4bfdad5362a27, 0X4f9b2769d2ca7, 0X5342b569d4f82, 0X56f4736b527da,
   6775        0X5ab07dd485429, 0X5e76f15ad2148, 0X6247eb03a5585, 0X6623882552225,
   6776        0X6a09e667f3bcd, 0X6dfb23c651a2f, 0X71f75e8ec5f74, 0X75feb564267c9,
   6777        0X7a11473eb0187, 0X7e2f336cf4e62, 0X82589994cce13, 0X868d99b4492ed,
   6778        0X8ace5422aa0db, 0X8f1ae99157736, 0X93737b0cdc5e5, 0X97d829fde4e50,
   6779        0X9c49182a3f090, 0Xa0c667b5de565, 0Xa5503b23e255d, 0Xa9e6b5579fdbf,
   6780        0Xae89f995ad3ad, 0Xb33a2b84f15fb, 0Xb7f76f2fb5e47, 0Xbcc1e904bc1d2,
   6781        0Xc199bdd85529c, 0Xc67f12e57d14b, 0Xcb720dcef9069, 0Xd072d4a07897c,
   6782        0Xd5818dcfba487, 0Xda9e603db3285, 0Xdfc97337b9b5f, 0Xe502ee78b3ff6,
   6783        0Xea4afa2a490da, 0Xefa1bee615a27, 0Xf50765b6e4540, 0Xfa7c1819e90d8};
   6784 
   6785   unsigned lane_size = LaneSizeInBitsFromFormat(vform);
   6786   int index_highbit = 5;
   6787   int op_highbit, op_shift;
   6788   const uint64_t* fexpa_coeff;
   6789 
   6790   if (lane_size == kHRegSize) {
   6791     index_highbit = 4;
   6792     VIXL_ASSERT(ArrayLength(fexpa_coeff16) == (1U << (index_highbit + 1)));
   6793     fexpa_coeff = fexpa_coeff16;
   6794     op_highbit = 9;
   6795     op_shift = 10;
   6796   } else if (lane_size == kSRegSize) {
   6797     VIXL_ASSERT(ArrayLength(fexpa_coeff32) == (1U << (index_highbit + 1)));
   6798     fexpa_coeff = fexpa_coeff32;
   6799     op_highbit = 13;
   6800     op_shift = 23;
   6801   } else {
   6802     VIXL_ASSERT(lane_size == kDRegSize);
   6803     VIXL_ASSERT(ArrayLength(fexpa_coeff64) == (1U << (index_highbit + 1)));
   6804     fexpa_coeff = fexpa_coeff64;
   6805     op_highbit = 16;
   6806     op_shift = 52;
   6807   }
   6808 
   6809   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6810     uint64_t op = src.Uint(vform, i);
   6811     uint64_t result = fexpa_coeff[Bits(op, index_highbit, 0)];
   6812     result |= (Bits(op, op_highbit, index_highbit + 1) << op_shift);
   6813     dst.SetUint(vform, i, result);
   6814   }
   6815   return dst;
   6816 }
   6817 
   6818 template <typename T>
   6819 LogicVRegister Simulator::fscale(VectorFormat vform,
   6820                                  LogicVRegister dst,
   6821                                  const LogicVRegister& src1,
   6822                                  const LogicVRegister& src2) {
   6823   T two = T(2.0);
   6824   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6825     T src1_val = src1.Float<T>(i);
   6826     if (!IsNaN(src1_val)) {
   6827       int64_t scale = src2.Int(vform, i);
   6828       // TODO: this is a low-performance implementation, but it's simple and
   6829       // less likely to be buggy. Consider replacing it with something faster.
   6830 
   6831       // Scales outside of these bounds become infinity or zero, so there's no
   6832       // point iterating further.
   6833       scale = std::min<int64_t>(std::max<int64_t>(scale, -2048), 2048);
   6834 
   6835       // Compute src1_val * 2 ^ scale. If scale is positive, multiply by two and
   6836       // decrement scale until it's zero.
   6837       while (scale-- > 0) {
   6838         src1_val = FPMul(src1_val, two);
   6839       }
   6840 
   6841       // If scale is negative, divide by two and increment scale until it's
   6842       // zero. Initially, scale is (src2 - 1), so we pre-increment.
   6843       while (++scale < 0) {
   6844         src1_val = FPDiv(src1_val, two);
   6845       }
   6846     }
   6847     dst.SetFloat<T>(i, src1_val);
   6848   }
   6849   return dst;
   6850 }
   6851 
   6852 LogicVRegister Simulator::fscale(VectorFormat vform,
   6853                                  LogicVRegister dst,
   6854                                  const LogicVRegister& src1,
   6855                                  const LogicVRegister& src2) {
   6856   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
   6857     fscale<SimFloat16>(vform, dst, src1, src2);
   6858   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   6859     fscale<float>(vform, dst, src1, src2);
   6860   } else {
   6861     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   6862     fscale<double>(vform, dst, src1, src2);
   6863   }
   6864   return dst;
   6865 }
   6866 
   6867 LogicVRegister Simulator::scvtf(VectorFormat vform,
   6868                                 unsigned dst_data_size_in_bits,
   6869                                 unsigned src_data_size_in_bits,
   6870                                 LogicVRegister dst,
   6871                                 const LogicPRegister& pg,
   6872                                 const LogicVRegister& src,
   6873                                 FPRounding round,
   6874                                 int fbits) {
   6875   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) >= dst_data_size_in_bits);
   6876   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) >= src_data_size_in_bits);
   6877   dst.ClearForWrite(vform);
   6878 
   6879   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6880     if (!pg.IsActive(vform, i)) continue;
   6881 
   6882     int64_t value = ExtractSignedBitfield64(src_data_size_in_bits - 1,
   6883                                             0,
   6884                                             src.Uint(vform, i));
   6885 
   6886     switch (dst_data_size_in_bits) {
   6887       case kHRegSize: {
   6888         SimFloat16 result = FixedToFloat16(value, fbits, round);
   6889         dst.SetUint(vform, i, Float16ToRawbits(result));
   6890         break;
   6891       }
   6892       case kSRegSize: {
   6893         float result = FixedToFloat(value, fbits, round);
   6894         dst.SetUint(vform, i, FloatToRawbits(result));
   6895         break;
   6896       }
   6897       case kDRegSize: {
   6898         double result = FixedToDouble(value, fbits, round);
   6899         dst.SetUint(vform, i, DoubleToRawbits(result));
   6900         break;
   6901       }
   6902       default:
   6903         VIXL_UNIMPLEMENTED();
   6904         break;
   6905     }
   6906   }
   6907 
   6908   return dst;
   6909 }
   6910 
   6911 LogicVRegister Simulator::scvtf(VectorFormat vform,
   6912                                 LogicVRegister dst,
   6913                                 const LogicVRegister& src,
   6914                                 int fbits,
   6915                                 FPRounding round) {
   6916   return scvtf(vform,
   6917                LaneSizeInBitsFromFormat(vform),
   6918                LaneSizeInBitsFromFormat(vform),
   6919                dst,
   6920                GetPTrue(),
   6921                src,
   6922                round,
   6923                fbits);
   6924 }
   6925 
   6926 LogicVRegister Simulator::ucvtf(VectorFormat vform,
   6927                                 unsigned dst_data_size_in_bits,
   6928                                 unsigned src_data_size_in_bits,
   6929                                 LogicVRegister dst,
   6930                                 const LogicPRegister& pg,
   6931                                 const LogicVRegister& src,
   6932                                 FPRounding round,
   6933                                 int fbits) {
   6934   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) >= dst_data_size_in_bits);
   6935   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) >= src_data_size_in_bits);
   6936   dst.ClearForWrite(vform);
   6937 
   6938   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   6939     if (!pg.IsActive(vform, i)) continue;
   6940 
   6941     uint64_t value = ExtractUnsignedBitfield64(src_data_size_in_bits - 1,
   6942                                                0,
   6943                                                src.Uint(vform, i));
   6944 
   6945     switch (dst_data_size_in_bits) {
   6946       case kHRegSize: {
   6947         SimFloat16 result = UFixedToFloat16(value, fbits, round);
   6948         dst.SetUint(vform, i, Float16ToRawbits(result));
   6949         break;
   6950       }
   6951       case kSRegSize: {
   6952         float result = UFixedToFloat(value, fbits, round);
   6953         dst.SetUint(vform, i, FloatToRawbits(result));
   6954         break;
   6955       }
   6956       case kDRegSize: {
   6957         double result = UFixedToDouble(value, fbits, round);
   6958         dst.SetUint(vform, i, DoubleToRawbits(result));
   6959         break;
   6960       }
   6961       default:
   6962         VIXL_UNIMPLEMENTED();
   6963         break;
   6964     }
   6965   }
   6966 
   6967   return dst;
   6968 }
   6969 
   6970 LogicVRegister Simulator::ucvtf(VectorFormat vform,
   6971                                 LogicVRegister dst,
   6972                                 const LogicVRegister& src,
   6973                                 int fbits,
   6974                                 FPRounding round) {
   6975   return ucvtf(vform,
   6976                LaneSizeInBitsFromFormat(vform),
   6977                LaneSizeInBitsFromFormat(vform),
   6978                dst,
   6979                GetPTrue(),
   6980                src,
   6981                round,
   6982                fbits);
   6983 }
   6984 
   6985 LogicVRegister Simulator::unpk(VectorFormat vform,
   6986                                LogicVRegister dst,
   6987                                const LogicVRegister& src,
   6988                                UnpackType unpack_type,
   6989                                ExtendType extend_type) {
   6990   VectorFormat vform_half = VectorFormatHalfWidth(vform);
   6991   const int lane_count = LaneCountFromFormat(vform);
   6992   const int src_start_lane = (unpack_type == kLoHalf) ? 0 : lane_count;
   6993 
   6994   switch (extend_type) {
   6995     case kSignedExtend: {
   6996       int64_t result[kZRegMaxSizeInBytes];
   6997       for (int i = 0; i < lane_count; ++i) {
   6998         result[i] = src.Int(vform_half, i + src_start_lane);
   6999       }
   7000       for (int i = 0; i < lane_count; ++i) {
   7001         dst.SetInt(vform, i, result[i]);
   7002       }
   7003       break;
   7004     }
   7005     case kUnsignedExtend: {
   7006       uint64_t result[kZRegMaxSizeInBytes];
   7007       for (int i = 0; i < lane_count; ++i) {
   7008         result[i] = src.Uint(vform_half, i + src_start_lane);
   7009       }
   7010       for (int i = 0; i < lane_count; ++i) {
   7011         dst.SetUint(vform, i, result[i]);
   7012       }
   7013       break;
   7014     }
   7015     default:
   7016       VIXL_UNREACHABLE();
   7017   }
   7018   return dst;
   7019 }
   7020 
   7021 LogicPRegister Simulator::SVEIntCompareVectorsHelper(Condition cond,
   7022                                                      VectorFormat vform,
   7023                                                      LogicPRegister dst,
   7024                                                      const LogicPRegister& mask,
   7025                                                      const LogicVRegister& src1,
   7026                                                      const LogicVRegister& src2,
   7027                                                      bool is_wide_elements,
   7028                                                      FlagsUpdate flags) {
   7029   for (int lane = 0; lane < LaneCountFromFormat(vform); lane++) {
   7030     bool result = false;
   7031     if (mask.IsActive(vform, lane)) {
   7032       int64_t op1 = 0xbadbeef;
   7033       int64_t op2 = 0xbadbeef;
   7034       int d_lane = (lane * LaneSizeInBitsFromFormat(vform)) / kDRegSize;
   7035       switch (cond) {
   7036         case eq:
   7037         case ge:
   7038         case gt:
   7039         case lt:
   7040         case le:
   7041         case ne:
   7042           op1 = src1.Int(vform, lane);
   7043           op2 = is_wide_elements ? src2.Int(kFormatVnD, d_lane)
   7044                                  : src2.Int(vform, lane);
   7045           break;
   7046         case hi:
   7047         case hs:
   7048         case ls:
   7049         case lo:
   7050           op1 = src1.Uint(vform, lane);
   7051           op2 = is_wide_elements ? src2.Uint(kFormatVnD, d_lane)
   7052                                  : src2.Uint(vform, lane);
   7053           break;
   7054         default:
   7055           VIXL_UNREACHABLE();
   7056       }
   7057 
   7058       switch (cond) {
   7059         case eq:
   7060           result = (op1 == op2);
   7061           break;
   7062         case ne:
   7063           result = (op1 != op2);
   7064           break;
   7065         case ge:
   7066           result = (op1 >= op2);
   7067           break;
   7068         case gt:
   7069           result = (op1 > op2);
   7070           break;
   7071         case le:
   7072           result = (op1 <= op2);
   7073           break;
   7074         case lt:
   7075           result = (op1 < op2);
   7076           break;
   7077         case hs:
   7078           result = (static_cast<uint64_t>(op1) >= static_cast<uint64_t>(op2));
   7079           break;
   7080         case hi:
   7081           result = (static_cast<uint64_t>(op1) > static_cast<uint64_t>(op2));
   7082           break;
   7083         case ls:
   7084           result = (static_cast<uint64_t>(op1) <= static_cast<uint64_t>(op2));
   7085           break;
   7086         case lo:
   7087           result = (static_cast<uint64_t>(op1) < static_cast<uint64_t>(op2));
   7088           break;
   7089         default:
   7090           VIXL_UNREACHABLE();
   7091       }
   7092     }
   7093     dst.SetActive(vform, lane, result);
   7094   }
   7095 
   7096   if (flags == SetFlags) PredTest(vform, mask, dst);
   7097 
   7098   return dst;
   7099 }
   7100 
   7101 LogicVRegister Simulator::SVEBitwiseShiftHelper(Shift shift_op,
   7102                                                 VectorFormat vform,
   7103                                                 LogicVRegister dst,
   7104                                                 const LogicVRegister& src1,
   7105                                                 const LogicVRegister& src2,
   7106                                                 bool is_wide_elements) {
   7107   unsigned lane_size = LaneSizeInBitsFromFormat(vform);
   7108   VectorFormat shift_vform = is_wide_elements ? kFormatVnD : vform;
   7109 
   7110   for (int lane = 0; lane < LaneCountFromFormat(vform); lane++) {
   7111     int shift_src_lane = lane;
   7112     if (is_wide_elements) {
   7113       // If the shift amount comes from wide elements, select the D-sized lane
   7114       // which occupies the corresponding lanes of the value to be shifted.
   7115       shift_src_lane = (lane * lane_size) / kDRegSize;
   7116     }
   7117     uint64_t shift_amount = src2.Uint(shift_vform, shift_src_lane);
   7118 
   7119     // Saturate shift_amount to the size of the lane that will be shifted.
   7120     if (shift_amount > lane_size) shift_amount = lane_size;
   7121 
   7122     uint64_t value = src1.Uint(vform, lane);
   7123     int64_t result = ShiftOperand(lane_size,
   7124                                   value,
   7125                                   shift_op,
   7126                                   static_cast<unsigned>(shift_amount));
   7127     dst.SetUint(vform, lane, result);
   7128   }
   7129 
   7130   return dst;
   7131 }
   7132 
   7133 LogicVRegister Simulator::asrd(VectorFormat vform,
   7134                                LogicVRegister dst,
   7135                                const LogicVRegister& src1,
   7136                                int shift) {
   7137   VIXL_ASSERT((shift > 0) && (static_cast<unsigned>(shift) <=
   7138                               LaneSizeInBitsFromFormat(vform)));
   7139 
   7140   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   7141     int64_t value = src1.Int(vform, i);
   7142     if (shift <= 63) {
   7143       if (value < 0) {
   7144         // The max possible mask is 0x7fff'ffff'ffff'ffff, which can be safely
   7145         // cast to int64_t, and cannot cause signed overflow in the result.
   7146         value = value + GetUintMask(shift);
   7147       }
   7148       value = ShiftOperand(kDRegSize, value, ASR, shift);
   7149     } else {
   7150       value = 0;
   7151     }
   7152     dst.SetInt(vform, i, value);
   7153   }
   7154   return dst;
   7155 }
   7156 
   7157 LogicVRegister Simulator::SVEBitwiseLogicalUnpredicatedHelper(
   7158     LogicalOp logical_op,
   7159     VectorFormat vform,
   7160     LogicVRegister zd,
   7161     const LogicVRegister& zn,
   7162     const LogicVRegister& zm) {
   7163   VIXL_ASSERT(IsSVEFormat(vform));
   7164   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   7165     uint64_t op1 = zn.Uint(vform, i);
   7166     uint64_t op2 = zm.Uint(vform, i);
   7167     uint64_t result = 0;
   7168     switch (logical_op) {
   7169       case AND:
   7170         result = op1 & op2;
   7171         break;
   7172       case BIC:
   7173         result = op1 & ~op2;
   7174         break;
   7175       case EOR:
   7176         result = op1 ^ op2;
   7177         break;
   7178       case ORR:
   7179         result = op1 | op2;
   7180         break;
   7181       default:
   7182         VIXL_UNIMPLEMENTED();
   7183     }
   7184     zd.SetUint(vform, i, result);
   7185   }
   7186 
   7187   return zd;
   7188 }
   7189 
   7190 LogicPRegister Simulator::SVEPredicateLogicalHelper(SVEPredicateLogicalOp op,
   7191                                                     LogicPRegister pd,
   7192                                                     const LogicPRegister& pn,
   7193                                                     const LogicPRegister& pm) {
   7194   for (int i = 0; i < pn.GetChunkCount(); i++) {
   7195     LogicPRegister::ChunkType op1 = pn.GetChunk(i);
   7196     LogicPRegister::ChunkType op2 = pm.GetChunk(i);
   7197     LogicPRegister::ChunkType result = 0;
   7198     switch (op) {
   7199       case ANDS_p_p_pp_z:
   7200       case AND_p_p_pp_z:
   7201         result = op1 & op2;
   7202         break;
   7203       case BICS_p_p_pp_z:
   7204       case BIC_p_p_pp_z:
   7205         result = op1 & ~op2;
   7206         break;
   7207       case EORS_p_p_pp_z:
   7208       case EOR_p_p_pp_z:
   7209         result = op1 ^ op2;
   7210         break;
   7211       case NANDS_p_p_pp_z:
   7212       case NAND_p_p_pp_z:
   7213         result = ~(op1 & op2);
   7214         break;
   7215       case NORS_p_p_pp_z:
   7216       case NOR_p_p_pp_z:
   7217         result = ~(op1 | op2);
   7218         break;
   7219       case ORNS_p_p_pp_z:
   7220       case ORN_p_p_pp_z:
   7221         result = op1 | ~op2;
   7222         break;
   7223       case ORRS_p_p_pp_z:
   7224       case ORR_p_p_pp_z:
   7225         result = op1 | op2;
   7226         break;
   7227       default:
   7228         VIXL_UNIMPLEMENTED();
   7229     }
   7230     pd.SetChunk(i, result);
   7231   }
   7232   return pd;
   7233 }
   7234 
   7235 LogicVRegister Simulator::SVEBitwiseImmHelper(
   7236     SVEBitwiseLogicalWithImm_UnpredicatedOp op,
   7237     VectorFormat vform,
   7238     LogicVRegister zd,
   7239     uint64_t imm) {
   7240   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   7241     uint64_t op1 = zd.Uint(vform, i);
   7242     uint64_t result = 0;
   7243     switch (op) {
   7244       case AND_z_zi:
   7245         result = op1 & imm;
   7246         break;
   7247       case EOR_z_zi:
   7248         result = op1 ^ imm;
   7249         break;
   7250       case ORR_z_zi:
   7251         result = op1 | imm;
   7252         break;
   7253       default:
   7254         VIXL_UNIMPLEMENTED();
   7255     }
   7256     zd.SetUint(vform, i, result);
   7257   }
   7258 
   7259   return zd;
   7260 }
   7261 
   7262 void Simulator::SVEStructuredStoreHelper(VectorFormat vform,
   7263                                          const LogicPRegister& pg,
   7264                                          unsigned zt_code,
   7265                                          const LogicSVEAddressVector& addr) {
   7266   VIXL_ASSERT(zt_code < kNumberOfZRegisters);
   7267 
   7268   int esize_in_bytes_log2 = LaneSizeInBytesLog2FromFormat(vform);
   7269   int msize_in_bytes_log2 = addr.GetMsizeInBytesLog2();
   7270   int msize_in_bytes = addr.GetMsizeInBytes();
   7271   int reg_count = addr.GetRegCount();
   7272 
   7273   VIXL_ASSERT(esize_in_bytes_log2 >= msize_in_bytes_log2);
   7274   VIXL_ASSERT((reg_count >= 1) && (reg_count <= 4));
   7275 
   7276   unsigned zt_codes[4] = {zt_code,
   7277                           (zt_code + 1) % kNumberOfZRegisters,
   7278                           (zt_code + 2) % kNumberOfZRegisters,
   7279                           (zt_code + 3) % kNumberOfZRegisters};
   7280 
   7281   LogicVRegister zt[4] = {
   7282       ReadVRegister(zt_codes[0]),
   7283       ReadVRegister(zt_codes[1]),
   7284       ReadVRegister(zt_codes[2]),
   7285       ReadVRegister(zt_codes[3]),
   7286   };
   7287 
   7288   // For unpacked forms (e.g. `st1b { z0.h }, ...`, the upper parts of the lanes
   7289   // are ignored, so read the source register using the VectorFormat that
   7290   // corresponds with the storage format, and multiply the index accordingly.
   7291   VectorFormat unpack_vform =
   7292       SVEFormatFromLaneSizeInBytesLog2(msize_in_bytes_log2);
   7293   int unpack_shift = esize_in_bytes_log2 - msize_in_bytes_log2;
   7294 
   7295   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   7296     if (!pg.IsActive(vform, i)) continue;
   7297 
   7298     for (int r = 0; r < reg_count; r++) {
   7299       uint64_t element_address = addr.GetElementAddress(i, r);
   7300       if (!StoreLane(zt[r], unpack_vform, i << unpack_shift, element_address)) {
   7301         return;
   7302       }
   7303     }
   7304   }
   7305 
   7306   if (ShouldTraceWrites()) {
   7307     PrintRegisterFormat format = GetPrintRegisterFormat(vform);
   7308     if (esize_in_bytes_log2 == msize_in_bytes_log2) {
   7309       // Use an FP format where it's likely that we're accessing FP data.
   7310       format = GetPrintRegisterFormatTryFP(format);
   7311     }
   7312     // Stores don't represent a change to the source register's value, so only
   7313     // print the relevant part of the value.
   7314     format = GetPrintRegPartial(format);
   7315 
   7316     PrintZStructAccess(zt_code,
   7317                        reg_count,
   7318                        pg,
   7319                        format,
   7320                        msize_in_bytes,
   7321                        "->",
   7322                        addr);
   7323   }
   7324 }
   7325 
   7326 bool Simulator::SVEStructuredLoadHelper(VectorFormat vform,
   7327                                         const LogicPRegister& pg,
   7328                                         unsigned zt_code,
   7329                                         const LogicSVEAddressVector& addr,
   7330                                         bool is_signed) {
   7331   int esize_in_bytes_log2 = LaneSizeInBytesLog2FromFormat(vform);
   7332   int msize_in_bytes_log2 = addr.GetMsizeInBytesLog2();
   7333   int msize_in_bytes = addr.GetMsizeInBytes();
   7334   int reg_count = addr.GetRegCount();
   7335 
   7336   VIXL_ASSERT(zt_code < kNumberOfZRegisters);
   7337   VIXL_ASSERT(esize_in_bytes_log2 >= msize_in_bytes_log2);
   7338   VIXL_ASSERT((reg_count >= 1) && (reg_count <= 4));
   7339 
   7340   unsigned zt_codes[4] = {zt_code,
   7341                           (zt_code + 1) % kNumberOfZRegisters,
   7342                           (zt_code + 2) % kNumberOfZRegisters,
   7343                           (zt_code + 3) % kNumberOfZRegisters};
   7344   LogicVRegister zt[4] = {
   7345       ReadVRegister(zt_codes[0]),
   7346       ReadVRegister(zt_codes[1]),
   7347       ReadVRegister(zt_codes[2]),
   7348       ReadVRegister(zt_codes[3]),
   7349   };
   7350 
   7351   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   7352     for (int r = 0; r < reg_count; r++) {
   7353       uint64_t element_address = addr.GetElementAddress(i, r);
   7354 
   7355       if (!pg.IsActive(vform, i)) {
   7356         zt[r].SetUint(vform, i, 0);
   7357         continue;
   7358       }
   7359 
   7360       if (is_signed) {
   7361         if (!LoadIntToLane(zt[r], vform, msize_in_bytes, i, element_address)) {
   7362           return false;
   7363         }
   7364       } else {
   7365         if (!LoadUintToLane(zt[r], vform, msize_in_bytes, i, element_address)) {
   7366           return false;
   7367         }
   7368       }
   7369     }
   7370   }
   7371 
   7372   if (ShouldTraceVRegs()) {
   7373     PrintRegisterFormat format = GetPrintRegisterFormat(vform);
   7374     if ((esize_in_bytes_log2 == msize_in_bytes_log2) && !is_signed) {
   7375       // Use an FP format where it's likely that we're accessing FP data.
   7376       format = GetPrintRegisterFormatTryFP(format);
   7377     }
   7378     PrintZStructAccess(zt_code,
   7379                        reg_count,
   7380                        pg,
   7381                        format,
   7382                        msize_in_bytes,
   7383                        "<-",
   7384                        addr);
   7385   }
   7386   return true;
   7387 }
   7388 
   7389 LogicPRegister Simulator::brka(LogicPRegister pd,
   7390                                const LogicPRegister& pg,
   7391                                const LogicPRegister& pn) {
   7392   bool break_ = false;
   7393   for (int i = 0; i < LaneCountFromFormat(kFormatVnB); i++) {
   7394     if (pg.IsActive(kFormatVnB, i)) {
   7395       pd.SetActive(kFormatVnB, i, !break_);
   7396       break_ |= pn.IsActive(kFormatVnB, i);
   7397     }
   7398   }
   7399 
   7400   return pd;
   7401 }
   7402 
   7403 LogicPRegister Simulator::brkb(LogicPRegister pd,
   7404                                const LogicPRegister& pg,
   7405                                const LogicPRegister& pn) {
   7406   bool break_ = false;
   7407   for (int i = 0; i < LaneCountFromFormat(kFormatVnB); i++) {
   7408     if (pg.IsActive(kFormatVnB, i)) {
   7409       break_ |= pn.IsActive(kFormatVnB, i);
   7410       pd.SetActive(kFormatVnB, i, !break_);
   7411     }
   7412   }
   7413 
   7414   return pd;
   7415 }
   7416 
   7417 LogicPRegister Simulator::brkn(LogicPRegister pdm,
   7418                                const LogicPRegister& pg,
   7419                                const LogicPRegister& pn) {
   7420   if (!IsLastActive(kFormatVnB, pg, pn)) {
   7421     pfalse(pdm);
   7422   }
   7423   return pdm;
   7424 }
   7425 
   7426 LogicPRegister Simulator::brkpa(LogicPRegister pd,
   7427                                 const LogicPRegister& pg,
   7428                                 const LogicPRegister& pn,
   7429                                 const LogicPRegister& pm) {
   7430   bool last_active = IsLastActive(kFormatVnB, pg, pn);
   7431 
   7432   for (int i = 0; i < LaneCountFromFormat(kFormatVnB); i++) {
   7433     bool active = false;
   7434     if (pg.IsActive(kFormatVnB, i)) {
   7435       active = last_active;
   7436       last_active = last_active && !pm.IsActive(kFormatVnB, i);
   7437     }
   7438     pd.SetActive(kFormatVnB, i, active);
   7439   }
   7440 
   7441   return pd;
   7442 }
   7443 
   7444 LogicPRegister Simulator::brkpb(LogicPRegister pd,
   7445                                 const LogicPRegister& pg,
   7446                                 const LogicPRegister& pn,
   7447                                 const LogicPRegister& pm) {
   7448   bool last_active = IsLastActive(kFormatVnB, pg, pn);
   7449 
   7450   for (int i = 0; i < LaneCountFromFormat(kFormatVnB); i++) {
   7451     bool active = false;
   7452     if (pg.IsActive(kFormatVnB, i)) {
   7453       last_active = last_active && !pm.IsActive(kFormatVnB, i);
   7454       active = last_active;
   7455     }
   7456     pd.SetActive(kFormatVnB, i, active);
   7457   }
   7458 
   7459   return pd;
   7460 }
   7461 
   7462 void Simulator::SVEFaultTolerantLoadHelper(VectorFormat vform,
   7463                                            const LogicPRegister& pg,
   7464                                            unsigned zt_code,
   7465                                            const LogicSVEAddressVector& addr,
   7466                                            SVEFaultTolerantLoadType type,
   7467                                            bool is_signed) {
   7468   int esize_in_bytes = LaneSizeInBytesFromFormat(vform);
   7469   int msize_in_bits = addr.GetMsizeInBits();
   7470   int msize_in_bytes = addr.GetMsizeInBytes();
   7471 
   7472   VIXL_ASSERT(zt_code < kNumberOfZRegisters);
   7473   VIXL_ASSERT(esize_in_bytes >= msize_in_bytes);
   7474   VIXL_ASSERT(addr.GetRegCount() == 1);
   7475 
   7476   LogicVRegister zt = ReadVRegister(zt_code);
   7477   LogicPRegister ffr = ReadFFR();
   7478 
   7479   // Non-faulting loads are allowed to fail arbitrarily. To stress user
   7480   // code, fail a random element in roughly one in eight full-vector loads.
   7481   uint32_t rnd = static_cast<uint32_t>(jrand48(rand_state_));
   7482   int fake_fault_at_lane = rnd % (LaneCountFromFormat(vform) * 8);
   7483 
   7484   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   7485     uint64_t value = 0;
   7486 
   7487     if (pg.IsActive(vform, i)) {
   7488       uint64_t element_address = addr.GetElementAddress(i, 0);
   7489 
   7490       if (type == kSVEFirstFaultLoad) {
   7491         // First-faulting loads always load the first active element, regardless
   7492         // of FFR. The result will be discarded if its FFR lane is inactive, but
   7493         // it could still generate a fault.
   7494         VIXL_DEFINE_OR_RETURN(mem_result,
   7495                               MemReadUint(msize_in_bytes, element_address));
   7496         value = mem_result;
   7497         // All subsequent elements have non-fault semantics.
   7498         type = kSVENonFaultLoad;
   7499 
   7500       } else if (ffr.IsActive(vform, i)) {
   7501         // Simulation of fault-tolerant loads relies on system calls, and is
   7502         // likely to be relatively slow, so we only actually perform the load if
   7503         // its FFR lane is active.
   7504 
   7505         bool can_read = (i < fake_fault_at_lane) &&
   7506                         CanReadMemory(element_address, msize_in_bytes);
   7507         if (can_read) {
   7508           VIXL_DEFINE_OR_RETURN(mem_result,
   7509                                 MemReadUint(msize_in_bytes, element_address));
   7510           value = mem_result;
   7511         } else {
   7512           // Propagate the fault to the end of FFR.
   7513           for (int j = i; j < LaneCountFromFormat(vform); j++) {
   7514             ffr.SetActive(vform, j, false);
   7515           }
   7516         }
   7517       }
   7518     }
   7519 
   7520     // The architecture permits a few possible results for inactive FFR lanes
   7521     // (including those caused by a fault in this instruction). We choose to
   7522     // leave the register value unchanged (like merging predication) because
   7523     // no other input to this instruction can have the same behaviour.
   7524     //
   7525     // Note that this behaviour takes precedence over pg's zeroing predication.
   7526 
   7527     if (ffr.IsActive(vform, i)) {
   7528       int msb = msize_in_bits - 1;
   7529       if (is_signed) {
   7530         zt.SetInt(vform, i, ExtractSignedBitfield64(msb, 0, value));
   7531       } else {
   7532         zt.SetUint(vform, i, ExtractUnsignedBitfield64(msb, 0, value));
   7533       }
   7534     }
   7535   }
   7536 
   7537   if (ShouldTraceVRegs()) {
   7538     PrintRegisterFormat format = GetPrintRegisterFormat(vform);
   7539     if ((esize_in_bytes == msize_in_bytes) && !is_signed) {
   7540       // Use an FP format where it's likely that we're accessing FP data.
   7541       format = GetPrintRegisterFormatTryFP(format);
   7542     }
   7543     // Log accessed lanes that are active in both pg and ffr. PrintZStructAccess
   7544     // expects a single mask, so combine the two predicates.
   7545     SimPRegister mask;
   7546     SVEPredicateLogicalHelper(AND_p_p_pp_z, mask, pg, ffr);
   7547     PrintZStructAccess(zt_code, 1, mask, format, msize_in_bytes, "<-", addr);
   7548   }
   7549 }
   7550 
   7551 void Simulator::SVEGatherLoadScalarPlusVectorHelper(const Instruction* instr,
   7552                                                     VectorFormat vform,
   7553                                                     SVEOffsetModifier mod) {
   7554   bool is_signed = instr->ExtractBit(14) == 0;
   7555   bool is_ff = instr->ExtractBit(13) == 1;
   7556   // Note that these instructions don't use the Dtype encoding.
   7557   int msize_in_bytes_log2 = instr->ExtractBits(24, 23);
   7558   int scale = instr->ExtractBit(21) * msize_in_bytes_log2;
   7559   uint64_t base = ReadXRegister(instr->GetRn(), Reg31IsStackPointer);
   7560   LogicSVEAddressVector addr(base,
   7561                              &ReadVRegister(instr->GetRm()),
   7562                              vform,
   7563                              mod,
   7564                              scale);
   7565   addr.SetMsizeInBytesLog2(msize_in_bytes_log2);
   7566   if (is_ff) {
   7567     SVEFaultTolerantLoadHelper(vform,
   7568                                ReadPRegister(instr->GetPgLow8()),
   7569                                instr->GetRt(),
   7570                                addr,
   7571                                kSVEFirstFaultLoad,
   7572                                is_signed);
   7573   } else {
   7574     SVEStructuredLoadHelper(vform,
   7575                             ReadPRegister(instr->GetPgLow8()),
   7576                             instr->GetRt(),
   7577                             addr,
   7578                             is_signed);
   7579   }
   7580 }
   7581 
   7582 int Simulator::GetFirstActive(VectorFormat vform,
   7583                               const LogicPRegister& pg) const {
   7584   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   7585     if (pg.IsActive(vform, i)) return i;
   7586   }
   7587   return -1;
   7588 }
   7589 
   7590 int Simulator::GetLastActive(VectorFormat vform,
   7591                              const LogicPRegister& pg) const {
   7592   for (int i = LaneCountFromFormat(vform) - 1; i >= 0; i--) {
   7593     if (pg.IsActive(vform, i)) return i;
   7594   }
   7595   return -1;
   7596 }
   7597 
   7598 int Simulator::CountActiveLanes(VectorFormat vform,
   7599                                 const LogicPRegister& pg) const {
   7600   int count = 0;
   7601   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   7602     count += pg.IsActive(vform, i) ? 1 : 0;
   7603   }
   7604   return count;
   7605 }
   7606 
   7607 int Simulator::CountActiveAndTrueLanes(VectorFormat vform,
   7608                                        const LogicPRegister& pg,
   7609                                        const LogicPRegister& pn) const {
   7610   int count = 0;
   7611   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   7612     count += (pg.IsActive(vform, i) && pn.IsActive(vform, i)) ? 1 : 0;
   7613   }
   7614   return count;
   7615 }
   7616 
   7617 int Simulator::GetPredicateConstraintLaneCount(VectorFormat vform,
   7618                                                int pattern) const {
   7619   VIXL_ASSERT(IsSVEFormat(vform));
   7620   int all = LaneCountFromFormat(vform);
   7621   VIXL_ASSERT(all > 0);
   7622 
   7623   switch (pattern) {
   7624     case SVE_VL1:
   7625     case SVE_VL2:
   7626     case SVE_VL3:
   7627     case SVE_VL4:
   7628     case SVE_VL5:
   7629     case SVE_VL6:
   7630     case SVE_VL7:
   7631     case SVE_VL8:
   7632       // VL1-VL8 are encoded directly.
   7633       VIXL_STATIC_ASSERT(SVE_VL1 == 1);
   7634       VIXL_STATIC_ASSERT(SVE_VL8 == 8);
   7635       return (pattern <= all) ? pattern : 0;
   7636     case SVE_VL16:
   7637     case SVE_VL32:
   7638     case SVE_VL64:
   7639     case SVE_VL128:
   7640     case SVE_VL256: {
   7641       // VL16-VL256 are encoded as log2(N) + c.
   7642       int min = 16 << (pattern - SVE_VL16);
   7643       return (min <= all) ? min : 0;
   7644     }
   7645     // Special cases.
   7646     case SVE_POW2:
   7647       return 1 << HighestSetBitPosition(all);
   7648     case SVE_MUL4:
   7649       return all - (all % 4);
   7650     case SVE_MUL3:
   7651       return all - (all % 3);
   7652     case SVE_ALL:
   7653       return all;
   7654   }
   7655   // Unnamed cases architecturally return 0.
   7656   return 0;
   7657 }
   7658 
   7659 LogicPRegister Simulator::match(VectorFormat vform,
   7660                                 LogicPRegister dst,
   7661                                 const LogicVRegister& haystack,
   7662                                 const LogicVRegister& needles,
   7663                                 bool negate_match) {
   7664   SimVRegister ztemp;
   7665   SimPRegister ptemp;
   7666 
   7667   pfalse(dst);
   7668   int lanes_per_segment = kQRegSize / LaneSizeInBitsFromFormat(vform);
   7669   for (int i = 0; i < lanes_per_segment; i++) {
   7670     dup_elements_to_segments(vform, ztemp, needles, i);
   7671     SVEIntCompareVectorsHelper(eq,
   7672                                vform,
   7673                                ptemp,
   7674                                GetPTrue(),
   7675                                haystack,
   7676                                ztemp,
   7677                                false,
   7678                                LeaveFlags);
   7679     SVEPredicateLogicalHelper(ORR_p_p_pp_z, dst, dst, ptemp);
   7680   }
   7681   if (negate_match) {
   7682     ptrue(vform, ptemp, SVE_ALL);
   7683     SVEPredicateLogicalHelper(EOR_p_p_pp_z, dst, dst, ptemp);
   7684   }
   7685   return dst;
   7686 }
   7687 
   7688 uint64_t LogicSVEAddressVector::GetStructAddress(int lane) const {
   7689   if (IsContiguous()) {
   7690     return base_ + (lane * GetRegCount()) * GetMsizeInBytes();
   7691   }
   7692 
   7693   VIXL_ASSERT(IsScatterGather());
   7694   VIXL_ASSERT(vector_ != NULL);
   7695 
   7696   // For scatter-gather accesses, we need to extract the offset from vector_,
   7697   // and apply modifiers.
   7698 
   7699   uint64_t offset = 0;
   7700   switch (vector_form_) {
   7701     case kFormatVnS:
   7702       offset = vector_->GetLane<uint32_t>(lane);
   7703       break;
   7704     case kFormatVnD:
   7705       offset = vector_->GetLane<uint64_t>(lane);
   7706       break;
   7707     default:
   7708       VIXL_UNIMPLEMENTED();
   7709       break;
   7710   }
   7711 
   7712   switch (vector_mod_) {
   7713     case SVE_MUL_VL:
   7714       VIXL_UNIMPLEMENTED();
   7715       break;
   7716     case SVE_LSL:
   7717       // We apply the shift below. There's nothing to do here.
   7718       break;
   7719     case NO_SVE_OFFSET_MODIFIER:
   7720       VIXL_ASSERT(vector_shift_ == 0);
   7721       break;
   7722     case SVE_UXTW:
   7723       offset = ExtractUnsignedBitfield64(kWRegSize - 1, 0, offset);
   7724       break;
   7725     case SVE_SXTW:
   7726       offset = ExtractSignedBitfield64(kWRegSize - 1, 0, offset);
   7727       break;
   7728   }
   7729 
   7730   return base_ + (offset << vector_shift_);
   7731 }
   7732 
   7733 LogicVRegister Simulator::pack_odd_elements(VectorFormat vform,
   7734                                             LogicVRegister dst,
   7735                                             const LogicVRegister& src) {
   7736   SimVRegister zero;
   7737   zero.Clear();
   7738   return uzp2(vform, dst, src, zero);
   7739 }
   7740 
   7741 LogicVRegister Simulator::pack_even_elements(VectorFormat vform,
   7742                                              LogicVRegister dst,
   7743                                              const LogicVRegister& src) {
   7744   SimVRegister zero;
   7745   zero.Clear();
   7746   return uzp1(vform, dst, src, zero);
   7747 }
   7748 
   7749 LogicVRegister Simulator::adcl(VectorFormat vform,
   7750                                LogicVRegister dst,
   7751                                const LogicVRegister& src1,
   7752                                const LogicVRegister& src2,
   7753                                bool top) {
   7754   unsigned reg_size = LaneSizeInBitsFromFormat(vform);
   7755   VIXL_ASSERT((reg_size == kSRegSize) || (reg_size == kDRegSize));
   7756 
   7757   for (int i = 0; i < LaneCountFromFormat(vform); i += 2) {
   7758     uint64_t left = src1.Uint(vform, i + (top ? 1 : 0));
   7759     uint64_t right = dst.Uint(vform, i);
   7760     unsigned carry_in = src2.Uint(vform, i + 1) & 1;
   7761     std::pair<uint64_t, uint8_t> val_and_flags =
   7762         AddWithCarry(reg_size, left, right, carry_in);
   7763 
   7764     // Set even lanes to the result of the addition.
   7765     dst.SetUint(vform, i, val_and_flags.first);
   7766 
   7767     // Set odd lanes to the carry flag from the addition.
   7768     uint64_t carry_out = (val_and_flags.second >> 1) & 1;
   7769     dst.SetUint(vform, i + 1, carry_out);
   7770   }
   7771   return dst;
   7772 }
   7773 
   7774 // Multiply the 2x8 8-bit matrix in src1 by the 8x2 8-bit matrix in src2, add
   7775 // the 2x2 32-bit result to the matrix in srcdst, and write back to srcdst.
   7776 //
   7777 // Matrices of the form:
   7778 //
   7779 //  src1 = ( a b c d e f g h )  src2 = ( A B )
   7780 //         ( i j k l m n o p )         ( C D )
   7781 //                                     ( E F )
   7782 //                                     ( G H )
   7783 //                                     ( I J )
   7784 //                                     ( K L )
   7785 //                                     ( M N )
   7786 //                                     ( O P )
   7787 //
   7788 // Are stored in the input vector registers as:
   7789 //
   7790 //           15  14  13  12  11  10  9   8   7   6   5   4   3   2   1   0
   7791 //  src1 = [ p | o | n | m | l | k | j | i | h | g | f | e | d | c | b | a ]
   7792 //  src2 = [ P | N | L | J | H | F | D | B | O | M | K | I | G | E | C | A ]
   7793 //
   7794 LogicVRegister Simulator::matmul(VectorFormat vform_dst,
   7795                                  LogicVRegister srcdst,
   7796                                  const LogicVRegister& src1,
   7797                                  const LogicVRegister& src2,
   7798                                  bool src1_signed,
   7799                                  bool src2_signed) {
   7800   // Two destination forms are supported: Q register containing four S-sized
   7801   // elements (4S) and Z register containing n S-sized elements (VnS).
   7802   VIXL_ASSERT((vform_dst == kFormat4S) || (vform_dst == kFormatVnS));
   7803   VectorFormat vform_src = kFormatVnB;
   7804   int b_per_segment = kQRegSize / kBRegSize;
   7805   int s_per_segment = kQRegSize / kSRegSize;
   7806   int64_t result[kZRegMaxSizeInBytes / kSRegSizeInBytes] = {};
   7807   int segment_count = LaneCountFromFormat(vform_dst) / 4;
   7808   for (int seg = 0; seg < segment_count; seg++) {
   7809     for (int i = 0; i < 2; i++) {
   7810       for (int j = 0; j < 2; j++) {
   7811         int dstidx = (2 * i) + j + (seg * s_per_segment);
   7812         int64_t sum = srcdst.Int(vform_dst, dstidx);
   7813         for (int k = 0; k < 8; k++) {
   7814           int idx1 = (8 * i) + k + (seg * b_per_segment);
   7815           int idx2 = (8 * j) + k + (seg * b_per_segment);
   7816           int64_t e1 = src1_signed ? src1.Int(vform_src, idx1)
   7817                                    : src1.Uint(vform_src, idx1);
   7818           int64_t e2 = src2_signed ? src2.Int(vform_src, idx2)
   7819                                    : src2.Uint(vform_src, idx2);
   7820           sum += e1 * e2;
   7821         }
   7822         result[dstidx] = sum;
   7823       }
   7824     }
   7825   }
   7826   srcdst.SetIntArray(vform_dst, result);
   7827   return srcdst;
   7828 }
   7829 
   7830 // Multiply the 2x2 FP matrix in src1 by the 2x2 FP matrix in src2, add the 2x2
   7831 // result to the matrix in srcdst, and write back to srcdst.
   7832 //
   7833 // Matrices of the form:
   7834 //
   7835 //  src1 = ( a b )  src2 = ( A B )
   7836 //         ( c d )         ( C D )
   7837 //
   7838 // Are stored in the input vector registers as:
   7839 //
   7840 //           3   2   1   0
   7841 //  src1 = [ d | c | b | a ]
   7842 //  src2 = [ D | B | C | A ]
   7843 //
   7844 template <typename T>
   7845 LogicVRegister Simulator::fmatmul(VectorFormat vform,
   7846                                   LogicVRegister srcdst,
   7847                                   const LogicVRegister& src1,
   7848                                   const LogicVRegister& src2) {
   7849   T result[kZRegMaxSizeInBytes / sizeof(T)];
   7850   int T_per_segment = 4;
   7851   int segment_count = GetVectorLengthInBytes() / (T_per_segment * sizeof(T));
   7852   for (int seg = 0; seg < segment_count; seg++) {
   7853     int segoff = seg * T_per_segment;
   7854     for (int i = 0; i < 2; i++) {
   7855       for (int j = 0; j < 2; j++) {
   7856         T prod0 = FPMulNaNs(src1.Float<T>(2 * i + 0 + segoff),
   7857                             src2.Float<T>(2 * j + 0 + segoff));
   7858         T prod1 = FPMulNaNs(src1.Float<T>(2 * i + 1 + segoff),
   7859                             src2.Float<T>(2 * j + 1 + segoff));
   7860         T sum = FPAdd(srcdst.Float<T>(2 * i + j + segoff), prod0);
   7861         result[2 * i + j + segoff] = FPAdd(sum, prod1);
   7862       }
   7863     }
   7864   }
   7865   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
   7866     // Elements outside a multiple of 4T are set to zero. This happens only
   7867     // for double precision operations, when the VL is a multiple of 128 bits,
   7868     // but not a multiple of 256 bits.
   7869     T value = (i < (T_per_segment * segment_count)) ? result[i] : 0;
   7870     srcdst.SetFloat<T>(vform, i, value);
   7871   }
   7872   return srcdst;
   7873 }
   7874 
   7875 LogicVRegister Simulator::fmatmul(VectorFormat vform,
   7876                                   LogicVRegister dst,
   7877                                   const LogicVRegister& src1,
   7878                                   const LogicVRegister& src2) {
   7879   if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
   7880     fmatmul<float>(vform, dst, src1, src2);
   7881   } else {
   7882     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
   7883     fmatmul<double>(vform, dst, src1, src2);
   7884   }
   7885   return dst;
   7886 }
   7887 
   7888 }  // namespace aarch64
   7889 }  // namespace vixl
   7890 
   7891 #endif  // VIXL_INCLUDE_SIMULATOR_AARCH64
	duckstation duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
	git clone https://git.neptards.moe/u3shit/duckstation.git
	Log \| Files \| Refs \| README \| LICENSE